git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2016-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <vm/lz4_assembly_select.h>
	30	#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7
	31
	32	/* void lz4_encode_2gb(uint8_t ** dst_ptr,
	33	size_t dst_size,
	34	const uint8_t ** src_ptr,
	35	const uint8_t * src_begin,
	36	size_t src_size,
	37	lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
	38	int skip_final_literals) */
	39
	40	.globl _lz4_encode_2gb
	41	.syntax unified
	42
	43	#define dst_ptr r0
	44	#define dst_end r1
	45	#define src_ptr r2
	46	#define src_beg r3
	47	#define src_end r4
	48	#define table r5
	49	#define mch_ptr r6
	50	#define mch_dis r8
	51	#define mch_len r9
	52	#define mch_ref r10
	53
	54	#define margin 128
	55
	56	.macro establish_frame
	57	push {r4-r7, lr}
	58	add r7, sp, #12
	59	push {r8-r11}
	60	ldrd r4, r5, [sp, #36]
	61	push {r0, r2}
	62	ldr dst_ptr, [r0]
	63	ldr src_ptr, [r2]
	64	subs r1, r1, margin // subtract safety margin from dst_size
	65	bls L_done
	66	add dst_end, dst_ptr, r1 // dst end - margin
	67	sub r4, r4, margin // subtract safety margin from src_size (src_size < margin is detected by check on mch_ptr in match_candidate_loop).
	68	add src_end, src_ptr, r4 // src end - margin.
	69	vmov.i8 q1, #255 // vector of all 1s used to emit
	70	.endm
	71
	72	.macro clear_frame_and_return
	73	pop {r1, r3}
	74	str dst_ptr, [r1]
	75	str src_ptr, [r3]
	76	pop {r8-r11}
	77	pop {r4-r7, pc}
	78	.endm
	79
	80	.p2align 4
	81	_lz4_encode_2gb:
	82	establish_frame
	83	L_next_match:
	84	// Start searching for the next match, starting at the end of the last one.
	85	// [We begin with mch_ptr = src_ptr - 1 because we pre-increment mch_ptr
	86	// within the search loop itself]. Load the hash magic number in lr, and
	87	// zero out mch_len (when we find a match, its length will initially be
	88	// four, but we actually work with the match length minus four at all times).
	89	ldr lr, L_hash
	90	sub mch_ptr, src_ptr, #1
	91
	92	L_match_candidate_loop:
	93	// If mch_ptr >= src_end, we are near the end of the source buffer (remember
	94	// that we subtracted margin from src_end, so we are not actually past the
	95	// end just yet).
	96	cmp mch_ptr, src_end
	97	bhs L_trailing_literal
	98
	99	// Load the four-byte word starting at mch_ptr, and get the address of the
	100	// corresponding row of the hash table.
	101	ldr r9, [mch_ptr, #1]!
	102	sub r8, mch_ptr, src_beg
	103	mul r12, r9, lr
	104	mvn r10, #0x7
	105	and r12, r10, r12, lsr #17 // byte offset of table entry.
	106
	107	// Load offset and word from hash table row, then update with the new offset
	108	// and word that we just computed.
	109	ldrd r10,r11, [table, r12]
	110	strd r8, r9, [table, r12]
	111
	112	// At this point, we only know that the hashes of the words match; check to
	113	// see if the words themselves match. If not, move on to the next candidate.
	114	cmp r9, r11
	115	bne L_match_candidate_loop
	116
	117	// It's not enough for the words to match; the match distance must also be
	118	// in the representable range (i.e. less than 0x10000).
	119	sub mch_dis, r8, r10
	120	add mch_ref, src_beg, r10
	121	cmp mch_dis, #0x10000
	122	bhs L_match_candidate_loop
	123
	124	// We have found a match; registers at this point are as follows:
	125	//
	126	// register symbolic name meaning
	127	// r0 dst_ptr pointer into destination buffer where the
	128	// match information will be stored.
	129	// r1 dst_end pointer to the end of the destination buffer,
	130	// less margin.
	131	// r2 src_ptr pointer to the byte after the last match that
	132	// we found, or to the point from which we
	133	// started searching if this is the first match.
	134	// r3 src_beg pointer to the actual start of the buffer.
	135	// r4 src_end pointer to the end of the source buffer, less
	136	// margin.
	137	// r5 table address of hash table.
	138	// r6 mch_ptr pointer to match.
	139	// r8 mch_dis match distance ("D")
	140	// r9 mch_len length of match less four ("M")
	141	// r10 mch_ref pointer to match reference.
	142	// r11 -
	143	// r12 -
	144	// lr -
	145	//
	146	// Attempt to grow the match backwards (typically we only grow backwards by
	147	// a byte or two, if at all, so we use a byte-by-byte scan).
	148	eor mch_len, mch_len
	149	0:cmp mch_ref, src_beg
	150	cmpne mch_ptr, src_ptr
	151	beq 1f
	152	ldrb r11, [mch_ref, #-1]
	153	ldrb r12, [mch_ptr, #-1]
	154	cmp r11, r12
	155	bne 1f
	156	sub mch_ref, #1
	157	sub mch_ptr, #1
	158	add mch_len, #1
	159	b 0b
	160
	161	// Now that we have the start of the match, we can compute the literal
	162	// length. Then advance the mch and ref pointers to the end of the match
	163	// and its reference. Because mch_len is the real match length minus four,
	164	// we actually advance to four before the end of the match, but our loop
	165	// to grow the matches uses pre-incremented loads with writeback, so this
	166	// works out correctly.
	167	#define lit_len lr
	168	1:sub lit_len, mch_ptr, src_ptr
	169	add mch_ptr, mch_len
	170	add mch_ref, mch_len
	171
	172	// Now attempt to grow the match forwards. This is much more common, and
	173	// there is a safety margin at the end of the buffer, so we grow forwards
	174	// in four-byte chunks.
	175	0:ldr r11, [mch_ptr, #4]!
	176	ldr r12, [mch_ref, #4]!
	177	eors r11, r12
	178	bne 1f
	179	add mch_len, #4
	180	cmp mch_ptr, src_end
	181	blo 0b
	182	b L_emit_match
	183	// At least one of the bytes in the last comparison did not match. Identify
	184	// which byte had the mismatch and compute the final length (less four).
	185	1:rev r11, r11
	186	clz r11, r11
	187	add mch_len, r11, lsr #3
	188
	189	L_emit_match:
	190	// Time to emit what we've found!
	191	//
	192	// register symbolic name meaning
	193	// r0 dst_ptr pointer into destination buffer where the
	194	// match information will be stored.
	195	// r1 dst_end pointer to the end of the destination buffer,
	196	// less margin.
	197	// r2 src_ptr pointer to the byte after the last match that
	198	// we found, or to the point from which we
	199	// started searching if this is the first match.
	200	// r3 src_beg pointer to the actual start of the buffer.
	201	// r4 src_end pointer to the end of the source buffer, less
	202	// margin.
	203	// r5 table address of hash table.
	204	// r6 -
	205	// r8 mch_dis match distance ("D")
	206	// r9 mch_len length of match ("M")
	207	// r10 -
	208	// r11 -
	209	// r12 -
	210	// lr lit_len literal length ("L")
	211	// q1 vector of all ones
	212	//
	213	// Synthesize control byte under the assumption that L and M are both less
	214	// than 15, jumping out of the fast path if one of them is not.
	215	cmp lit_len, #15
	216	orr r10, mch_len, lit_len, lsl #4
	217	cmplo mch_len, #15
	218	bhs L_emit_careful
	219	// L and M are both less than 15, which means (a) we use the most compact
	220	// encoding for the match and (b) we do not need to do a bounds check on
	221	// the destination buffer before writing, only before continuing our search.
	222	// Store the command byte.
	223	strb r10, [dst_ptr], #1
	224	// Copy literal.
	225	vld1.8 q0, [src_ptr]
	226	add src_ptr, lit_len
	227	vst1.8 q0, [dst_ptr]
	228	add dst_ptr, lit_len
	229	// Restore "true" match length before updating src_ptr.
	230	add mch_len, #4
	231	// Store match distance (D) and update the source pointer.
	232	strh r8, [dst_ptr], #2
	233	add src_ptr, mch_len
	234	// If we're not into the safety margin of the destination buffer, look for
	235	// another match.
	236	cmp dst_ptr, dst_end
	237	blo L_next_match
	238	// If we are into the safety margin of the destination buffer, we're done
	239	// encoding this block; update the source and destination pointers and
	240	// return.
	241	L_done:
	242	clear_frame_and_return
	243
	244	// Constant island
	245	L_hash: .long 2654435761
	246	L_magic: .long 0x80808081
	247
	248	L_emit_careful:
	249	// Either L or M is >= 15, which means that we don't get to use the compact
	250	// encoding, and that we need to do extra bounds checks while writing.
	251	//
	252	// register symbolic name meaning
	253	// r0 dst_ptr pointer into destination buffer where the
	254	// match information will be stored.
	255	// r1 dst_end pointer to the end of the destination buffer,
	256	// less margin.
	257	// r2 src_ptr pointer to the byte after the last match that
	258	// we found, or to the point from which we
	259	// started searching if this is the first match.
	260	// r3 src_beg pointer to the actual start of the buffer.
	261	// r4 src_end pointer to the end of the source buffer, less
	262	// margin.
	263	// r5 table address of hash table.
	264	// r6 -
	265	// r8 mch_dis match distance ("D")
	266	// r9 mch_len length of match ("M") less four
	267	// r10 -
	268	// r11 -
	269	// r12 -
	270	// lr lit_len literal length ("L")
	271	// q1 vector of all ones
	272	//
	273	// Start by creating the low 4 bits of the control word; M if M < 15, 0xf
	274	// otherwise. We also load 0x80808081, which is the magic number for
	275	// division by 255; this will be required later on.
	276	ldr r12, L_magic
	277	cmp mch_len, #15
	278	mov r10, mch_len
	279	movhs r10, #0x0f
	280	subs r6, lit_len, #15
	281	bhs L_large_L
	282	// M is large, but L is < 15. This means we can use the simple approach
	283	// for copying the literal with no bounds checks.
	284	orr r10, lit_len, lsl #4
	285	strb r10, [dst_ptr], #1
	286	// Copy literal.
	287	vld1.8 q0, [src_ptr]
	288	add src_ptr, lit_len
	289	vst1.8 q0, [dst_ptr]
	290	add dst_ptr, lit_len
	291	// Store match distance (D).
	292	strh r8, [dst_ptr], #2
	293	sub r6, mch_len, #15
	294	b L_large_M
	295
	296	L_large_L:
	297	// L is large, M may or may not be. We need to encode extra literal length
	298	// bytes and we need to do bounds checks while store both those byte and the
	299	// literal itself.
	300	orr r10, #0xf0
	301	strb r10, [dst_ptr], #1
	302	// How many extra literal bytes do we need to store? We need to store
	303	// (L - 15)/255 extra literal bytes of 0xff, plus one more byte that is
	304	// (L - 15)%255. Get these quantities via magic number multiplication:
	305	// (L - 15)*0x80808081 >> (32 + 7)
	306	umull r10, r11, r6, r12
	307	mov r12, #255
	308	lsr r10, r11, #7 // (L - 15) / 255
	309	mls r11, r10, r12, r6 // (L - 15) % 255
	310	ldr r12, L_magic // may need magic number again for M.
	311	// Compute address dst_ptr will have after storing all 0xff bytes, and
	312	// check that we won't exceed dst_end in doing so.
	313	add r10, dst_ptr, r10
	314	cmp r10, dst_end
	315	bhs L_done
	316	// There's enough space for all the 0xff bytes, so go ahead and store them.
	317	0:vst1.8 q1, [dst_ptr]!
	318	cmp dst_ptr, r10
	319	blo 0b
	320	// Store the (L - 15) % 255 byte.
	321	strb r11, [r10], #1
	322	// Compute the address we'll have reached after storing all literal bytes.
	323	// If that passes dst_end, we're done.
	324	add dst_ptr, r10, lit_len
	325	cmp dst_ptr, dst_end
	326	bhs L_done
	327	// Copy the literal.
	328	0:vld1.8 q0, [src_ptr]!
	329	vst1.8 q0, [r10]!
	330	subs r6, r10, dst_ptr
	331	blo 0b
	332	// Fixup src_ptr, store match distance (D), and check whether or not M is
	333	// bigger than 14. If not, go find the next match.
	334	strh r8, [dst_ptr], #2
	335	sub src_ptr, r6
	336	subs r6, mch_len, #15
	337	bhs L_large_M
	338	// M is small, so we're all done; we just need to update the source pointer
	339	// and we can go look for the next match.
	340	add mch_len, #4
	341	add src_ptr, mch_len
	342	b L_next_match
	343
	344	L_large_M:
	345	// Just like with large L, we split (M - 15) into (M - 15) / 255 and
	346	// (M - 15) % 255 via magic number multiply.
	347	umull r10, r11, r6, r12
	348	mov r12, #255
	349	lsr r10, r11, #7 // (M - 15) / 255
	350	mls r11, r10, r12, r6 // (M - 15) % 255
	351	// Compute address dst_ptr will have after storing all 0xff bytes, and
	352	// check that we won't exceed dst_end in doing so.
	353	add r10, dst_ptr, r10
	354	cmp r10, dst_end
	355	bhs L_done
	356	// There's enough space for all the 0xff bytes, so go ahead and store them.
	357	0:vst1.8 q1, [dst_ptr]!
	358	cmp dst_ptr, r10
	359	blo 0b
	360	// Store final M % 255 byte, update dst_ptr and src_ptr, and look for next
	361	// match.
	362	strb r11, [r10]
	363	add mch_len, #4
	364	add dst_ptr, r10, #1
	365	add src_ptr, mch_len
	366	b L_next_match
	367
	368	L_trailing_literal:
	369	// Check if skip_final_literals is set.
	370	ldr r5, [sp, #52]
	371	tst r5, r5
	372	bne L_done
	373	// Emit a trailing literal that covers the remainder of the source buffer,
	374	// if we can do so without exceeding the bounds of the destination buffer.
	375	add src_end, margin
	376	sub lit_len, src_end, src_ptr
	377	subs r6, lit_len, #15
	378	bhs L_trailing_literal_long
	379	lsl r10, lit_len, #4
	380	strb r10, [dst_ptr], #1
	381	vld1.8 q0, [src_ptr]
	382	mov src_ptr, src_end
	383	vst1.8 q0, [dst_ptr]
	384	add dst_ptr, lit_len
	385	b L_done
	386
	387	L_trailing_literal_long:
	388	ldr r12, L_magic
	389	mov r10, #0xf0
	390	add dst_end, margin
	391	strb r10, [dst_ptr], #1
	392	umull r10, r11, r6, r12
	393	mov r12, #255
	394	lsr r10, r11, #7 // (L - 15) / 255
	395	mls r11, r10, r12, r6 // (L - 15) % 255
	396	// We want to write out lit_len + (L - 15)/255 + 1 bytes. Check if we have
	397	// space for all of them.
	398	add r10, dst_ptr
	399	add r12, r10, lit_len
	400	cmp r12, dst_end
	401	bhs L_done
	402	// We have enough space, so go ahead and write them all out. Because we
	403	// know that we have enough space, and that the literal is at least 15 bytes,
	404	// we can write the block of 0xffs using vector stores, even without a
	405	// safety margin.
	406	0:vst1.8 q1, [dst_ptr]!
	407	cmp dst_ptr, r10
	408	blo 0b
	409	// Store the (L - 15) % 255 byte.
	410	strb r11, [r10], #1
	411	mov dst_ptr, r10
	412	// Now store the literal itself; here we need to actually be somewhat
	413	// careful to ensure that we don't write past the end of the destination
	414	// buffer or read past the end of the source buffer.
	415	subs lit_len, #16
	416	blo 1f
	417	0:vld1.8 q0, [src_ptr]!
	418	subs lit_len, #16
	419	vst1.8 q0, [dst_ptr]!
	420	bhs 0b
	421	1:adds lit_len, #16
	422	beq L_done
	423	2:ldrb r6, [src_ptr], #1
	424	subs lit_len, #1
	425	strb r6, [dst_ptr], #1
	426	bne 2b
	427	b L_done
	428
	429	#endif