]>
Commit | Line | Data |
---|---|---|
39037602 A |
1 | /* |
2 | * Copyright (c) 2016 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <sys/cdefs.h> | |
30 | #include <sys/param.h> | |
31 | #include <sys/mbuf.h> | |
32 | #include <sys/socket.h> | |
33 | #include <sys/sockio.h> | |
34 | #include <sys/systm.h> | |
35 | #include <sys/sysctl.h> | |
36 | #include <sys/syslog.h> | |
37 | #include <sys/proc.h> | |
38 | #include <sys/errno.h> | |
39 | #include <sys/kernel.h> | |
40 | #include <sys/kauth.h> | |
41 | #include <kern/zalloc.h> | |
42 | #include <netinet/in.h> | |
43 | ||
44 | #include <net/pktsched/pktsched_fq_codel.h> | |
45 | #include <net/classq/classq_fq_codel.h> | |
46 | ||
47 | static struct zone *flowq_zone = NULL; | |
48 | static size_t flowq_size; | |
49 | ||
50 | #define FQ_ZONE_MAX (32 * 1024) /* across all interfaces */ | |
51 | #define FQ_SEQ_LT(a,b) ((int)((a)-(b)) < 0) | |
52 | #define FQ_SEQ_GT(a,b) ((int)((a)-(b)) > 0) | |
53 | ||
54 | void | |
55 | fq_codel_init(void) | |
56 | { | |
57 | if (flowq_zone != NULL) | |
58 | return; | |
59 | ||
60 | flowq_size = sizeof (fq_t); | |
61 | flowq_zone = zinit(flowq_size, FQ_ZONE_MAX * flowq_size, | |
62 | 0, "flowq_zone"); | |
63 | if (flowq_zone == NULL) { | |
64 | panic("%s: failed to allocate flowq_zone", __func__); | |
65 | /* NOTREACHED */ | |
66 | } | |
67 | zone_change(flowq_zone, Z_EXPAND, TRUE); | |
68 | zone_change(flowq_zone, Z_CALLERACCT, TRUE); | |
69 | } | |
70 | ||
71 | fq_t * | |
72 | fq_alloc(int how) | |
73 | { | |
74 | fq_t *fq = NULL; | |
75 | fq = (how == M_WAITOK) ? zalloc(flowq_zone) : | |
76 | zalloc_noblock(flowq_zone); | |
77 | if (fq == NULL) { | |
78 | log(LOG_ERR, "%s: unable to allocate from flowq_zone\n"); | |
79 | return (NULL); | |
80 | } | |
81 | ||
82 | bzero(fq, flowq_size); | |
83 | MBUFQ_INIT(&fq->fq_mbufq); | |
84 | return (fq); | |
85 | } | |
86 | ||
87 | void | |
88 | fq_destroy(fq_t *fq) | |
89 | { | |
90 | VERIFY(MBUFQ_EMPTY(&fq->fq_mbufq)); | |
91 | VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW))); | |
92 | bzero(fq, flowq_size); | |
93 | zfree(flowq_zone, fq); | |
94 | } | |
95 | ||
96 | static void | |
97 | fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl, | |
98 | u_int64_t *now) | |
99 | { | |
100 | u_int64_t maxgetqtime; | |
101 | if (FQ_IS_DELAYHIGH(flowq) || flowq->fq_getqtime == 0 || | |
102 | MBUFQ_EMPTY(&flowq->fq_mbufq) || | |
103 | flowq->fq_bytes < FQ_MIN_FC_THRESHOLD_BYTES) | |
104 | return; | |
105 | maxgetqtime = flowq->fq_getqtime + fqs->fqs_update_interval; | |
106 | if ((*now) > maxgetqtime) { | |
107 | /* | |
108 | * there was no dequeue in an update interval worth of | |
109 | * time. It means that the queue is stalled. | |
110 | */ | |
111 | FQ_SET_DELAY_HIGH(flowq); | |
112 | fq_cl->fcl_stat.fcl_dequeue_stall++; | |
113 | } | |
114 | } | |
115 | ||
116 | void | |
117 | fq_head_drop(fq_if_t *fqs, fq_t *fq) | |
118 | { | |
119 | struct mbuf *m = NULL; | |
120 | struct ifclassq *ifq = fqs->fqs_ifq; | |
121 | ||
122 | m = fq_getq_flow(fqs, fq); | |
123 | if (m == NULL) | |
124 | return; | |
125 | ||
126 | IFCQ_DROP_ADD(ifq, 1, m_length(m)); | |
127 | IFCQ_CONVERT_LOCK(ifq); | |
128 | m_freem(m); | |
129 | } | |
130 | ||
131 | int | |
132 | fq_addq(fq_if_t *fqs, struct mbuf *m, fq_if_classq_t *fq_cl) | |
133 | { | |
134 | struct pkthdr *pkt = &m->m_pkthdr; | |
135 | int droptype = DTYPE_NODROP, fc_adv = 0, ret = CLASSQEQ_SUCCESS; | |
136 | u_int64_t now; | |
137 | fq_t *fq = NULL; | |
138 | ||
139 | VERIFY(!(pkt->pkt_flags & PKTF_PRIV_GUARDED)); | |
140 | pkt->pkt_flags |= PKTF_PRIV_GUARDED; | |
141 | ||
142 | if (pkt->pkt_timestamp > 0) { | |
143 | now = pkt->pkt_timestamp; | |
144 | } else { | |
145 | now = mach_absolute_time(); | |
146 | pkt->pkt_timestamp = now; | |
147 | } | |
148 | ||
149 | /* find the flowq for this packet */ | |
150 | fq = fq_if_hash_pkt(fqs, pkt->pkt_flowid, m_get_service_class(m), | |
151 | now, TRUE); | |
152 | if (fq == NULL) { | |
153 | /* drop the packet if we could not allocate a flow queue */ | |
154 | fq_cl->fcl_stat.fcl_drop_memfailure++; | |
155 | IFCQ_CONVERT_LOCK(fqs->fqs_ifq); | |
156 | m_freem(m); | |
157 | return (CLASSQEQ_DROPPED); | |
158 | } | |
159 | ||
160 | VERIFY(fq_cl->fcl_service_class == | |
161 | (u_int32_t)mbuf_get_service_class(m)); | |
162 | ||
163 | fq_detect_dequeue_stall(fqs, fq, fq_cl, &now); | |
164 | ||
165 | if (FQ_IS_DELAYHIGH(fq)) { | |
166 | if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) && | |
167 | (pkt->pkt_flags & PKTF_FLOW_ADV)) { | |
168 | fc_adv = 1; | |
169 | /* | |
170 | * If the flow is suspended or it is not | |
171 | * TCP, drop the packet | |
172 | */ | |
173 | if (pkt->pkt_proto != IPPROTO_TCP) { | |
174 | droptype = DTYPE_EARLY; | |
175 | fq_cl->fcl_stat.fcl_drop_early++; | |
176 | } | |
177 | } else { | |
178 | /* | |
179 | * Need to drop a packet, instead of dropping this | |
180 | * one, try to drop from the head of the queue | |
181 | */ | |
182 | if (!MBUFQ_EMPTY(&fq->fq_mbufq)) { | |
183 | fq_head_drop(fqs, fq); | |
184 | droptype = DTYPE_NODROP; | |
185 | } else { | |
186 | droptype = DTYPE_EARLY; | |
187 | } | |
188 | fq_cl->fcl_stat.fcl_drop_early++; | |
189 | } | |
190 | ||
191 | } | |
192 | ||
193 | /* | |
194 | * check if this packet is a retransmission of another pkt already | |
195 | * in the queue | |
196 | */ | |
197 | if ((pkt->pkt_flags & (PKTF_TCP_REXMT|PKTF_START_SEQ)) == | |
198 | (PKTF_TCP_REXMT|PKTF_START_SEQ) && fq->fq_dequeue_seq != 0) { | |
199 | if (FQ_SEQ_GT(pkt->tx_start_seq, fq->fq_dequeue_seq)) { | |
200 | fq_cl->fcl_stat.fcl_dup_rexmts++; | |
201 | droptype = DTYPE_FORCED; | |
202 | } | |
203 | } | |
204 | ||
205 | /* Set the return code correctly */ | |
206 | if (fc_adv == 1 && droptype != DTYPE_FORCED) { | |
207 | if (fq_if_add_fcentry(fqs, pkt, fq_cl)) { | |
208 | fq->fq_flags |= FQF_FLOWCTL_ON; | |
209 | /* deliver flow control advisory error */ | |
210 | if (droptype == DTYPE_NODROP) { | |
211 | ret = CLASSQEQ_SUCCESS_FC; | |
212 | } else { | |
213 | /* dropped due to flow control */ | |
214 | ret = CLASSQEQ_DROPPED_FC; | |
215 | } | |
216 | } else { | |
217 | /* | |
218 | * if we could not flow control the flow, it is | |
219 | * better to drop | |
220 | */ | |
221 | droptype = DTYPE_FORCED; | |
222 | ret = CLASSQEQ_DROPPED_FC; | |
223 | fq_cl->fcl_stat.fcl_flow_control_fail++; | |
224 | } | |
225 | } | |
226 | ||
227 | /* | |
228 | * If the queue length hits the queue limit, drop a packet from the | |
229 | * front of the queue for a flow with maximum number of bytes. This | |
230 | * will penalize heavy and unresponsive flows. It will also avoid a | |
231 | * tail drop. | |
232 | */ | |
233 | if (droptype == DTYPE_NODROP && fq_if_at_drop_limit(fqs)) { | |
234 | fq_if_drop_packet(fqs); | |
235 | } | |
236 | ||
237 | if (droptype == DTYPE_NODROP) { | |
238 | MBUFQ_ENQUEUE(&fq->fq_mbufq, m); | |
239 | fq->fq_bytes += m_length(m); | |
240 | fq_cl->fcl_stat.fcl_byte_cnt += m_length(m); | |
241 | fq_cl->fcl_stat.fcl_pkt_cnt++; | |
242 | ||
243 | /* | |
244 | * check if this queue will qualify to be the next | |
245 | * victim queue | |
246 | */ | |
247 | fq_if_is_flow_heavy(fqs, fq); | |
248 | } else { | |
249 | IFCQ_CONVERT_LOCK(fqs->fqs_ifq); | |
250 | m_freem(m); | |
251 | return ((ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROPPED); | |
252 | } | |
253 | ||
254 | /* | |
255 | * If the queue is not currently active, add it to the end of new | |
256 | * flows list for that service class. | |
257 | */ | |
258 | if ((fq->fq_flags & (FQF_NEW_FLOW|FQF_OLD_FLOW)) == 0) { | |
259 | VERIFY(STAILQ_NEXT(fq, fq_actlink) == NULL); | |
260 | STAILQ_INSERT_TAIL(&fq_cl->fcl_new_flows, fq, fq_actlink); | |
261 | fq->fq_flags |= FQF_NEW_FLOW; | |
262 | ||
263 | fq_cl->fcl_stat.fcl_newflows_cnt++; | |
264 | ||
265 | fq->fq_deficit = fq_cl->fcl_quantum; | |
266 | } | |
267 | return (ret); | |
268 | } | |
269 | ||
270 | struct mbuf * | |
271 | fq_getq_flow(fq_if_t *fqs, fq_t *fq) | |
272 | { | |
273 | struct mbuf *m = NULL; | |
274 | struct ifclassq *ifq = fqs->fqs_ifq; | |
275 | fq_if_classq_t *fq_cl; | |
276 | u_int64_t now; | |
277 | int64_t qdelay; | |
278 | struct pkthdr *pkt; | |
279 | u_int32_t mlen; | |
280 | ||
281 | MBUFQ_DEQUEUE(&fq->fq_mbufq, m); | |
282 | if (m == NULL) | |
283 | return (NULL); | |
284 | ||
285 | mlen = m_length(m); | |
286 | ||
287 | VERIFY(fq->fq_bytes >= mlen); | |
288 | fq->fq_bytes -= mlen; | |
289 | ||
290 | fq_cl = &fqs->fqs_classq[fq->fq_sc_index]; | |
291 | fq_cl->fcl_stat.fcl_byte_cnt -= mlen; | |
292 | fq_cl->fcl_stat.fcl_pkt_cnt--; | |
293 | IFCQ_DEC_LEN(ifq); | |
294 | IFCQ_DEC_BYTES(ifq, mlen); | |
295 | ||
296 | pkt = &m->m_pkthdr; | |
297 | now = mach_absolute_time(); | |
298 | ||
299 | /* this will compute qdelay in nanoseconds */ | |
300 | qdelay = now - pkt->pkt_timestamp; | |
301 | ||
302 | if (fq->fq_min_qdelay == 0 || | |
303 | (qdelay > 0 && (u_int64_t)qdelay < fq->fq_min_qdelay)) | |
304 | fq->fq_min_qdelay = qdelay; | |
305 | if (now >= fq->fq_updatetime || MBUFQ_EMPTY(&fq->fq_mbufq)) { | |
306 | if (fq->fq_min_qdelay >= fqs->fqs_target_qdelay) { | |
307 | if (!FQ_IS_DELAYHIGH(fq)) | |
308 | FQ_SET_DELAY_HIGH(fq); | |
309 | } | |
310 | ||
311 | if (!FQ_IS_DELAYHIGH(fq) || MBUFQ_EMPTY(&fq->fq_mbufq)) { | |
312 | FQ_CLEAR_DELAY_HIGH(fq); | |
313 | if (fq->fq_flags & FQF_FLOWCTL_ON) { | |
314 | fq_if_flow_feedback(fqs, fq, fq_cl); | |
315 | } | |
316 | } | |
317 | ||
318 | /* Reset measured queue delay and update time */ | |
319 | fq->fq_updatetime = now + fqs->fqs_update_interval; | |
320 | fq->fq_min_qdelay = 0; | |
321 | } | |
322 | ||
323 | if ((pkt->pkt_flags & PKTF_START_SEQ) && (fq->fq_dequeue_seq == 0 || | |
324 | (FQ_SEQ_LT(fq->fq_dequeue_seq, pkt->tx_start_seq)))) | |
325 | fq->fq_dequeue_seq = pkt->tx_start_seq; | |
326 | ||
327 | pkt->pkt_timestamp = 0; | |
328 | pkt->pkt_flags &= ~PKTF_PRIV_GUARDED; | |
329 | ||
330 | if (MBUFQ_EMPTY(&fq->fq_mbufq)) { | |
331 | /* | |
332 | * Remove from large_flow field, if this happened to be | |
333 | * the one that is tagged. | |
334 | */ | |
335 | if (fqs->fqs_large_flow == fq) | |
336 | fqs->fqs_large_flow = NULL; | |
337 | ||
338 | /* Reset getqtime so that we don't count idle times */ | |
339 | fq->fq_getqtime = 0; | |
340 | } else { | |
341 | fq->fq_getqtime = now; | |
342 | } | |
343 | ||
344 | return (m); | |
345 | } |