]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp.c
xnu-3247.1.106.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp.c
1 /*
2 * Copyright (c) 2012-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/mbuf.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <sys/syslog.h>
37 #include <sys/protosw.h>
38
39 #include <kern/zalloc.h>
40 #include <kern/locks.h>
41
42 #include <mach/thread_act.h>
43 #include <mach/sdt.h>
44
45 #include <dev/random/randomdev.h>
46
47 #include <net/if.h>
48 #include <netinet/in.h>
49 #include <netinet/in_var.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_fsm.h>
52 #include <netinet/tcp_seq.h>
53 #include <netinet/tcp_var.h>
54 #include <netinet/mptcp_var.h>
55 #include <netinet/mptcp.h>
56 #include <netinet/mptcp_seq.h>
57 #include <netinet/mptcp_opt.h>
58 #include <netinet/mptcp_timer.h>
59
60 int mptcp_enable = 1;
61 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
62 &mptcp_enable, 0, "Enable Multipath TCP Support");
63
64 /* Number of times to try negotiating MPTCP on SYN retransmissions */
65 int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
66 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
67 CTLFLAG_RW | CTLFLAG_LOCKED,
68 &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
69
70 /*
71 * By default, DSS checksum is turned off, revisit if we ever do
72 * MPTCP for non SSL Traffic.
73 */
74 int mptcp_dss_csum = 0;
75 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
76 &mptcp_dss_csum, 0, "Enable DSS checksum");
77
78 /*
79 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
80 * is attempted on a different path.
81 */
82 int mptcp_fail_thresh = 1;
83 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
84 &mptcp_fail_thresh, 0, "Failover threshold");
85
86
87 /*
88 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
89 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
90 * Some carrier networks have a timeout of 10 or 15 minutes.
91 */
92 int mptcp_subflow_keeptime = 60*14;
93 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
94 &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
95
96 /*
97 * MP_PRIO option.
98 */
99 int mptcp_mpprio_enable = 1;
100 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mpprio, CTLFLAG_RW | CTLFLAG_LOCKED,
101 &mptcp_mpprio_enable, 0, "Enable MP_PRIO option");
102
103 /*
104 * REMOVE_ADDR option.
105 */
106 int mptcp_remaddr_enable = 1;
107 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, remaddr, CTLFLAG_RW | CTLFLAG_LOCKED,
108 &mptcp_remaddr_enable, 0, "Enable REMOVE_ADDR option");
109
110 /*
111 * FastJoin Option
112 */
113 int mptcp_fastjoin = 1;
114 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fastjoin, CTLFLAG_RW | CTLFLAG_LOCKED,
115 &mptcp_fastjoin, 0, "Enable FastJoin Option");
116
117 int mptcp_zerortt_fastjoin = 0;
118 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, zerortt_fastjoin, CTLFLAG_RW |
119 CTLFLAG_LOCKED, &mptcp_zerortt_fastjoin, 0,
120 "Enable Zero RTT Fast Join");
121
122 /*
123 * R/W Notification on resume
124 */
125 int mptcp_rwnotify = 0;
126 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rwnotify, CTLFLAG_RW | CTLFLAG_LOCKED,
127 &mptcp_rwnotify, 0, "Enable RW notify on resume");
128
129 /*
130 * Using RTT history for sending new data
131 */
132 int mptcp_use_rtthist = 1;
133 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist, CTLFLAG_RW | CTLFLAG_LOCKED,
134 &mptcp_use_rtthist, 0, "Disable RTT History");
135
136 #define MPTCP_RTTHIST_MINTHRESH 500
137 int mptcp_rtthist_rtthresh = 600;
138 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
139 &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
140
141 /*
142 * Use RTO history for sending new data
143 */
144 int mptcp_use_rto = 1;
145 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
146 &mptcp_use_rto, 0, "Disable RTO for subflow selection");
147
148 #define MPTCP_RTO_MINTHRESH 1000
149 int mptcp_rtothresh = 1500;
150 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
151 &mptcp_rtothresh, 0, "RTO threshold");
152
153 /*
154 * Use server's chosen path for sending new data
155 */
156 int mptcp_peerswitch = 1;
157 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, use_peer, CTLFLAG_RW | CTLFLAG_LOCKED,
158 &mptcp_peerswitch, 0, "Use peer");
159
160 #define MPTCP_PEERSWITCH_CNTMIN 3
161 uint32_t mptcp_peerswitch_cnt = 3;
162 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, peerswitchno, CTLFLAG_RW | CTLFLAG_LOCKED,
163 &mptcp_peerswitch_cnt, 0, "Set threshold based on peer's data arrival");
164
165 /*
166 * Probe the preferred path, when it is not in use
167 */
168 #define MPTCP_PROBETO_MIN 500
169 uint32_t mptcp_probeto = 1000;
170 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
171 &mptcp_probeto, 0, "Disable probing by setting to 0");
172
173 #define MPTCP_PROBE_MX 15
174 uint32_t mptcp_probecnt = 5;
175 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
176 &mptcp_probecnt, 0, "Number of probe writes");
177
178 /*
179 * Static declarations
180 */
181 static int mptcp_validate_csum(struct tcpcb *, struct mbuf *, int);
182 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, int);
183
184 /*
185 * MPTCP input, called when data has been read from a subflow socket.
186 */
187 void
188 mptcp_input(struct mptses *mpte, struct mbuf *m)
189 {
190 struct socket *mp_so;
191 struct mptcb *mp_tp = NULL;
192 u_int64_t mb_dsn;
193 u_int32_t mb_datalen;
194 int count = 0;
195 struct mbuf *save = NULL, *prev = NULL;
196 struct mbuf *freelist = NULL, *tail = NULL;
197 boolean_t in_fallback = FALSE;
198
199 VERIFY(m->m_flags & M_PKTHDR);
200
201 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
202 mp_so = mpte->mpte_mppcb->mpp_socket;
203
204 DTRACE_MPTCP(input);
205
206 /*
207 * Each mbuf contains MPTCP Data Sequence Map
208 * Process the data for reassembly, delivery to MPTCP socket
209 * client, etc.
210 *
211 */
212 count = mp_so->so_rcv.sb_cc;
213
214 VERIFY(m != NULL);
215 mp_tp = mpte->mpte_mptcb;
216 VERIFY(mp_tp != NULL);
217
218 /* Ok to check for this flag without lock as its set in this thread */
219 in_fallback = (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
220
221 /*
222 * In the degraded fallback case, data is accepted without DSS map
223 */
224 if (in_fallback) {
225 fallback:
226 /*
227 * assume degraded flow as this may be the first packet
228 * without DSS, and the subflow state is not updated yet.
229 */
230 if (sbappendstream(&mp_so->so_rcv, m))
231 sorwakeup(mp_so);
232 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
233 struct socket *, mp_so,
234 struct sockbuf *, &mp_so->so_rcv,
235 struct sockbuf *, &mp_so->so_snd,
236 struct mptses *, mpte);
237 count = mp_so->so_rcv.sb_cc - count;
238 mptcplog((LOG_DEBUG, "MPTCP Receiver: Fallback read %d bytes\n",
239 count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
240 return;
241 }
242
243 MPT_LOCK(mp_tp);
244 do {
245 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
246 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
247 MPT_UNLOCK(mp_tp);
248 goto fallback;
249 }
250
251 save = m->m_next;
252 /*
253 * A single TCP packet formed of multiple mbufs
254 * holds DSS mapping in the first mbuf of the chain.
255 * Other mbufs in the chain may have M_PKTHDR set
256 * even though they belong to the same TCP packet
257 * and therefore use the DSS mapping stored in the
258 * first mbuf of the mbuf chain. mptcp_input() can
259 * get an mbuf chain with multiple TCP packets.
260 */
261 while (save && (!(save->m_flags & M_PKTHDR) ||
262 !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
263 prev = save;
264 save = save->m_next;
265 }
266 if (prev)
267 prev->m_next = NULL;
268 else
269 m->m_next = NULL;
270
271 mb_dsn = m->m_pkthdr.mp_dsn;
272 mb_datalen = m->m_pkthdr.mp_rlen;
273
274 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvatmark)) {
275 tcpstat.tcps_mp_oodata++;
276 MPT_UNLOCK(mp_tp);
277 m_freem(m);
278 return;
279 /*
280 * Reassembly queue support here in future. Per spec,
281 * senders must implement retransmission timer to
282 * retransmit unacked data. Dropping out of order
283 * gives a slight hit on performance but allows us to
284 * deploy MPTCP and protects us against in-window DoS
285 * attacks that attempt to use up memory by sending
286 * out of order data. When doing load sharing across
287 * subflows, out of order support is a must.
288 */
289 }
290
291 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvatmark)) {
292 if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
293 mp_tp->mpt_rcvatmark)) {
294 if (freelist == NULL)
295 freelist = m;
296 else
297 tail->m_next = m;
298
299 if (prev != NULL)
300 tail = prev;
301 else
302 tail = m;
303
304 m = save;
305 prev = save = NULL;
306 continue;
307 } else {
308 m_adj(m, (mp_tp->mpt_rcvatmark - mb_dsn));
309 }
310 mptcplog((LOG_INFO, "MPTCP Receiver: Left Edge %llu\n",
311 mp_tp->mpt_rcvatmark),
312 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
313 }
314
315 MPT_UNLOCK(mp_tp);
316 if (sbappendstream(&mp_so->so_rcv, m)) {
317 sorwakeup(mp_so);
318 }
319 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
320 struct sockbuf *, &mp_so->so_rcv,
321 struct sockbuf *, &mp_so->so_snd,
322 struct mptses *, mpte,
323 struct mptcb *, mp_tp);
324 MPT_LOCK(mp_tp);
325 count = mp_so->so_rcv.sb_cc - count;
326 tcpstat.tcps_mp_rcvtotal++;
327 tcpstat.tcps_mp_rcvbytes += count;
328 mptcplog((LOG_DEBUG, "MPTCP Receiver: Read %d bytes\n", count),
329 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
330
331 /*
332 * The data received at the MPTCP layer will never exceed the
333 * receive window because anything to the right of the
334 * receive window will be trimmed at the subflow level.
335 */
336 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
337 mp_tp->mpt_rcvatmark += count;
338 m = save;
339 prev = save = NULL;
340 count = mp_so->so_rcv.sb_cc;
341 } while (m);
342 MPT_UNLOCK(mp_tp);
343
344 if (freelist)
345 m_freem(freelist);
346 }
347
348 /*
349 * MPTCP output.
350 */
351 int
352 mptcp_output(struct mptses *mpte)
353 {
354 struct mptsub *mpts;
355 struct mptsub *mpts_tried = NULL;
356 struct socket *mp_so;
357 struct mptsub *preferred_mpts = NULL;
358 int error = 0;
359
360 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
361 mp_so = mpte->mpte_mppcb->mpp_socket;
362 if (mp_so->so_state & SS_CANTSENDMORE) {
363 mptcplog((LOG_DEBUG, "MPTCP Sender: cantsendmore\n"),
364 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
365 return (EPIPE);
366 }
367
368 try_again:
369 /* get the "best" subflow to be used for transmission */
370 mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
371 if (mpts == NULL) {
372 mptcplog((LOG_ERR, "MPTCP Sender: mp_so 0x%llx no subflow\n",
373 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
374 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
375 goto out;
376 }
377
378 mptcplog((LOG_DEBUG, "MPTCP Sender: mp_so 0x%llx using cid %d \n",
379 (uint64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
380 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
381
382 /* In case there's just one flow, we reattempt later */
383 MPTS_LOCK(mpts);
384 if ((mpts_tried != NULL) && ((mpts == mpts_tried) ||
385 (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
386 MPTS_UNLOCK(mpts);
387 MPTS_LOCK(mpts_tried);
388 mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
389 mpts_tried->mpts_flags |= MPTSF_ACTIVE;
390 MPTS_UNLOCK(mpts_tried);
391 mptcp_start_timer(mpte, MPTT_REXMT);
392 mptcplog((LOG_DEBUG, "MPTCP Sender: mp_so 0x%llx retry later\n",
393 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
394 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
395 goto out;
396 }
397
398 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
399 struct socket *, mp_so);
400 error = mptcp_subflow_output(mpte, mpts);
401 if (error) {
402 /* can be a temporary loss of source address or other error */
403 mpts->mpts_flags |= MPTSF_FAILINGOVER;
404 mpts->mpts_flags &= ~MPTSF_ACTIVE;
405 mpts_tried = mpts;
406 MPTS_UNLOCK(mpts);
407 mptcplog((LOG_INFO, "MPTCP Sender: Error = %d \n", error),
408 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
409 goto try_again;
410 }
411 /* The model is to have only one active flow at a time */
412 mpts->mpts_flags |= MPTSF_ACTIVE;
413 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
414 MPTS_UNLOCK(mpts);
415
416 /* Allows us to update the smoothed rtt */
417 if ((mptcp_probeto) && (mptcp_probeto >= MPTCP_PROBETO_MIN) &&
418 (mpts != preferred_mpts) && (preferred_mpts != NULL)) {
419 MPTS_LOCK(preferred_mpts);
420 if (preferred_mpts->mpts_probesoon) {
421 if ((tcp_now - preferred_mpts->mpts_probesoon) >
422 mptcp_probeto) {
423 (void) mptcp_subflow_output(mpte, preferred_mpts);
424 if (preferred_mpts->mpts_probecnt >=
425 MIN(mptcp_probecnt, MPTCP_PROBE_MX)) {
426 preferred_mpts->mpts_probesoon = 0;
427 preferred_mpts->mpts_probecnt = 0;
428 }
429 }
430 } else {
431 preferred_mpts->mpts_probesoon = tcp_now;
432 preferred_mpts->mpts_probecnt = 0;
433 }
434 MPTS_UNLOCK(preferred_mpts);
435 }
436
437 if (mpte->mpte_active_sub == NULL) {
438 mpte->mpte_active_sub = mpts;
439 } else if (mpte->mpte_active_sub != mpts) {
440 mptcplog((LOG_DEBUG, "MPTCP Sender: switch [cid %d, srtt %d]"
441 "to [cid %d, srtt %d]\n",
442 mpte->mpte_active_sub->mpts_connid,
443 mpte->mpte_active_sub->mpts_srtt >> 5,
444 mpts->mpts_connid,
445 mpts->mpts_srtt >> 5),
446 MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
447
448 MPTS_LOCK(mpte->mpte_active_sub);
449 mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
450 mpts->mpts_peerswitch = 0;
451 MPTS_UNLOCK(mpte->mpte_active_sub);
452 mpte->mpte_active_sub = mpts;
453 tcpstat.tcps_mp_switches++;
454 }
455 out:
456 /* subflow errors should not be percolated back up */
457 return (0);
458 }
459
460 /*
461 * Return the most eligible subflow to be used for sending data.
462 * This function also serves to check if any alternate subflow is available
463 * or not. best and second_best flows are chosen by their priority. third_best
464 * could be best or second_best but is under loss at the time of evaluation.
465 */
466 struct mptsub *
467 mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
468 {
469 struct mptsub *mpts;
470 struct mptsub *best = NULL;
471 struct mptsub *second_best = NULL;
472 struct mptsub *third_best = NULL;
473 struct mptsub *symptoms_best = NULL;
474 struct socket *so = NULL;
475
476 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
477
478 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
479 MPTS_LOCK(mpts);
480
481 if ((ignore) && (mpts == ignore)) {
482 MPTS_UNLOCK(mpts);
483 continue;
484 }
485
486 /* There can only be one subflow in degraded state */
487 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
488 MPTS_UNLOCK(mpts);
489 best = mpts;
490 break;
491 }
492
493 /*
494 * Subflows with Fastjoin allow data to be written before
495 * the subflow is mp capable.
496 */
497 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
498 !(mpts->mpts_flags & MPTSF_FASTJ_REQD)) {
499 MPTS_UNLOCK(mpts);
500 continue;
501 }
502
503 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
504 MPTS_UNLOCK(mpts);
505 continue;
506 }
507
508 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
509 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
510 MPTS_UNLOCK(mpts);
511 continue;
512 }
513
514 if (mpts->mpts_flags & MPTSF_FAILINGOVER) {
515 so = mpts->mpts_socket;
516 if ((so) && (!(so->so_flags & SOF_PCBCLEARING))) {
517 socket_lock(so, 1);
518 if ((so->so_snd.sb_cc == 0) &&
519 (mptcp_no_rto_spike(so))) {
520 mpts->mpts_flags &= ~MPTSF_FAILINGOVER;
521 so->so_flags &= ~SOF_MP_TRYFAILOVER;
522 socket_unlock(so, 1);
523 } else {
524 third_best = mpts;
525 mptcplog((LOG_DEBUG, "MPTCP Sender: "
526 "%s cid %d in failover\n",
527 __func__, third_best->mpts_connid),
528 MPTCP_SENDER_DBG,
529 MPTCP_LOGLVL_VERBOSE);
530 socket_unlock(so, 1);
531 MPTS_UNLOCK(mpts);
532 continue;
533 }
534 } else {
535 MPTS_UNLOCK(mpts);
536 continue;
537 }
538 }
539
540 /* When there are no preferred flows, use first one in list */
541 if ((!second_best) && !(mpts->mpts_flags & MPTSF_PREFERRED))
542 second_best = mpts;
543
544 if (mpts->mpts_flags & MPTSF_PREFERRED) {
545 best = mpts;
546 }
547
548 MPTS_UNLOCK(mpts);
549 }
550
551 /*
552 * If there is no preferred or backup subflow, and there is no active
553 * subflow use the last usable subflow.
554 */
555 if (best == NULL) {
556 return (second_best ? second_best : third_best);
557 }
558
559 if (second_best == NULL) {
560 return (best ? best : third_best);
561 }
562
563 if (preferred != NULL)
564 *preferred = best;
565
566 /* Use a hint from symptomsd if it exists */
567 symptoms_best = mptcp_use_symptoms_hints(best, second_best);
568 if (symptoms_best != NULL)
569 return (symptoms_best);
570
571 /* Compare RTTs, select second_best if best's rtt exceeds rttthresh */
572 if ((mptcp_use_rtthist) &&
573 (best->mpts_srtt) && (second_best->mpts_srtt) &&
574 (best->mpts_srtt > second_best->mpts_srtt) &&
575 (best->mpts_srtt >= MAX((MPTCP_RTTHIST_MINTHRESH << 5),
576 (mptcp_rtthist_rtthresh << 5)))) {
577 tcpstat.tcps_mp_sel_rtt++;
578 mptcplog((LOG_DEBUG, "MPTCP Sender: %s best cid %d"
579 " at rtt %d, second cid %d at rtt %d\n", __func__,
580 best->mpts_connid, best->mpts_srtt >> 5,
581 second_best->mpts_connid,
582 second_best->mpts_srtt >> 5),
583 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
584 return (second_best);
585 }
586
587 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
588 if ((mptcp_use_rto) &&
589 (best->mpts_rxtcur) && (second_best->mpts_rxtcur) &&
590 (best->mpts_rxtcur > second_best->mpts_rxtcur) &&
591 (best->mpts_rxtcur >=
592 MAX(MPTCP_RTO_MINTHRESH, mptcp_rtothresh))) {
593 tcpstat.tcps_mp_sel_rto++;
594 mptcplog((LOG_DEBUG, "MPTCP Sender: %s best cid %d"
595 " at rto %d, second cid %d at rto %d\n", __func__,
596 best->mpts_connid, best->mpts_rxtcur,
597 second_best->mpts_connid, second_best->mpts_rxtcur),
598 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
599
600 return (second_best);
601 }
602
603 /* If second_best received data, use second_best */
604 if (mptcp_peerswitch &&
605 (second_best->mpts_peerswitch >
606 MAX(MPTCP_PEERSWITCH_CNTMIN, mptcp_peerswitch_cnt))) {
607 tcpstat.tcps_mp_sel_peer++;
608 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: best cid %d"
609 " but using cid %d after receiving %d segments\n",
610 __func__, best->mpts_connid, second_best->mpts_connid,
611 second_best->mpts_peerswitch), MPTCP_SENDER_DBG,
612 MPTCP_LOGLVL_LOG);
613 return (second_best);
614 }
615 return (best);
616 }
617
618 struct mptsub *
619 mptcp_get_pending_subflow(struct mptses *mpte, struct mptsub *ignore)
620 {
621 struct mptsub *mpts = NULL;
622
623 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
624
625 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
626 MPTS_LOCK(mpts);
627
628 if ((ignore) && (mpts == ignore)) {
629 MPTS_UNLOCK(mpts);
630 continue;
631 }
632
633 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
634 MPTS_UNLOCK(mpts);
635 break;
636 }
637
638 MPTS_UNLOCK(mpts);
639 }
640 return (mpts);
641 }
642
643 static const char *
644 mptcp_event_to_str(uint32_t event)
645 {
646 const char *c = "UNDEFINED";
647 switch (event) {
648 case MPCE_CLOSE:
649 c = "MPCE_CLOSE";
650 break;
651 case MPCE_RECV_DATA_ACK:
652 c = "MPCE_RECV_DATA_ACK";
653 break;
654 case MPCE_RECV_DATA_FIN:
655 c = "MPCE_RECV_DATA_FIN";
656 break;
657 }
658 return (c);
659 }
660
661 static const char *
662 mptcp_state_to_str(mptcp_state_t state)
663 {
664 const char *c = "UNDEFINED";
665 switch (state) {
666 case MPTCPS_CLOSED:
667 c = "MPTCPS_CLOSED";
668 break;
669 case MPTCPS_LISTEN:
670 c = "MPTCPS_LISTEN";
671 break;
672 case MPTCPS_ESTABLISHED:
673 c = "MPTCPS_ESTABLISHED";
674 break;
675 case MPTCPS_CLOSE_WAIT:
676 c = "MPTCPS_CLOSE_WAIT";
677 break;
678 case MPTCPS_FIN_WAIT_1:
679 c = "MPTCPS_FIN_WAIT_1";
680 break;
681 case MPTCPS_CLOSING:
682 c = "MPTCPS_CLOSING";
683 break;
684 case MPTCPS_LAST_ACK:
685 c = "MPTCPS_LAST_ACK";
686 break;
687 case MPTCPS_FIN_WAIT_2:
688 c = "MPTCPS_FIN_WAIT_2";
689 break;
690 case MPTCPS_TIME_WAIT:
691 c = "MPTCPS_TIME_WAIT";
692 break;
693 case MPTCPS_FASTCLOSE_WAIT:
694 c = "MPTCPS_FASTCLOSE_WAIT";
695 break;
696 case MPTCPS_TERMINATE:
697 c = "MPTCPS_TERMINATE";
698 break;
699 }
700 return (c);
701 }
702
703 void
704 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
705 {
706 MPT_LOCK_ASSERT_HELD(mp_tp);
707 mptcp_state_t old_state = mp_tp->mpt_state;
708
709 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
710 uint32_t, event);
711
712 switch (mp_tp->mpt_state) {
713 case MPTCPS_CLOSED:
714 case MPTCPS_LISTEN:
715 mp_tp->mpt_state = MPTCPS_CLOSED;
716 break;
717
718 case MPTCPS_ESTABLISHED:
719 if (event == MPCE_CLOSE) {
720 mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
721 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
722 }
723 else if (event == MPCE_RECV_DATA_FIN) {
724 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
725 mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
726 }
727 break;
728
729 case MPTCPS_CLOSE_WAIT:
730 if (event == MPCE_CLOSE) {
731 mp_tp->mpt_state = MPTCPS_LAST_ACK;
732 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
733 }
734 break;
735
736 case MPTCPS_FIN_WAIT_1:
737 if (event == MPCE_RECV_DATA_ACK)
738 mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
739 else if (event == MPCE_RECV_DATA_FIN) {
740 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
741 mp_tp->mpt_state = MPTCPS_CLOSING;
742 }
743 break;
744
745 case MPTCPS_CLOSING:
746 if (event == MPCE_RECV_DATA_ACK)
747 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
748 break;
749
750 case MPTCPS_LAST_ACK:
751 if (event == MPCE_RECV_DATA_ACK)
752 mp_tp->mpt_state = MPTCPS_TERMINATE;
753 break;
754
755 case MPTCPS_FIN_WAIT_2:
756 if (event == MPCE_RECV_DATA_FIN) {
757 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
758 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
759 }
760 break;
761
762 case MPTCPS_TIME_WAIT:
763 break;
764
765 case MPTCPS_FASTCLOSE_WAIT:
766 if (event == MPCE_CLOSE) {
767 /* no need to adjust for data FIN */
768 mp_tp->mpt_state = MPTCPS_TERMINATE;
769 }
770 break;
771 case MPTCPS_TERMINATE:
772 break;
773 default:
774 VERIFY(0);
775 /* NOTREACHED */
776 }
777 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
778 uint32_t, event);
779 mptcplog((LOG_INFO, "MPTCP State: %s to %s on event %s\n",
780 mptcp_state_to_str(old_state),
781 mptcp_state_to_str(mp_tp->mpt_state),
782 mptcp_event_to_str(event)),
783 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
784 }
785
786 /*
787 * Update the mptcb send state variables, but the actual sbdrop occurs
788 * in MPTCP layer
789 */
790 void
791 mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack)
792 {
793 u_int64_t acked = 0;
794
795 acked = full_dack - mp_tp->mpt_snduna;
796
797 if (acked) {
798 mp_tp->mpt_snduna += acked;
799 /* In degraded mode, we may get some Data ACKs */
800 if ((tp->t_mpflags & TMPF_TCP_FALLBACK) &&
801 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
802 MPTCP_SEQ_GT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
803 /* bring back sndnxt to retransmit MPTCP data */
804 mp_tp->mpt_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
805 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
806 tp->t_inpcb->inp_socket->so_flags1 |=
807 SOF1_POST_FALLBACK_SYNC;
808 }
809 }
810 if ((full_dack == mp_tp->mpt_sndmax) &&
811 (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1)) {
812 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_ACK);
813 tp->t_mpflags &= ~TMPF_SEND_DFIN;
814 }
815 }
816
817 /* If you change this function, match up mptcp_update_rcv_state_f */
818 void
819 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
820 uint16_t csum)
821 {
822 struct mptcb *mp_tp = tptomptp(tp);
823 u_int64_t full_dsn = 0;
824
825 NTOHL(dss_info->mdss_dsn);
826 NTOHL(dss_info->mdss_subflow_seqn);
827 NTOHS(dss_info->mdss_data_len);
828
829 /* XXX for autosndbuf grow sb here */
830 MPT_LOCK(mp_tp);
831 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
832 MPT_UNLOCK(mp_tp);
833 mptcp_update_rcv_state_meat(mp_tp, tp,
834 full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
835 csum);
836
837 }
838
839 void
840 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
841 u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
842 uint16_t csum)
843 {
844 if (mdss_data_len == 0) {
845 mptcplog((LOG_INFO, "MPTCP Receiver: Infinite Mapping.\n"),
846 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
847
848 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
849 mptcplog((LOG_ERR, "MPTCP Receiver: Bad checksum %x \n",
850 csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
851 }
852 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
853 return;
854 }
855 MPT_LOCK(mp_tp);
856 mptcplog((LOG_DEBUG,
857 "MPTCP Receiver: seqn = %x len = %x full = %llx "
858 "rcvnxt = %llu \n",
859 seqn, mdss_data_len, full_dsn, mp_tp->mpt_rcvnxt),
860 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
861
862 /* Process a Data FIN packet , handled in mptcp_do_fin_opt */
863 if ((seqn == 0) && (mdss_data_len == 1)) {
864 mptcplog((LOG_INFO, "MPTCP Receiver: Data FIN in %s state \n",
865 mptcp_state_to_str(mp_tp->mpt_state)),
866 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
867 MPT_UNLOCK(mp_tp);
868 return;
869 }
870 MPT_UNLOCK(mp_tp);
871 mptcp_notify_mpready(tp->t_inpcb->inp_socket);
872 tp->t_rcv_map.mpt_dsn = full_dsn;
873 tp->t_rcv_map.mpt_sseq = seqn;
874 tp->t_rcv_map.mpt_len = mdss_data_len;
875 tp->t_rcv_map.mpt_csum = csum;
876 tp->t_mpflags |= TMPF_EMBED_DSN;
877 }
878
879
880 void
881 mptcp_update_rcv_state_f(struct mptcp_dss_ack_opt *dss_info, struct tcpcb *tp,
882 uint16_t csum)
883 {
884 u_int64_t full_dsn = 0;
885 struct mptcb *mp_tp = tptomptp(tp);
886
887 NTOHL(dss_info->mdss_dsn);
888 NTOHL(dss_info->mdss_subflow_seqn);
889 NTOHS(dss_info->mdss_data_len);
890 MPT_LOCK(mp_tp);
891 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
892 MPT_UNLOCK(mp_tp);
893 mptcp_update_rcv_state_meat(mp_tp, tp,
894 full_dsn,
895 dss_info->mdss_subflow_seqn,
896 dss_info->mdss_data_len,
897 csum);
898 }
899
900 void
901 mptcp_update_rcv_state_g(struct mptcp_dss64_ack32_opt *dss_info,
902 struct tcpcb *tp, uint16_t csum)
903 {
904 u_int64_t dsn = mptcp_ntoh64(dss_info->mdss_dsn);
905 struct mptcb *mp_tp = tptomptp(tp);
906
907 NTOHL(dss_info->mdss_subflow_seqn);
908 NTOHS(dss_info->mdss_data_len);
909 mptcp_update_rcv_state_meat(mp_tp, tp,
910 dsn,
911 dss_info->mdss_subflow_seqn,
912 dss_info->mdss_data_len,
913 csum);
914 }
915
916 static int
917 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
918 int hdrlen)
919 {
920 u_int32_t sseq, datalen;
921
922 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
923 return 0;
924
925 sseq = m->m_pkthdr.mp_rseq + tp->irs;
926 datalen = m->m_pkthdr.mp_rlen;
927
928 #if 0
929 /* enable this to test TCP fallback post connection establishment */
930 if (SEQ_GT(sseq, (tp->irs+1)))
931 datalen = m->m_pkthdr.len - hdrlen - 1;
932 #endif
933
934 /* unacceptable DSS option, fallback to TCP */
935 if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
936 mptcplog((LOG_ERR, "MPTCP Receiver: "
937 "%s: mbuf len %d, MPTCP expected %d",
938 __func__, m->m_pkthdr.len, datalen),
939 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
940 } else {
941 return 0;
942 }
943 tp->t_mpflags |= TMPF_SND_MPFAIL;
944 mptcp_notify_mpfail(so);
945 m_freem(m);
946 return -1;
947 }
948
949 int
950 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, int drop_hdrlen)
951 {
952 if (mptcp_validate_csum(tp, m, drop_hdrlen) != 0)
953 return -1;
954
955 mptcp_insert_rmap(tp, m);
956 if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
957 drop_hdrlen) != 0)
958 return -1;
959 return 0;
960 }
961
962 /*
963 * MPTCP Checksum support
964 * The checksum is calculated whenever the MPTCP DSS option is included
965 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
966 * header and the actual data indicated by the length specified in the
967 * DSS option.
968 */
969
970 static int
971 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, int drop_hdrlen)
972 {
973 uint16_t mptcp_csum = 0;
974 mptcp_csum = mptcp_input_csum(tp, m, drop_hdrlen);
975 if (mptcp_csum) {
976 tp->t_mpflags |= TMPF_SND_MPFAIL;
977 tp->t_mpflags &= ~TMPF_EMBED_DSN;
978 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
979 m_freem(m);
980 tcpstat.tcps_mp_badcsum++;
981 return -1;
982 }
983 return 0;
984 }
985
986 static uint16_t
987 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, int off)
988 {
989 struct mptcb *mp_tp = tptomptp(tp);
990 uint32_t sum = 0;
991 uint64_t dsn;
992 uint32_t sseq;
993 uint16_t len;
994 uint16_t csum;
995
996 if (mp_tp == NULL)
997 return (0);
998
999 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
1000 return (0);
1001
1002 if (!(tp->t_mpflags & TMPF_EMBED_DSN))
1003 return (0);
1004
1005 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
1006 return (0);
1007
1008 /*
1009 * The remote side may send a packet with fewer bytes than the
1010 * claimed DSS checksum length.
1011 */
1012 if ((int)m_length2(m, NULL) < (off + tp->t_rcv_map.mpt_len))
1013 return (0xffff);
1014
1015 if (tp->t_rcv_map.mpt_len != 0)
1016 sum = m_sum16(m, off, tp->t_rcv_map.mpt_len);
1017
1018 dsn = mptcp_hton64(tp->t_rcv_map.mpt_dsn);
1019 sseq = htonl(tp->t_rcv_map.mpt_sseq);
1020 len = htons(tp->t_rcv_map.mpt_len);
1021 csum = tp->t_rcv_map.mpt_csum;
1022 sum += in_pseudo64(dsn, sseq, (len + csum));
1023 ADDCARRY(sum);
1024 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1025 uint32_t, sum);
1026 mptcplog((LOG_DEBUG, "MPTCP Receiver: sum = %x \n", sum),
1027 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1028 return (~sum & 0xffff);
1029 }
1030
1031 void
1032 mptcp_output_csum(struct tcpcb *tp, struct mbuf *m, int32_t len,
1033 unsigned hdrlen, u_int64_t dss_val, u_int32_t *sseqp)
1034 {
1035 struct mptcb *mp_tp = tptomptp(tp);
1036 u_int32_t sum = 0;
1037 uint32_t sseq;
1038 uint16_t dss_len;
1039 uint16_t csum = 0;
1040 uint16_t *csump = NULL;
1041
1042 if (mp_tp == NULL)
1043 return;
1044
1045 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
1046 return;
1047
1048 if (sseqp == NULL)
1049 return;
1050
1051 if (len)
1052 sum = m_sum16(m, hdrlen, len);
1053
1054 dss_val = mptcp_hton64(dss_val);
1055 sseq = *sseqp;
1056 dss_len = *(uint16_t *)(void *)((u_char*)sseqp + sizeof (u_int32_t));
1057 sum += in_pseudo64(dss_val, sseq, (dss_len + csum));
1058
1059 ADDCARRY(sum);
1060 sum = ~sum & 0xffff;
1061 csump = (uint16_t *)(void *)((u_char*)sseqp + sizeof (u_int32_t) +
1062 sizeof (uint16_t));
1063 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1064 uint32_t, sum);
1065 *csump = sum;
1066 mptcplog((LOG_DEBUG, "MPTCP Sender: sum = %x \n", sum),
1067 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1068 }
1069
1070 /*
1071 * When WiFi signal starts fading, there's more loss and RTT spikes.
1072 * Check if there has been a large spike by comparing against
1073 * a tolerable RTT spike threshold.
1074 */
1075 boolean_t
1076 mptcp_no_rto_spike(struct socket *so)
1077 {
1078 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1079 int32_t spike = 0;
1080
1081 if (tp->t_rxtcur > MAX(mptcp_rtothresh, MPTCP_RTO_MINTHRESH)) {
1082 spike = tp->t_rxtcur - mptcp_rtothresh;
1083
1084 mptcplog((LOG_DEBUG, "MPTCP Socket: %s: spike = %d rto = %d"
1085 "best = %d cur = %d\n", __func__, spike,
1086 tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1087 tp->t_rttcur),
1088 (MPTCP_SOCKET_DBG|MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1089
1090 }
1091
1092 if (spike > 0 ) {
1093 return (FALSE);
1094 } else {
1095 return (TRUE);
1096 }
1097 }