]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/vsock_domain.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / kern / vsock_domain.c
1 /*
2 * Copyright (c) 2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/domain.h>
30 #include <sys/socket.h>
31 #include <sys/protosw.h>
32 #include <sys/mcache.h>
33 #include <sys/systm.h>
34 #include <sys/sysctl.h>
35 #include <sys/random.h>
36 #include <sys/mbuf.h>
37 #include <sys/vsock_domain.h>
38 #include <sys/vsock_transport.h>
39 #include <kern/task.h>
40 #include <kern/zalloc.h>
41 #include <kern/locks.h>
42 #include <machine/atomic.h>
43
44 #define sotovsockpcb(so) ((struct vsockpcb *)(so)->so_pcb)
45
46 #define VSOCK_PORT_RESERVED 1024
47
48 /* VSock Protocol Globals */
49
50 static struct vsock_transport * _Atomic the_vsock_transport = NULL;
51 static ZONE_DECLARE(vsockpcb_zone, "vsockpcbzone",
52 sizeof(struct vsockpcb), ZC_NONE);
53 static LCK_GRP_DECLARE(vsock_lock_grp, "vsock");
54 static struct vsockpcbinfo vsockinfo;
55
56 static uint32_t vsock_sendspace = VSOCK_MAX_PACKET_SIZE * 8;
57 static uint32_t vsock_recvspace = VSOCK_MAX_PACKET_SIZE * 8;
58
59 /* VSock PCB Helpers */
60
61 static uint32_t
62 vsock_get_peer_space(struct vsockpcb *pcb)
63 {
64 return pcb->peer_buf_alloc - (pcb->tx_cnt - pcb->peer_fwd_cnt);
65 }
66
67 static struct vsockpcb *
68 vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst)
69 {
70 struct vsockpcb *preferred = NULL;
71 struct vsockpcb *match = NULL;
72 struct vsockpcb *pcb = NULL;
73
74 lck_rw_lock_shared(&vsockinfo.bound_lock);
75 LIST_FOREACH(pcb, &vsockinfo.bound, bound) {
76 // Source cid and port must match. Only destination port must match. (Allows for a changing CID during migration)
77 socket_lock(pcb->so, 1);
78 if ((pcb->so->so_state & SS_ISCONNECTED || pcb->so->so_state & SS_ISCONNECTING) &&
79 pcb->local_address.cid == src.cid && pcb->local_address.port == src.port &&
80 pcb->remote_address.port == dst.port) {
81 preferred = pcb;
82 break;
83 } else if ((pcb->local_address.cid == src.cid || pcb->local_address.cid == VMADDR_CID_ANY) &&
84 pcb->local_address.port == src.port) {
85 match = pcb;
86 }
87 socket_unlock(pcb->so, 1);
88 }
89 if (!preferred && match) {
90 socket_lock(match->so, 1);
91 preferred = match;
92 }
93 lck_rw_done(&vsockinfo.bound_lock);
94
95 return preferred;
96 }
97
98 static errno_t
99 vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t local_port, uint32_t remote_cid, uint32_t remote_port)
100 {
101 socket_lock_assert_owned(pcb->so);
102
103 // Privileged ports.
104 if (local_port != VMADDR_PORT_ANY && local_port < VSOCK_PORT_RESERVED &&
105 current_task() != kernel_task && proc_suser(current_proc()) != 0) {
106 return EACCES;
107 }
108
109 bool taken = false;
110 const bool check_remote = (remote_cid != VMADDR_CID_ANY && remote_port != VMADDR_PORT_ANY);
111
112 struct vsockpcb *pcb_match = NULL;
113
114 socket_unlock(pcb->so, 0);
115 lck_rw_lock_exclusive(&vsockinfo.bound_lock);
116 LIST_FOREACH(pcb_match, &vsockinfo.bound, bound) {
117 socket_lock(pcb_match->so, 1);
118 if (pcb == pcb_match ||
119 (!check_remote && pcb_match->local_address.port == local_port) ||
120 (check_remote && pcb_match->local_address.port == local_port &&
121 pcb_match->remote_address.cid == remote_cid && pcb_match->remote_address.port == remote_port)) {
122 socket_unlock(pcb_match->so, 1);
123 taken = true;
124 break;
125 }
126 socket_unlock(pcb_match->so, 1);
127 }
128 socket_lock(pcb->so, 0);
129 if (!taken) {
130 pcb->local_address = (struct vsock_address) { .cid = local_cid, .port = local_port };
131 pcb->remote_address = (struct vsock_address) { .cid = remote_cid, .port = remote_port };
132 LIST_INSERT_HEAD(&vsockinfo.bound, pcb, bound);
133 }
134 lck_rw_done(&vsockinfo.bound_lock);
135
136 return taken ? EADDRINUSE : 0;
137 }
138
139 static errno_t
140 vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsock_address raddr)
141 {
142 if (!pcb) {
143 return EINVAL;
144 }
145
146 socket_lock_assert_owned(pcb->so);
147
148 // Certain CIDs are reserved.
149 if (laddr.cid == VMADDR_CID_HYPERVISOR || laddr.cid == VMADDR_CID_RESERVED || laddr.cid == VMADDR_CID_HOST) {
150 return EADDRNOTAVAIL;
151 }
152
153 // Remote address must be fully specified or not specified at all.
154 if ((raddr.cid == VMADDR_CID_ANY) ^ (raddr.port == VMADDR_PORT_ANY)) {
155 return EINVAL;
156 }
157
158 // Cannot bind if already bound.
159 if (pcb->local_address.port != VMADDR_PORT_ANY) {
160 return EINVAL;
161 }
162
163 uint32_t transport_cid;
164 struct vsock_transport *transport = pcb->transport;
165 errno_t error = transport->get_cid(transport->provider, &transport_cid);
166 if (error) {
167 return error;
168 }
169
170 // Local CID must be this transport's CID or any.
171 if (laddr.cid != transport_cid && laddr.cid != VMADDR_CID_ANY) {
172 return EINVAL;
173 }
174
175 if (laddr.port != VMADDR_PORT_ANY) {
176 error = vsock_bind_address_if_free(pcb, laddr.cid, laddr.port, raddr.cid, raddr.port);
177 } else {
178 lck_mtx_lock(&vsockinfo.port_lock);
179
180 const uint32_t first = VSOCK_PORT_RESERVED;
181 const uint32_t last = VMADDR_PORT_ANY - 1;
182 uint32_t count = last - first + 1;
183 uint32_t *last_port = &vsockinfo.last_port;
184
185 if (pcb->so->so_flags & SOF_BINDRANDOMPORT) {
186 uint32_t random = 0;
187 read_frandom(&random, sizeof(random));
188 *last_port = first + (random % count);
189 }
190
191 do {
192 if (count == 0) {
193 lck_mtx_unlock(&vsockinfo.port_lock);
194 return EADDRNOTAVAIL;
195 }
196 count--;
197
198 ++*last_port;
199 if (*last_port < first || *last_port > last) {
200 *last_port = first;
201 }
202
203 error = vsock_bind_address_if_free(pcb, laddr.cid, *last_port, raddr.cid, raddr.port);
204 } while (error);
205
206 lck_mtx_unlock(&vsockinfo.port_lock);
207 }
208
209 return error;
210 }
211
212 static void
213 vsock_unbind_pcb(struct vsockpcb *pcb, bool is_locked)
214 {
215 if (!pcb) {
216 return;
217 }
218
219 socket_lock_assert_owned(pcb->so);
220
221 soisdisconnected(pcb->so);
222
223 if (!pcb->bound.le_prev) {
224 return;
225 }
226
227 if (!is_locked) {
228 socket_unlock(pcb->so, 0);
229 lck_rw_lock_exclusive(&vsockinfo.bound_lock);
230 socket_lock(pcb->so, 0);
231 if (!pcb->bound.le_prev) {
232 lck_rw_done(&vsockinfo.bound_lock);
233 return;
234 }
235 }
236
237 LIST_REMOVE(pcb, bound);
238 pcb->bound.le_next = NULL;
239 pcb->bound.le_prev = NULL;
240
241 if (!is_locked) {
242 lck_rw_done(&vsockinfo.bound_lock);
243 }
244 }
245
246 static struct sockaddr *
247 vsock_new_sockaddr(struct vsock_address *address)
248 {
249 if (!address) {
250 return NULL;
251 }
252
253 struct sockaddr_vm *addr;
254 MALLOC(addr, struct sockaddr_vm *, sizeof(*addr), M_SONAME,
255 M_WAITOK | M_ZERO);
256 if (!addr) {
257 return NULL;
258 }
259
260 addr->svm_len = sizeof(*addr);
261 addr->svm_family = AF_VSOCK;
262 addr->svm_port = address->port;
263 addr->svm_cid = address->cid;
264
265 return (struct sockaddr *)addr;
266 }
267
268 static errno_t
269 vsock_pcb_send_message(struct vsockpcb *pcb, enum vsock_operation operation, mbuf_t m)
270 {
271 if (!pcb) {
272 if (m != NULL) {
273 mbuf_freem_list(m);
274 }
275 return EINVAL;
276 }
277
278 socket_lock_assert_owned(pcb->so);
279
280 errno_t error;
281
282 struct vsock_address dst = pcb->remote_address;
283 if (dst.cid == VMADDR_CID_ANY || dst.port == VMADDR_PORT_ANY) {
284 if (m != NULL) {
285 mbuf_freem_list(m);
286 }
287 return EINVAL;
288 }
289
290 struct vsock_address src = pcb->local_address;
291 if (src.cid == VMADDR_CID_ANY) {
292 uint32_t transport_cid;
293 struct vsock_transport *transport = pcb->transport;
294 error = transport->get_cid(transport->provider, &transport_cid);
295 if (error) {
296 if (m != NULL) {
297 mbuf_freem_list(m);
298 }
299 return error;
300 }
301 src.cid = transport_cid;
302 }
303
304 uint32_t buf_alloc = pcb->so->so_rcv.sb_hiwat;
305 uint32_t fwd_cnt = pcb->fwd_cnt;
306
307 if (src.cid == dst.cid) {
308 pcb->last_buf_alloc = buf_alloc;
309 pcb->last_fwd_cnt = fwd_cnt;
310
311 socket_unlock(pcb->so, 0);
312 error = vsock_put_message(src, dst, operation, buf_alloc, fwd_cnt, m);
313 socket_lock(pcb->so, 0);
314 } else {
315 struct vsock_transport *transport = pcb->transport;
316 error = transport->put_message(transport->provider, src, dst, operation, buf_alloc, fwd_cnt, m);
317
318 if (!error) {
319 pcb->last_buf_alloc = buf_alloc;
320 pcb->last_fwd_cnt = fwd_cnt;
321 }
322 }
323
324 return error;
325 }
326
327 static errno_t
328 vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst)
329 {
330 if (dst.cid == VMADDR_CID_ANY || dst.port == VMADDR_PORT_ANY) {
331 return EINVAL;
332 }
333
334 errno_t error;
335 struct vsock_transport *transport = NULL;
336
337 if (src.cid == VMADDR_CID_ANY) {
338 transport = os_atomic_load(&the_vsock_transport, relaxed);
339 if (transport == NULL) {
340 return ENODEV;
341 }
342
343 uint32_t transport_cid;
344 error = transport->get_cid(transport->provider, &transport_cid);
345 if (error) {
346 return error;
347 }
348 src.cid = transport_cid;
349 }
350
351 if (src.cid == dst.cid) {
352 error = vsock_put_message(src, dst, VSOCK_RESET, 0, 0, NULL);
353 } else {
354 if (!transport) {
355 transport = os_atomic_load(&the_vsock_transport, relaxed);
356 if (transport == NULL) {
357 return ENODEV;
358 }
359 }
360 error = transport->put_message(transport->provider, src, dst, VSOCK_RESET, 0, 0, NULL);
361 }
362
363 return error;
364 }
365
366 static errno_t
367 vsock_pcb_safe_reset_address(struct vsockpcb *pcb, struct vsock_address src, struct vsock_address dst)
368 {
369 if (pcb) {
370 socket_lock_assert_owned(pcb->so);
371 socket_unlock(pcb->so, 0);
372 }
373 errno_t error = vsock_pcb_reset_address(src, dst);
374 if (pcb) {
375 socket_lock(pcb->so, 0);
376 }
377 return error;
378 }
379
380 static errno_t
381 vsock_pcb_connect(struct vsockpcb *pcb)
382 {
383 return vsock_pcb_send_message(pcb, VSOCK_REQUEST, NULL);
384 }
385
386 static errno_t
387 vsock_pcb_respond(struct vsockpcb *pcb)
388 {
389 return vsock_pcb_send_message(pcb, VSOCK_RESPONSE, NULL);
390 }
391
392 static errno_t
393 vsock_pcb_send(struct vsockpcb *pcb, mbuf_t m)
394 {
395 return vsock_pcb_send_message(pcb, VSOCK_PAYLOAD, m);
396 }
397
398 static errno_t
399 vsock_pcb_shutdown_send(struct vsockpcb *pcb)
400 {
401 return vsock_pcb_send_message(pcb, VSOCK_SHUTDOWN_SEND, NULL);
402 }
403
404 static errno_t
405 vsock_pcb_reset(struct vsockpcb *pcb)
406 {
407 return vsock_pcb_send_message(pcb, VSOCK_RESET, NULL);
408 }
409
410 static errno_t
411 vsock_pcb_credit_update(struct vsockpcb *pcb)
412 {
413 return vsock_pcb_send_message(pcb, VSOCK_CREDIT_UPDATE, NULL);
414 }
415
416 static errno_t
417 vsock_pcb_credit_request(struct vsockpcb *pcb)
418 {
419 return vsock_pcb_send_message(pcb, VSOCK_CREDIT_REQUEST, NULL);
420 }
421
422 static errno_t
423 vsock_disconnect_pcb_common(struct vsockpcb *pcb, bool is_locked)
424 {
425 socket_lock_assert_owned(pcb->so);
426 vsock_unbind_pcb(pcb, is_locked);
427 return vsock_pcb_reset(pcb);
428 }
429
430 static errno_t
431 vsock_disconnect_pcb_locked(struct vsockpcb *pcb)
432 {
433 return vsock_disconnect_pcb_common(pcb, true);
434 }
435
436 static errno_t
437 vsock_disconnect_pcb(struct vsockpcb *pcb)
438 {
439 return vsock_disconnect_pcb_common(pcb, false);
440 }
441
442 static errno_t
443 vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr)
444 {
445 if (!pcb || !pcb->so || !addr) {
446 return EINVAL;
447 }
448
449 // Validate address length.
450 if (addr->svm_len < sizeof(struct sockaddr_vm)) {
451 return EINVAL;
452 }
453
454 // Validate address family.
455 if (addr->svm_family != AF_UNSPEC && addr->svm_family != AF_VSOCK) {
456 return EAFNOSUPPORT;
457 }
458
459 // Only stream is supported currently.
460 if (pcb->so->so_type != SOCK_STREAM) {
461 return EAFNOSUPPORT;
462 }
463
464 return 0;
465 }
466 /* VSock Receive Handlers */
467
468 static errno_t
469 vsock_put_message_connected(struct vsockpcb *pcb, enum vsock_operation op, mbuf_t m)
470 {
471 socket_lock_assert_owned(pcb->so);
472
473 errno_t error = 0;
474
475 switch (op) {
476 case VSOCK_SHUTDOWN:
477 error = vsock_disconnect_pcb(pcb);
478 break;
479 case VSOCK_SHUTDOWN_RECEIVE:
480 socantsendmore(pcb->so);
481 break;
482 case VSOCK_SHUTDOWN_SEND:
483 socantrcvmore(pcb->so);
484 break;
485 case VSOCK_PAYLOAD:
486 // Add data to the receive queue then wakeup any reading threads.
487 error = !sbappendstream(&pcb->so->so_rcv, m);
488 if (!error) {
489 sorwakeup(pcb->so);
490 }
491 break;
492 case VSOCK_RESET:
493 vsock_unbind_pcb(pcb, false);
494 break;
495 default:
496 error = ENOTSUP;
497 break;
498 }
499
500 return error;
501 }
502
503 static errno_t
504 vsock_put_message_connecting(struct vsockpcb *pcb, enum vsock_operation op)
505 {
506 socket_lock_assert_owned(pcb->so);
507
508 errno_t error = 0;
509
510 switch (op) {
511 case VSOCK_RESPONSE:
512 soisconnected(pcb->so);
513 break;
514 case VSOCK_RESET:
515 pcb->so->so_error = EAGAIN;
516 error = vsock_disconnect_pcb(pcb);
517 break;
518 default:
519 vsock_disconnect_pcb(pcb);
520 error = ENOTSUP;
521 break;
522 }
523
524 return error;
525 }
526
527 static errno_t
528 vsock_put_message_listening(struct vsockpcb *pcb, enum vsock_operation op, struct vsock_address src, struct vsock_address dst)
529 {
530 socket_lock_assert_owned(pcb->so);
531
532 struct sockaddr_vm addr;
533 struct socket *so2 = NULL;
534 struct vsockpcb *pcb2 = NULL;
535
536 errno_t error = 0;
537
538 switch (op) {
539 case VSOCK_REQUEST:
540 addr = (struct sockaddr_vm) {
541 .svm_len = sizeof(addr),
542 .svm_family = AF_VSOCK,
543 .svm_reserved1 = 0,
544 .svm_port = pcb->local_address.port,
545 .svm_cid = pcb->local_address.cid
546 };
547 so2 = sonewconn(pcb->so, 0, (struct sockaddr *)&addr);
548 if (!so2) {
549 // It is likely that the backlog is full. Deny this request.
550 vsock_pcb_safe_reset_address(pcb, dst, src);
551 error = ECONNREFUSED;
552 break;
553 }
554
555 pcb2 = sotovsockpcb(so2);
556 if (!pcb2) {
557 error = EINVAL;
558 goto done;
559 }
560
561 error = vsock_bind_address(pcb2, dst, src);
562 if (error) {
563 goto done;
564 }
565
566 error = vsock_pcb_respond(pcb2);
567 if (error) {
568 goto done;
569 }
570
571 soisconnected(so2);
572
573 done:
574 if (error) {
575 soisdisconnected(so2);
576 if (pcb2) {
577 vsock_unbind_pcb(pcb2, false);
578 }
579 socket_unlock(so2, 1);
580 vsock_pcb_reset_address(dst, src);
581 } else {
582 socket_unlock(so2, 0);
583 }
584 socket_lock(pcb->so, 0);
585
586 break;
587 case VSOCK_RESET:
588 error = vsock_pcb_safe_reset_address(pcb, dst, src);
589 break;
590 default:
591 vsock_pcb_safe_reset_address(pcb, dst, src);
592 error = ENOTSUP;
593 break;
594 }
595
596 return error;
597 }
598
599 /* VSock Transport */
600
601 errno_t
602 vsock_add_transport(struct vsock_transport *transport)
603 {
604 if (transport == NULL || transport->provider == NULL) {
605 return EINVAL;
606 }
607 if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, NULL, transport, acq_rel)) {
608 return EEXIST;
609 }
610 return 0;
611 }
612
613 errno_t
614 vsock_remove_transport(struct vsock_transport *transport)
615 {
616 if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, transport, NULL, acq_rel)) {
617 return ENODEV;
618 }
619 return 0;
620 }
621
622 errno_t
623 vsock_reset_transport(struct vsock_transport *transport)
624 {
625 if (transport == NULL) {
626 return EINVAL;
627 }
628
629 errno_t error = 0;
630 struct vsockpcb *pcb = NULL;
631 struct vsockpcb *tmp_pcb = NULL;
632
633 lck_rw_lock_exclusive(&vsockinfo.bound_lock);
634 LIST_FOREACH_SAFE(pcb, &vsockinfo.bound, bound, tmp_pcb) {
635 // Disconnect this transport's sockets. Listen and bind sockets must stay alive.
636 socket_lock(pcb->so, 1);
637 if (pcb->transport == transport && pcb->so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) {
638 errno_t dc_error = vsock_disconnect_pcb_locked(pcb);
639 if (dc_error && !error) {
640 error = dc_error;
641 }
642 }
643 socket_unlock(pcb->so, 1);
644 }
645 lck_rw_done(&vsockinfo.bound_lock);
646
647 return error;
648 }
649
650 errno_t
651 vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m)
652 {
653 struct vsockpcb *pcb = vsock_get_matching_pcb(dst, src);
654 if (!pcb) {
655 if (op != VSOCK_RESET) {
656 vsock_pcb_reset_address(dst, src);
657 }
658 if (m != NULL) {
659 mbuf_freem_list(m);
660 }
661 return EINVAL;
662 }
663
664 socket_lock_assert_owned(pcb->so);
665
666 struct socket *so = pcb->so;
667 errno_t error = 0;
668
669 // Check if the peer's buffer has changed. Update our view of the peer's forwarded bytes.
670 int buffers_changed = (pcb->peer_buf_alloc != buf_alloc) || (pcb->peer_fwd_cnt) != fwd_cnt;
671 pcb->peer_buf_alloc = buf_alloc;
672 pcb->peer_fwd_cnt = fwd_cnt;
673
674 // Peer's buffer has enough space for the next packet. Notify any threads waiting for space.
675 if (buffers_changed && vsock_get_peer_space(pcb) >= pcb->waiting_send_size) {
676 sowwakeup(so);
677 }
678
679 switch (op) {
680 case VSOCK_CREDIT_REQUEST:
681 error = vsock_pcb_credit_update(pcb);
682 break;
683 case VSOCK_CREDIT_UPDATE:
684 break;
685 default:
686 if (so->so_state & SS_ISCONNECTED) {
687 error = vsock_put_message_connected(pcb, op, m);
688 m = NULL;
689 } else if (so->so_state & SS_ISCONNECTING) {
690 error = vsock_put_message_connecting(pcb, op);
691 } else if (so->so_options & SO_ACCEPTCONN) {
692 error = vsock_put_message_listening(pcb, op, src, dst);
693 } else {
694 // Reset the connection for other states such as 'disconnecting'.
695 error = vsock_disconnect_pcb(pcb);
696 if (!error) {
697 error = ENODEV;
698 }
699 }
700 break;
701 }
702 socket_unlock(so, 1);
703
704 if (m != NULL) {
705 mbuf_freem_list(m);
706 }
707
708 return error;
709 }
710
711 /* VSock Sysctl */
712
713 static int
714 vsock_pcblist SYSCTL_HANDLER_ARGS
715 {
716 #pragma unused(oidp,arg2)
717
718 int error;
719
720 // Only stream is supported.
721 if ((intptr_t)arg1 != SOCK_STREAM) {
722 return EINVAL;
723 }
724
725 // Get the generation count and the count of all vsock sockets.
726 lck_rw_lock_shared(&vsockinfo.all_lock);
727 uint64_t n = vsockinfo.all_pcb_count;
728 vsock_gen_t gen_count = vsockinfo.vsock_gencnt;
729 lck_rw_done(&vsockinfo.all_lock);
730
731 const size_t xpcb_len = sizeof(struct xvsockpcb);
732 struct xvsockpgen xvg;
733
734 /*
735 * The process of preparing the PCB list is too time-consuming and
736 * resource-intensive to repeat twice on every request.
737 */
738 if (req->oldptr == USER_ADDR_NULL) {
739 req->oldidx = (size_t)(2 * sizeof(xvg) + (n + n / 8) * xpcb_len);
740 return 0;
741 }
742
743 if (req->newptr != USER_ADDR_NULL) {
744 return EPERM;
745 }
746
747 bzero(&xvg, sizeof(xvg));
748 xvg.xvg_len = sizeof(xvg);
749 xvg.xvg_count = n;
750 xvg.xvg_gen = gen_count;
751 xvg.xvg_sogen = so_gencnt;
752 error = SYSCTL_OUT(req, &xvg, sizeof(xvg));
753 if (error) {
754 return error;
755 }
756
757 // Return if no sockets exist.
758 if (n == 0) {
759 return 0;
760 }
761
762 lck_rw_lock_shared(&vsockinfo.all_lock);
763
764 n = 0;
765 struct vsockpcb *pcb = NULL;
766 TAILQ_FOREACH(pcb, &vsockinfo.all, all) {
767 // Bail if there is not enough user buffer for this next socket.
768 if (req->oldlen - req->oldidx - sizeof(xvg) < xpcb_len) {
769 break;
770 }
771
772 // Populate the socket structure.
773 socket_lock(pcb->so, 1);
774 if (pcb->vsock_gencnt <= gen_count) {
775 struct xvsockpcb xpcb;
776 bzero(&xpcb, xpcb_len);
777 xpcb.xv_len = xpcb_len;
778 xpcb.xv_vsockpp = (uint64_t)VM_KERNEL_ADDRHASH(pcb);
779 xpcb.xvp_local_cid = pcb->local_address.cid;
780 xpcb.xvp_local_port = pcb->local_address.port;
781 xpcb.xvp_remote_cid = pcb->remote_address.cid;
782 xpcb.xvp_remote_port = pcb->remote_address.port;
783 xpcb.xvp_rxcnt = pcb->fwd_cnt;
784 xpcb.xvp_txcnt = pcb->tx_cnt;
785 xpcb.xvp_peer_rxhiwat = pcb->peer_buf_alloc;
786 xpcb.xvp_peer_rxcnt = pcb->peer_fwd_cnt;
787 xpcb.xvp_last_pid = pcb->so->last_pid;
788 xpcb.xvp_gencnt = pcb->vsock_gencnt;
789 if (pcb->so) {
790 sotoxsocket(pcb->so, &xpcb.xv_socket);
791 }
792 socket_unlock(pcb->so, 1);
793
794 error = SYSCTL_OUT(req, &xpcb, xpcb_len);
795 if (error != 0) {
796 break;
797 }
798 n++;
799 } else {
800 socket_unlock(pcb->so, 1);
801 }
802 }
803
804 // Update the generation count to match the sockets being returned.
805 gen_count = vsockinfo.vsock_gencnt;
806
807 lck_rw_done(&vsockinfo.all_lock);
808
809 if (!error) {
810 /*
811 * Give the user an updated idea of our state.
812 * If the generation differs from what we told
813 * her before, she knows that something happened
814 * while we were processing this request, and it
815 * might be necessary to retry.
816 */
817 bzero(&xvg, sizeof(xvg));
818 xvg.xvg_len = sizeof(xvg);
819 xvg.xvg_count = n;
820 xvg.xvg_gen = gen_count;
821 xvg.xvg_sogen = so_gencnt;
822 error = SYSCTL_OUT(req, &xvg, sizeof(xvg));
823 }
824
825 return error;
826 }
827
828 #ifdef SYSCTL_DECL
829 SYSCTL_NODE(_net, OID_AUTO, vsock, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "vsock");
830 SYSCTL_UINT(_net_vsock, OID_AUTO, sendspace, CTLFLAG_RW | CTLFLAG_LOCKED,
831 &vsock_sendspace, 0, "Maximum outgoing vsock datagram size");
832 SYSCTL_UINT(_net_vsock, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED,
833 &vsock_recvspace, 0, "Maximum incoming vsock datagram size");
834 SYSCTL_PROC(_net_vsock, OID_AUTO, pcblist,
835 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
836 (caddr_t)(long)SOCK_STREAM, 0, vsock_pcblist, "S,xvsockpcb",
837 "List of active vsock sockets");
838 #endif
839
840 /* VSock Protocol */
841
842 static int
843 vsock_attach(struct socket *so, int proto, struct proc *p)
844 {
845 #pragma unused(proto, p)
846
847 // Attach should only be run once per socket.
848 struct vsockpcb *pcb = sotovsockpcb(so);
849 if (pcb) {
850 return EINVAL;
851 }
852
853 // Get the transport for this socket.
854 struct vsock_transport *transport = os_atomic_load(&the_vsock_transport, relaxed);
855 if (transport == NULL) {
856 return ENODEV;
857 }
858
859 // Reserve send and receive buffers.
860 errno_t error = soreserve(so, vsock_sendspace, vsock_recvspace);
861 if (error) {
862 return error;
863 }
864
865 // Initialize the vsock protocol control block.
866 pcb = zalloc(vsockpcb_zone);
867 if (pcb == NULL) {
868 return ENOBUFS;
869 }
870 bzero(pcb, sizeof(*pcb));
871 pcb->so = so;
872 pcb->transport = transport;
873 pcb->local_address = (struct vsock_address) {
874 .cid = VMADDR_CID_ANY,
875 .port = VMADDR_PORT_ANY
876 };
877 pcb->remote_address = (struct vsock_address) {
878 .cid = VMADDR_CID_ANY,
879 .port = VMADDR_PORT_ANY
880 };
881 so->so_pcb = pcb;
882
883 // Tell the transport that this socket has attached.
884 error = transport->attach_socket(transport->provider);
885 if (error) {
886 return error;
887 }
888
889 // Add to the list of all vsock sockets.
890 lck_rw_lock_exclusive(&vsockinfo.all_lock);
891 TAILQ_INSERT_TAIL(&vsockinfo.all, pcb, all);
892 vsockinfo.all_pcb_count++;
893 pcb->vsock_gencnt = ++vsockinfo.vsock_gencnt;
894 lck_rw_done(&vsockinfo.all_lock);
895
896 return 0;
897 }
898
899 static int
900 vsock_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct proc *p)
901 {
902 #pragma unused(ifp)
903
904 VERIFY(so != NULL || p == kernproc);
905
906 if (cmd != IOCTL_VM_SOCKETS_GET_LOCAL_CID) {
907 return EINVAL;
908 }
909
910 struct vsock_transport *transport;
911 if (so) {
912 struct vsockpcb *pcb = sotovsockpcb(so);
913 if (pcb == NULL) {
914 return EINVAL;
915 }
916 transport = pcb->transport;
917 } else {
918 transport = os_atomic_load(&the_vsock_transport, relaxed);
919 }
920
921 if (transport == NULL) {
922 return ENODEV;
923 }
924
925 uint32_t transport_cid;
926 errno_t error = transport->get_cid(transport->provider, &transport_cid);
927 if (error) {
928 return error;
929 }
930
931 memcpy(data, &transport_cid, sizeof(transport_cid));
932
933 return 0;
934 }
935
936 static int
937 vsock_detach(struct socket *so)
938 {
939 struct vsockpcb *pcb = sotovsockpcb(so);
940 if (pcb == NULL) {
941 return EINVAL;
942 }
943
944 vsock_unbind_pcb(pcb, false);
945
946 // Tell the transport that this socket has detached.
947 struct vsock_transport *transport = pcb->transport;
948 errno_t error = transport->detach_socket(transport->provider);
949 if (error) {
950 return error;
951 }
952
953 // Remove from the list of all vsock sockets.
954 lck_rw_lock_exclusive(&vsockinfo.all_lock);
955 TAILQ_REMOVE(&vsockinfo.all, pcb, all);
956 pcb->all.tqe_next = NULL;
957 pcb->all.tqe_prev = NULL;
958 vsockinfo.all_pcb_count--;
959 vsockinfo.vsock_gencnt++;
960 lck_rw_done(&vsockinfo.all_lock);
961
962 // Deallocate any resources.
963 zfree(vsockpcb_zone, pcb);
964 so->so_pcb = 0;
965 so->so_flags |= SOF_PCBCLEARING;
966 sofree(so);
967
968 return 0;
969 }
970
971 static int
972 vsock_abort(struct socket *so)
973 {
974 soisdisconnected(so);
975 return vsock_detach(so);
976 }
977
978 static int
979 vsock_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
980 {
981 #pragma unused(p)
982
983 struct vsockpcb *pcb = sotovsockpcb(so);
984 if (pcb == NULL) {
985 return EINVAL;
986 }
987
988 struct sockaddr_vm *addr = (struct sockaddr_vm *)nam;
989
990 errno_t error = vsock_sockaddr_vm_validate(pcb, addr);
991 if (error) {
992 return error;
993 }
994
995 struct vsock_address laddr = (struct vsock_address) {
996 .cid = addr->svm_cid,
997 .port = addr->svm_port,
998 };
999
1000 struct vsock_address raddr = (struct vsock_address) {
1001 .cid = VMADDR_CID_ANY,
1002 .port = VMADDR_PORT_ANY,
1003 };
1004
1005 error = vsock_bind_address(pcb, laddr, raddr);
1006 if (error) {
1007 return error;
1008 }
1009
1010 return 0;
1011 }
1012
1013 static int
1014 vsock_listen(struct socket *so, struct proc *p)
1015 {
1016 #pragma unused(p)
1017
1018 struct vsockpcb *pcb = sotovsockpcb(so);
1019 if (pcb == NULL) {
1020 return EINVAL;
1021 }
1022
1023 // Only stream is supported currently.
1024 if (so->so_type != SOCK_STREAM) {
1025 return EAFNOSUPPORT;
1026 }
1027
1028 struct vsock_address *addr = &pcb->local_address;
1029
1030 if (addr->port == VMADDR_CID_ANY) {
1031 return EFAULT;
1032 }
1033
1034 struct vsock_transport *transport = pcb->transport;
1035 uint32_t transport_cid;
1036 errno_t error = transport->get_cid(transport->provider, &transport_cid);
1037 if (error) {
1038 return error;
1039 }
1040
1041 // Can listen on the transport's cid or any.
1042 if (addr->cid != transport_cid && addr->cid != VMADDR_CID_ANY) {
1043 return EFAULT;
1044 }
1045
1046 return 0;
1047 }
1048
1049 static int
1050 vsock_accept(struct socket *so, struct sockaddr **nam)
1051 {
1052 struct vsockpcb *pcb = sotovsockpcb(so);
1053 if (pcb == NULL) {
1054 return EINVAL;
1055 }
1056
1057 // Do not accept disconnected sockets.
1058 if (so->so_state & SS_ISDISCONNECTED) {
1059 return ECONNABORTED;
1060 }
1061
1062 *nam = vsock_new_sockaddr(&pcb->remote_address);
1063
1064 return 0;
1065 }
1066
1067 static int
1068 vsock_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
1069 {
1070 #pragma unused(p)
1071
1072 struct vsockpcb *pcb = sotovsockpcb(so);
1073 if (pcb == NULL) {
1074 return EINVAL;
1075 }
1076
1077 struct sockaddr_vm *addr = (struct sockaddr_vm *)nam;
1078
1079 errno_t error = vsock_sockaddr_vm_validate(pcb, addr);
1080 if (error) {
1081 return error;
1082 }
1083
1084 uint32_t transport_cid;
1085 struct vsock_transport *transport = pcb->transport;
1086 error = transport->get_cid(transport->provider, &transport_cid);
1087 if (error) {
1088 return error;
1089 }
1090
1091 // Only supporting connections to the host, hypervisor, or self for now.
1092 if (addr->svm_cid != VMADDR_CID_HOST &&
1093 addr->svm_cid != VMADDR_CID_HYPERVISOR &&
1094 addr->svm_cid != transport_cid) {
1095 return EFAULT;
1096 }
1097
1098 soisconnecting(so);
1099
1100 // Set the remote and local address.
1101 struct vsock_address remote_addr = (struct vsock_address) {
1102 .cid = addr->svm_cid,
1103 .port = addr->svm_port,
1104 };
1105
1106 struct vsock_address local_addr = (struct vsock_address) {
1107 .cid = transport_cid,
1108 .port = VMADDR_PORT_ANY,
1109 };
1110
1111 // Bind to the address.
1112 error = vsock_bind_address(pcb, local_addr, remote_addr);
1113 if (error) {
1114 goto cleanup;
1115 }
1116
1117 // Attempt a connection using the socket's transport.
1118 error = vsock_pcb_connect(pcb);
1119 if (error) {
1120 goto cleanup;
1121 }
1122
1123 if ((so->so_state & SS_ISCONNECTED) == 0) {
1124 // Don't wait for peer's response if non-blocking.
1125 if (so->so_state & SS_NBIO) {
1126 error = EINPROGRESS;
1127 goto done;
1128 }
1129
1130 struct timespec ts = (struct timespec) {
1131 .tv_sec = so->so_snd.sb_timeo.tv_sec,
1132 .tv_nsec = so->so_snd.sb_timeo.tv_usec * 1000,
1133 };
1134
1135 lck_mtx_t *mutex_held;
1136 if (so->so_proto->pr_getlock != NULL) {
1137 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1138 } else {
1139 mutex_held = so->so_proto->pr_domain->dom_mtx;
1140 }
1141
1142 // Wait until we receive a response to the connect request.
1143 error = msleep((caddr_t)&so->so_timeo, mutex_held, PSOCK | PCATCH, "vsock_connect", &ts);
1144 if (error) {
1145 if (error == EAGAIN) {
1146 error = ETIMEDOUT;
1147 }
1148 goto cleanup;
1149 }
1150 }
1151
1152 cleanup:
1153 if (so->so_error && !error) {
1154 error = so->so_error;
1155 so->so_error = 0;
1156 }
1157 if (!error) {
1158 error = !(so->so_state & SS_ISCONNECTED);
1159 }
1160 if (error) {
1161 vsock_unbind_pcb(pcb, false);
1162 }
1163
1164 done:
1165 return error;
1166 }
1167
1168 static int
1169 vsock_disconnect(struct socket *so)
1170 {
1171 struct vsockpcb *pcb = sotovsockpcb(so);
1172 if (pcb == NULL) {
1173 return EINVAL;
1174 }
1175
1176 return vsock_disconnect_pcb(pcb);
1177 }
1178
1179 static int
1180 vsock_sockaddr(struct socket *so, struct sockaddr **nam)
1181 {
1182 struct vsockpcb *pcb = sotovsockpcb(so);
1183 if (pcb == NULL) {
1184 return EINVAL;
1185 }
1186
1187 *nam = vsock_new_sockaddr(&pcb->local_address);
1188
1189 return 0;
1190 }
1191
1192 static int
1193 vsock_peeraddr(struct socket *so, struct sockaddr **nam)
1194 {
1195 struct vsockpcb *pcb = sotovsockpcb(so);
1196 if (pcb == NULL) {
1197 return EINVAL;
1198 }
1199
1200 *nam = vsock_new_sockaddr(&pcb->remote_address);
1201
1202 return 0;
1203 }
1204
1205 static int
1206 vsock_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, proc_t p)
1207 {
1208 #pragma unused(flags, nam, p)
1209
1210 struct vsockpcb *pcb = sotovsockpcb(so);
1211 if (pcb == NULL || m == NULL) {
1212 return EINVAL;
1213 }
1214
1215 if (control != NULL) {
1216 m_freem(control);
1217 return EOPNOTSUPP;
1218 }
1219
1220 // Ensure this socket is connected.
1221 if ((so->so_state & SS_ISCONNECTED) == 0) {
1222 if (m != NULL) {
1223 mbuf_freem_list(m);
1224 }
1225 return EPERM;
1226 }
1227
1228 errno_t error;
1229
1230 const size_t len = mbuf_pkthdr_len(m);
1231 uint32_t free_space = vsock_get_peer_space(pcb);
1232
1233 // Ensure the peer has enough space in their receive buffer.
1234 while (len > free_space) {
1235 // Record the number of free peer bytes necessary before we can send.
1236 if (len > pcb->waiting_send_size) {
1237 pcb->waiting_send_size = len;
1238 }
1239
1240 // Send a credit request.
1241 error = vsock_pcb_credit_request(pcb);
1242 if (error) {
1243 if (m != NULL) {
1244 mbuf_freem_list(m);
1245 }
1246 return error;
1247 }
1248
1249 // Check again in case free space was automatically updated in loopback case.
1250 free_space = vsock_get_peer_space(pcb);
1251 if (len <= free_space) {
1252 pcb->waiting_send_size = 0;
1253 break;
1254 }
1255
1256 // Bail if this is a non-blocking socket.
1257 if (so->so_state & SS_NBIO) {
1258 if (m != NULL) {
1259 mbuf_freem_list(m);
1260 }
1261 return EWOULDBLOCK;
1262 }
1263
1264 // Wait until our peer has enough free space in their receive buffer.
1265 error = sbwait(&so->so_snd);
1266 pcb->waiting_send_size = 0;
1267 if (error) {
1268 if (m != NULL) {
1269 mbuf_freem_list(m);
1270 }
1271 return error;
1272 }
1273
1274 // Bail if an error occured or we can't send more.
1275 if (so->so_state & SS_CANTSENDMORE) {
1276 if (m != NULL) {
1277 mbuf_freem_list(m);
1278 }
1279 return EPIPE;
1280 } else if (so->so_error) {
1281 error = so->so_error;
1282 so->so_error = 0;
1283 if (m != NULL) {
1284 mbuf_freem_list(m);
1285 }
1286 return error;
1287 }
1288
1289 free_space = vsock_get_peer_space(pcb);
1290 }
1291
1292 // Send a payload over the transport.
1293 error = vsock_pcb_send(pcb, m);
1294 if (error) {
1295 return error;
1296 }
1297
1298 pcb->tx_cnt += len;
1299
1300 return 0;
1301 }
1302
1303 static int
1304 vsock_shutdown(struct socket *so)
1305 {
1306 struct vsockpcb *pcb = sotovsockpcb(so);
1307 if (pcb == NULL) {
1308 return EINVAL;
1309 }
1310
1311 socantsendmore(so);
1312
1313 // Tell peer we will no longer send.
1314 errno_t error = vsock_pcb_shutdown_send(pcb);
1315 if (error) {
1316 return error;
1317 }
1318
1319 return 0;
1320 }
1321
1322 static int
1323 vsock_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1324 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1325 {
1326 struct vsockpcb *pcb = sotovsockpcb(so);
1327 if (pcb == NULL) {
1328 return EINVAL;
1329 }
1330
1331 user_ssize_t length = uio_resid(uio);
1332 int result = soreceive(so, psa, uio, mp0, controlp, flagsp);
1333 length -= uio_resid(uio);
1334
1335 socket_lock(so, 1);
1336
1337 pcb->fwd_cnt += length;
1338
1339 const uint32_t threshold = VSOCK_MAX_PACKET_SIZE;
1340
1341 // Send a credit update if is possible that the peer will no longer send.
1342 if ((pcb->fwd_cnt - pcb->last_fwd_cnt + threshold) >= pcb->last_buf_alloc) {
1343 errno_t error = vsock_pcb_credit_update(pcb);
1344 if (!result && error) {
1345 result = error;
1346 }
1347 }
1348
1349 socket_unlock(so, 1);
1350
1351 return result;
1352 }
1353
1354 static struct pr_usrreqs vsock_usrreqs = {
1355 .pru_abort = vsock_abort,
1356 .pru_attach = vsock_attach,
1357 .pru_control = vsock_control,
1358 .pru_detach = vsock_detach,
1359 .pru_bind = vsock_bind,
1360 .pru_listen = vsock_listen,
1361 .pru_accept = vsock_accept,
1362 .pru_connect = vsock_connect,
1363 .pru_disconnect = vsock_disconnect,
1364 .pru_send = vsock_send,
1365 .pru_shutdown = vsock_shutdown,
1366 .pru_sockaddr = vsock_sockaddr,
1367 .pru_peeraddr = vsock_peeraddr,
1368 .pru_sosend = sosend,
1369 .pru_soreceive = vsock_soreceive,
1370 };
1371
1372 static void
1373 vsock_init(struct protosw *pp, struct domain *dp)
1374 {
1375 #pragma unused(dp)
1376
1377 static int vsock_initialized = 0;
1378 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
1379 if (!os_atomic_cmpxchg((volatile int *)&vsock_initialized, 0, 1, acq_rel)) {
1380 return;
1381 }
1382
1383 // Setup VSock protocol info struct.
1384 lck_rw_init(&vsockinfo.all_lock, &vsock_lock_grp, LCK_ATTR_NULL);
1385 lck_rw_init(&vsockinfo.bound_lock, &vsock_lock_grp, LCK_ATTR_NULL);
1386 lck_mtx_init(&vsockinfo.port_lock, &vsock_lock_grp, LCK_ATTR_NULL);
1387 TAILQ_INIT(&vsockinfo.all);
1388 LIST_INIT(&vsockinfo.bound);
1389 vsockinfo.last_port = VMADDR_PORT_ANY;
1390 }
1391
1392 static struct protosw vsocksw[] = {
1393 {
1394 .pr_type = SOCK_STREAM,
1395 .pr_protocol = 0,
1396 .pr_flags = PR_CONNREQUIRED | PR_WANTRCVD,
1397 .pr_init = vsock_init,
1398 .pr_usrreqs = &vsock_usrreqs,
1399 }
1400 };
1401
1402 static const int vsock_proto_count = (sizeof(vsocksw) / sizeof(struct protosw));
1403
1404 /* VSock Domain */
1405
1406 static struct domain *vsock_domain = NULL;
1407
1408 static void
1409 vsock_dinit(struct domain *dp)
1410 {
1411 // The VSock domain is initialized with a singleton pattern.
1412 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
1413 VERIFY(vsock_domain == NULL);
1414 vsock_domain = dp;
1415
1416 // Add protocols and initialize.
1417 for (int i = 0; i < vsock_proto_count; i++) {
1418 net_add_proto((struct protosw *)&vsocksw[i], dp, 1);
1419 }
1420 }
1421
1422 struct domain vsockdomain_s = {
1423 .dom_family = PF_VSOCK,
1424 .dom_name = "vsock",
1425 .dom_init = vsock_dinit,
1426 .dom_maxrtkey = sizeof(struct sockaddr_vm),
1427 .dom_protohdrlen = sizeof(struct sockaddr_vm),
1428 };