2 * Copyright (c) 2011-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * Link-layer Reachability Record
32 * Each interface maintains a red-black tree which contains records related
33 * to the on-link nodes which we are interested in communicating with. Each
34 * record gets allocated and inserted into the tree in the following manner:
35 * upon processing an ARP announcement or reply from a known node (i.e. there
36 * exists a ARP route entry for the node), and if a link-layer reachability
37 * record for the node doesn't yet exist; and, upon processing a ND6 RS/RA/
38 * NS/NA/redirect from a node, and if a link-layer reachability record for the
39 * node doesn't yet exist.
41 * Each newly created record is then referred to by the resolver route entry;
42 * if a record already exists, its reference count gets increased for the new
43 * resolver entry which now refers to it. A record gets removed from the tree
44 * and freed once its reference counts drops to zero, i.e. when there is no
45 * more resolver entry referring to it.
47 * A record contains the link-layer protocol (e.g. Ethertype IP/IPv6), the
48 * HW address of the sender, the "last heard from" timestamp (lr_lastrcvd) and
49 * the number of references made to it (lr_reqcnt). Because the key for each
50 * record in the red-black tree consists of the link-layer protocol, therefore
51 * the namespace for the records is partitioned based on the type of link-layer
52 * protocol, i.e. an Ethertype IP link-layer record is only referred to by one
53 * or more ARP entries; an Ethernet IPv6 link-layer record is only referred to
54 * by one or more ND6 entries. Therefore, lr_reqcnt represents the number of
55 * resolver entry references to the record for the same protocol family.
57 * Upon receiving packets from the network, the protocol's input callback
58 * (e.g. ether_inet{6}_input) informs the corresponding resolver (ARP/ND6)
59 * about the (link-layer) origin of the packet. This results in searching
60 * for a matching record in the red-black tree for the interface where the
61 * packet arrived on. If there's no match, no further processing takes place.
62 * Otherwise, the lr_lastrcvd timestamp of the record is updated.
64 * When an IP/IPv6 packet is transmitted to the resolver (i.e. the destination
65 * is on-link), ARP/ND6 records the "last spoken to" timestamp in the route
66 * entry ({la,ln}_lastused).
68 * The reachability of the on-link node is determined by the following logic,
69 * upon sending a packet thru the resolver:
71 * a) If the record is only used by exactly one resolver entry (lr_reqcnt
72 * is 1), i.e. the target host does not have IP/IPv6 aliases that we know
73 * of, check if lr_lastrcvd is "recent." If so, simply send the packet;
74 * otherwise, re-resolve the target node.
76 * b) If the record is shared by multiple resolver entries (lr_reqcnt is
77 * greater than 1), i.e. the target host has more than one IP/IPv6 aliases
78 * on the same network interface, we can't rely on lr_lastrcvd alone, as
79 * one of the IP/IPv6 aliases could have been silently moved to another
80 * node for which we don't have a link-layer record. If lr_lastrcvd is
81 * not "recent", we re-resolve the target node. Otherwise, we perform
82 * an additional check against {la,ln}_lastused to see whether it is also
83 * "recent", relative to lr_lastrcvd. If so, simply send the packet;
84 * otherwise, re-resolve the target node.
86 * The value for "recent" is configurable by adjusting the basetime value for
87 * net.link.ether.inet.arp_llreach_base or net.inet6.icmp6.nd6_llreach_base.
88 * The default basetime value is 30 seconds, and the actual expiration time
89 * is calculated by multiplying the basetime value with some random factor,
90 * which results in a number between 15 to 45 seconds. Setting the basetime
91 * value to 0 effectively disables this feature for the corresponding resolver.
95 * The above logic is based upon the following assumptions:
97 * i) Network traffics are mostly bi-directional, i.e. the act of sending
98 * packets to an on-link node would most likely cause us to receive
99 * packets from that node.
101 * ii) If the on-link node's IP/IPv6 address silently moves to another
102 * on-link node for which we are not aware of, non-unicast packets
103 * from the old node would trigger the record's lr_lastrcvd to be
106 * We can mitigate the above by having the resolver check its {la,ln}_lastused
107 * timestamp at all times, i.e. not only when lr_reqcnt is greater than 1; but
108 * we currently optimize for the common cases.
111 #include <sys/param.h>
112 #include <sys/systm.h>
113 #include <sys/kernel.h>
114 #include <sys/malloc.h>
115 #include <sys/tree.h>
116 #include <sys/sysctl.h>
117 #include <sys/mcache.h>
118 #include <sys/protosw.h>
120 #include <dev/random/randomdev.h>
122 #include <net/if_dl.h>
124 #include <net/if_var.h>
125 #include <net/if_llreach.h>
126 #include <net/dlil.h>
127 #include <net/kpi_interface.h>
128 #include <net/route.h>
130 #include <kern/assert.h>
131 #include <kern/locks.h>
132 #include <kern/zalloc.h>
135 #include <netinet6/in6_var.h>
136 #include <netinet6/nd6.h>
139 static unsigned int iflr_size
; /* size of if_llreach */
140 static struct zone
*iflr_zone
; /* zone for if_llreach */
142 #define IFLR_ZONE_MAX 128 /* maximum elements in zone */
143 #define IFLR_ZONE_NAME "if_llreach" /* zone name */
145 static struct if_llreach
*iflr_alloc(int);
146 static void iflr_free(struct if_llreach
*);
147 static __inline
int iflr_cmp(const struct if_llreach
*,
148 const struct if_llreach
*);
149 static __inline
int iflr_reachable(struct if_llreach
*, int, u_int64_t
);
150 static int sysctl_llreach_ifinfo SYSCTL_HANDLER_ARGS
;
152 /* The following is protected by if_llreach_lock */
153 RB_GENERATE_PREV(ll_reach_tree
, if_llreach
, lr_link
, iflr_cmp
);
155 SYSCTL_DECL(_net_link_generic_system
);
157 SYSCTL_NODE(_net_link_generic_system
, OID_AUTO
, llreach_info
,
158 CTLFLAG_RD
| CTLFLAG_LOCKED
, sysctl_llreach_ifinfo
,
159 "Per-interface tree of source link-layer reachability records");
162 * Link-layer reachability is based off node constants in RFC4861.
165 #define LL_COMPUTE_RTIME(x) ND_COMPUTE_RTIME(x)
167 #define LL_MIN_RANDOM_FACTOR 512 /* 1024 * 0.5 */
168 #define LL_MAX_RANDOM_FACTOR 1536 /* 1024 * 1.5 */
169 #define LL_COMPUTE_RTIME(x) \
170 (((LL_MIN_RANDOM_FACTOR * (x >> 10)) + (RandomULong() & \
171 ((LL_MAX_RANDOM_FACTOR - LL_MIN_RANDOM_FACTOR) * (x >> 10)))) / 1000)
175 ifnet_llreach_init(void)
177 iflr_size
= sizeof(struct if_llreach
);
178 iflr_zone
= zinit(iflr_size
,
179 IFLR_ZONE_MAX
* iflr_size
, 0, IFLR_ZONE_NAME
);
180 if (iflr_zone
== NULL
) {
181 panic("%s: failed allocating %s", __func__
, IFLR_ZONE_NAME
);
184 zone_change(iflr_zone
, Z_EXPAND
, TRUE
);
185 zone_change(iflr_zone
, Z_CALLERACCT
, FALSE
);
189 ifnet_llreach_ifattach(struct ifnet
*ifp
, boolean_t reuse
)
191 lck_rw_lock_exclusive(&ifp
->if_llreach_lock
);
192 /* Initialize link-layer source tree (if not already) */
194 RB_INIT(&ifp
->if_ll_srcs
);
196 lck_rw_done(&ifp
->if_llreach_lock
);
200 ifnet_llreach_ifdetach(struct ifnet
*ifp
)
204 * Nothing to do for now; the link-layer source tree might
205 * contain entries at this point, that are still referred
206 * to by route entries pointing to this ifp.
211 * Link-layer source tree comparison function.
213 * An ordered predicate is necessary; bcmp() is not documented to return
214 * an indication of order, memcmp() is, and is an ISO C99 requirement.
217 iflr_cmp(const struct if_llreach
*a
, const struct if_llreach
*b
)
219 return memcmp(&a
->lr_key
, &b
->lr_key
, sizeof(a
->lr_key
));
223 iflr_reachable(struct if_llreach
*lr
, int cmp_delta
, u_int64_t tval
)
228 now
= net_uptime(); /* current approx. uptime */
230 * No need for lr_lock; atomically read the last rcvd uptime.
232 expire
= lr
->lr_lastrcvd
+ lr
->lr_reachable
;
234 * If we haven't heard back from the local host for over
235 * lr_reachable seconds, consider that the host is no
239 return expire
>= now
;
242 * If the caller supplied a reference time, consider the
243 * host is reachable if the record hasn't expired (see above)
244 * and if the reference time is within the past lr_reachable
247 return (expire
>= now
) && (now
- tval
) < lr
->lr_reachable
;
251 ifnet_llreach_reachable(struct if_llreach
*lr
)
254 * Check whether the cache is too old to be trusted.
256 return iflr_reachable(lr
, 0, 0);
260 ifnet_llreach_reachable_delta(struct if_llreach
*lr
, u_int64_t tval
)
263 * Check whether the cache is too old to be trusted.
265 return iflr_reachable(lr
, 1, tval
);
269 ifnet_llreach_set_reachable(struct ifnet
*ifp
, u_int16_t llproto
, void *addr
,
272 struct if_llreach find
, *lr
;
274 VERIFY(alen
== IF_LLREACH_MAXLEN
); /* for now */
276 find
.lr_key
.proto
= llproto
;
277 bcopy(addr
, &find
.lr_key
.addr
, IF_LLREACH_MAXLEN
);
279 lck_rw_lock_shared(&ifp
->if_llreach_lock
);
280 lr
= RB_FIND(ll_reach_tree
, &ifp
->if_ll_srcs
, &find
);
282 lck_rw_done(&ifp
->if_llreach_lock
);
286 * No need for lr_lock; atomically update the last rcvd uptime.
288 lr
->lr_lastrcvd
= net_uptime();
289 lck_rw_done(&ifp
->if_llreach_lock
);
293 ifnet_llreach_alloc(struct ifnet
*ifp
, u_int16_t llproto
, void *addr
,
294 unsigned int alen
, u_int64_t llreach_base
)
296 struct if_llreach find
, *lr
;
299 if (llreach_base
== 0) {
303 VERIFY(alen
== IF_LLREACH_MAXLEN
); /* for now */
305 find
.lr_key
.proto
= llproto
;
306 bcopy(addr
, &find
.lr_key
.addr
, IF_LLREACH_MAXLEN
);
308 lck_rw_lock_shared(&ifp
->if_llreach_lock
);
309 lr
= RB_FIND(ll_reach_tree
, &ifp
->if_ll_srcs
, &find
);
313 VERIFY(lr
->lr_reqcnt
>= 1);
315 VERIFY(lr
->lr_reqcnt
!= 0);
316 IFLR_ADDREF_LOCKED(lr
); /* for caller */
317 lr
->lr_lastrcvd
= net_uptime(); /* current approx. uptime */
319 lck_rw_done(&ifp
->if_llreach_lock
);
323 if (!lck_rw_lock_shared_to_exclusive(&ifp
->if_llreach_lock
)) {
324 lck_rw_lock_exclusive(&ifp
->if_llreach_lock
);
327 LCK_RW_ASSERT(&ifp
->if_llreach_lock
, LCK_RW_ASSERT_EXCLUSIVE
);
329 /* in case things have changed while becoming writer */
330 lr
= RB_FIND(ll_reach_tree
, &ifp
->if_ll_srcs
, &find
);
335 lr
= iflr_alloc(M_WAITOK
);
337 lck_rw_done(&ifp
->if_llreach_lock
);
342 VERIFY(lr
->lr_reqcnt
== 1);
343 IFLR_ADDREF_LOCKED(lr
); /* for RB tree */
344 IFLR_ADDREF_LOCKED(lr
); /* for caller */
345 lr
->lr_lastrcvd
= net_uptime(); /* current approx. uptime */
346 lr
->lr_baseup
= lr
->lr_lastrcvd
; /* base uptime */
348 lr
->lr_basecal
= cnow
.tv_sec
; /* base calendar time */
349 lr
->lr_basereachable
= llreach_base
;
350 lr
->lr_reachable
= LL_COMPUTE_RTIME(lr
->lr_basereachable
* 1000);
351 lr
->lr_debug
|= IFD_ATTACHED
;
353 lr
->lr_key
.proto
= llproto
;
354 bcopy(addr
, &lr
->lr_key
.addr
, IF_LLREACH_MAXLEN
);
355 lr
->lr_rssi
= IFNET_RSSI_UNKNOWN
;
356 lr
->lr_lqm
= IFNET_LQM_THRESH_UNKNOWN
;
357 lr
->lr_npm
= IFNET_NPM_THRESH_UNKNOWN
;
358 RB_INSERT(ll_reach_tree
, &ifp
->if_ll_srcs
, lr
);
360 lck_rw_done(&ifp
->if_llreach_lock
);
366 ifnet_llreach_free(struct if_llreach
*lr
)
370 /* no need to lock here; lr_ifp never changes */
373 lck_rw_lock_exclusive(&ifp
->if_llreach_lock
);
375 if (lr
->lr_reqcnt
== 0) {
376 panic("%s: lr=%p negative reqcnt", __func__
, lr
);
380 if (lr
->lr_reqcnt
> 0) {
382 lck_rw_done(&ifp
->if_llreach_lock
);
383 IFLR_REMREF(lr
); /* for caller */
386 if (!(lr
->lr_debug
& IFD_ATTACHED
)) {
387 panic("%s: Attempt to detach an unattached llreach lr=%p",
391 lr
->lr_debug
&= ~IFD_ATTACHED
;
392 RB_REMOVE(ll_reach_tree
, &ifp
->if_ll_srcs
, lr
);
394 lck_rw_done(&ifp
->if_llreach_lock
);
396 IFLR_REMREF(lr
); /* for RB tree */
397 IFLR_REMREF(lr
); /* for caller */
401 ifnet_llreach_up2calexp(struct if_llreach
*lr
, u_int64_t uptime
)
403 u_int64_t calendar
= 0;
409 getmicrotime(&cnow
); /* current calendar time */
410 unow
= net_uptime(); /* current approx. uptime */
412 * Take into account possible calendar time changes;
413 * adjust base calendar value if necessary, i.e.
414 * the calendar skew should equate to the uptime skew.
416 lr
->lr_basecal
+= (cnow
.tv_sec
- lr
->lr_basecal
) -
417 (unow
- lr
->lr_baseup
);
419 calendar
= lr
->lr_basecal
+ lr
->lr_reachable
+
420 (uptime
- lr
->lr_baseup
);
427 ifnet_llreach_up2upexp(struct if_llreach
*lr
, u_int64_t uptime
)
429 return lr
->lr_reachable
+ uptime
;
433 ifnet_llreach_get_defrouter(struct ifnet
*ifp
, int af
,
434 struct ifnet_llreach_info
*iflri
)
436 struct radix_node_head
*rnh
;
437 struct sockaddr_storage dst_ss
, mask_ss
;
441 VERIFY(ifp
!= NULL
&& iflri
!= NULL
&&
442 (af
== AF_INET
|| af
== AF_INET6
));
444 bzero(iflri
, sizeof(*iflri
));
446 if ((rnh
= rt_tables
[af
]) == NULL
) {
450 bzero(&dst_ss
, sizeof(dst_ss
));
451 bzero(&mask_ss
, sizeof(mask_ss
));
452 dst_ss
.ss_family
= af
;
453 dst_ss
.ss_len
= (af
== AF_INET
) ? sizeof(struct sockaddr_in
) :
454 sizeof(struct sockaddr_in6
);
456 lck_mtx_lock(rnh_lock
);
457 rt
= rt_lookup(TRUE
, SA(&dst_ss
), SA(&mask_ss
), rnh
, ifp
->if_index
);
459 struct rtentry
*gwrt
;
462 if ((rt
->rt_flags
& RTF_GATEWAY
) &&
463 (gwrt
= rt
->rt_gwroute
) != NULL
&&
464 rt_key(rt
)->sa_family
== rt_key(gwrt
)->sa_family
&&
465 (gwrt
->rt_flags
& RTF_UP
)) {
468 if (gwrt
->rt_llinfo_get_iflri
!= NULL
) {
469 (*gwrt
->rt_llinfo_get_iflri
)(gwrt
, iflri
);
478 lck_mtx_unlock(rnh_lock
);
483 static struct if_llreach
*
486 struct if_llreach
*lr
;
488 lr
= (how
== M_WAITOK
) ? zalloc(iflr_zone
) : zalloc_noblock(iflr_zone
);
490 bzero(lr
, iflr_size
);
491 lck_mtx_init(&lr
->lr_lock
, ifnet_lock_group
, ifnet_lock_attr
);
492 lr
->lr_debug
|= IFD_ALLOC
;
498 iflr_free(struct if_llreach
*lr
)
501 if (lr
->lr_debug
& IFD_ATTACHED
) {
502 panic("%s: attached lr=%p is being freed", __func__
, lr
);
504 } else if (!(lr
->lr_debug
& IFD_ALLOC
)) {
505 panic("%s: lr %p cannot be freed", __func__
, lr
);
507 } else if (lr
->lr_refcnt
!= 0) {
508 panic("%s: non-zero refcount lr=%p", __func__
, lr
);
510 } else if (lr
->lr_reqcnt
!= 0) {
511 panic("%s: non-zero reqcnt lr=%p", __func__
, lr
);
514 lr
->lr_debug
&= ~IFD_ALLOC
;
517 lck_mtx_destroy(&lr
->lr_lock
, ifnet_lock_group
);
518 zfree(iflr_zone
, lr
);
522 iflr_addref(struct if_llreach
*lr
, int locked
)
527 IFLR_LOCK_ASSERT_HELD(lr
);
530 if (++lr
->lr_refcnt
== 0) {
531 panic("%s: lr=%p wraparound refcnt", __func__
, lr
);
540 iflr_remref(struct if_llreach
*lr
)
543 if (lr
->lr_refcnt
== 0) {
544 panic("%s: lr=%p negative refcnt", __func__
, lr
);
548 if (lr
->lr_refcnt
> 0) {
554 iflr_free(lr
); /* deallocate it */
558 ifnet_lr2ri(struct if_llreach
*lr
, struct rt_reach_info
*ri
)
560 struct if_llreach_info lri
;
562 IFLR_LOCK_ASSERT_HELD(lr
);
564 bzero(ri
, sizeof(*ri
));
565 ifnet_lr2lri(lr
, &lri
);
566 ri
->ri_refcnt
= lri
.lri_refcnt
;
567 ri
->ri_probes
= lri
.lri_probes
;
568 ri
->ri_rcv_expire
= lri
.lri_expire
;
569 ri
->ri_rssi
= lri
.lri_rssi
;
570 ri
->ri_lqm
= lri
.lri_lqm
;
571 ri
->ri_npm
= lri
.lri_npm
;
575 ifnet_lr2iflri(struct if_llreach
*lr
, struct ifnet_llreach_info
*iflri
)
577 IFLR_LOCK_ASSERT_HELD(lr
);
579 bzero(iflri
, sizeof(*iflri
));
581 * Note here we return request count, not actual memory refcnt.
583 iflri
->iflri_refcnt
= lr
->lr_reqcnt
;
584 iflri
->iflri_probes
= lr
->lr_probes
;
585 iflri
->iflri_rcv_expire
= ifnet_llreach_up2upexp(lr
, lr
->lr_lastrcvd
);
586 iflri
->iflri_curtime
= net_uptime();
587 switch (lr
->lr_key
.proto
) {
589 iflri
->iflri_netproto
= PF_INET
;
592 iflri
->iflri_netproto
= PF_INET6
;
596 * This shouldn't be possible for the time being,
597 * since link-layer reachability records are only
598 * kept for ARP and ND6.
600 iflri
->iflri_netproto
= PF_UNSPEC
;
603 bcopy(&lr
->lr_key
.addr
, &iflri
->iflri_addr
, IF_LLREACH_MAXLEN
);
604 iflri
->iflri_rssi
= lr
->lr_rssi
;
605 iflri
->iflri_lqm
= lr
->lr_lqm
;
606 iflri
->iflri_npm
= lr
->lr_npm
;
610 ifnet_lr2lri(struct if_llreach
*lr
, struct if_llreach_info
*lri
)
612 IFLR_LOCK_ASSERT_HELD(lr
);
614 bzero(lri
, sizeof(*lri
));
616 * Note here we return request count, not actual memory refcnt.
618 lri
->lri_refcnt
= lr
->lr_reqcnt
;
619 lri
->lri_ifindex
= lr
->lr_ifp
->if_index
;
620 lri
->lri_probes
= lr
->lr_probes
;
621 lri
->lri_expire
= ifnet_llreach_up2calexp(lr
, lr
->lr_lastrcvd
);
622 lri
->lri_proto
= lr
->lr_key
.proto
;
623 bcopy(&lr
->lr_key
.addr
, &lri
->lri_addr
, IF_LLREACH_MAXLEN
);
624 lri
->lri_rssi
= lr
->lr_rssi
;
625 lri
->lri_lqm
= lr
->lr_lqm
;
626 lri
->lri_npm
= lr
->lr_npm
;
630 sysctl_llreach_ifinfo SYSCTL_HANDLER_ARGS
633 int *name
, retval
= 0;
634 unsigned int namelen
;
636 struct if_llreach
*lr
;
637 struct if_llreach_info lri
= {};
641 namelen
= (unsigned int)arg2
;
643 if (req
->newptr
!= USER_ADDR_NULL
) {
652 ifnet_head_lock_shared();
653 if (ifindex
<= 0 || ifindex
> (u_int
)if_index
) {
654 printf("%s: ifindex %u out of range\n", __func__
, ifindex
);
659 ifp
= ifindex2ifnet
[ifindex
];
662 printf("%s: no ifp for ifindex %u\n", __func__
, ifindex
);
666 lck_rw_lock_shared(&ifp
->if_llreach_lock
);
667 RB_FOREACH(lr
, ll_reach_tree
, &ifp
->if_ll_srcs
) {
668 /* Export to if_llreach_info structure */
670 ifnet_lr2lri(lr
, &lri
);
673 if ((retval
= SYSCTL_OUT(req
, &lri
, sizeof(lri
))) != 0) {
677 lck_rw_done(&ifp
->if_llreach_lock
);