]> git.saurik.com Git - apple/xnu.git/blob - bsd/net/if_llreach.c
xnu-1699.24.23.tar.gz
[apple/xnu.git] / bsd / net / if_llreach.c
1 /*
2 * Copyright (c) 2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Link-layer Reachability Record
31 *
32 * Each interface maintains a red-black tree which contains records related
33 * to the on-link nodes which we are interested in communicating with. Each
34 * record gets allocated and inserted into the tree in the following manner:
35 * upon processing an ARP announcement or reply from a known node (i.e. there
36 * exists a ARP route entry for the node), and if a link-layer reachability
37 * record for the node doesn't yet exist; and, upon processing a ND6 RS/RA/
38 * NS/NA/redirect from a node, and if a link-layer reachability record for the
39 * node doesn't yet exist.
40 *
41 * Each newly created record is then referred to by the resolver route entry;
42 * if a record already exists, its reference count gets increased for the new
43 * resolver entry which now refers to it. A record gets removed from the tree
44 * and freed once its reference counts drops to zero, i.e. when there is no
45 * more resolver entry referring to it.
46 *
47 * A record contains the link-layer protocol (e.g. Ethertype IP/IPv6), the
48 * HW address of the sender, the "last heard from" timestamp (lr_lastrcvd) and
49 * the number of references made to it (lr_reqcnt). Because the key for each
50 * record in the red-black tree consists of the link-layer protocol, therefore
51 * the namespace for the records is partitioned based on the type of link-layer
52 * protocol, i.e. an Ethertype IP link-layer record is only referred to by one
53 * or more ARP entries; an Ethernet IPv6 link-layer record is only referred to
54 * by one or more ND6 entries. Therefore, lr_reqcnt represents the number of
55 * resolver entry references to the record for the same protocol family.
56 *
57 * Upon receiving packets from the network, the protocol's input callback
58 * (e.g. ether_inet{6}_input) informs the corresponding resolver (ARP/ND6)
59 * about the (link-layer) origin of the packet. This results in searching
60 * for a matching record in the red-black tree for the interface where the
61 * packet arrived on. If there's no match, no further processing takes place.
62 * Otherwise, the lr_lastrcvd timestamp of the record is updated.
63 *
64 * When an IP/IPv6 packet is transmitted to the resolver (i.e. the destination
65 * is on-link), ARP/ND6 records the "last spoken to" timestamp in the route
66 * entry ({la,ln}_lastused).
67 *
68 * The reachability of the on-link node is determined by the following logic,
69 * upon sending a packet thru the resolver:
70 *
71 * a) If the record is only used by exactly one resolver entry (lr_reqcnt
72 * is 1), i.e. the target host does not have IP/IPv6 aliases that we know
73 * of, check if lr_lastrcvd is "recent." If so, simply send the packet;
74 * otherwise, re-resolve the target node.
75 *
76 * b) If the record is shared by multiple resolver entries (lr_reqcnt is
77 * greater than 1), i.e. the target host has more than one IP/IPv6 aliases
78 * on the same network interface, we can't rely on lr_lastrcvd alone, as
79 * one of the IP/IPv6 aliases could have been silently moved to another
80 * node for which we don't have a link-layer record. If lr_lastrcvd is
81 * not "recent", we re-resolve the target node. Otherwise, we perform
82 * an additional check against {la,ln}_lastused to see whether it is also
83 * "recent", relative to lr_lastrcvd. If so, simply send the packet;
84 * otherwise, re-resolve the target node.
85 *
86 * The value for "recent" is configurable by adjusting the basetime value for
87 * net.link.ether.inet.arp_llreach_base or net.inet6.icmp6.nd6_llreach_base.
88 * The default basetime value is 30 seconds, and the actual expiration time
89 * is calculated by multiplying the basetime value with some random factor,
90 * which results in a number between 15 to 45 seconds. Setting the basetime
91 * value to 0 effectively disables this feature for the corresponding resolver.
92 *
93 * Assumptions:
94 *
95 * The above logic is based upon the following assumptions:
96 *
97 * i) Network traffics are mostly bi-directional, i.e. the act of sending
98 * packets to an on-link node would most likely cause us to receive
99 * packets from that node.
100 *
101 * ii) If the on-link node's IP/IPv6 address silently moves to another
102 * on-link node for which we are not aware of, non-unicast packets
103 * from the old node would trigger the record's lr_lastrcvd to be
104 * kept recent.
105 *
106 * We can mitigate the above by having the resolver check its {la,ln}_lastused
107 * timestamp at all times, i.e. not only when lr_reqcnt is greater than 1; but
108 * we currently optimize for the common cases.
109 */
110
111 #include <sys/param.h>
112 #include <sys/systm.h>
113 #include <sys/kernel.h>
114 #include <sys/malloc.h>
115 #include <sys/tree.h>
116 #include <sys/sysctl.h>
117 #include <sys/mcache.h>
118 #include <sys/protosw.h>
119
120 #include <net/if_dl.h>
121 #include <net/if.h>
122 #include <net/if_var.h>
123 #include <net/if_llreach.h>
124 #include <net/dlil.h>
125
126 #include <kern/assert.h>
127 #include <kern/locks.h>
128 #include <kern/zalloc.h>
129
130 #if INET6
131 #include <netinet6/in6_var.h>
132 #include <netinet6/nd6.h>
133 #endif /* INET6 */
134
135 static unsigned int iflr_size; /* size of if_llreach */
136 static struct zone *iflr_zone; /* zone for if_llreach */
137
138 #define IFLR_ZONE_MAX 128 /* maximum elements in zone */
139 #define IFLR_ZONE_NAME "if_llreach" /* zone name */
140
141 static struct if_llreach *iflr_alloc(int);
142 static void iflr_free(struct if_llreach *);
143 static __inline int iflr_cmp(const struct if_llreach *,
144 const struct if_llreach *);
145 static __inline int iflr_reachable(struct if_llreach *, int, u_int64_t);
146 static int sysctl_llreach_ifinfo SYSCTL_HANDLER_ARGS;
147
148 /* The following is protected by if_llreach_lock */
149 RB_GENERATE_PREV(ll_reach_tree, if_llreach, lr_link, iflr_cmp);
150
151 SYSCTL_DECL(_net_link_generic_system);
152
153 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, llreach_info,
154 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_llreach_ifinfo,
155 "Per-interface tree of source link-layer reachability records");
156
157 /*
158 * Link-layer reachability is based off node constants in RFC4861.
159 */
160 #if INET6
161 #define LL_COMPUTE_RTIME(x) ND_COMPUTE_RTIME(x)
162 #else
163 #define LL_MIN_RANDOM_FACTOR 512 /* 1024 * 0.5 */
164 #define LL_MAX_RANDOM_FACTOR 1536 /* 1024 * 1.5 */
165 #define LL_COMPUTE_RTIME(x) \
166 (((LL_MIN_RANDOM_FACTOR * (x >> 10)) + (random() & \
167 ((LL_MAX_RANDOM_FACTOR - LL_MIN_RANDOM_FACTOR) * (x >> 10)))) / 1000)
168 #endif /* !INET6 */
169
170 void
171 ifnet_llreach_init(void)
172 {
173 iflr_size = sizeof (struct if_llreach);
174 iflr_zone = zinit(iflr_size,
175 IFLR_ZONE_MAX * iflr_size, 0, IFLR_ZONE_NAME);
176 if (iflr_zone == NULL) {
177 panic("%s: failed allocating %s", __func__, IFLR_ZONE_NAME);
178 /* NOTREACHED */
179 }
180 zone_change(iflr_zone, Z_EXPAND, TRUE);
181 zone_change(iflr_zone, Z_CALLERACCT, FALSE);
182 }
183
184 void
185 ifnet_llreach_ifattach(struct ifnet *ifp, boolean_t reuse)
186 {
187 lck_rw_lock_exclusive(&ifp->if_llreach_lock);
188 /* Initialize link-layer source tree (if not already) */
189 if (!reuse)
190 RB_INIT(&ifp->if_ll_srcs);
191 lck_rw_done(&ifp->if_llreach_lock);
192 }
193
194 void
195 ifnet_llreach_ifdetach(struct ifnet *ifp)
196 {
197 #pragma unused(ifp)
198 /*
199 * Nothing to do for now; the link-layer source tree might
200 * contain entries at this point, that are still referred
201 * to by route entries pointing to this ifp.
202 */
203 }
204
205 /*
206 * Link-layer source tree comparison function.
207 *
208 * An ordered predicate is necessary; bcmp() is not documented to return
209 * an indication of order, memcmp() is, and is an ISO C99 requirement.
210 */
211 static __inline int
212 iflr_cmp(const struct if_llreach *a, const struct if_llreach *b)
213 {
214 return (memcmp(&a->lr_key, &b->lr_key, sizeof (a->lr_key)));
215 }
216
217 static __inline int
218 iflr_reachable(struct if_llreach *lr, int cmp_delta, u_int64_t tval)
219 {
220 u_int64_t now;
221 u_int64_t expire;
222
223 now = net_uptime(); /* current approx. uptime */
224 /*
225 * No need for lr_lock; atomically read the last rcvd uptime.
226 */
227 expire = lr->lr_lastrcvd + lr->lr_reachable;
228 /*
229 * If we haven't heard back from the local host for over
230 * lr_reachable seconds, consider that the host is no
231 * longer reachable.
232 */
233 if (!cmp_delta)
234 return (expire >= now);
235 /*
236 * If the caller supplied a reference time, consider the
237 * host is reachable if the record hasn't expired (see above)
238 * and if the reference time is within the past lr_reachable
239 * seconds.
240 */
241 return ((expire >= now) && (now - tval) < lr->lr_reachable);
242 }
243
244 int
245 ifnet_llreach_reachable(struct if_llreach *lr)
246 {
247 /*
248 * Check whether the cache is too old to be trusted.
249 */
250 return (iflr_reachable(lr, 0, 0));
251 }
252
253 int
254 ifnet_llreach_reachable_delta(struct if_llreach *lr, u_int64_t tval)
255 {
256 /*
257 * Check whether the cache is too old to be trusted.
258 */
259 return (iflr_reachable(lr, 1, tval));
260 }
261
262 void
263 ifnet_llreach_set_reachable(struct ifnet *ifp, u_int16_t llproto, void *addr,
264 unsigned int alen)
265 {
266 struct if_llreach find, *lr;
267
268 VERIFY(alen == IF_LLREACH_MAXLEN); /* for now */
269
270 find.lr_key.proto = llproto;
271 bcopy(addr, &find.lr_key.addr, IF_LLREACH_MAXLEN);
272
273 lck_rw_lock_shared(&ifp->if_llreach_lock);
274 lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find);
275 if (lr == NULL) {
276 lck_rw_done(&ifp->if_llreach_lock);
277 return;
278 }
279 /*
280 * No need for lr_lock; atomically update the last rcvd uptime.
281 */
282 lr->lr_lastrcvd = net_uptime();
283 lck_rw_done(&ifp->if_llreach_lock);
284 }
285
286 struct if_llreach *
287 ifnet_llreach_alloc(struct ifnet *ifp, u_int16_t llproto, void *addr,
288 unsigned int alen, u_int64_t llreach_base)
289 {
290 struct if_llreach find, *lr;
291 struct timeval now;
292
293 if (llreach_base == 0)
294 return (NULL);
295
296 VERIFY(alen == IF_LLREACH_MAXLEN); /* for now */
297
298 find.lr_key.proto = llproto;
299 bcopy(addr, &find.lr_key.addr, IF_LLREACH_MAXLEN);
300
301 lck_rw_lock_shared(&ifp->if_llreach_lock);
302 lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find);
303 if (lr != NULL) {
304 found:
305 IFLR_LOCK(lr);
306 VERIFY(lr->lr_reqcnt >= 1);
307 lr->lr_reqcnt++;
308 VERIFY(lr->lr_reqcnt != 0);
309 IFLR_ADDREF_LOCKED(lr); /* for caller */
310 lr->lr_lastrcvd = net_uptime(); /* current approx. uptime */
311 IFLR_UNLOCK(lr);
312 lck_rw_done(&ifp->if_llreach_lock);
313 return (lr);
314 }
315
316 if (!lck_rw_lock_shared_to_exclusive(&ifp->if_llreach_lock))
317 lck_rw_lock_exclusive(&ifp->if_llreach_lock);
318
319 lck_rw_assert(&ifp->if_llreach_lock, LCK_RW_ASSERT_EXCLUSIVE);
320
321 /* in case things have changed while becoming writer */
322 lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find);
323 if (lr != NULL)
324 goto found;
325
326 lr = iflr_alloc(M_WAITOK);
327 if (lr == NULL) {
328 lck_rw_done(&ifp->if_llreach_lock);
329 return (NULL);
330 }
331 IFLR_LOCK(lr);
332 lr->lr_reqcnt++;
333 VERIFY(lr->lr_reqcnt == 1);
334 IFLR_ADDREF_LOCKED(lr); /* for RB tree */
335 IFLR_ADDREF_LOCKED(lr); /* for caller */
336 lr->lr_lastrcvd = net_uptime(); /* current approx. uptime */
337 lr->lr_baseup = lr->lr_lastrcvd; /* base uptime */
338 microtime(&now);
339 lr->lr_basecal = now.tv_sec; /* base calendar time */
340 lr->lr_basereachable = llreach_base;
341 lr->lr_reachable = LL_COMPUTE_RTIME(lr->lr_basereachable * 1000);
342 lr->lr_debug |= IFD_ATTACHED;
343 lr->lr_ifp = ifp;
344 lr->lr_key.proto = llproto;
345 bcopy(addr, &lr->lr_key.addr, IF_LLREACH_MAXLEN);
346 RB_INSERT(ll_reach_tree, &ifp->if_ll_srcs, lr);
347 IFLR_UNLOCK(lr);
348 lck_rw_done(&ifp->if_llreach_lock);
349
350 return (lr);
351 }
352
353 void
354 ifnet_llreach_free(struct if_llreach *lr)
355 {
356 struct ifnet *ifp;
357
358 /* no need to lock here; lr_ifp never changes */
359 ifp = lr->lr_ifp;
360
361 lck_rw_lock_exclusive(&ifp->if_llreach_lock);
362 IFLR_LOCK(lr);
363 if (lr->lr_reqcnt == 0) {
364 panic("%s: lr=%p negative reqcnt", __func__, lr);
365 /* NOTREACHED */
366 }
367 --lr->lr_reqcnt;
368 if (lr->lr_reqcnt > 0) {
369 IFLR_UNLOCK(lr);
370 lck_rw_done(&ifp->if_llreach_lock);
371 IFLR_REMREF(lr); /* for caller */
372 return;
373 }
374 if (!(lr->lr_debug & IFD_ATTACHED)) {
375 panic("%s: Attempt to detach an unattached llreach lr=%p",
376 __func__, lr);
377 /* NOTREACHED */
378 }
379 lr->lr_debug &= ~IFD_ATTACHED;
380 RB_REMOVE(ll_reach_tree, &ifp->if_ll_srcs, lr);
381 IFLR_UNLOCK(lr);
382 lck_rw_done(&ifp->if_llreach_lock);
383
384 IFLR_REMREF(lr); /* for RB tree */
385 IFLR_REMREF(lr); /* for caller */
386 }
387
388 u_int64_t
389 ifnet_llreach_up2cal(struct if_llreach *lr, u_int64_t uptime)
390 {
391 u_int64_t calendar = 0;
392
393 if (uptime != 0) {
394 struct timeval cnow;
395 u_int64_t unow;
396
397 getmicrotime(&cnow); /* current calendar time */
398 unow = net_uptime(); /* current approx. uptime */
399 /*
400 * Take into account possible calendar time changes;
401 * adjust base calendar value if necessary, i.e.
402 * the calendar skew should equate to the uptime skew.
403 */
404 lr->lr_basecal += (cnow.tv_sec - lr->lr_basecal) -
405 (unow - lr->lr_baseup);
406
407 calendar = lr->lr_basecal + lr->lr_reachable +
408 (uptime - lr->lr_baseup);
409 }
410
411 return (calendar);
412 }
413
414 static struct if_llreach *
415 iflr_alloc(int how)
416 {
417 struct if_llreach *lr;
418
419 lr = (how == M_WAITOK) ? zalloc(iflr_zone) : zalloc_noblock(iflr_zone);
420 if (lr != NULL) {
421 bzero(lr, iflr_size);
422 lck_mtx_init(&lr->lr_lock, ifnet_lock_group, ifnet_lock_attr);
423 lr->lr_debug |= IFD_ALLOC;
424 }
425 return (lr);
426 }
427
428 static void
429 iflr_free(struct if_llreach *lr)
430 {
431 IFLR_LOCK(lr);
432 if (lr->lr_debug & IFD_ATTACHED) {
433 panic("%s: attached lr=%p is being freed", __func__, lr);
434 /* NOTREACHED */
435 } else if (!(lr->lr_debug & IFD_ALLOC)) {
436 panic("%s: lr %p cannot be freed", __func__, lr);
437 /* NOTREACHED */
438 } else if (lr->lr_refcnt != 0) {
439 panic("%s: non-zero refcount lr=%p", __func__, lr);
440 /* NOTREACHED */
441 } else if (lr->lr_reqcnt != 0) {
442 panic("%s: non-zero reqcnt lr=%p", __func__, lr);
443 /* NOTREACHED */
444 }
445 lr->lr_debug &= ~IFD_ALLOC;
446 IFLR_UNLOCK(lr);
447
448 lck_mtx_destroy(&lr->lr_lock, ifnet_lock_group);
449 zfree(iflr_zone, lr);
450 }
451
452 void
453 iflr_addref(struct if_llreach *lr, int locked)
454 {
455 if (!locked)
456 IFLR_LOCK(lr);
457 else
458 IFLR_LOCK_ASSERT_HELD(lr);
459
460 if (++lr->lr_refcnt == 0) {
461 panic("%s: lr=%p wraparound refcnt", __func__, lr);
462 /* NOTREACHED */
463 }
464 if (!locked)
465 IFLR_UNLOCK(lr);
466 }
467
468 void
469 iflr_remref(struct if_llreach *lr)
470 {
471 IFLR_LOCK(lr);
472 if (lr->lr_refcnt == 0) {
473 panic("%s: lr=%p negative refcnt", __func__, lr);
474 /* NOTREACHED */
475 }
476 --lr->lr_refcnt;
477 if (lr->lr_refcnt > 0) {
478 IFLR_UNLOCK(lr);
479 return;
480 }
481 IFLR_UNLOCK(lr);
482
483 iflr_free(lr); /* deallocate it */
484 }
485
486 void
487 ifnet_lr2ri(struct if_llreach *lr, struct rt_reach_info *ri)
488 {
489 struct if_llreach_info lri;
490
491 IFLR_LOCK_ASSERT_HELD(lr);
492
493 bzero(ri, sizeof (*ri));
494 ifnet_lr2lri(lr, &lri);
495 ri->ri_refcnt = lri.lri_refcnt;
496 ri->ri_probes = lri.lri_probes;
497 ri->ri_rcv_expire = lri.lri_expire;
498 }
499
500 void
501 ifnet_lr2lri(struct if_llreach *lr, struct if_llreach_info *lri)
502 {
503 IFLR_LOCK_ASSERT_HELD(lr);
504
505 bzero(lri, sizeof (*lri));
506 /*
507 * Note here we return request count, not actual memory refcnt.
508 */
509 lri->lri_refcnt = lr->lr_reqcnt;
510 lri->lri_ifindex = lr->lr_ifp->if_index;
511 lri->lri_probes = lr->lr_probes;
512 lri->lri_expire = ifnet_llreach_up2cal(lr, lr->lr_lastrcvd);
513 lri->lri_proto = lr->lr_key.proto;
514 bcopy(&lr->lr_key.addr, &lri->lri_addr, IF_LLREACH_MAXLEN);
515 }
516
517 static int
518 sysctl_llreach_ifinfo SYSCTL_HANDLER_ARGS
519 {
520 #pragma unused(oidp)
521 int *name, retval = 0;
522 unsigned int namelen;
523 uint32_t ifindex;
524 struct if_llreach *lr;
525 struct if_llreach_info lri;
526 struct ifnet *ifp;
527
528 name = (int *)arg1;
529 namelen = (unsigned int)arg2;
530
531 if (req->newptr != USER_ADDR_NULL)
532 return (EPERM);
533
534 if (namelen != 1)
535 return (EINVAL);
536
537 ifindex = name[0];
538 ifnet_head_lock_shared();
539 if (ifindex <= 0 || ifindex > (u_int)if_index) {
540 printf("%s: ifindex %u out of range\n", __func__, ifindex);
541 ifnet_head_done();
542 return (ENOENT);
543 }
544
545 ifp = ifindex2ifnet[ifindex];
546 ifnet_head_done();
547 if (ifp == NULL) {
548 printf("%s: no ifp for ifindex %u\n", __func__, ifindex);
549 return (ENOENT);
550 }
551
552 lck_rw_lock_shared(&ifp->if_llreach_lock);
553 RB_FOREACH(lr, ll_reach_tree, &ifp->if_ll_srcs) {
554 /* Export to if_llreach_info structure */
555 IFLR_LOCK(lr);
556 ifnet_lr2lri(lr, &lri);
557 IFLR_UNLOCK(lr);
558
559 if ((retval = SYSCTL_OUT(req, &lri, sizeof (lri))) != 0)
560 break;
561 }
562 lck_rw_done(&ifp->if_llreach_lock);
563
564 return (retval);
565 }