From: Apple Date: Fri, 7 Aug 2009 20:02:49 +0000 (+0000) Subject: xnu-1228.15.4.tar.gz X-Git-Tag: mac-os-x-1058^0 X-Git-Url: https://git.saurik.com/apple/xnu.git/commitdiff_plain/e2fac8b15b12a7979f72090454d850e612fc5b13?ds=sidebyside;hp=c910b4d9d2451126ae3917b931cd4390c11e1d52 xnu-1228.15.4.tar.gz --- diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index f3c12bb41..5bb4ec920 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -46,6 +46,7 @@ #include #include #include +#include #include @@ -272,8 +273,34 @@ typedef struct hfsmount { /* Resize variables: */ u_int32_t hfs_resize_filesmoved; u_int32_t hfs_resize_totalfiles; + + /* + * About the sync counters: + * hfs_sync_scheduled keeps track whether a timer was scheduled but we + * haven't started processing the callback (i.e. we + * haven't begun the flush). This will be non-zero + * even if the callback has been invoked, before we + * start the flush. + * hfs_sync_incomplete keeps track of the number of callbacks that have + * not completed yet (including callbacks not yet + * invoked). We cannot safely unmount until this + * drops to zero. + * + * In both cases, we use counters, not flags, so that we can avoid + * taking locks. + */ + int32_t hfs_sync_scheduled; + int32_t hfs_sync_incomplete; + u_int64_t hfs_last_sync_request_time; + u_int64_t hfs_last_sync_time; + uint32_t hfs_active_threads; + thread_call_t hfs_syncer; // removeable devices get sync'ed by this guy + } hfsmount_t; +#define HFS_META_DELAY (100) +#define HFS_MILLISEC_SCALE (1000*1000) + typedef hfsmount_t ExtendedVCB; /* Aliases for legacy (Mac OS 9) field names */ @@ -689,6 +716,7 @@ extern int hfs_virtualmetafile(struct cnode *); extern int hfs_start_transaction(struct hfsmount *hfsmp); extern int hfs_end_transaction(struct hfsmount *hfsmp); +extern void hfs_sync_ejectable(struct hfsmount *hfsmp); /***************************************************************************** diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 1e836052f..e5ab6c8b9 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1836,12 +1836,20 @@ fail_change_next_allocation: } case HFS_GET_MOUNT_TIME: - return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time)); - break; + if (is64bit) { + *(user_time_t *)(ap->a_data) = (user_time_t) hfsmp->hfs_mount_time; + } else { + *(time_t *)(ap->a_data) = (time_t) hfsmp->hfs_mount_time; + } + return 0; case HFS_GET_LAST_MTIME: - return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime)); - break; + if (is64bit) { + *(user_time_t *)(ap->a_data) = (user_time_t) hfsmp->hfs_last_mounted_mtime; + } else { + *(time_t *)(ap->a_data) = (time_t) hfsmp->hfs_last_mounted_mtime; + } + return 0; case HFS_SET_BOOT_INFO: if (!vnode_isvroot(vp)) diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index b2e71a034..6f5c3eb53 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -827,6 +827,99 @@ hfs_reload(struct mount *mountp) return (0); } +int hfs_last_io_wait_time = 125000; +SYSCTL_INT (_kern, OID_AUTO, hfs_last_io_wait_time, CTLFLAG_RW, &hfs_last_io_wait_time, 0, "number of usecs to wait after an i/o before syncing ejectable media"); + +static void +hfs_syncer(void *arg0, void *unused) +{ +#pragma unused(unused) + + struct hfsmount *hfsmp = arg0; + uint32_t secs, usecs, delay = HFS_META_DELAY; + uint64_t now; + struct timeval nowtv, last_io; + + clock_get_calendar_microtime(&secs, &usecs); + now = ((uint64_t)secs * 1000000LL) + usecs; + // + // If we have put off the last sync for more than + // 5 seconds, force it so that we don't let too + // much i/o queue up (since flushing the journal + // causes the i/o queue to drain) + // + if ((now - hfsmp->hfs_last_sync_time) >= 5000000LL) { + goto doit; + } + + // + // Find out when the last i/o was done to this device (read or write). + // + throttle_info_get_last_io_time(hfsmp->hfs_mp, &last_io); + microuptime(&nowtv); + timevalsub(&nowtv, &last_io); + + // + // If the last i/o was too recent, defer this sync until later. + // The limit chosen (125 milli-seconds) was picked based on + // some experiments copying data to an SD card and seems to + // prevent us from issuing too many syncs. + // + if (nowtv.tv_sec >= 0 && nowtv.tv_usec > 0 && nowtv.tv_usec < hfs_last_io_wait_time) { + delay /= 2; + goto resched; + } + + // + // If there's pending i/o, also skip the sync. + // + if (hfsmp->hfs_devvp && hfsmp->hfs_devvp->v_numoutput > 0) { + goto resched; + } + + + // + // Only flush the journal if we have not sync'ed recently + // and the last sync request time was more than 100 milli + // seconds ago and there is no one in the middle of a + // transaction right now. Else we defer the sync and + // reschedule it for later. + // + if ( ((now - hfsmp->hfs_last_sync_time) >= 100000LL) + && ((now - hfsmp->hfs_last_sync_request_time) >= 100000LL) + && (hfsmp->hfs_active_threads == 0) + && (hfsmp->hfs_global_lock_nesting == 0)) { + + doit: + OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); + if (hfsmp->jnl) { + journal_flush(hfsmp->jnl); + } + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); + + clock_get_calendar_microtime(&secs, &usecs); + hfsmp->hfs_last_sync_time = ((int64_t)secs * 1000000) + usecs; + + } else if (hfsmp->hfs_active_threads == 0) { + uint64_t deadline; + + resched: + clock_interval_to_deadline(delay, HFS_MILLISEC_SCALE, &deadline); + thread_call_enter_delayed(hfsmp->hfs_syncer, deadline); + return; + } + + // + // NOTE: we decrement these *after* we're done the journal_flush() since + // it can take a significant amount of time and so we don't want more + // callbacks scheduled until we're done this one. + // + OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled); + OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete); + wakeup((caddr_t)&hfsmp->hfs_sync_incomplete); +} + +extern int IOBSDIsMediaEjectable( const char *cdev_name ); /* * Common code for mount and mountroot @@ -855,12 +948,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, u_int32_t iswritable; daddr64_t mdb_offset; int isvirtual = 0; + int isroot = 0; ronly = vfs_isrdonly(mp); dev = vnode_specrdev(devvp); cred = p ? vfs_context_ucred(context) : NOCRED; mntwrapper = 0; + if (args == NULL) { + /* only hfs_mountroot passes us NULL as the 'args' argument */ + isroot = 1; + } + bp = NULL; hfsmp = NULL; mdbp = NULL; @@ -1379,6 +1478,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } } + /* ejectability checks will time out when the device is root_device, so skip them */ + if (isroot == 0) { + if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 && + IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) { + hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp); + if (hfsmp->hfs_syncer == NULL) { + printf("hfs: failed to allocate syncer thread callback for %s (%s)\n", + mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname); + } + } + } + /* * Start looking for free space to drop below this level and generate a * warning immediately if needed: @@ -1451,6 +1562,38 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) if (hfsmp->hfs_flags & HFS_METADATA_ZONE) (void) hfs_recording_suspend(hfsmp); + /* + * Cancel any pending timers for this volume. Then wait for any timers + * which have fired, but whose callbacks have not yet completed. + */ + if (hfsmp->hfs_syncer) + { + struct timespec ts = {0, 100000000}; /* 0.1 seconds */ + + /* + * Cancel any timers that have been scheduled, but have not + * fired yet. NOTE: The kernel considers a timer complete as + * soon as it starts your callback, so the kernel does not + * keep track of the number of callbacks in progress. + */ + if (thread_call_cancel(hfsmp->hfs_syncer)) + OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete); + thread_call_free(hfsmp->hfs_syncer); + hfsmp->hfs_syncer = NULL; + + /* + * This waits for all of the callbacks that were entered before + * we did thread_call_cancel above, but have not completed yet. + */ + while(hfsmp->hfs_sync_incomplete > 0) + { + msleep((caddr_t)&hfsmp->hfs_sync_incomplete, NULL, PWAIT, "hfs_unmount", &ts); + } + + if (hfsmp->hfs_sync_incomplete < 0) + printf("hfs_unmount: pm_sync_incomplete underflow (%d)!\n", hfsmp->hfs_sync_incomplete); + } + /* * Flush out the b-trees, volume bitmap and Volume Header */ @@ -1931,6 +2074,15 @@ hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) journal_flush(hfsmp->jnl); } + { + uint32_t secs, usecs; + uint64_t now; + + clock_get_calendar_microtime(&secs, &usecs); + now = ((uint64_t)secs * 1000000LL) + usecs; + hfsmp->hfs_last_sync_time = now; + } + lck_rw_unlock_shared(&hfsmp->hfs_insync); return (allerror); } diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index ce577ec74..d6dc1e356 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -2347,6 +2347,46 @@ hfs_virtualmetafile(struct cnode *cp) } + +// +// Fire off a timed callback to sync the disk if the +// volume is on ejectable media. +// + __private_extern__ +void +hfs_sync_ejectable(struct hfsmount *hfsmp) +{ + if (hfsmp->hfs_syncer) { + uint32_t secs, usecs; + uint64_t now; + + clock_get_calendar_microtime(&secs, &usecs); + now = ((uint64_t)secs * 1000000) + usecs; + + if (hfsmp->hfs_sync_scheduled == 0) { + uint64_t deadline; + + hfsmp->hfs_last_sync_request_time = now; + + clock_interval_to_deadline(HFS_META_DELAY, HFS_MILLISEC_SCALE, &deadline); + + /* + * Increment hfs_sync_scheduled on the assumption that we're the + * first thread to schedule the timer. If some other thread beat + * us, then we'll decrement it. If we *were* the first to + * schedule the timer, then we need to keep track that the + * callback is waiting to complete. + */ + OSIncrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled); + if (thread_call_enter_delayed(hfsmp->hfs_syncer, deadline)) + OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled); + else + OSIncrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete); + } + } +} + + __private_extern__ int hfs_start_transaction(struct hfsmount *hfsmp) @@ -2374,6 +2414,7 @@ hfs_start_transaction(struct hfsmount *hfsmp) if (hfsmp->jnl == NULL || journal_owner(hfsmp->jnl) != thread) { lck_rw_lock_shared(&hfsmp->hfs_global_lock); + OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); unlock_on_err = 1; } @@ -2399,6 +2440,7 @@ hfs_start_transaction(struct hfsmount *hfsmp) out: if (ret != 0 && unlock_on_err) { lck_rw_unlock_shared(&hfsmp->hfs_global_lock); + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); } return ret; @@ -2424,7 +2466,9 @@ hfs_end_transaction(struct hfsmount *hfsmp) } if (need_unlock) { + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); lck_rw_unlock_shared(&hfsmp->hfs_global_lock); + hfs_sync_ejectable(hfsmp); } return ret; diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 61875f626..6d8d6ad33 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -367,6 +367,11 @@ hfs_vnop_close(ap) } hfs_unlock(cp); + + if (ap->a_fflag & FWASWRITTEN) { + hfs_sync_ejectable(hfsmp); + } + return (0); } @@ -2619,6 +2624,16 @@ hfs_vnop_rename(ap) skip_rm: /* * All done with tvp and fvp + * + * We also jump to this point if there was no destination observed during lookup and namei. + * However, because only iocounts are held at the VFS layer, there is nothing preventing a + * competing thread from racing us and creating a file or dir at the destination of this rename + * operation. If this occurs, it may cause us to get a spurious EEXIST out of the cat_rename + * call below. To preserve rename's atomicity, we need to signal VFS to re-drive the + * namei/lookup and restart the rename operation. EEXIST is an allowable errno to be bubbled + * out of the rename syscall, but not for this reason, since it is a synonym errno for ENOTEMPTY. + * To signal VFS, we return ERECYCLE (which is also used for lookup restarts). This errno + * will be swallowed and it will restart the operation. */ lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); @@ -2626,6 +2641,9 @@ skip_rm: hfs_systemfile_unlock(hfsmp, lockflags); if (error) { + if (error == EEXIST) { + error = ERECYCLE; + } goto out; } diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index d7711096f..f7bd0e5d9 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -106,6 +106,8 @@ #include +#include /* fd); AUDIT_ARG(cmd, uap->cmd); @@ -604,7 +607,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval) } context.vc_thread = current_thread(); context.vc_ucred = fp->f_cred; - if (proc_is64bit(p)) { + + is64bit = proc_is64bit(p); + if (is64bit) { argp = uap->arg; } else { @@ -1482,13 +1487,17 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval) } default: - if (uap->cmd < FCNTL_FS_SPECIFIC_BASE) { - error = EINVAL; + /* + * This is an fcntl() that we d not recognize at this level; + * if this is a vnode, we send it down into the VNOP_IOCTL + * for this vnode; this can include special devices, and will + * effectively overload fcntl() to send ioctl()'s. + */ + if((uap->cmd & IOC_VOID) && (uap->cmd & IOC_INOUT)){ + error = EINVAL; goto out; } - - // if it's a fs-specific fcntl() then just pass it through - + if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; @@ -1497,12 +1506,103 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval) proc_fdunlock(p); if ( (error = vnode_getwithref(vp)) == 0 ) { - error = VNOP_IOCTL(vp, uap->cmd, CAST_DOWN(caddr_t, argp), 0, &context); +#define STK_PARAMS 128 + char stkbuf[STK_PARAMS]; + unsigned int size; + caddr_t data, memp; + int fix_cmd = uap->cmd; + + /* + * For this to work properly, we have to copy in the + * ioctl() cmd argument if there is one; we must also + * check that a command parameter, if present, does + * not exceed the maximum command length dictated by + * the number of bits we have available in the command + * to represent a structure length. Finally, we have + * to copy the results back out, if it is that type of + * ioctl(). + */ + size = IOCPARM_LEN(uap->cmd); + if (size > IOCPARM_MAX) { + (void)vnode_put(vp); + error = EINVAL; + break; + } + + /* + * fix up the command we should have + * received via fcntl with one with a valid size and + * copy out argument. + */ + if (fix_cmd == HFS_GET_MOUNT_TIME || + fix_cmd == HFS_GET_LAST_MTIME) { + if (is64bit) + size = sizeof(user_time_t); + else + size = sizeof(time_t); + fix_cmd |= IOC_OUT; + } + + memp = NULL; + if (size > sizeof (stkbuf)) { + if ((memp = (caddr_t)kalloc(size)) == 0) { + (void)vnode_put(vp); + error = ENOMEM; + } + data = memp; + } else { + data = &stkbuf[0]; + } + + if (fix_cmd & IOC_IN) { + if (size) { + /* structure */ + error = copyin(argp, data, size); + if (error) { + (void)vnode_put(vp); + if (memp) + kfree(memp, size); + goto outdrop; + } + } else { + /* int */ + if (is64bit) { + *(user_addr_t *)data = argp; + } else { + *(uint32_t *)data = (uint32_t)argp; + } + }; + } else if ((fix_cmd & IOC_OUT) && size) { + /* + * Zero the buffer so the user always + * gets back something deterministic. + */ + bzero(data, size); + } else if (fix_cmd & IOC_VOID) { + if (is64bit) { + *(user_addr_t *)data = argp; + } else { + *(uint32_t *)data = (uint32_t)argp; + } + } + + /* + * We pass the unmodified uap->cmd + * to the underlying VNOP so that we don't confuse it; + * but we are going to handle its copyout() when it + * gets back. + */ + error = VNOP_IOCTL(vp, uap->cmd, CAST_DOWN(caddr_t, data), 0, &context); (void)vnode_put(vp); + + /* Copy any output data to user */ + if (error == 0 && (fix_cmd & IOC_OUT) && size) + error = copyout(data, argp, size); + if (memp) + kfree(memp, size); } break; - } outdrop: @@ -3871,9 +3971,12 @@ closef_locked(struct fileproc *fp, struct fileglob *fg, proc_t p) fg->fg_lflags |= FG_TERM; lck_mtx_unlock(&fg->fg_lock); - proc_fdunlock(p); + if (p) + proc_fdunlock(p); error = closef_finish(fp, fg, p, &context); - proc_fdlock(p); + + if (p) + proc_fdlock(p); return(error); } diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index 9b568d24c..4c1b6c6eb 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -132,7 +132,7 @@ static int unp_connect(struct socket *, struct sockaddr *, proc_t); static void unp_disconnect(struct unpcb *); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *, int); -static void unp_gc(void); +__private_extern__ void unp_gc(void); static void unp_scan(struct mbuf *, void (*)(struct fileglob *)); static void unp_mark(struct fileglob *); static void unp_discard(struct fileglob *); @@ -749,7 +749,11 @@ unp_detach(struct unpcb *unp) * gets them (resulting in a "panic: closef: count < 0"). */ sorflush(unp->unp_socket); + + /* Per domain mutex deadlock avoidance */ + socket_unlock(unp->unp_socket, 0); unp_gc(); + socket_lock(unp->unp_socket, 0); } if (unp->unp_addr) FREE(unp->unp_addr, M_SONAME); @@ -1362,11 +1366,15 @@ unp_internalize(struct mbuf *control, proc_t p) } static int unp_defer, unp_gcing, unp_gcwait; +static thread_t unp_gcthread = NULL; /* always called under uipc_lock */ void unp_gc_wait(void) { + if (unp_gcthread == current_thread()) + return; + while (unp_gcing != 0) { unp_gcwait = 1; msleep(&unp_gcing, uipc_lock, 0 , "unp_gc_wait", NULL); @@ -1374,12 +1382,13 @@ unp_gc_wait(void) } -static void +__private_extern__ void unp_gc(void) { struct fileglob *fg, *nextfg; struct socket *so; - struct fileglob **extra_ref, **fpp; + static struct fileglob **extra_ref; + struct fileglob **fpp; int nunref, i; int need_gcwakeup = 0; @@ -1390,6 +1399,7 @@ unp_gc(void) } unp_gcing = 1; unp_defer = 0; + unp_gcthread = current_thread(); lck_mtx_unlock(uipc_lock); /* * before going through all this, set all FDs to @@ -1484,9 +1494,13 @@ unp_gc(void) * to see if we hold any file descriptors in its * message buffers. Follow those links and mark them * as accessible too. + * + * In case a file is passed onto itself we need to + * release the file lock. */ - unp_scan(so->so_rcv.sb_mb, unp_mark); lck_mtx_unlock(&fg->fg_lock); + + unp_scan(so->so_rcv.sb_mb, unp_mark); } } while (unp_defer); /* @@ -1564,20 +1578,13 @@ unp_gc(void) tfg = *fpp; if (tfg->fg_type == DTYPE_SOCKET && tfg->fg_data != NULL) { - int locked = 0; - so = (struct socket *)(tfg->fg_data); - /* XXXX */ - /* Assume local sockets use a global lock */ - if (so->so_proto->pr_domain->dom_family != PF_LOCAL) { - socket_lock(so, 0); - locked = 1; - } + socket_lock(so, 0); + sorflush(so); - if (locked) - socket_unlock(so, 0); + socket_unlock(so, 0); } } for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) @@ -1585,6 +1592,7 @@ unp_gc(void) lck_mtx_lock(uipc_lock); unp_gcing = 0; + unp_gcthread = NULL; if (unp_gcwait != 0) { unp_gcwait = 0; diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index aad1b0250..6c26b1799 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -600,6 +600,7 @@ void IOSleep(int); struct _throttle_io_info_t { struct timeval last_normal_IO_timestamp; + struct timeval last_IO_timestamp; SInt32 numthreads_throttling; }; @@ -614,6 +615,32 @@ SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +void +throttle_info_get_last_io_time(mount_t mp, struct timeval *tv) +{ + size_t devbsdunit; + + devbsdunit = mp->mnt_devbsdunit; + + if (devbsdunit < LOWPRI_MAX_NUM_DEV) { + *tv = _throttle_io_info[devbsdunit].last_IO_timestamp; + } else { + memset(tv, 0, sizeof(*tv)); + } +} + +void +update_last_io_time(mount_t mp) +{ + size_t devbsdunit; + + devbsdunit = mp->mnt_devbsdunit; + + if (devbsdunit < LOWPRI_MAX_NUM_DEV) { + microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp); + } +} + int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit) { struct timeval elapsed; @@ -784,6 +811,18 @@ spec_strategy(struct vnop_strategy_args *ap) } } } + + if ((bflags & B_READ) == 0) { + size_t devbsdunit; + + if (buf_vnode(bp)->v_mount != NULL) + devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit; + else + devbsdunit = LOWPRI_MAX_NUM_DEV - 1; + + microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp); + } + (*bdevsw[major(bdev)].d_strategy)(bp); return (0); diff --git a/bsd/netat/atp_read.c b/bsd/netat/atp_read.c index a3853eae9..940a58ae3 100644 --- a/bsd/netat/atp_read.c +++ b/bsd/netat/atp_read.c @@ -58,6 +58,8 @@ #include #include +__private_extern__ int atp_resp_seqno2big = 0; + static void atp_trans_complete(struct atp_trans *); void atp_x_done_locked(void *); void atp_treq_event(void *); @@ -139,8 +141,8 @@ gbuf_t *m; case ATP_CMD_TRESP: { register struct atp_trans *trp; - register int seqno; - register at_ddp_t *ddp; + register unsigned int seqno; + register at_ddp_t *ddp; /* * we just got a response, find the trans record @@ -155,10 +157,20 @@ gbuf_t *m; * If we can't find one then ignore the message */ seqno = athp->bitmap; + if (seqno > 7) { + atp_resp_seqno2big++; + ddp = AT_DDP_HDR(m); + dPrintf(D_M_ATP_LOW, (D_L_INPUT|D_L_ERROR), + ("atp_rput: dropping TRESP seqno too big, tid=%d,loc=%d,rem=%d.%d,seqno=%u\n", + UAS_VALUE_NTOH(athp->tid), + ddp->dst_socket, ddp->src_node, ddp->src_socket, seqno)); + gbuf_freem(m); + return; + } if (trp == NULL) { ddp = AT_DDP_HDR(m); dPrintf(D_M_ATP_LOW, (D_L_INPUT|D_L_ERROR), - ("atp_rput: dropping TRESP, no trp,tid=%d,loc=%d,rem=%d.%d,seqno=%d\n", + ("atp_rput: dropping TRESP, no trp,tid=%d,loc=%d,rem=%d.%d,seqno=%u\n", UAS_VALUE_NTOH(athp->tid), ddp->dst_socket, ddp->src_node, ddp->src_socket, seqno)); gbuf_freem(m); @@ -184,7 +196,7 @@ gbuf_t *m; if (!(trp->tr_bitmap&atp_mask[seqno]) || trp->tr_rcv[seqno]) { ddp = AT_DDP_HDR(m); dPrintf(D_M_ATP_LOW, (D_L_INPUT|D_L_ERROR), - ("atp_rput: dropping TRESP, duplicate,tid=%d,loc=%d,rem=%d.%d,seqno=%d\n", + ("atp_rput: dropping TRESP, duplicate,tid=%d,loc=%d,rem=%d.%d,seqno=%u\n", UAS_VALUE_NTOH(athp->tid), ddp->dst_socket, ddp->src_node, ddp->src_socket, seqno)); gbuf_freem(m); diff --git a/bsd/netat/sys_glue.c b/bsd/netat/sys_glue.c index a1d2a402c..dd22563be 100644 --- a/bsd/netat/sys_glue.c +++ b/bsd/netat/sys_glue.c @@ -99,6 +99,9 @@ SYSCTL_INT(_net_appletalk, OID_AUTO, routermix, CTLFLAG_WR, at_ddp_stats_t at_ddp_stats; /* DDP statistics */ SYSCTL_STRUCT(_net_appletalk, OID_AUTO, ddpstats, CTLFLAG_RD, &at_ddp_stats, at_ddp_stats, "AppleTalk DDP Stats"); +extern int atp_resp_seqno2big; +SYSCTL_INT(_net_appletalk, OID_AUTO, atp_resp_seqno2big, CTLFLAG_RD, + &atp_resp_seqno2big, 0, "Appletalk ATP seqno too big count"); static void ioccmd_t_32_to_64( ioccmd_t *from_p, user_ioccmd_t *to_p ); static void ioccmd_t_64_to_32( user_ioccmd_t *from_p, ioccmd_t *to_p ); diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index e847a3319..db0662895 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -121,7 +121,8 @@ static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; extern struct inpcbinfo ripcbinfo; extern lck_mtx_t *ip6_mutex; -extern lck_mtx_t *nd6_mutex; +extern lck_mtx_t *nd6_mutex; +extern lck_mtx_t *inet6_domain_mutex; static void icmp6_errcount(struct icmp6errstat *, int, int); static int icmp6_rip6_input(struct mbuf **, int); @@ -515,8 +516,15 @@ icmp6_input(mp, offp) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo); if (code != 0) goto badcode; + + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + if ((n = m_copy(m, 0, M_COPYALL)) == NULL) { /* Give up remote */ + goto rate_limit_checked; break; } if ((n->m_flags & M_EXT) != 0 @@ -531,6 +539,7 @@ icmp6_input(mp, offp) if (maxlen >= MCLBYTES) { /* Give up remote */ m_freem(n0); + goto rate_limit_checked; break; } MGETHDR(n, M_DONTWAIT, n0->m_type); /* MAC-OK */ @@ -544,6 +553,7 @@ icmp6_input(mp, offp) if (n == NULL) { /* Give up remote */ m_freem(n0); + goto rate_limit_checked; break; } M_COPY_PKTHDR(n, n0); @@ -578,6 +588,7 @@ icmp6_input(mp, offp) icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++; icmp6_reflect(n, noff); } + goto rate_limit_checked; break; case ICMP6_ECHO_REPLY: @@ -594,6 +605,12 @@ icmp6_input(mp, offp) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery); else icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport); + + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ mld6_input(m, off); @@ -602,6 +619,7 @@ icmp6_input(mp, offp) } mld6_input(n, off); /* m stays. */ + goto rate_limit_checked; break; case MLD6_LISTENER_DONE: @@ -631,6 +649,11 @@ icmp6_input(mp, offp) IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), return IPPROTO_DONE); #endif + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + n = m_copy(m, 0, M_COPYALL); if (n) n = ni6_input(n, off); @@ -640,6 +663,7 @@ icmp6_input(mp, offp) icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++; icmp6_reflect(n, noff); } + goto rate_limit_checked; break; case ICMP6_WRUREPLY: @@ -653,6 +677,12 @@ icmp6_input(mp, offp) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; + + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_rs_input(m, off, icmp6len); @@ -661,6 +691,7 @@ icmp6_input(mp, offp) } nd6_rs_input(n, off, icmp6len); /* m stays. */ + goto rate_limit_checked; break; case ND_ROUTER_ADVERT: @@ -669,6 +700,12 @@ icmp6_input(mp, offp) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; + + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ra_input(m, off, icmp6len); @@ -677,6 +714,7 @@ icmp6_input(mp, offp) } nd6_ra_input(n, off, icmp6len); /* m stays. */ + goto rate_limit_checked; break; case ND_NEIGHBOR_SOLICIT: @@ -685,6 +723,12 @@ icmp6_input(mp, offp) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; + + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ns_input(m, off, icmp6len); @@ -693,6 +737,7 @@ icmp6_input(mp, offp) } nd6_ns_input(n, off, icmp6len); /* m stays. */ + goto rate_limit_checked; break; case ND_NEIGHBOR_ADVERT: @@ -701,6 +746,12 @@ icmp6_input(mp, offp) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; + + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_na_input(m, off, icmp6len); @@ -709,6 +760,7 @@ icmp6_input(mp, offp) } nd6_na_input(n, off, icmp6len); /* m stays. */ + goto rate_limit_checked; break; case ND_REDIRECT: @@ -717,6 +769,12 @@ icmp6_input(mp, offp) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; + + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ icmp6_redirect_input(m, off); @@ -725,6 +783,7 @@ icmp6_input(mp, offp) } icmp6_redirect_input(n, off); /* m stays. */ + goto rate_limit_checked; break; case ICMP6_ROUTER_RENUMBERING: @@ -736,6 +795,11 @@ icmp6_input(mp, offp) break; default: + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(&ip6->ip6_src), @@ -747,9 +811,15 @@ icmp6_input(mp, offp) /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ + goto rate_limit_checked; break; } deliver: + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } + if (icmp6_notify_error(m, off, icmp6len, code)) { /* In this case, m should've been freed. */ return(IPPROTO_DONE); @@ -765,6 +835,11 @@ icmp6_input(mp, offp) break; } + if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) { + icmp6stat.icp6s_toofreq++; + goto freeit; + } +rate_limit_checked: /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); @@ -2331,7 +2406,16 @@ icmp6_redirect_input(m, off) sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); + + /* + * Radar 6843900 + * Release the IPv6 domain lock because we are going to take domain_proto_mtx + * and could otherwise cause a deadlock with other threads taking these locks + * in the reverse order -- e.g. frag6_slowtimo() from pfslowtimo() + */ + lck_mtx_unlock(inet6_domain_mutex); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); + lck_mtx_lock(inet6_domain_mutex); #if IPSEC key_sa_routechange((struct sockaddr *)&sdst); #endif diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index 29188674f..a84715b19 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -545,9 +545,14 @@ struct in6_pktinfo { #define IPV6CTL_MAXFRAGS 41 /* max fragments */ +#define IPV6CTL_NEIGHBORGCTHRESH 46 +#define IPV6CTL_MAXIFPREFIXES 47 +#define IPV6CTL_MAXIFDEFROUTERS 48 +#define IPV6CTL_MAXDYNROUTES 49 + /* New entries should be added here from current IPV6CTL_MAXID value. */ /* to define items, should talk with KAME guys first, for *BSD compatibility */ -#define IPV6CTL_MAXID 42 +#define IPV6CTL_MAXID 50 #ifdef KERNEL_PRIVATE #define CTL_IPV6PROTO_NAMES { \ diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index c27a77892..a937bbf35 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -378,6 +378,11 @@ int ip6_rr_prune = 5; /* router renumbering prefix * walk list every 5 sec. */ int ip6_v6only = 0; /* Mapped addresses on by default - Radar 3347718 */ +int ip6_neighborgcthresh = 2048; /* Threshold # of NDP entries for GC */ +int ip6_maxifprefixes = 16; /* Max acceptable prefixes via RA per IF */ +int ip6_maxifdefrouters = 16; /* Max acceptable def routers via RA */ +int ip6_maxdynroutes = 4096; /* Max # of routes created via redirect */ + u_int32_t ip6_id = 0UL; int ip6_keepfaith = 0; time_t ip6_log_time = (time_t)0L; @@ -515,6 +520,14 @@ SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD, &rip6stat, rip6stat, ""); SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD, &mrt6stat, mrt6stat, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NEIGHBORGCTHRESH, + neighborgcthresh, CTLFLAG_RW, &ip6_neighborgcthresh, 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFPREFIXES, + maxifprefixes, CTLFLAG_RW, &ip6_maxifprefixes, 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFDEFROUTERS, + maxifdefrouters, CTLFLAG_RW, &ip6_maxifdefrouters, 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXDYNROUTES, + maxdynroutes, CTLFLAG_RW, &ip6_maxdynroutes, 0, ""); /* net.inet6.icmp6 */ diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index c4a2fb286..f42089272 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -266,6 +266,11 @@ extern int ip6_rr_prune; /* router renumbering prefix #define ip6_mapped_addr_on (!ip6_v6only) extern int ip6_v6only; +extern int ip6_neighborgcthresh; /* Threshold # of NDP entries for GC */ +extern int ip6_maxifprefixes; /* Max acceptable prefixes via RA per IF */ +extern int ip6_maxifdefrouters; /* Max acceptable def routers via RA */ +extern int ip6_maxdynroutes; /* Max # of routes created via redirect */ + extern struct socket *ip6_mrouter; /* multicast routing daemon */ extern int ip6_sendredirects; /* send IP redirects when forwarding? */ extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index f0c838d58..66f4cd2b9 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -119,6 +119,7 @@ extern lck_mtx_t *nd6_mutex; static void nd6_slowtimo(void *ignored_arg); + void nd6_init() { @@ -415,11 +416,11 @@ nd6_timer( struct in6_ifaddr *ia6, *nia6; struct in6_addrlifetime *lt6; struct timeval timenow; + int count = 0; getmicrotime(&timenow); - ln = llinfo_nd6.ln_next; while (ln && ln != &llinfo_nd6) { struct rtentry *rt; @@ -439,9 +440,34 @@ nd6_timer( ndi = &nd_ifinfo[ifp->if_index]; dst = (struct sockaddr_in6 *)rt_key(rt); + count++; + if (ln->ln_expire > timenow.tv_sec) { - ln = next; - continue; + + /* Radar 6871508 Check if we have too many cache entries. + * In that case purge 20% of the table to make space + * for the new entries. + * This is a bit crude but keeps the deletion in timer + * thread only. + */ + + if ((ip6_neighborgcthresh >= 0 && + nd6_inuse >= ip6_neighborgcthresh) && + ((count % 5) == 0)) { + + if (ln->ln_state > ND6_LLINFO_INCOMPLETE) + ln->ln_state = ND6_LLINFO_STALE; + else + ln->ln_state = ND6_LLINFO_PURGE; + ln->ln_expire = timenow.tv_sec; + + /* fallthrough and call nd6_free() */ + } + + else { + ln = next; + continue; + } } /* sanity check */ @@ -499,6 +525,7 @@ nd6_timer( break; case ND6_LLINFO_STALE: + case ND6_LLINFO_PURGE: /* Garbage Collection(RFC 2461 5.3) */ if (ln->ln_expire) next = nd6_free(rt); diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index b85be0157..4b66e6be6 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -55,6 +55,7 @@ struct llinfo_nd6 { }; #endif /* KERNEL_PRIVATE */ +#define ND6_LLINFO_PURGE -3 #define ND6_LLINFO_NOSTATE -2 /* * We don't need the WAITDELETE state any more, but we keep the definition @@ -86,6 +87,9 @@ struct nd_ifinfo { u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */ u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */ u_int8_t randomid[8]; /* current random ID */ + /* keep track of routers and prefixes on this link */ + int32_t nprefixes; + int32_t ndefrouters; }; #define ND6_IFF_PERFORMNUD 0x1 diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index 8ca259b74..5af29d31e 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -631,6 +631,7 @@ defrtrlist_del( struct nd_defrouter *dr, int nd6locked) { struct nd_defrouter *deldr = NULL; + struct nd_ifinfo *ndi = &nd_ifinfo[dr->ifp->if_index]; struct nd_prefix *pr; /* @@ -667,6 +668,12 @@ defrtrlist_del( if (deldr) defrouter_select(); + ndi->ndefrouters--; + if (ndi->ndefrouters < 0) { + log(LOG_WARNING, "defrtrlist_del: negative count on %s\n", + if_name(dr->ifp)); + } + if (nd6locked == 0) lck_mtx_unlock(nd6_mutex); @@ -760,6 +767,7 @@ defrtrlist_update( struct nd_defrouter *new) { struct nd_defrouter *dr, *n; + struct nd_ifinfo *ndi = &nd_ifinfo[new->ifp->if_index]; lck_mtx_lock(nd6_mutex); if ((dr = defrouter_lookup(&new->rtaddr, new->ifp)) != NULL) { @@ -783,6 +791,12 @@ defrtrlist_update( return(NULL); } + if (ip6_maxifdefrouters >= 0 && + ndi->ndefrouters >= ip6_maxifdefrouters) { + lck_mtx_unlock(nd6_mutex); + return (NULL); + } + n = (struct nd_defrouter *)_MALLOC(sizeof(*n), M_IP6NDP, M_NOWAIT); if (n == NULL) { lck_mtx_unlock(nd6_mutex); @@ -799,6 +813,8 @@ defrtrlist_update( TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry); if (TAILQ_FIRST(&nd_defrouter) == n) defrouter_select(); + + ndi->ndefrouters++; lck_mtx_unlock(nd6_mutex); return(n); @@ -905,6 +921,40 @@ ndpr_rele(struct nd_prefix *pr, boolean_t locked) lck_mtx_unlock(nd6_mutex); } +static void +purge_detached(struct ifnet *ifp) +{ + struct nd_prefix *pr, *pr_next; + struct in6_ifaddr *ia; + struct ifaddr *ifa, *ifa_next; + + lck_mtx_lock(nd6_mutex); + + for (pr = nd_prefix.lh_first; pr; pr = pr_next) { + pr_next = pr->ndpr_next; + if (pr->ndpr_ifp != ifp || + IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || + ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && + !LIST_EMPTY(&pr->ndpr_advrtrs))) + continue; + + for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa_next) { + ifa_next = ifa->ifa_list.tqe_next; + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + ia = (struct in6_ifaddr *)ifa; + if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == + IN6_IFF_AUTOCONF && ia->ia6_ndpr == pr) { + in6_purgeaddr(ifa, 1); + } + } + if (pr->ndpr_refcnt == 0) + prelist_remove(pr, 1); + } + + lck_mtx_unlock(nd6_mutex); +} + int nd6_prelist_add( struct nd_prefix *pr, @@ -913,6 +963,14 @@ nd6_prelist_add( { struct nd_prefix *new = NULL; int i; + struct nd_ifinfo *ndi = &nd_ifinfo[pr->ndpr_ifp->if_index]; + + if (ip6_maxifprefixes >= 0) { + if (ndi->nprefixes >= ip6_maxifprefixes / 2) + purge_detached(pr->ndpr_ifp); + if (ndi->nprefixes >= ip6_maxifprefixes) + return(ENOMEM); + } new = (struct nd_prefix *)_MALLOC(sizeof(*new), M_IP6NDP, M_NOWAIT); if (new == NULL) @@ -953,6 +1011,9 @@ nd6_prelist_add( if (dr) { pfxrtr_add(new, dr); } + + ndi->nprefixes++; + lck_mtx_unlock(nd6_mutex); return 0; @@ -964,6 +1025,7 @@ prelist_remove( { struct nd_pfxrouter *pfr, *next; int e; + struct nd_ifinfo *ndi = &nd_ifinfo[pr->ndpr_ifp->if_index]; /* make sure to invalidate the prefix until it is really freed. */ pr->ndpr_vltime = 0; @@ -1001,6 +1063,12 @@ prelist_remove( FREE(pfr, M_IP6NDP); } + ndi->nprefixes--; + if (ndi->nprefixes < 0) { + log(LOG_WARNING, "prelist_remove: negative count on %s\n", + if_name(pr->ndpr_ifp)); + } + FREE(pr, M_IP6NDP); pfxlist_onlink_check(1); diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index 0e1d6dc69..75e405b3c 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -781,6 +781,8 @@ key_checkrequest(isr, saidx, sav) * OUT: NULL: not found. * others: found and return the pointer. */ +u_int32_t sah_search_calls = 0; +u_int32_t sah_search_count = 0; struct secasvar * key_allocsa_policy(saidx) struct secasindex *saidx; @@ -794,7 +796,9 @@ key_allocsa_policy(saidx) u_int16_t dstport; lck_mtx_lock(sadb_mutex); + sah_search_calls++; LIST_FOREACH(sah, &sahtree, chain) { + sah_search_count++; if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE | CMP_REQID)) @@ -4630,7 +4634,9 @@ key_bbcmp(p1, p2, bits) * and do to remove or to expire. * XXX: year 2038 problem may remain. */ - +int key_timehandler_debug = 0; +u_int32_t spd_count = 0, sah_count = 0, dead_sah_count = 0, empty_sah_count = 0, larval_sav_count = 0, mature_sav_count = 0, dying_sav_count = 0, dead_sav_count = 0; +u_int64_t total_sav_count = 0; void key_timehandler(void) { @@ -4671,6 +4677,7 @@ key_timehandler(void) sp != NULL; sp = nextsp) { + spd_count++; nextsp = LIST_NEXT(sp, chain); if (sp->state == IPSEC_SPSTATE_DEAD) { @@ -4706,11 +4713,22 @@ key_timehandler(void) sah != NULL; sah = nextsah) { + sah_count++; nextsah = LIST_NEXT(sah, chain); /* if sah has been dead, then delete it and process next sah. */ if (sah->state == SADB_SASTATE_DEAD) { key_delsah(sah); + dead_sah_count++; + continue; + } + + if (LIST_FIRST(&sah->savtree[SADB_SASTATE_LARVAL]) == NULL && + LIST_FIRST(&sah->savtree[SADB_SASTATE_MATURE]) == NULL && + LIST_FIRST(&sah->savtree[SADB_SASTATE_DYING]) == NULL && + LIST_FIRST(&sah->savtree[SADB_SASTATE_DEAD]) == NULL) { + key_delsah(sah); + empty_sah_count++; continue; } @@ -4719,6 +4737,8 @@ key_timehandler(void) sav != NULL; sav = nextsav) { + larval_sav_count++; + total_sav_count++; nextsav = LIST_NEXT(sav, chain); if (tv.tv_sec - sav->created > key_larval_lifetime) { @@ -4755,6 +4775,8 @@ key_timehandler(void) sav != NULL; sav = nextsav) { + mature_sav_count++; + total_sav_count++; nextsav = LIST_NEXT(sav, chain); /* we don't need to check. */ @@ -4816,6 +4838,8 @@ key_timehandler(void) sav != NULL; sav = nextsav) { + dying_sav_count++; + total_sav_count++; nextsav = LIST_NEXT(sav, chain); /* we don't need to check. */ @@ -4869,6 +4893,8 @@ key_timehandler(void) sav != NULL; sav = nextsav) { + dead_sav_count++; + total_sav_count++; nextsav = LIST_NEXT(sav, chain); /* sanity check */ @@ -4890,6 +4916,32 @@ key_timehandler(void) } } + if (++key_timehandler_debug >= 300) { + if (key_debug_level) { + printf("%s: total stats for %u calls\n", __FUNCTION__, key_timehandler_debug); + printf("%s: walked %u SPDs\n", __FUNCTION__, spd_count); + printf("%s: walked %llu SAs: LARVAL SAs %u, MATURE SAs %u, DYING SAs %u, DEAD SAs %u\n", __FUNCTION__, + total_sav_count, larval_sav_count, mature_sav_count, dying_sav_count, dead_sav_count); + printf("%s: walked %u SAHs: DEAD SAHs %u, EMPTY SAHs %u\n", __FUNCTION__, + sah_count, dead_sah_count, empty_sah_count); + if (sah_search_calls) { + printf("%s: SAH search cost %d iters per call\n", __FUNCTION__, + (sah_search_count/sah_search_calls)); + } + } + spd_count = 0; + sah_count = 0; + dead_sah_count = 0; + empty_sah_count = 0; + larval_sav_count = 0; + mature_sav_count = 0; + dying_sav_count = 0; + dead_sav_count = 0; + total_sav_count = 0; + sah_search_count = 0; + sah_search_calls = 0; + key_timehandler_debug = 0; + } #ifndef IPSEC_NONBLOCK_ACQUIRE /* ACQ tree */ { diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index 368f4f3e6..d482714ef 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -230,6 +230,8 @@ void get_procrustime(struct time_value *tv); void load_init_program(struct proc *p); void __pthread_testcancel(int presyscall); void syscall_exit_funnelcheck(void); +void throttle_info_get_last_io_time(mount_t mp, struct timeval *tv); +void update_last_io_time(mount_t mp); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 3871c2a3b..026109d21 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -3154,6 +3154,10 @@ buf_biodone(buf_t bp) fslog_io_error(bp); } + if (bp->b_vp && bp->b_vp->v_mount && (bp->b_flags & B_READ) == 0) { + update_last_io_time(bp->b_vp->v_mount); + } + if (kdebug_enable) { int code = DKIO_DONE; diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index bfee0d8b4..643d79c07 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -4986,15 +4986,21 @@ out: * - Neither the node nor the directory are immutable. * - The user is not the superuser. * - * Deletion is not permitted if the directory is sticky and the caller is not owner of the - * node or directory. + * Deletion is not permitted if the directory is sticky and the caller is + * not owner of the node or directory. * - * If either the node grants DELETE, or the directory grants DELETE_CHILD, the node may be - * deleted. If neither denies the permission, and the caller has Posix write access to the - * directory, then the node may be deleted. + * If either the node grants DELETE, or the directory grants DELETE_CHILD, + * the node may be deleted. If neither denies the permission, and the + * caller has Posix write access to the directory, then the node may be + * deleted. + * + * As an optimization, we cache whether or not delete child is permitted + * on directories without the sticky bit set. */ -static int -vnode_authorize_delete(vauth_ctx vcp) +int +vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child); +/*static*/ int +vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) { struct vnode_attr *vap = vcp->vap; struct vnode_attr *dvap = vcp->dvap; @@ -5004,7 +5010,7 @@ vnode_authorize_delete(vauth_ctx vcp) /* check the ACL on the directory */ delete_child_denied = 0; - if (VATTR_IS_NOT(dvap, va_acl, NULL)) { + if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) { eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; eval.ae_acl = &dvap->va_acl->acl_ace[0]; eval.ae_count = dvap->va_acl->acl_entrycount; @@ -5070,15 +5076,20 @@ vnode_authorize_delete(vauth_ctx vcp) return(EACCES); } - /* enforce sticky bit behaviour */ - if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { + /* + * enforce sticky bit behaviour; the cached_delete_child property will + * be false and the dvap contents valis for sticky bit directories; + * this makes us check the directory each time, but it's unavoidable, + * as sticky bit is an exception to caching. + */ + if (!cached_delete_child && (dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid); return(EACCES); } /* check the directory */ - if ((error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { + if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { KAUTH_DEBUG("%p ALLOWED - granted by posix permisssions", vcp->vp); return(error); } @@ -5476,7 +5487,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i int result; int *errorp; int noimmutable; - boolean_t parent_authorized_for_delete = FALSE; + boolean_t parent_authorized_for_delete_child = FALSE; boolean_t found_deny = FALSE; boolean_t parent_ref= FALSE; @@ -5541,8 +5552,8 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i * can skip a whole bunch of work... we will still have to * authorize that this specific child can be removed */ - if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE) == TRUE) - parent_authorized_for_delete = TRUE; + if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) + parent_authorized_for_delete_child = TRUE; } else { dvp = NULL; } @@ -5589,7 +5600,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result); goto out; } - if (dvp && parent_authorized_for_delete == FALSE) { + if (dvp && parent_authorized_for_delete_child == FALSE) { VATTR_WANTED(&dva, va_mode); VATTR_WANTED(&dva, va_uid); VATTR_WANTED(&dva, va_gid); @@ -5645,7 +5656,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0) goto out; if ((rights & KAUTH_VNODE_DELETE) && - parent_authorized_for_delete == FALSE && + parent_authorized_for_delete_child == FALSE && ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0)) goto out; @@ -5658,13 +5669,14 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i goto out; /* - * If we're not the superuser, authorize based on file properties. + * If we're not the superuser, authorize based on file properties; + * note that even if parent_authorized_for_delete_child is TRUE, we + * need to check on the node itself. */ if (!vfs_context_issuser(ctx)) { /* process delete rights */ if ((rights & KAUTH_VNODE_DELETE) && - parent_authorized_for_delete == FALSE && - ((result = vnode_authorize_delete(vcp)) != 0)) + ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) goto out; /* process remaining rights */ @@ -5715,12 +5727,20 @@ out: vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE); } } - if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete == FALSE) { + if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) { /* - * parent was successfully and newly authorized for deletions - * add it to the cache + * parent was successfully and newly authorized for content deletions + * add it to the cache, but only if it doesn't have the sticky + * bit set on it. This same check is done earlier guarding + * fetching of dva, and if we jumped to out without having done + * this, we will have returned already because of a non-zero + * 'result' value. */ - vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE); + if (VATTR_IS_SUPPORTED(&dva, va_mode) && + !(dva.va_mode & (S_ISVTX))) { + /* OK to cache delete rights */ + vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD); + } } if (parent_ref) vnode_put(vp); diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 869f3f5b3..ee31d2c82 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -5120,10 +5120,11 @@ auth_exit: /* * We may encounter a race in the VNOP where the destination didn't * exist when we did the namei, but it does by the time we go and - * try to create the entry. In this case, we should re-drive this rename - * call from the top again. - */ - if (error == EEXIST) { + * try to create the entry. In this case, we should re-drive this rename + * call from the top again. Currently, only HFS bubbles out ERECYCLE, + * but other filesystem susceptible to this race could return it, too. + */ + if (error == ERECYCLE) { do_retry = 1; } diff --git a/config/MasterVersion b/config/MasterVersion index 58454343e..6ce3d2f9d 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -9.7.0 +9.8.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp index 6346d0c2d..3ec11b1f1 100644 --- a/iokit/bsddev/IOKitBSDInit.cpp +++ b/iokit/bsddev/IOKitBSDInit.cpp @@ -857,4 +857,41 @@ kern_return_t IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout ) return KERN_SUCCESS; } + +int IOBSDIsMediaEjectable( const char *cdev_name ) +{ + int ret = 0; + OSDictionary *dictionary; + OSString *dev_name; + + if (strncmp(cdev_name, "/dev/", 5) == 0) { + cdev_name += 5; + } + + dictionary = IOService::serviceMatching( "IOMedia" ); + if( dictionary ) { + dev_name = OSString::withCString( cdev_name ); + if( dev_name ) { + IOService *service; + mach_timespec_t tv = { 5, 0 }; // wait up to "timeout" seconds for the device + + dictionary->setObject( kIOBSDNameKey, dev_name ); + dictionary->retain(); + service = IOService::waitForService( dictionary, &tv ); + if( service ) { + OSBoolean *ejectable = (OSBoolean *) service->getProperty( "Ejectable" ); + + if( ejectable ) { + ret = (int)ejectable->getValue(); + } + + } + dev_name->release(); + } + dictionary->release(); + } + + return ret; +} + } /* extern "C" */ diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index cd5bdbb71..17143604c 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -136,10 +136,24 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) acpi_hibernate_callback_data_t data; boolean_t did_hibernate; #endif + unsigned int cpu; + kern_return_t rc; + unsigned int my_cpu; kprintf("acpi_sleep_kernel hib=%d\n", current_cpu_datap()->cpu_hibernate); + /* Geta ll CPUs to be in the "off" state */ + my_cpu = cpu_number(); + for (cpu = 0; cpu < real_ncpus; cpu += 1) { + if (cpu == my_cpu) + continue; + rc = pmCPUExitHaltToOff(cpu); + if (rc != KERN_SUCCESS) + panic("Error %d trying to transition CPU %d to OFF", + rc, cpu); + } + /* shutdown local APIC before passing control to BIOS */ lapic_shutdown(); diff --git a/osfmk/i386/cpu.c b/osfmk/i386/cpu.c index 1760eabf5..194a6576b 100644 --- a/osfmk/i386/cpu.c +++ b/osfmk/i386/cpu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -141,9 +141,13 @@ cpu_exit_wait( { cpu_data_t *cdp = cpu_datap(cpu); + /* + * Wait until the CPU indicates that it has stopped. + */ simple_lock(&x86_topo_lock); while ((cdp->lcpu.state != LCPU_HALT) - && (cdp->lcpu.state != LCPU_OFF)) { + && (cdp->lcpu.state != LCPU_OFF) + && !cdp->lcpu.stopped) { simple_unlock(&x86_topo_lock); cpu_pause(); simple_lock(&x86_topo_lock); diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index 88aa0f87b..8decbb943 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -354,6 +354,19 @@ pmCPUExitHalt(int cpu) return(rc); } +kern_return_t +pmCPUExitHaltToOff(int cpu) +{ + kern_return_t rc = KERN_INVALID_ARGUMENT; + + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->exitHaltToOff != NULL) + rc = pmDispatch->exitHaltToOff(cpu_to_lcpu(cpu)); + + return(rc); +} + /* * Called to initialize the power management structures for the CPUs. */ diff --git a/osfmk/i386/pmCPU.h b/osfmk/i386/pmCPU.h index 6026f5ed6..cbfaebe65 100644 --- a/osfmk/i386/pmCPU.h +++ b/osfmk/i386/pmCPU.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2008 Apple Inc. All rights reserved. + * Copyright (c) 2006-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,7 +37,7 @@ * This value should be changed each time that pmDsipatch_t or pmCallBacks_t * changes. */ -#define PM_DISPATCH_VERSION 15 +#define PM_DISPATCH_VERSION 16 /* * Dispatch table for functions that get installed when the power @@ -68,6 +68,7 @@ typedef struct void (*pmTimerStateSave)(void); void (*pmTimerStateRestore)(void); kern_return_t (*exitHalt)(x86_lcpu_t *lcpu); + kern_return_t (*exitHaltToOff)(x86_lcpu_t *lcpu); void (*markAllCPUsOff)(void); void (*pmSetRunCount)(uint32_t count); boolean_t (*pmIsCPUUnAvailable)(x86_lcpu_t *lcpu); @@ -112,6 +113,7 @@ void pmCPUHalt(uint32_t reason); void pmTimerSave(void); void pmTimerRestore(void); kern_return_t pmCPUExitHalt(int cpu); +kern_return_t pmCPUExitHaltToOff(int cpu); #define PM_HALT_NORMAL 0 /* normal halt path */ #define PM_HALT_DEBUG 1 /* debug code wants to halt */