From: Apple <opensource@apple.com>
Date: Fri, 7 Aug 2009 20:02:49 +0000 (+0000)
Subject: xnu-1228.15.4.tar.gz
X-Git-Tag: mac-os-x-1058^0
X-Git-Url: https://git.saurik.com/apple/xnu.git/commitdiff_plain/e2fac8b15b12a7979f72090454d850e612fc5b13?ds=sidebyside;hp=c910b4d9d2451126ae3917b931cd4390c11e1d52

xnu-1228.15.4.tar.gz
---

diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h
index f3c12bb41..5bb4ec920 100644
--- a/bsd/hfs/hfs.h
+++ b/bsd/hfs/hfs.h
@@ -46,6 +46,7 @@
 #include <sys/quota.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
+#include <kern/thread_call.h>
 
 #include <kern/locks.h>
 
@@ -272,8 +273,34 @@ typedef struct hfsmount {
 	/* Resize variables: */
 	u_int32_t		hfs_resize_filesmoved;
 	u_int32_t		hfs_resize_totalfiles;
+
+	/*
+	 * About the sync counters:
+	 * hfs_sync_scheduled  keeps track whether a timer was scheduled but we
+	 *                     haven't started processing the callback (i.e. we
+	 *                     haven't begun the flush).  This will be non-zero
+	 *                     even if the callback has been invoked, before we
+	 *                    start the flush.
+	 * hfs_sync_incomplete keeps track of the number of callbacks that have
+	 *                     not completed yet (including callbacks not yet
+	 *                     invoked).  We cannot safely unmount until this
+	 *                     drops to zero.
+	 *
+	 * In both cases, we use counters, not flags, so that we can avoid
+	 * taking locks.
+	 */
+	int32_t		hfs_sync_scheduled;
+	int32_t		hfs_sync_incomplete;
+	u_int64_t       hfs_last_sync_request_time;
+	u_int64_t       hfs_last_sync_time;
+	uint32_t        hfs_active_threads;
+	thread_call_t   hfs_syncer;	      // removeable devices get sync'ed by this guy
+
 } hfsmount_t;
 
+#define HFS_META_DELAY     (100)
+#define HFS_MILLISEC_SCALE (1000*1000)
+
 typedef hfsmount_t  ExtendedVCB;
 
 /* Aliases for legacy (Mac OS 9) field names */
@@ -689,6 +716,7 @@ extern int  hfs_virtualmetafile(struct cnode *);
 
 extern int hfs_start_transaction(struct hfsmount *hfsmp);
 extern int hfs_end_transaction(struct hfsmount *hfsmp);
+extern void hfs_sync_ejectable(struct hfsmount *hfsmp);
 
 
 /*****************************************************************************
diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c
index 1e836052f..e5ab6c8b9 100644
--- a/bsd/hfs/hfs_readwrite.c
+++ b/bsd/hfs/hfs_readwrite.c
@@ -1836,12 +1836,20 @@ fail_change_next_allocation:
 	}
 
 	case HFS_GET_MOUNT_TIME:
-	    return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time));
-	    break;
+	    if (is64bit) {
+	    	*(user_time_t *)(ap->a_data) = (user_time_t) hfsmp->hfs_mount_time;
+	    } else {
+	    	*(time_t *)(ap->a_data) = (time_t) hfsmp->hfs_mount_time;
+	    }
+		return 0;
 
 	case HFS_GET_LAST_MTIME:
-	    return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime));
-	    break;
+	    if (is64bit) {
+	    	*(user_time_t *)(ap->a_data) = (user_time_t) hfsmp->hfs_last_mounted_mtime;
+	    } else {
+	    	*(time_t *)(ap->a_data) = (time_t) hfsmp->hfs_last_mounted_mtime;
+	    }
+		return 0;
 
 	case HFS_SET_BOOT_INFO:
 		if (!vnode_isvroot(vp))
diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c
index b2e71a034..6f5c3eb53 100644
--- a/bsd/hfs/hfs_vfsops.c
+++ b/bsd/hfs/hfs_vfsops.c
@@ -827,6 +827,99 @@ hfs_reload(struct mount *mountp)
 	return (0);
 }
 
+int hfs_last_io_wait_time = 125000;
+SYSCTL_INT (_kern, OID_AUTO, hfs_last_io_wait_time, CTLFLAG_RW, &hfs_last_io_wait_time, 0, "number of usecs to wait after an i/o before syncing ejectable media");
+
+static void
+hfs_syncer(void *arg0, void *unused)
+{
+#pragma unused(unused)
+
+    struct hfsmount *hfsmp = arg0;
+    uint32_t secs, usecs, delay = HFS_META_DELAY;
+    uint64_t now;
+    struct timeval nowtv, last_io;
+
+    clock_get_calendar_microtime(&secs, &usecs);
+    now = ((uint64_t)secs * 1000000LL) + usecs;
+    //
+    // If we have put off the last sync for more than
+    // 5 seconds, force it so that we don't let too
+    // much i/o queue up (since flushing the journal
+    // causes the i/o queue to drain)
+    //
+    if ((now - hfsmp->hfs_last_sync_time) >= 5000000LL) {
+	    goto doit;
+    }
+
+    //
+    // Find out when the last i/o was done to this device (read or write).  
+    //
+    throttle_info_get_last_io_time(hfsmp->hfs_mp, &last_io);
+    microuptime(&nowtv);
+    timevalsub(&nowtv, &last_io);
+
+    //
+    // If the last i/o was too recent, defer this sync until later.
+    // The limit chosen (125 milli-seconds) was picked based on
+    // some experiments copying data to an SD card and seems to
+    // prevent us from issuing too many syncs.
+    //
+    if (nowtv.tv_sec >= 0 && nowtv.tv_usec > 0 && nowtv.tv_usec < hfs_last_io_wait_time) {
+	    delay /= 2;
+	    goto resched;
+    }
+    
+    //
+    // If there's pending i/o, also skip the sync.
+    //
+    if (hfsmp->hfs_devvp && hfsmp->hfs_devvp->v_numoutput > 0) {
+	    goto resched;
+    }
+
+		
+    //
+    // Only flush the journal if we have not sync'ed recently
+    // and the last sync request time was more than 100 milli
+    // seconds ago and there is no one in the middle of a
+    // transaction right now.  Else we defer the sync and
+    // reschedule it for later.
+    //
+    if (  ((now - hfsmp->hfs_last_sync_time) >= 100000LL)
+       && ((now - hfsmp->hfs_last_sync_request_time) >= 100000LL)
+       && (hfsmp->hfs_active_threads == 0)
+       && (hfsmp->hfs_global_lock_nesting == 0)) {
+
+    doit:
+	    OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads);
+	    if (hfsmp->jnl) {
+		    journal_flush(hfsmp->jnl);
+	    }
+	    OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads);
+		   
+	    clock_get_calendar_microtime(&secs, &usecs);
+	    hfsmp->hfs_last_sync_time = ((int64_t)secs * 1000000) + usecs;
+	    
+    } else if (hfsmp->hfs_active_threads == 0) {
+	    uint64_t deadline;
+
+    resched:
+	    clock_interval_to_deadline(delay, HFS_MILLISEC_SCALE, &deadline);
+	    thread_call_enter_delayed(hfsmp->hfs_syncer, deadline);
+	    return;
+    }
+	    
+    //
+    // NOTE: we decrement these *after* we're done the journal_flush() since
+    // it can take a significant amount of time and so we don't want more
+    // callbacks scheduled until we're done this one.
+    //
+    OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
+    OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
+    wakeup((caddr_t)&hfsmp->hfs_sync_incomplete);
+}
+
+extern int IOBSDIsMediaEjectable( const char *cdev_name );
 
 /*
  * Common code for mount and mountroot
@@ -855,12 +948,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
 	u_int32_t iswritable;
 	daddr64_t mdb_offset;
 	int isvirtual = 0;
+	int isroot = 0;
 
 	ronly = vfs_isrdonly(mp);
 	dev = vnode_specrdev(devvp);
 	cred = p ? vfs_context_ucred(context) : NOCRED;
 	mntwrapper = 0;
 
+	if (args == NULL) {
+		/* only hfs_mountroot passes us NULL as the 'args' argument */
+		isroot = 1;	
+	}
+
 	bp = NULL;
 	hfsmp = NULL;
 	mdbp = NULL;
@@ -1379,6 +1478,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
 		}
 	}
 
+	/* ejectability checks will time out when the device is root_device, so skip them */
+	if (isroot == 0) {
+		if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 && 
+				IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) {
+			hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp);
+			if (hfsmp->hfs_syncer == NULL) {
+				printf("hfs: failed to allocate syncer thread callback for %s (%s)\n",
+						mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname);
+			}
+		}
+	}
+
 	/*
 	 * Start looking for free space to drop below this level and generate a
 	 * warning immediately if needed:
@@ -1451,6 +1562,38 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context)
 	if (hfsmp->hfs_flags & HFS_METADATA_ZONE)
 		(void) hfs_recording_suspend(hfsmp);
 
+	/*
+	 * Cancel any pending timers for this volume.  Then wait for any timers
+	 * which have fired, but whose callbacks have not yet completed.
+	 */
+	if (hfsmp->hfs_syncer)
+	{
+		struct timespec ts = {0, 100000000};	/* 0.1 seconds */
+		
+		/*
+		 * Cancel any timers that have been scheduled, but have not
+		 * fired yet.  NOTE: The kernel considers a timer complete as
+		 * soon as it starts your callback, so the kernel does not
+		 * keep track of the number of callbacks in progress.
+		 */
+		if (thread_call_cancel(hfsmp->hfs_syncer))
+			OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
+		thread_call_free(hfsmp->hfs_syncer);
+		hfsmp->hfs_syncer = NULL;
+		
+		/*
+		 * This waits for all of the callbacks that were entered before
+		 * we did thread_call_cancel above, but have not completed yet.
+		 */
+		while(hfsmp->hfs_sync_incomplete > 0)
+		{
+			msleep((caddr_t)&hfsmp->hfs_sync_incomplete, NULL, PWAIT, "hfs_unmount", &ts);
+		}
+		
+		if (hfsmp->hfs_sync_incomplete < 0)
+			printf("hfs_unmount: pm_sync_incomplete underflow (%d)!\n", hfsmp->hfs_sync_incomplete);
+	}
+	
 	/*
 	 * Flush out the b-trees, volume bitmap and Volume Header
 	 */
@@ -1931,6 +2074,15 @@ hfs_sync(struct mount *mp, int waitfor, vfs_context_t context)
 	    journal_flush(hfsmp->jnl);
 	}
 
+	{
+		uint32_t secs, usecs;
+		uint64_t now;
+
+		clock_get_calendar_microtime(&secs, &usecs);
+		now = ((uint64_t)secs * 1000000LL) + usecs;
+		hfsmp->hfs_last_sync_time = now;
+	}
+
 	lck_rw_unlock_shared(&hfsmp->hfs_insync);	
 	return (allerror);
 }
diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c
index ce577ec74..d6dc1e356 100644
--- a/bsd/hfs/hfs_vfsutils.c
+++ b/bsd/hfs/hfs_vfsutils.c
@@ -2347,6 +2347,46 @@ hfs_virtualmetafile(struct cnode *cp)
 }
 
 
+
+//
+// Fire off a timed callback to sync the disk if the
+// volume is on ejectable media.
+//
+ __private_extern__
+void
+hfs_sync_ejectable(struct hfsmount *hfsmp)
+{
+	if (hfsmp->hfs_syncer)	{
+		uint32_t secs, usecs;
+		uint64_t now;
+
+		clock_get_calendar_microtime(&secs, &usecs);
+		now = ((uint64_t)secs * 1000000) + usecs;
+
+		if (hfsmp->hfs_sync_scheduled == 0) {
+			uint64_t deadline;
+
+			hfsmp->hfs_last_sync_request_time = now;
+
+			clock_interval_to_deadline(HFS_META_DELAY, HFS_MILLISEC_SCALE, &deadline);
+
+			/*
+			 * Increment hfs_sync_scheduled on the assumption that we're the
+			 * first thread to schedule the timer.  If some other thread beat
+			 * us, then we'll decrement it.  If we *were* the first to
+			 * schedule the timer, then we need to keep track that the
+			 * callback is waiting to complete.
+			 */
+			OSIncrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
+			if (thread_call_enter_delayed(hfsmp->hfs_syncer, deadline))
+				OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
+			else
+				OSIncrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
+		}		
+	}
+}
+
+
 __private_extern__
 int
 hfs_start_transaction(struct hfsmount *hfsmp)
@@ -2374,6 +2414,7 @@ hfs_start_transaction(struct hfsmount *hfsmp)
 
     if (hfsmp->jnl == NULL || journal_owner(hfsmp->jnl) != thread) {
 	lck_rw_lock_shared(&hfsmp->hfs_global_lock);
+	OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads);
 	unlock_on_err = 1;
     }
 
@@ -2399,6 +2440,7 @@ hfs_start_transaction(struct hfsmount *hfsmp)
 out:
     if (ret != 0 && unlock_on_err) {
 	lck_rw_unlock_shared(&hfsmp->hfs_global_lock);
+	OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads);
     }
 
     return ret;
@@ -2424,7 +2466,9 @@ hfs_end_transaction(struct hfsmount *hfsmp)
     }
 
     if (need_unlock) {
+	OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads);
 	lck_rw_unlock_shared(&hfsmp->hfs_global_lock);
+	hfs_sync_ejectable(hfsmp);
     }
 
     return ret;
diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c
index 61875f626..6d8d6ad33 100644
--- a/bsd/hfs/hfs_vnops.c
+++ b/bsd/hfs/hfs_vnops.c
@@ -367,6 +367,11 @@ hfs_vnop_close(ap)
 	}
 	
 	hfs_unlock(cp);
+
+	if (ap->a_fflag & FWASWRITTEN) {
+		hfs_sync_ejectable(hfsmp);
+	}
+
 	return (0);
 }
 
@@ -2619,6 +2624,16 @@ hfs_vnop_rename(ap)
 skip_rm:
 	/*
 	 * All done with tvp and fvp
+	 *
+	 * We also jump to this point if there was no destination observed during lookup and namei.
+	 * However, because only iocounts are held at the VFS layer, there is nothing preventing a 
+	 * competing thread from racing us and creating a file or dir at the destination of this rename 
+	 * operation.  If this occurs, it may cause us to get a spurious EEXIST out of the cat_rename 
+	 * call below.  To preserve rename's atomicity, we need to signal VFS to re-drive the 
+	 * namei/lookup and restart the rename operation.  EEXIST is an allowable errno to be bubbled 
+	 * out of the rename syscall, but not for this reason, since it is a synonym errno for ENOTEMPTY.
+	 * To signal VFS, we return ERECYCLE (which is also used for lookup restarts). This errno
+	 * will be swallowed and it will restart the operation.
 	 */
 
 	lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
@@ -2626,6 +2641,9 @@ skip_rm:
 	hfs_systemfile_unlock(hfsmp, lockflags);
 
 	if (error) {
+		if (error == EEXIST) {
+			error = ERECYCLE;
+		}
 		goto out;
 	}
 
diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c
index d7711096f..f7bd0e5d9 100644
--- a/bsd/kern/kern_descrip.c
+++ b/bsd/kern/kern_descrip.c
@@ -106,6 +106,8 @@
 
 #include <sys/ubc_internal.h>
 
+#include <hfs/hfs.h>	/* <rdar://7042269 manifest constants */
+
 struct psemnode;
 struct pshmnode;
 
@@ -593,6 +595,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval)
 	int devBlockSize = 0;
 	unsigned int fflag;
 	user_addr_t argp;
+	boolean_t is64bit;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(cmd, uap->cmd);
@@ -604,7 +607,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval)
 	}
 	context.vc_thread = current_thread();
 	context.vc_ucred = fp->f_cred;
-	if (proc_is64bit(p)) {
+
+	is64bit = proc_is64bit(p);
+	if (is64bit) {
 		argp = uap->arg;
 	}
 	else {
@@ -1482,13 +1487,17 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval)
 	}
 
 	default:
-		if (uap->cmd < FCNTL_FS_SPECIFIC_BASE) {
-			error = EINVAL;
+		/*
+		 * This is an fcntl() that we d not recognize at this level;
+		 * if this is a vnode, we send it down into the VNOP_IOCTL
+		 * for this vnode; this can include special devices, and will
+		 * effectively overload fcntl() to send ioctl()'s.
+		 */
+		if((uap->cmd & IOC_VOID) && (uap->cmd & IOC_INOUT)){
+                	error = EINVAL;
 			goto out;
 		}
-
-		// if it's a fs-specific fcntl() then just pass it through
-
+		
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			goto out;
@@ -1497,12 +1506,103 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval)
 		proc_fdunlock(p);
 
 		if ( (error = vnode_getwithref(vp)) == 0 ) {
-			error = VNOP_IOCTL(vp, uap->cmd, CAST_DOWN(caddr_t, argp), 0, &context);
+#define STK_PARAMS 128
+			char stkbuf[STK_PARAMS];
+			unsigned int size;
+			caddr_t data, memp;
+			int fix_cmd = uap->cmd;
+
+			/*
+			 * For this to work properly, we have to copy in the
+			 * ioctl() cmd argument if there is one; we must also
+			 * check that a command parameter, if present, does
+			 * not exceed the maximum command length dictated by
+			 * the number of bits we have available in the command
+			 * to represent a structure length.  Finally, we have
+			 * to copy the results back out, if it is that type of
+			 * ioctl().
+			 */
+			size = IOCPARM_LEN(uap->cmd);
+			if (size > IOCPARM_MAX) {
+				(void)vnode_put(vp);
+				error = EINVAL;
+				break;
+			}
+
+			/*
+			 * <rdar://7042269> fix up the command we should have
+			 * received via fcntl with one with a valid size and
+			 * copy out argument.
+			 */
+			if (fix_cmd == HFS_GET_MOUNT_TIME ||
+			    fix_cmd == HFS_GET_LAST_MTIME) {
+			    	if (is64bit)
+					size = sizeof(user_time_t);
+				else
+					size = sizeof(time_t);
+				fix_cmd |= IOC_OUT;
+		        }
+
+			memp = NULL;
+			if (size > sizeof (stkbuf)) {
+				if ((memp = (caddr_t)kalloc(size)) == 0) {
+					(void)vnode_put(vp);
+					error = ENOMEM;
+				}
+				data = memp;
+			} else {
+				data = &stkbuf[0];
+			}
+			
+			if (fix_cmd & IOC_IN) {
+				if (size) {
+					/* structure */
+					error = copyin(argp, data, size);
+					if (error) {
+						(void)vnode_put(vp);
+						if (memp)
+							kfree(memp, size);
+						goto outdrop;
+					}
+				} else {
+					/* int */
+					if (is64bit) {
+						*(user_addr_t *)data = argp;
+					} else {
+						*(uint32_t *)data = (uint32_t)argp;
+					}
+				};
+			} else if ((fix_cmd & IOC_OUT) && size) {
+				/*
+				 * Zero the buffer so the user always
+				 * gets back something deterministic.
+				 */
+				bzero(data, size);
+			} else if (fix_cmd & IOC_VOID) {
+				if (is64bit) {
+				    *(user_addr_t *)data = argp;
+				} else {
+				    *(uint32_t *)data = (uint32_t)argp;
+				}
+			}
+
+			/*
+			 * <rdar://7042269> We pass the unmodified uap->cmd
+			 * to the underlying VNOP so that we don't confuse it;
+			 * but we are going to handle its copyout() when it
+			 * gets back.
+			 */
+			error = VNOP_IOCTL(vp, uap->cmd, CAST_DOWN(caddr_t, data), 0, &context);
 
 			(void)vnode_put(vp);
+
+			/* Copy any output data to user */
+			if (error == 0 && (fix_cmd & IOC_OUT) && size) 
+				error = copyout(data, argp, size);
+			if (memp)
+				kfree(memp, size);
 		}
 		break;
-	
 	}
 
 outdrop:
@@ -3871,9 +3971,12 @@ closef_locked(struct fileproc *fp, struct fileglob *fg, proc_t p)
 	fg->fg_lflags |= FG_TERM;
 	lck_mtx_unlock(&fg->fg_lock);
 
-	proc_fdunlock(p);
+	if (p)
+		proc_fdunlock(p);
 	error = closef_finish(fp, fg, p, &context);
-	proc_fdlock(p);
+
+	if (p)
+		proc_fdlock(p);
 
 	return(error);
 }
diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c
index 9b568d24c..4c1b6c6eb 100644
--- a/bsd/kern/uipc_usrreq.c
+++ b/bsd/kern/uipc_usrreq.c
@@ -132,7 +132,7 @@ static int	unp_connect(struct socket *, struct sockaddr *, proc_t);
 static void	unp_disconnect(struct unpcb *);
 static void	unp_shutdown(struct unpcb *);
 static void	unp_drop(struct unpcb *, int);
-static void	unp_gc(void);
+__private_extern__ void	unp_gc(void);
 static void	unp_scan(struct mbuf *, void (*)(struct fileglob *));
 static void	unp_mark(struct fileglob *);
 static void	unp_discard(struct fileglob *);
@@ -749,7 +749,11 @@ unp_detach(struct unpcb *unp)
 		 * gets them (resulting in a "panic: closef: count < 0").
 		 */
 		sorflush(unp->unp_socket);
+
+		/* Per domain mutex deadlock avoidance */
+		socket_unlock(unp->unp_socket, 0);
 		unp_gc();
+		socket_lock(unp->unp_socket, 0);
 	}
 	if (unp->unp_addr)
 		FREE(unp->unp_addr, M_SONAME);
@@ -1362,11 +1366,15 @@ unp_internalize(struct mbuf *control, proc_t p)
 }
 
 static int	unp_defer, unp_gcing, unp_gcwait;
+static thread_t unp_gcthread = NULL;
 
 /* always called under uipc_lock */
 void
 unp_gc_wait(void)
 {
+	if (unp_gcthread == current_thread())
+		return;
+
 	while (unp_gcing != 0) {
 		unp_gcwait = 1;
 		msleep(&unp_gcing, uipc_lock, 0 , "unp_gc_wait", NULL);
@@ -1374,12 +1382,13 @@ unp_gc_wait(void)
 }
 
 
-static void
+__private_extern__ void
 unp_gc(void)
 {
 	struct fileglob *fg, *nextfg;
 	struct socket *so;
-	struct fileglob **extra_ref, **fpp;
+	static struct fileglob **extra_ref;
+        struct fileglob **fpp;
 	int nunref, i;
 	int need_gcwakeup = 0;
 
@@ -1390,6 +1399,7 @@ unp_gc(void)
 	}
 	unp_gcing = 1;
 	unp_defer = 0;
+	unp_gcthread = current_thread();
 	lck_mtx_unlock(uipc_lock);
 	/*
 	 * before going through all this, set all FDs to
@@ -1484,9 +1494,13 @@ unp_gc(void)
 			 * to see if we hold any file descriptors in its
 			 * message buffers. Follow those links and mark them
 			 * as accessible too.
+			 *
+			 * In case a file is passed onto itself we need to
+			 * release the file lock.
 			 */
-			unp_scan(so->so_rcv.sb_mb, unp_mark);
 			lck_mtx_unlock(&fg->fg_lock);
+
+			unp_scan(so->so_rcv.sb_mb, unp_mark);
 		}
 	} while (unp_defer);
 	/*
@@ -1564,20 +1578,13 @@ unp_gc(void)
 		tfg = *fpp;
 
 		if (tfg->fg_type == DTYPE_SOCKET && tfg->fg_data != NULL) {
-			int locked = 0;
-
 			so = (struct socket *)(tfg->fg_data);
 
-			/* XXXX */
-			/* Assume local sockets use a global lock */
-			if (so->so_proto->pr_domain->dom_family != PF_LOCAL) {
-				socket_lock(so, 0);
-				locked = 1;
-			}
+			socket_lock(so, 0);
+
 			sorflush(so);
 
-			if (locked)
-				socket_unlock(so, 0);
+			socket_unlock(so, 0);
 		}
 	}
 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
@@ -1585,6 +1592,7 @@ unp_gc(void)
 
         lck_mtx_lock(uipc_lock);
 	unp_gcing = 0;
+	unp_gcthread = NULL;
 
 	if (unp_gcwait != 0) {
 		unp_gcwait = 0;
diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c
index aad1b0250..6c26b1799 100644
--- a/bsd/miscfs/specfs/spec_vnops.c
+++ b/bsd/miscfs/specfs/spec_vnops.c
@@ -600,6 +600,7 @@ void IOSleep(int);
 
 struct _throttle_io_info_t {
 	struct timeval	last_normal_IO_timestamp;
+	struct timeval  last_IO_timestamp;
 	SInt32 numthreads_throttling;
 };
 
@@ -614,6 +615,32 @@ SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window
 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 
+void
+throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
+{
+	size_t devbsdunit;
+		
+	devbsdunit = mp->mnt_devbsdunit;
+
+	if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+		*tv = _throttle_io_info[devbsdunit].last_IO_timestamp;
+	} else {
+		memset(tv, 0, sizeof(*tv));
+	}
+}
+
+void
+update_last_io_time(mount_t mp)
+{
+	size_t devbsdunit;
+		
+	devbsdunit = mp->mnt_devbsdunit;
+
+	if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+		microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+	}
+}
+
 int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit)
 {
 	struct timeval elapsed;
@@ -784,6 +811,18 @@ spec_strategy(struct vnop_strategy_args *ap)
 			}
 		}
 	}
+
+	if ((bflags & B_READ) == 0) {
+		size_t devbsdunit;
+
+		if (buf_vnode(bp)->v_mount != NULL)
+			devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+		else
+			devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
+		
+		microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+	}
+
         (*bdevsw[major(bdev)].d_strategy)(bp);
 
         return (0);
diff --git a/bsd/netat/atp_read.c b/bsd/netat/atp_read.c
index a3853eae9..940a58ae3 100644
--- a/bsd/netat/atp_read.c
+++ b/bsd/netat/atp_read.c
@@ -58,6 +58,8 @@
 #include <netat/asp.h>
 #include <netat/debug.h>
 
+__private_extern__ int atp_resp_seqno2big = 0;
+
 static void atp_trans_complete(struct atp_trans *);
 void atp_x_done_locked(void *);
 void atp_treq_event(void *);
@@ -139,8 +141,8 @@ gbuf_t   *m;
 	    case ATP_CMD_TRESP:
 	    {   
 		register struct atp_trans *trp;
-		register int    seqno;
-	    register at_ddp_t       *ddp;
+		register unsigned int    seqno;
+		register at_ddp_t       *ddp;
 
 		/*
 		 * we just got a response, find the trans record
@@ -155,10 +157,20 @@ gbuf_t   *m;
 		 *	If we can't find one then ignore the message
 		 */
 		seqno = athp->bitmap;
+		if (seqno > 7) {
+			atp_resp_seqno2big++;
+			ddp = AT_DDP_HDR(m);
+			dPrintf(D_M_ATP_LOW, (D_L_INPUT|D_L_ERROR),
+				("atp_rput: dropping TRESP seqno too big, tid=%d,loc=%d,rem=%d.%d,seqno=%u\n",
+				 UAS_VALUE_NTOH(athp->tid),
+				 ddp->dst_socket, ddp->src_node, ddp->src_socket, seqno));
+			gbuf_freem(m);
+			return;
+		}
 		if (trp == NULL) {
 	        ddp = AT_DDP_HDR(m);
 		    dPrintf(D_M_ATP_LOW, (D_L_INPUT|D_L_ERROR),
-		("atp_rput: dropping TRESP, no trp,tid=%d,loc=%d,rem=%d.%d,seqno=%d\n",
+		("atp_rput: dropping TRESP, no trp,tid=%d,loc=%d,rem=%d.%d,seqno=%u\n",
 			    UAS_VALUE_NTOH(athp->tid),
 			    ddp->dst_socket, ddp->src_node, ddp->src_socket, seqno));
 		    gbuf_freem(m);
@@ -184,7 +196,7 @@ gbuf_t   *m;
 		if (!(trp->tr_bitmap&atp_mask[seqno]) || trp->tr_rcv[seqno]) {
 	        ddp = AT_DDP_HDR(m);
 		    dPrintf(D_M_ATP_LOW, (D_L_INPUT|D_L_ERROR),
-		("atp_rput: dropping TRESP, duplicate,tid=%d,loc=%d,rem=%d.%d,seqno=%d\n",
+		("atp_rput: dropping TRESP, duplicate,tid=%d,loc=%d,rem=%d.%d,seqno=%u\n",
 			    UAS_VALUE_NTOH(athp->tid),
 			    ddp->dst_socket, ddp->src_node, ddp->src_socket, seqno));
 		    gbuf_freem(m);
diff --git a/bsd/netat/sys_glue.c b/bsd/netat/sys_glue.c
index a1d2a402c..dd22563be 100644
--- a/bsd/netat/sys_glue.c
+++ b/bsd/netat/sys_glue.c
@@ -99,6 +99,9 @@ SYSCTL_INT(_net_appletalk, OID_AUTO, routermix, CTLFLAG_WR,
 at_ddp_stats_t at_ddp_stats;		/* DDP statistics */
 SYSCTL_STRUCT(_net_appletalk, OID_AUTO, ddpstats, CTLFLAG_RD,
 	      &at_ddp_stats, at_ddp_stats, "AppleTalk DDP Stats");
+extern int atp_resp_seqno2big;
+SYSCTL_INT(_net_appletalk, OID_AUTO, atp_resp_seqno2big, CTLFLAG_RD,
+                        &atp_resp_seqno2big, 0, "Appletalk ATP seqno too big count");
 
 static void ioccmd_t_32_to_64( ioccmd_t *from_p, user_ioccmd_t *to_p );
 static void ioccmd_t_64_to_32( user_ioccmd_t *from_p, ioccmd_t *to_p );
diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c
index e847a3319..db0662895 100644
--- a/bsd/netinet6/icmp6.c
+++ b/bsd/netinet6/icmp6.c
@@ -121,7 +121,8 @@ static struct timeval icmp6errppslim_last;
 extern int icmp6_nodeinfo;
 extern struct inpcbinfo ripcbinfo;
 extern lck_mtx_t *ip6_mutex; 
-extern lck_mtx_t *nd6_mutex; 
+extern lck_mtx_t *nd6_mutex;
+extern lck_mtx_t *inet6_domain_mutex;
 
 static void icmp6_errcount(struct icmp6errstat *, int, int);
 static int icmp6_rip6_input(struct mbuf **, int);
@@ -515,8 +516,15 @@ icmp6_input(mp, offp)
 		icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo);
 		if (code != 0)
 			goto badcode;
+
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		if ((n = m_copy(m, 0, M_COPYALL)) == NULL) {
 			/* Give up remote */
+			goto rate_limit_checked;
 			break;
 		}
 		if ((n->m_flags & M_EXT) != 0
@@ -531,6 +539,7 @@ icmp6_input(mp, offp)
 			if (maxlen >= MCLBYTES) {
 				/* Give up remote */
 				m_freem(n0);
+				goto rate_limit_checked;
 				break;
 			}
 			MGETHDR(n, M_DONTWAIT, n0->m_type);	/* MAC-OK */
@@ -544,6 +553,7 @@ icmp6_input(mp, offp)
 			if (n == NULL) {
 				/* Give up remote */
 				m_freem(n0);
+				goto rate_limit_checked;
 				break;
 			}
 			M_COPY_PKTHDR(n, n0);
@@ -578,6 +588,7 @@ icmp6_input(mp, offp)
 			icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++;
 			icmp6_reflect(n, noff);
 		}
+		goto rate_limit_checked;
 		break;
 
 	case ICMP6_ECHO_REPLY:
@@ -594,6 +605,12 @@ icmp6_input(mp, offp)
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery);
 		else
 			icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport);
+
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			mld6_input(m, off);
@@ -602,6 +619,7 @@ icmp6_input(mp, offp)
 		}
 		mld6_input(n, off);
 		/* m stays. */
+		goto rate_limit_checked;
 		break;
 
 	case MLD6_LISTENER_DONE:
@@ -631,6 +649,11 @@ icmp6_input(mp, offp)
 		IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo),
 				 return IPPROTO_DONE);
 #endif
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		n = m_copy(m, 0, M_COPYALL);
 		if (n)
 			n = ni6_input(n, off);
@@ -640,6 +663,7 @@ icmp6_input(mp, offp)
 			icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++;
 			icmp6_reflect(n, noff);
 		}
+		goto rate_limit_checked;
 		break;
 
 	case ICMP6_WRUREPLY:
@@ -653,6 +677,12 @@ icmp6_input(mp, offp)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_solicit))
 			goto badlen;
+
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_rs_input(m, off, icmp6len);
@@ -661,6 +691,7 @@ icmp6_input(mp, offp)
 		}
 		nd6_rs_input(n, off, icmp6len);
 		/* m stays. */
+		goto rate_limit_checked;
 		break;
 
 	case ND_ROUTER_ADVERT:
@@ -669,6 +700,12 @@ icmp6_input(mp, offp)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_advert))
 			goto badlen;
+
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_ra_input(m, off, icmp6len);
@@ -677,6 +714,7 @@ icmp6_input(mp, offp)
 		}
 		nd6_ra_input(n, off, icmp6len);
 		/* m stays. */
+		goto rate_limit_checked;
 		break;
 
 	case ND_NEIGHBOR_SOLICIT:
@@ -685,6 +723,12 @@ icmp6_input(mp, offp)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_solicit))
 			goto badlen;
+
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_ns_input(m, off, icmp6len);
@@ -693,6 +737,7 @@ icmp6_input(mp, offp)
 		}
 		nd6_ns_input(n, off, icmp6len);
 		/* m stays. */
+		goto rate_limit_checked;
 		break;
 
 	case ND_NEIGHBOR_ADVERT:
@@ -701,6 +746,12 @@ icmp6_input(mp, offp)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_advert))
 			goto badlen;
+
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			nd6_na_input(m, off, icmp6len);
@@ -709,6 +760,7 @@ icmp6_input(mp, offp)
 		}
 		nd6_na_input(n, off, icmp6len);
 		/* m stays. */
+		goto rate_limit_checked;
 		break;
 
 	case ND_REDIRECT:
@@ -717,6 +769,12 @@ icmp6_input(mp, offp)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_redirect))
 			goto badlen;
+
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
 			/* give up local */
 			icmp6_redirect_input(m, off);
@@ -725,6 +783,7 @@ icmp6_input(mp, offp)
 		}
 		icmp6_redirect_input(n, off);
 		/* m stays. */
+		goto rate_limit_checked;
 		break;
 
 	case ICMP6_ROUTER_RENUMBERING:
@@ -736,6 +795,11 @@ icmp6_input(mp, offp)
 		break;
 
 	default:
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		nd6log((LOG_DEBUG,
 		    "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
 		    icmp6->icmp6_type, ip6_sprintf(&ip6->ip6_src),
@@ -747,9 +811,15 @@ icmp6_input(mp, offp)
 			/* deliver */
 		} else {
 			/* ICMPv6 informational: MUST not deliver */
+			goto rate_limit_checked;
 			break;
 		}
 	deliver:
+		if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+			icmp6stat.icp6s_toofreq++;
+			goto freeit;
+		}
+
 		if (icmp6_notify_error(m, off, icmp6len, code)) {
 			/* In this case, m should've been freed. */
 			return(IPPROTO_DONE);
@@ -765,6 +835,11 @@ icmp6_input(mp, offp)
 		break;
 	}
 
+	if (icmp6_ratelimit(&ip6->ip6_dst, icmp6->icmp6_type, code)) {
+		icmp6stat.icp6s_toofreq++;
+		goto freeit;
+	}
+rate_limit_checked:
 	/* deliver the packet to appropriate sockets */
 	icmp6_rip6_input(&m, *offp);
 
@@ -2331,7 +2406,16 @@ icmp6_redirect_input(m, off)
 	sdst.sin6_family = AF_INET6;
 	sdst.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
+	
+	/*
+         * Radar 6843900
+	 * Release the IPv6 domain lock because we are going to take domain_proto_mtx
+	 * and could otherwise cause a deadlock with other threads taking these locks 
+	 * in the reverse order -- e.g. frag6_slowtimo() from pfslowtimo()
+	 */
+	lck_mtx_unlock(inet6_domain_mutex);
 	pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst);
+	lck_mtx_lock(inet6_domain_mutex);
 #if IPSEC
 	key_sa_routechange((struct sockaddr *)&sdst);
 #endif
diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h
index 29188674f..a84715b19 100644
--- a/bsd/netinet6/in6.h
+++ b/bsd/netinet6/in6.h
@@ -545,9 +545,14 @@ struct in6_pktinfo {
 
 #define IPV6CTL_MAXFRAGS        41      /* max fragments */
 
+#define IPV6CTL_NEIGHBORGCTHRESH 46
+#define IPV6CTL_MAXIFPREFIXES	47
+#define IPV6CTL_MAXIFDEFROUTERS 48
+#define IPV6CTL_MAXDYNROUTES	49
+
 /* New entries should be added here from current IPV6CTL_MAXID value. */
 /* to define items, should talk with KAME guys first, for *BSD compatibility */
-#define IPV6CTL_MAXID		42
+#define IPV6CTL_MAXID		50
 
 #ifdef KERNEL_PRIVATE
 #define CTL_IPV6PROTO_NAMES { \
diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c
index c27a77892..a937bbf35 100644
--- a/bsd/netinet6/in6_proto.c
+++ b/bsd/netinet6/in6_proto.c
@@ -378,6 +378,11 @@ int	ip6_rr_prune = 5;	/* router renumbering prefix
 				 * walk list every 5 sec.    */
 int	ip6_v6only = 0;		/* Mapped addresses on by default -  Radar 3347718 */
 
+int	ip6_neighborgcthresh = 2048;	/* Threshold # of NDP entries for GC */
+int	ip6_maxifprefixes = 16;		/* Max acceptable prefixes via RA per IF */
+int	ip6_maxifdefrouters = 16;	/* Max acceptable def routers via RA */
+int	ip6_maxdynroutes = 4096;	/* Max # of routes created via redirect */
+
 u_int32_t ip6_id = 0UL;
 int	ip6_keepfaith = 0;
 time_t	ip6_log_time = (time_t)0L;
@@ -515,6 +520,14 @@ SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD,
 	&rip6stat, rip6stat, "");
 SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD,
         &mrt6stat, mrt6stat, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NEIGHBORGCTHRESH,
+	neighborgcthresh, CTLFLAG_RW,	&ip6_neighborgcthresh,	0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFPREFIXES,
+	maxifprefixes, CTLFLAG_RW,	&ip6_maxifprefixes,	0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFDEFROUTERS,
+	maxifdefrouters, CTLFLAG_RW,	&ip6_maxifdefrouters,	0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXDYNROUTES,
+	maxdynroutes, CTLFLAG_RW,	&ip6_maxdynroutes,	0, "");
 
 
 /* net.inet6.icmp6 */
diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h
index c4a2fb286..f42089272 100644
--- a/bsd/netinet6/ip6_var.h
+++ b/bsd/netinet6/ip6_var.h
@@ -266,6 +266,11 @@ extern int	ip6_rr_prune;		/* router renumbering prefix
 #define ip6_mapped_addr_on	(!ip6_v6only)
 extern int	ip6_v6only;
 
+extern int	ip6_neighborgcthresh;	/* Threshold # of NDP entries for GC */
+extern int	ip6_maxifprefixes;	/* Max acceptable prefixes via RA per IF */
+extern int	ip6_maxifdefrouters;	/* Max acceptable def routers via RA */
+extern int	ip6_maxdynroutes;	/* Max # of routes created via redirect */
+
 extern struct socket *ip6_mrouter; 	/* multicast routing daemon */
 extern int	ip6_sendredirects;	/* send IP redirects when forwarding? */
 extern int	ip6_maxfragpackets; 	/* Maximum packets in reassembly queue */
diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c
index f0c838d58..66f4cd2b9 100644
--- a/bsd/netinet6/nd6.c
+++ b/bsd/netinet6/nd6.c
@@ -119,6 +119,7 @@ extern lck_mtx_t *nd6_mutex;
 
 static void nd6_slowtimo(void *ignored_arg);
 
+
 void
 nd6_init()
 {
@@ -415,11 +416,11 @@ nd6_timer(
 	struct in6_ifaddr *ia6, *nia6;
 	struct in6_addrlifetime *lt6;
 	struct timeval timenow;
+	int count = 0;
 
 	getmicrotime(&timenow);
 	
 
-
 	ln = llinfo_nd6.ln_next;
 	while (ln && ln != &llinfo_nd6) {
 		struct rtentry *rt;
@@ -439,9 +440,34 @@ nd6_timer(
 		ndi = &nd_ifinfo[ifp->if_index];
 		dst = (struct sockaddr_in6 *)rt_key(rt);
 
+		count++;
+
 		if (ln->ln_expire > timenow.tv_sec) {
-			ln = next;
-			continue;
+
+			/* Radar 6871508 Check if we have too many cache entries.
+			 * In that case purge 20% of the table to make space
+			 * for the new entries. 
+			 * This is a bit crude but keeps the deletion in timer
+			 * thread only. 
+			 */
+
+			if ((ip6_neighborgcthresh >= 0 &&
+		    		nd6_inuse >= ip6_neighborgcthresh) &&
+				((count % 5) == 0))  {
+
+				if (ln->ln_state > ND6_LLINFO_INCOMPLETE) 
+					ln->ln_state = ND6_LLINFO_STALE;
+				else
+					ln->ln_state = ND6_LLINFO_PURGE;
+				ln->ln_expire = timenow.tv_sec;
+
+				/* fallthrough and call nd6_free() */
+			}
+
+			else {
+				ln = next;
+				continue;
+			}
 		}
 
 		/* sanity check */
@@ -499,6 +525,7 @@ nd6_timer(
 			break;
 
 		case ND6_LLINFO_STALE:
+		case ND6_LLINFO_PURGE:
 			/* Garbage Collection(RFC 2461 5.3) */
 			if (ln->ln_expire)
 				next = nd6_free(rt);
diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h
index b85be0157..4b66e6be6 100644
--- a/bsd/netinet6/nd6.h
+++ b/bsd/netinet6/nd6.h
@@ -55,6 +55,7 @@ struct	llinfo_nd6 {
 };
 #endif /* KERNEL_PRIVATE */
 
+#define ND6_LLINFO_PURGE	-3
 #define ND6_LLINFO_NOSTATE	-2
 /*
  * We don't need the WAITDELETE state any more, but we keep the definition
@@ -86,6 +87,9 @@ struct nd_ifinfo {
 	u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */
 	u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */
 	u_int8_t randomid[8];	/* current random ID */
+	/* keep track of routers and prefixes on this link */
+	int32_t nprefixes;
+	int32_t ndefrouters;
 };
 
 #define ND6_IFF_PERFORMNUD	0x1
diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c
index 8ca259b74..5af29d31e 100644
--- a/bsd/netinet6/nd6_rtr.c
+++ b/bsd/netinet6/nd6_rtr.c
@@ -631,6 +631,7 @@ defrtrlist_del(
 	struct nd_defrouter *dr, int nd6locked)
 {
 	struct nd_defrouter *deldr = NULL;
+	struct nd_ifinfo *ndi = &nd_ifinfo[dr->ifp->if_index];
 	struct nd_prefix *pr;
 
 	/*
@@ -667,6 +668,12 @@ defrtrlist_del(
 	if (deldr)
 		defrouter_select();
 
+	ndi->ndefrouters--;
+	if (ndi->ndefrouters < 0) {
+		log(LOG_WARNING, "defrtrlist_del: negative count on %s\n",
+		    if_name(dr->ifp));
+	}
+
 	if (nd6locked == 0)
 		lck_mtx_unlock(nd6_mutex);
 
@@ -760,6 +767,7 @@ defrtrlist_update(
 	struct nd_defrouter *new)
 {
 	struct nd_defrouter *dr, *n;
+	struct nd_ifinfo *ndi = &nd_ifinfo[new->ifp->if_index];
 
 	lck_mtx_lock(nd6_mutex);
 	if ((dr = defrouter_lookup(&new->rtaddr, new->ifp)) != NULL) {
@@ -783,6 +791,12 @@ defrtrlist_update(
 		return(NULL);
 	}
 
+	if (ip6_maxifdefrouters >= 0 &&
+	    ndi->ndefrouters >= ip6_maxifdefrouters) {
+		lck_mtx_unlock(nd6_mutex);
+		return (NULL);
+	}
+
 	n = (struct nd_defrouter *)_MALLOC(sizeof(*n), M_IP6NDP, M_NOWAIT);
 	if (n == NULL) {
 		lck_mtx_unlock(nd6_mutex);
@@ -799,6 +813,8 @@ defrtrlist_update(
 	TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry);
 	if (TAILQ_FIRST(&nd_defrouter) == n)
 		defrouter_select();
+	
+	ndi->ndefrouters++;
 		
 	lck_mtx_unlock(nd6_mutex);
 	return(n);
@@ -905,6 +921,40 @@ ndpr_rele(struct nd_prefix *pr, boolean_t locked)
 		lck_mtx_unlock(nd6_mutex);
 }
 
+static void
+purge_detached(struct ifnet *ifp)
+{
+	struct nd_prefix *pr, *pr_next;
+	struct in6_ifaddr *ia;
+	struct ifaddr *ifa, *ifa_next;
+	
+	lck_mtx_lock(nd6_mutex);
+
+	for (pr = nd_prefix.lh_first; pr; pr = pr_next) {
+		pr_next = pr->ndpr_next;
+		if (pr->ndpr_ifp != ifp ||
+		    IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) ||
+		    ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
+		    !LIST_EMPTY(&pr->ndpr_advrtrs)))
+			continue;
+
+		for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa_next) {
+			ifa_next = ifa->ifa_list.tqe_next;
+			if (ifa->ifa_addr->sa_family != AF_INET6)
+				continue;
+			ia = (struct in6_ifaddr *)ifa;
+			if ((ia->ia6_flags & IN6_IFF_AUTOCONF) ==
+			    IN6_IFF_AUTOCONF && ia->ia6_ndpr == pr) {
+				in6_purgeaddr(ifa, 1);
+			}
+		}
+		if (pr->ndpr_refcnt == 0)
+			prelist_remove(pr, 1);
+	}
+
+	lck_mtx_unlock(nd6_mutex);
+}
+
 int
 nd6_prelist_add(
 	struct nd_prefix *pr,
@@ -913,6 +963,14 @@ nd6_prelist_add(
 {
 	struct nd_prefix *new = NULL;
 	int i;
+	struct nd_ifinfo *ndi = &nd_ifinfo[pr->ndpr_ifp->if_index];
+
+	if (ip6_maxifprefixes >= 0) {
+		if (ndi->nprefixes >= ip6_maxifprefixes / 2)
+			purge_detached(pr->ndpr_ifp);
+		if (ndi->nprefixes >= ip6_maxifprefixes)
+			return(ENOMEM);
+	}
 
 	new = (struct nd_prefix *)_MALLOC(sizeof(*new), M_IP6NDP, M_NOWAIT);
 	if (new == NULL)
@@ -953,6 +1011,9 @@ nd6_prelist_add(
 	if (dr) {
 		pfxrtr_add(new, dr);
 	}
+
+	ndi->nprefixes++;
+
 	lck_mtx_unlock(nd6_mutex);
 
 	return 0;
@@ -964,6 +1025,7 @@ prelist_remove(
 {
 	struct nd_pfxrouter *pfr, *next;
 	int e;
+	struct nd_ifinfo *ndi = &nd_ifinfo[pr->ndpr_ifp->if_index];
 
 	/* make sure to invalidate the prefix until it is really freed. */
 	pr->ndpr_vltime = 0;
@@ -1001,6 +1063,12 @@ prelist_remove(
 		FREE(pfr, M_IP6NDP);
 	}
 
+	ndi->nprefixes--;
+	if (ndi->nprefixes < 0) {
+		log(LOG_WARNING, "prelist_remove: negative count on %s\n",
+		    if_name(pr->ndpr_ifp));
+	}
+
 	FREE(pr, M_IP6NDP);
 
 	pfxlist_onlink_check(1);
diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c
index 0e1d6dc69..75e405b3c 100644
--- a/bsd/netkey/key.c
+++ b/bsd/netkey/key.c
@@ -781,6 +781,8 @@ key_checkrequest(isr, saidx, sav)
  * OUT:	NULL:	not found.
  *	others:	found and return the pointer.
  */
+u_int32_t sah_search_calls = 0;
+u_int32_t sah_search_count = 0;
 struct secasvar *
 key_allocsa_policy(saidx)
 	struct secasindex *saidx;
@@ -794,7 +796,9 @@ key_allocsa_policy(saidx)
 	u_int16_t	dstport;
 	
 	lck_mtx_lock(sadb_mutex);
+	sah_search_calls++;
 	LIST_FOREACH(sah, &sahtree, chain) {
+	        sah_search_count++;
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE | CMP_REQID))
@@ -4630,7 +4634,9 @@ key_bbcmp(p1, p2, bits)
  * and do to remove or to expire.
  * XXX: year 2038 problem may remain.
  */
-
+int key_timehandler_debug = 0;
+u_int32_t spd_count = 0, sah_count = 0, dead_sah_count = 0, empty_sah_count = 0, larval_sav_count = 0, mature_sav_count = 0, dying_sav_count = 0, dead_sav_count = 0;
+u_int64_t total_sav_count = 0;
 void
 key_timehandler(void)
 {
@@ -4671,6 +4677,7 @@ key_timehandler(void)
 			     sp != NULL;
 			     sp = nextsp) {
 
+			        spd_count++;
 				nextsp = LIST_NEXT(sp, chain);
 
 				if (sp->state == IPSEC_SPSTATE_DEAD) {
@@ -4706,11 +4713,22 @@ key_timehandler(void)
 			 sah != NULL;
 			 sah = nextsah) {
 	
+		        sah_count++;
 			nextsah = LIST_NEXT(sah, chain);
 	
 			/* if sah has been dead, then delete it and process next sah. */
 			if (sah->state == SADB_SASTATE_DEAD) {
 				key_delsah(sah);
+				dead_sah_count++;
+				continue;
+			}
+
+			if (LIST_FIRST(&sah->savtree[SADB_SASTATE_LARVAL]) == NULL &&
+			    LIST_FIRST(&sah->savtree[SADB_SASTATE_MATURE]) == NULL && 
+			    LIST_FIRST(&sah->savtree[SADB_SASTATE_DYING]) == NULL && 
+			    LIST_FIRST(&sah->savtree[SADB_SASTATE_DEAD]) == NULL) {
+			        key_delsah(sah);
+				empty_sah_count++;
 				continue;
 			}
 	
@@ -4719,6 +4737,8 @@ key_timehandler(void)
 				 sav != NULL;
 				 sav = nextsav) {
 	
+			        larval_sav_count++;
+				total_sav_count++;
 				nextsav = LIST_NEXT(sav, chain);
 	
 				if (tv.tv_sec - sav->created > key_larval_lifetime) {
@@ -4755,6 +4775,8 @@ key_timehandler(void)
 				 sav != NULL;
 				 sav = nextsav) {
 	
+			        mature_sav_count++;
+				total_sav_count++;
 				nextsav = LIST_NEXT(sav, chain);
 	
 				/* we don't need to check. */
@@ -4816,6 +4838,8 @@ key_timehandler(void)
 				 sav != NULL;
 				 sav = nextsav) {
 	
+			        dying_sav_count++;
+				total_sav_count++;
 				nextsav = LIST_NEXT(sav, chain);
 	
 				/* we don't need to check. */
@@ -4869,6 +4893,8 @@ key_timehandler(void)
 				 sav != NULL;
 				 sav = nextsav) {
 	
+			        dead_sav_count++;
+				total_sav_count++;
 				nextsav = LIST_NEXT(sav, chain);
 	
 				/* sanity check */
@@ -4890,6 +4916,32 @@ key_timehandler(void)
 		}
    }
 
+         if (++key_timehandler_debug >= 300) {
+	          if (key_debug_level) {
+		           printf("%s: total stats for %u calls\n", __FUNCTION__, key_timehandler_debug);
+		           printf("%s: walked %u SPDs\n", __FUNCTION__, spd_count);
+			   printf("%s: walked %llu SAs: LARVAL SAs %u, MATURE SAs %u, DYING SAs %u, DEAD SAs %u\n", __FUNCTION__,
+				  total_sav_count, larval_sav_count, mature_sav_count, dying_sav_count, dead_sav_count);
+			   printf("%s: walked %u SAHs: DEAD SAHs %u, EMPTY SAHs %u\n", __FUNCTION__,
+				  sah_count, dead_sah_count, empty_sah_count);
+			   if (sah_search_calls) {
+			           printf("%s: SAH search cost %d iters per call\n", __FUNCTION__,
+					  (sah_search_count/sah_search_calls));
+			   }
+		  }
+		  spd_count = 0;
+		  sah_count = 0;
+		  dead_sah_count = 0;
+		  empty_sah_count = 0;
+		  larval_sav_count = 0;
+		  mature_sav_count = 0;
+		  dying_sav_count = 0;
+		  dead_sav_count = 0;
+		  total_sav_count = 0;
+		  sah_search_count = 0;
+		  sah_search_calls = 0;
+		  key_timehandler_debug = 0;
+	 }
 #ifndef IPSEC_NONBLOCK_ACQUIRE
 	/* ACQ tree */
     {
diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h
index 368f4f3e6..d482714ef 100644
--- a/bsd/sys/systm.h
+++ b/bsd/sys/systm.h
@@ -230,6 +230,8 @@ void	get_procrustime(struct time_value *tv);
 void	load_init_program(struct proc *p);
 void __pthread_testcancel(int presyscall);
 void syscall_exit_funnelcheck(void);
+void throttle_info_get_last_io_time(mount_t mp, struct timeval *tv);
+void update_last_io_time(mount_t mp);
 #endif /* BSD_KERNEL_PRIVATE */
 
 
diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c
index 3871c2a3b..026109d21 100644
--- a/bsd/vfs/vfs_bio.c
+++ b/bsd/vfs/vfs_bio.c
@@ -3154,6 +3154,10 @@ buf_biodone(buf_t bp)
 		fslog_io_error(bp);
 	}
 
+	if (bp->b_vp && bp->b_vp->v_mount && (bp->b_flags & B_READ) == 0) {
+		update_last_io_time(bp->b_vp->v_mount);
+	}
+
         if (kdebug_enable) {
 	        int    code = DKIO_DONE;
 
diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c
index bfee0d8b4..643d79c07 100644
--- a/bsd/vfs/vfs_subr.c
+++ b/bsd/vfs/vfs_subr.c
@@ -4986,15 +4986,21 @@ out:
  * - Neither the node nor the directory are immutable.
  * - The user is not the superuser.
  *
- * Deletion is not permitted if the directory is sticky and the caller is not owner of the
- * node or directory.
+ * Deletion is not permitted if the directory is sticky and the caller is
+ * not owner of the node or directory.
  *
- * If either the node grants DELETE, or the directory grants DELETE_CHILD, the node may be
- * deleted.  If neither denies the permission, and the caller has Posix write access to the
- * directory, then the node may be deleted.
+ * If either the node grants DELETE, or the directory grants DELETE_CHILD,
+ * the node may be deleted.  If neither denies the permission, and the
+ * caller has Posix write access to the directory, then the node may be
+ * deleted.
+ *
+ * As an optimization, we cache whether or not delete child is permitted
+ * on directories without the sticky bit set.
  */
-static int
-vnode_authorize_delete(vauth_ctx vcp)
+int
+vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child);
+/*static*/ int
+vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
 {
 	struct vnode_attr	*vap = vcp->vap;
 	struct vnode_attr	*dvap = vcp->dvap;
@@ -5004,7 +5010,7 @@ vnode_authorize_delete(vauth_ctx vcp)
 
 	/* check the ACL on the directory */
 	delete_child_denied = 0;
-	if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
+	if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) {
 		eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
 		eval.ae_acl = &dvap->va_acl->acl_ace[0];
 		eval.ae_count = dvap->va_acl->acl_entrycount;
@@ -5070,15 +5076,20 @@ vnode_authorize_delete(vauth_ctx vcp)
 		return(EACCES);
 	}
 
-	/* enforce sticky bit behaviour */
-	if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
+	/*
+	 * enforce sticky bit behaviour; the cached_delete_child property will
+	 * be false and the dvap contents valis for sticky bit directories;
+	 * this makes us check the directory each time, but it's unavoidable,
+	 * as sticky bit is an exception to caching.
+	 */
+	if (!cached_delete_child && (dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
 		KAUTH_DEBUG("%p    DENIED - sticky bit rules (user %d  file %d  dir %d)",
 		    vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid);
 		return(EACCES);
 	}
 
 	/* check the directory */
-	if ((error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) {
+	if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) {
 		KAUTH_DEBUG("%p    ALLOWED - granted by posix permisssions", vcp->vp);
 		return(error);
 	}
@@ -5476,7 +5487,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i
 	int			result;
 	int			*errorp;
 	int			noimmutable;
-	boolean_t		parent_authorized_for_delete = FALSE;
+	boolean_t		parent_authorized_for_delete_child = FALSE;
 	boolean_t		found_deny = FALSE;
 	boolean_t		parent_ref= FALSE;
 
@@ -5541,8 +5552,8 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i
 		 * can skip a whole bunch of work... we will still have to
 		 * authorize that this specific child can be removed
 		 */
-		if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE) == TRUE)
-		        parent_authorized_for_delete = TRUE;
+		if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE)
+		        parent_authorized_for_delete_child = TRUE;
 	} else {
 		dvp = NULL;
 	}
@@ -5589,7 +5600,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i
 		KAUTH_DEBUG("%p    ERROR - failed to get vnode attributes - %d", vp, result);
 		goto out;
 	}
-	if (dvp && parent_authorized_for_delete == FALSE) {
+	if (dvp && parent_authorized_for_delete_child == FALSE) {
 		VATTR_WANTED(&dva, va_mode);
 		VATTR_WANTED(&dva, va_uid);
 		VATTR_WANTED(&dva, va_gid);
@@ -5645,7 +5656,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i
 	if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0)
 		goto out;
 	if ((rights & KAUTH_VNODE_DELETE) &&
-	    parent_authorized_for_delete == FALSE &&
+	    parent_authorized_for_delete_child == FALSE &&
 	    ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0))
 		goto out;
 
@@ -5658,13 +5669,14 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i
 		goto out;
 
 	/*
-	 * If we're not the superuser, authorize based on file properties.
+	 * If we're not the superuser, authorize based on file properties;
+	 * note that even if parent_authorized_for_delete_child is TRUE, we
+	 * need to check on the node itself.
 	 */
 	if (!vfs_context_issuser(ctx)) {
 		/* process delete rights */
 		if ((rights & KAUTH_VNODE_DELETE) &&
-		    parent_authorized_for_delete == FALSE &&
-		    ((result = vnode_authorize_delete(vcp)) != 0))
+		    ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0))
 		    goto out;
 
 		/* process remaining rights */
@@ -5715,12 +5727,20 @@ out:
 		        vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
 		}
 	}
-	if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete == FALSE) {
+	if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) {
 	        /*
-		 * parent was successfully and newly authorized for deletions
-		 * add it to the cache
+		 * parent was successfully and newly authorized for content deletions
+		 * add it to the cache, but only if it doesn't have the sticky
+		 * bit set on it.  This same  check is done earlier guarding
+		 * fetching of dva, and if we jumped to out without having done
+		 * this, we will have returned already because of a non-zero
+		 * 'result' value.
 		 */
-	        vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE);
+		if (VATTR_IS_SUPPORTED(&dva, va_mode) &&
+		    !(dva.va_mode & (S_ISVTX))) {
+		    	/* OK to cache delete rights */
+			vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD);
+		}
 	}
 	if (parent_ref)
 		vnode_put(vp);
diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c
index 869f3f5b3..ee31d2c82 100644
--- a/bsd/vfs/vfs_syscalls.c
+++ b/bsd/vfs/vfs_syscalls.c
@@ -5120,10 +5120,11 @@ auth_exit:
         /*
          * We may encounter a race in the VNOP where the destination didn't 
          * exist when we did the namei, but it does by the time we go and 
-         * try to create the entry. In this case, we should re-drive this rename
-         * call from the top again.
-         */
-        if (error == EEXIST) {
+		 * try to create the entry. In this case, we should re-drive this rename
+		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
+		 * but other filesystem susceptible to this race could return it, too.
+		 */
+        if (error == ERECYCLE) {
             do_retry = 1;
         }
 
diff --git a/config/MasterVersion b/config/MasterVersion
index 58454343e..6ce3d2f9d 100644
--- a/config/MasterVersion
+++ b/config/MasterVersion
@@ -1,4 +1,4 @@
-9.7.0
+9.8.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp
index 6346d0c2d..3ec11b1f1 100644
--- a/iokit/bsddev/IOKitBSDInit.cpp
+++ b/iokit/bsddev/IOKitBSDInit.cpp
@@ -857,4 +857,41 @@ kern_return_t IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout )
     return KERN_SUCCESS;
 }
 
+
+int IOBSDIsMediaEjectable( const char *cdev_name )
+{
+    int ret = 0;
+    OSDictionary *dictionary;
+    OSString *dev_name;
+
+    if (strncmp(cdev_name, "/dev/", 5) == 0) {
+	    cdev_name += 5;
+    }
+
+    dictionary = IOService::serviceMatching( "IOMedia" );
+    if( dictionary ) {
+	dev_name = OSString::withCString( cdev_name );
+	if( dev_name ) {
+	    IOService *service;
+	    mach_timespec_t tv = { 5, 0 };    // wait up to "timeout" seconds for the device
+
+	    dictionary->setObject( kIOBSDNameKey, dev_name );
+	    dictionary->retain();
+	    service = IOService::waitForService( dictionary, &tv );
+	    if( service ) {
+		OSBoolean *ejectable = (OSBoolean *) service->getProperty( "Ejectable" );
+
+		if( ejectable ) {
+			ret = (int)ejectable->getValue();
+		}
+
+	    }
+	    dev_name->release();
+	}
+	dictionary->release();
+    }
+
+    return ret;
+}
+
 } /* extern "C" */
diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c
index cd5bdbb71..17143604c 100644
--- a/osfmk/i386/acpi.c
+++ b/osfmk/i386/acpi.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -136,10 +136,24 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon)
 	acpi_hibernate_callback_data_t data;
 	boolean_t did_hibernate;
 #endif
+	unsigned int	cpu;
+	kern_return_t	rc;
+	unsigned int	my_cpu;
 
 	kprintf("acpi_sleep_kernel hib=%d\n",
 			current_cpu_datap()->cpu_hibernate);
 
+	/* Geta ll CPUs to be in the "off" state */
+	my_cpu = cpu_number();
+	for (cpu = 0; cpu < real_ncpus; cpu += 1) {
+	    	if (cpu == my_cpu)
+			continue;
+		rc = pmCPUExitHaltToOff(cpu);
+		if (rc != KERN_SUCCESS)
+		    panic("Error %d trying to transition CPU %d to OFF",
+			  rc, cpu);
+	}
+
 	/* shutdown local APIC before passing control to BIOS */
 	lapic_shutdown();
 
diff --git a/osfmk/i386/cpu.c b/osfmk/i386/cpu.c
index 1760eabf5..194a6576b 100644
--- a/osfmk/i386/cpu.c
+++ b/osfmk/i386/cpu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -141,9 +141,13 @@ cpu_exit_wait(
 {
     	cpu_data_t	*cdp = cpu_datap(cpu);
 
+	/*
+	 * Wait until the CPU indicates that it has stopped.
+	 */
 	simple_lock(&x86_topo_lock);
 	while ((cdp->lcpu.state != LCPU_HALT)
-	       && (cdp->lcpu.state != LCPU_OFF)) {
+	       && (cdp->lcpu.state != LCPU_OFF)
+	       && !cdp->lcpu.stopped) {
 	    simple_unlock(&x86_topo_lock);
 	    cpu_pause();
 	    simple_lock(&x86_topo_lock);
diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c
index 88aa0f87b..8decbb943 100644
--- a/osfmk/i386/pmCPU.c
+++ b/osfmk/i386/pmCPU.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -354,6 +354,19 @@ pmCPUExitHalt(int cpu)
     return(rc);
 }
 
+kern_return_t
+pmCPUExitHaltToOff(int cpu)
+{
+    kern_return_t	rc	= KERN_INVALID_ARGUMENT;
+
+    if (pmInitDone
+	&& pmDispatch != NULL
+	&& pmDispatch->exitHaltToOff != NULL)
+	rc = pmDispatch->exitHaltToOff(cpu_to_lcpu(cpu));
+
+    return(rc);
+}
+
 /*
  * Called to initialize the power management structures for the CPUs.
  */
diff --git a/osfmk/i386/pmCPU.h b/osfmk/i386/pmCPU.h
index 6026f5ed6..cbfaebe65 100644
--- a/osfmk/i386/pmCPU.h
+++ b/osfmk/i386/pmCPU.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -37,7 +37,7 @@
  * This value should be changed each time that pmDsipatch_t or pmCallBacks_t
  * changes.
  */
-#define PM_DISPATCH_VERSION	15
+#define PM_DISPATCH_VERSION	16
 
 /*
  * Dispatch table for functions that get installed when the power
@@ -68,6 +68,7 @@ typedef struct
     void		(*pmTimerStateSave)(void);
     void		(*pmTimerStateRestore)(void);
     kern_return_t	(*exitHalt)(x86_lcpu_t *lcpu);
+    kern_return_t	(*exitHaltToOff)(x86_lcpu_t *lcpu);
     void		(*markAllCPUsOff)(void);
     void		(*pmSetRunCount)(uint32_t count);
     boolean_t		(*pmIsCPUUnAvailable)(x86_lcpu_t *lcpu);
@@ -112,6 +113,7 @@ void pmCPUHalt(uint32_t reason);
 void pmTimerSave(void);
 void pmTimerRestore(void);
 kern_return_t pmCPUExitHalt(int cpu);
+kern_return_t pmCPUExitHaltToOff(int cpu);
 
 #define PM_HALT_NORMAL		0		/* normal halt path */
 #define PM_HALT_DEBUG		1		/* debug code wants to halt */