]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/vfs/vfs_bio.c
xnu-201.5.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
index e11f6cb1dccaee99bcc98e99470295c8e6ecae8b..3e9b0a09badfe201681aefdc7aa49df9039afe53 100644 (file)
@@ -100,6 +100,7 @@ extern void reassignbuf(struct buf *, struct vnode *);
 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
 
 extern int niobuf;             /* The number of IO buffer headers for cluster IO */
+int blaundrycnt;
 
 #if TRACE
 struct proc *traceproc;
@@ -152,7 +153,7 @@ int need_iobuffer;
 
 #define BHASHENTCHECK(bp)      \
        if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
-               panic("%x: b_hash.le_prev is deadb", (bp));
+               panic("%x: b_hash.le_prev is not deadbeef", (bp));
 
 #define BLISTNONE(bp)  \
        (bp)->b_hash.le_next = (struct buf *)0; \
@@ -267,6 +268,19 @@ bremfree(bp)
        bp->b_timestamp = 0; 
 }
 
+static __inline__ void
+bufhdrinit(struct buf *bp)
+{
+       bzero((char *)bp, sizeof *bp);
+       bp->b_dev = NODEV;
+       bp->b_rcred = NOCRED;
+       bp->b_wcred = NOCRED;
+       bp->b_vnbufs.le_next = NOLIST;
+       bp->b_flags = B_INVAL;
+
+       return;
+}
+
 /*
  * Initialize buffers and hash links for buffers.
  */
@@ -278,9 +292,8 @@ bufinit()
        register int i;
        int metabuf;
        long whichq;
-#if ZALLOC_METADATA
        static void bufzoneinit();
-#endif /* ZALLOC_METADATA */
+       static void bcleanbuf_thread_init();
 
        /* Initialize the buffer queues ('freelists') and the hash table */
        for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
@@ -294,17 +307,13 @@ bufinit()
        /* Initialize the buffer headers */
        for (i = 0; i < nbuf; i++) {
                bp = &buf[i];
-               bzero((char *)bp, sizeof *bp);
-               bp->b_dev = NODEV;
-               bp->b_rcred = NOCRED;
-               bp->b_wcred = NOCRED;
-               bp->b_vnbufs.le_next = NOLIST;
-               bp->b_flags = B_INVAL;
+               bufhdrinit(bp);
+
                /*
                 * metabuf buffer headers on the meta-data list and
                 * rest of the buffer headers on the empty list
                 */
-               if (--metabuf 
+               if (--metabuf) 
                        whichq = BQ_META;
                else 
                        whichq = BQ_EMPTY;
@@ -317,24 +326,20 @@ bufinit()
 
        for (; i < nbuf + niobuf; i++) {
                bp = &buf[i];
-               bzero((char *)bp, sizeof *bp);
-               bp->b_dev = NODEV;
-               bp->b_rcred = NOCRED;
-               bp->b_wcred = NOCRED;
-               bp->b_vnbufs.le_next = NOLIST;
-               bp->b_flags = B_INVAL;
+               bufhdrinit(bp);
                binsheadfree(bp, &iobufqueue, -1);
        }
 
        printf("using %d buffer headers and %d cluster IO buffer headers\n",
                nbuf, niobuf);
 
-#if ZALLOC_METADATA
-       /* Set up zones for meta-data */
+       /* Set up zones used by the buffer cache */
        bufzoneinit();
-#endif
 
-#if XXX
+       /* start the bcleanbuf() thread */
+       bcleanbuf_thread_init();
+
+#if 0  /* notyet */
        /* create a thread to do dynamic buffer queue balancing */
        bufq_balance_thread_init();
 #endif /* XXX */
@@ -364,8 +369,11 @@ bio_doread(vp, blkno, size, cred, async, queuetype)
                /* Start I/O for the buffer (keeping credentials). */
                SET(bp->b_flags, B_READ | async);
                if (cred != NOCRED && bp->b_rcred == NOCRED) {
-                       crhold(cred);
-                       bp->b_rcred = cred;
+                       /*
+                        * NFS has embedded ucred.
+                        * Can not crhold() here as that causes zone corruption
+                        */
+                       bp->b_rcred = crdup(cred);
                }
                VOP_STRATEGY(bp);
 
@@ -633,60 +641,45 @@ brelse(bp)
                && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
                kern_return_t kret;
                upl_t         upl;
-               upl_page_info_t *pl;
                int           upl_flags;
 
                if ( !ISSET(bp->b_flags, B_PAGELIST)) {
                        if ( !ISSET(bp->b_flags, B_INVAL)) {
-                               void  *object;
-                               off_t  file_offset;
-
-                               object = ubc_getobject(bp->b_vp, UBC_NOREACTIVATE);
-                               if (object == (void *)NULL)
-                                       panic("vmobject for vp is null");
-                               if (bp->b_bufsize & 0xfff)
-                                       panic("list request is with less than 4k");
-
-                               file_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
-
-                               kret = vm_fault_list_request(object, 
-                                                            (vm_object_offset_t)file_offset, bp->b_bufsize, 
-                                                            &upl, NULL, 0,  
-                                               (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS
-                                               | UPL_SET_INTERNAL));
+                               kret = ubc_create_upl(bp->b_vp, 
+                                                               ubc_blktooff(bp->b_vp, bp->b_lblkno),
+                                                               bp->b_bufsize, 
+                                                           &upl,
+                                                               NULL,
+                                                               UPL_PRECIOUS);
                                if (kret != KERN_SUCCESS)
                                        panic("brelse: Failed to get pagelists");
 #ifdef  UBC_DEBUG
                                upl_ubc_alias_set(upl, bp, 5);
 #endif /* UBC_DEBUG */
                        } else
-                               upl = (upl_t) 0;
+                               upl = (upl_t) 0;
                } else {
-                       upl = bp->b_pagelist;
-                       kret = kernel_upl_unmap(kernel_map, upl);
+                       upl = bp->b_pagelist;
+                       kret = ubc_upl_unmap(upl);
 
                        if (kret != KERN_SUCCESS)
                                panic("kernel_upl_unmap failed");
                        bp->b_data = 0;
                }
                if (upl) {
-                       pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
-
                        if (bp->b_flags & (B_ERROR | B_INVAL)) {
-                               if (bp->b_flags & (B_READ | B_INVAL))
+                           if (bp->b_flags & (B_READ | B_INVAL))
                                        upl_flags = UPL_ABORT_DUMP_PAGES;
                                else
                                        upl_flags = 0;
-                               kernel_upl_abort(upl, upl_flags);
+                               ubc_upl_abort(upl, upl_flags);
                        } else {
-                               if (ISSET(bp->b_flags, (B_DELWRI | B_WASDIRTY)))
-                                       upl_flags = UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
+                           if (ISSET(bp->b_flags, (B_DELWRI | B_WASDIRTY)))
+                                       upl_flags = UPL_COMMIT_SET_DIRTY ;
                                else
-                                       upl_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
-                               kernel_upl_commit_range(upl, 0, bp->b_bufsize,
-                                       upl_flags 
-                                               | UPL_COMMIT_INACTIVATE, 
-                                       pl, MAX_UPL_TRANSFER);
+                                   upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
+                               ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
+                                       UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
                        }
                        s = splbio();
                        CLR(bp->b_flags, B_PAGELIST);
@@ -817,16 +810,13 @@ getblk(vp, blkno, size, slpflag, slptimeo, operation)
        int s, err;
        upl_t upl;
        upl_page_info_t *pl;
-       void * object;
        kern_return_t kret;
-       void *pager;
-       off_t file_offset;
        int error=0;
        int pagedirty = 0;
 
-start:
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
                     blkno * PAGE_SIZE, size, operation, 0, 0);
+start:
 
        s = splbio();
        if (bp = incore(vp, blkno)) {
@@ -887,29 +877,18 @@ start:
                        case BLK_READ:
                        case BLK_WRITE:
                                if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
-
-                                       if (bp->b_bufsize & 0xfff)
-                                               panic("list request is with less than 4k");
-
-                                       object = ubc_getobject(vp, UBC_NOREACTIVATE);
-                                       if (object == (void *)NULL)
-                                               panic("vmobject for vp is null");
-
-                                       file_offset = ubc_blktooff(vp, bp->b_lblkno);
-                                       
-                                       kret = vm_fault_list_request(object, 
-                                                                    (vm_object_offset_t)file_offset, bp->b_bufsize, 
-                                                                    &upl, NULL, 0,  
-                                                                    (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_SET_INTERNAL));
-
+                                       kret = ubc_create_upl(vp,
+                                                                       ubc_blktooff(vp, bp->b_lblkno), 
+                                                                       bp->b_bufsize, 
+                                                                       &upl, 
+                                                                       &pl,
+                                                                       UPL_PRECIOUS);
                                        if (kret != KERN_SUCCESS)
                                                panic("Failed to get pagelists");
 
                                        SET(bp->b_flags, B_PAGELIST);
                                        bp->b_pagelist = upl;
 
-                                       pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
-
                                        if ( !upl_valid_page(pl, 0))
                                                panic("getblk: incore buffer without valid page");
 
@@ -918,12 +897,12 @@ start:
                                        else
                                                CLR(bp->b_flags, B_WASDIRTY);
 
-                                       kret = kernel_upl_map(kernel_map, upl, (vm_address_t *)&(bp->b_data));
+                                       kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
                                        if (kret != KERN_SUCCESS) {
-                                               panic("getblk: kernel_upl_map() "
-                                                     "failed with (%d)", kret);
+                                               panic("getblk: ubc_upl_map() failed with (%d)",
+                                                                 kret);
                                        }
-                                       if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
+                                       if (bp->b_data == 0) panic("ubc_upl_map mapped 0");
                                }
                                break;
 
@@ -957,6 +936,13 @@ start:
                }
                if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
                        goto start;
+               if (incore(vp, blkno)) {
+                       SET(bp->b_flags, B_INVAL);
+                       binshash(bp, &invalhash);
+                       brelse(bp);
+                       goto start;
+               }
+
                /*
                 * if it is meta, the queue may be set to other 
                 * type so reset as well as mark it to be B_META
@@ -967,16 +953,17 @@ start:
                        SET(bp->b_flags, B_META);
                        queue = BQ_META;
                }
+               /*
+                * Insert in the hash so that incore() can find it 
+                */
+               binshash(bp, BUFHASH(vp, blkno)); 
+
                allocbuf(bp, size);
 
                switch (operation) {
                case BLK_META:
                        /* buffer data is invalid */
 
-                       /*
-                        * Insert in the hash so that incore() can find it 
-                        */
-                       binshash(bp, BUFHASH(vp, blkno)); 
 #if !ZALLOC_METADATA
                        if (bp->b_data)
                                panic("bp->b_data is not nul; %x",bp);
@@ -1004,27 +991,16 @@ start:
 
                case BLK_READ:
                case BLK_WRITE:
-                       /*
-                        * Insert in the hash so that incore() can find it 
-                        */
-                       binshash(bp, BUFHASH(vp, blkno)); 
-                       pager = ubc_getpager(vp);
-                       file_offset = ubc_blktooff(vp, blkno);
-
-                       object = ubc_getobject(vp, UBC_NOREACTIVATE);
-                       if (object == (void *)NULL)
-                               panic("vmobject for vp is null");
-                       if (bp->b_bufsize & 0xfff)
-                               panic("list request is with less than 4k");
 
                        if (ISSET(bp->b_flags, B_PAGELIST))
                                panic("B_PAGELIST in bp=%x",bp);
 
-                       kret = vm_fault_list_request(object, 
-                                       (vm_object_offset_t)file_offset, bp->b_bufsize, 
-                                       &upl, NULL, 0,  
-                                       (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_SET_INTERNAL));
-
+                       kret = ubc_create_upl(vp,
+                                                       ubc_blktooff(vp, blkno),
+                                                       bp->b_bufsize, 
+                                                       &upl,
+                                                       &pl,
+                                                       UPL_PRECIOUS);
                        if (kret != KERN_SUCCESS)
                                panic("Failed to get pagelists");
 
@@ -1035,7 +1011,6 @@ start:
                        bp->b_pagelist = upl;
 
                        SET(bp->b_flags, B_PAGELIST);
-                       pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
 
                        if (upl_valid_page(pl, 0)) {
                                SET(bp->b_flags, B_CACHE | B_DONE);
@@ -1102,9 +1077,9 @@ start:
                        } else {
                                bufstats.bufs_miss++;
                        }
-                       kret = kernel_upl_map(kernel_map, upl, (vm_address_t *)&(bp->b_data));
+                       kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
                        if (kret != KERN_SUCCESS) {
-                               panic("getblk: kernel_upl_map() "
+                               panic("getblk: ubc_upl_map() "
                                      "failed with (%d)", kret);
                        }
                        if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
@@ -1212,6 +1187,10 @@ struct meta_zone_entry meta_zones[] = {
        {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
        {NULL, 0, 0, "" } /* End */
 };
+#endif /* ZALLOC_METADATA */
+
+zone_t buf_hdr_zone;
+int buf_hdr_count;
 
 /*
  * Initialize the meta data zones
@@ -1219,6 +1198,7 @@ struct meta_zone_entry meta_zones[] = {
 static void
 bufzoneinit(void)
 {
+#if ZALLOC_METADATA
        int i;
 
        for (i = 0; meta_zones[i].mz_size != 0; i++) {
@@ -1228,8 +1208,11 @@ bufzoneinit(void)
                                        PAGE_SIZE,
                                        meta_zones[i].mz_name);
        }
+#endif /* ZALLOC_METADATA */
+       buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
 }
 
+#if ZALLOC_METADATA
 static zone_t
 getbufzone(size_t size)
 {
@@ -1372,7 +1355,8 @@ start:
        s = splbio();
        
        /* invalid request gets empty queue */
-       if ((*queue > BQUEUES) || (*queue < 0))
+       if ((*queue > BQUEUES) || (*queue < 0)
+               || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
                *queue = BQ_EMPTY;
 
        /* (*queue == BQUEUES) means no preference */
@@ -1395,14 +1379,24 @@ start:
                        *queue = BQ_EMPTY;
                        goto found;
                }
-#if DIAGNOSTIC
-               /* with UBC this is a fatal condition */
-               panic("getnewbuf: No useful buffers");
-#else
+
+               /* Create a new temparory buffer header */
+               bp = (struct buf *)zalloc(buf_hdr_zone);
+       
+               if (bp) {
+                       bufhdrinit(bp);
+                       BLISTNONE(bp);
+                       binshash(bp, &invalhash);
+                       SET(bp->b_flags, B_HDRALLOC);
+                       *queue = BQ_EMPTY;
+                       binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
+                       buf_hdr_count++;
+                       goto found;
+               }
+
                /* Log this error condition */
                printf("getnewbuf: No useful buffers");
-#endif  /* DIAGNOSTIC */
-       
+
                /* wait for a free buffer of any kind */
                needbuffer = 1;
                bufstats.bufs_sleeps++;
@@ -1510,10 +1504,15 @@ bcleanbuf(struct buf *bp)
        if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) 
                panic("bcleanbuf: le_prev is deadbeef");
 
-       /* If buffer was a delayed write, start it, and return 1 */
+       /*
+        * If buffer was a delayed write, start the IO by queuing
+        * it on the LAUNDRY queue, and return 1
+        */
        if (ISSET(bp->b_flags, B_DELWRI)) {
                splx(s);
-               bawrite (bp);
+               binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
+               blaundrycnt++;
+               wakeup(&blaundrycnt);
                return (1);
        }
 
@@ -1558,6 +1557,7 @@ bcleanbuf(struct buf *bp)
        s = splbio();
 
        /* clear out various other fields */
+       bp->b_bufsize = 0;
        bp->b_data = 0;
        bp->b_flags = B_BUSY;
        bp->b_dev = NODEV;
@@ -1712,7 +1712,8 @@ vfs_bufstats()
        register struct buf *bp;
        register struct bqueues *dp;
        int counts[MAXBSIZE/CLBYTES+1];
-       static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY", "META" };
+       static char *bname[BQUEUES] =
+               { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
 
        for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
                count = 0;
@@ -1733,21 +1734,30 @@ vfs_bufstats()
 }
 #endif /* DIAGNOSTIC */
 
+#define        NRESERVEDIOBUFS 16
 
 struct buf *
-alloc_io_buf(vp)
+alloc_io_buf(vp, priv)
        struct vnode *vp;
+       int priv;
 {
        register struct buf *bp;
        int s;
 
        s = splbio();
 
+       while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
+               need_iobuffer = 1;
+               bufstats.bufs_iobufsleeps++;
+               (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
+       }
+
        while ((bp = iobufqueue.tqh_first) == NULL) {
                need_iobuffer = 1;
                bufstats.bufs_iobufsleeps++;
-               tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
+               (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
        }
+
        TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
        bp->b_timestamp = 0; 
 
@@ -1944,6 +1954,12 @@ bufq_balance_thread_init()
                bufqlim[BQ_META].bl_target = nbuftarget/4;
                bufqlim[BQ_META].bl_stale = META_IS_STALE;
 
+               /* LAUNDRY queue */
+               bufqlim[BQ_LOCKED].bl_nlow = 0;
+               bufqlim[BQ_LOCKED].bl_nlhigh = 32;
+               bufqlim[BQ_LOCKED].bl_target = 0;
+               bufqlim[BQ_LOCKED].bl_stale = 30;
+
                buqlimprt(1);
        }
 
@@ -2012,8 +2028,8 @@ balancebufq(int q)
        if ((q < 0) || (q >= BQUEUES))
                goto out;
 
-       /* LOCKED queue MUST not be balanced */
-       if (q == BQ_LOCKED)
+       /* LOCKED or LAUNDRY queue MUST not be balanced */
+       if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
                goto out;
 
        n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
@@ -2092,7 +2108,8 @@ void
 buqlimprt(int all)
 {
        int i;
-    static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY", "META" };
+    static char *bname[BQUEUES] =
+               { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
 
        if (all)
                for (i = 0; i < BQUEUES; i++) {
@@ -2109,3 +2126,42 @@ buqlimprt(int all)
                        printf("cur = %d, ", (long)bufqlim[i].bl_num);
                }
 }
+
+/*
+ * If the getnewbuf() calls bcleanbuf() on the same thread
+ * there is a potential for stack overrun and deadlocks.
+ * So we always handoff the work to worker thread for completion
+ */
+
+static void
+bcleanbuf_thread_init()
+{
+       static void bcleanbuf_thread();
+
+       /* create worker thread */
+       kernel_thread(kernel_task, bcleanbuf_thread);
+}
+
+static void
+bcleanbuf_thread()
+{
+       boolean_t       funnel_state;
+       struct buf *bp;
+
+       funnel_state = thread_funnel_set(kernel_flock, TRUE);
+
+doit:
+       while (blaundrycnt == 0)
+               (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
+       bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
+       /* Remove from the queue */
+       bremfree(bp);
+       blaundrycnt--;
+       /* do the IO */
+       bawrite(bp);
+       /* start again */
+       goto doit;
+
+       (void) thread_funnel_set(kernel_flock, funnel_state);
+}
+