xnu-344.12.2.tar.gz

author Apple <opensource@apple.com>

Tue, 12 Aug 2003 21:04:55 +0000 (21:04 +0000)

committer Apple <opensource@apple.com>

Tue, 12 Aug 2003 21:04:55 +0000 (21:04 +0000)
author Apple <opensource@apple.com>
Tue, 12 Aug 2003 21:04:55 +0000 (21:04 +0000)
committer Apple <opensource@apple.com>
Tue, 12 Aug 2003 21:04:55 +0000 (21:04 +0000)
diff --git a/bsd/conf/files b/bsd/conf/files

index 7012205faeab42c488309d26baf8c59cd5b4e7fe..817d99f42882c3fde203f74a8c1670f331df1ed5 100644 (file)
--- a/bsd/conf/files
+++ b/bsd/conf/files
@@ -137,6 +137,7 @@ bsd/vfs/vfs_support.c                       standard
  bsd/vfs/vfs_utfconv.c                  standard
  bsd/vfs/vfs_vnops.c                    standard
  bsd/vfs/vnode_if.c                     standard
+bsd/vfs/vfs_journal.c                  standard
  
  bsd/miscfs/deadfs/dead_vnops.c         standard
  bsd/miscfs/fdesc/fdesc_vfsops.c                optional fdesc
@@ -501,6 +502,8 @@ bsd/kern/mach_header.c                      standard
  bsd/kern/mach_loader.c                 standard
  bsd/kern/posix_sem.c                   standard
  bsd/kern/posix_shm.c                   standard
+# XXXdbg - I need this in the journaling and block cache code
+bsd/kern/qsort.c                       standard
  
  bsd/vm/vnode_pager.c                   standard
  bsd/vm/vm_unix.c                       standard
diff --git a/bsd/conf/version.minor b/bsd/conf/version.minor

index d00491fd7e5bb6fa28c517a0bb32b8b506539d4d..0cfbf08886fca9a91cb753ec8734c84fcbe52c9f 100644 (file)
--- a/bsd/conf/version.minor
+++ b/bsd/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h

index 9086981b097bd22232c44e2a5346e3a4533747ad..b82adcf2e16dad50436f6316fc2f525671d5dff2 100644 (file)
--- a/bsd/hfs/hfs.h
+++ b/bsd/hfs/hfs.h
@@ -36,6 +36,8 @@
  #include <sys/quota.h>
  #include <sys/dirent.h>
  
+#include <vfs/vfs_journal.h>
+
  #include <hfs/hfs_format.h>
  #include <hfs/hfs_catalog.h>
  #include <hfs/hfs_cnode.h>
@@ -108,6 +110,7 @@ struct vcb_t {
      int16_t                    vcbAtrb;
      int16_t                    vcbFlags;
      int16_t                    vcbspare;
+    u_int32_t                  vcbJinfoBlock;
  
      u_int32_t                  vcbCrDate;
      u_int32_t                  vcbLsMod;
@@ -180,6 +183,7 @@ typedef struct hfsmount {
         u_int8_t                        hfs_fs_ronly;                   /* Whether this was mounted as read-initially  */
         u_int8_t                        hfs_unknownpermissions; /* Whether this was mounted with MNT_UNKNOWNPERMISSIONS */
         u_int8_t                        hfs_media_writeable;
+       u_int8_t                        hfs_orphans_cleaned;
         
         /* Physical Description */
         u_long                          hfs_phys_block_count;   /* Num of PHYSICAL blocks of volume */
@@ -211,10 +215,55 @@ typedef struct hfsmount {
         unicode_to_hfs_func_t   hfs_get_hfsname;
   
         struct quotafile        hfs_qfiles[MAXQUOTAS];    /* quota files */
+
+       // XXXdbg
+       void                *jnl;           // the journal for this volume (if one exists)
+       struct vnode        *jvp;           // device where the journal lives (may be equal to devvp)
+       u_int32_t            jnl_start;     // start block of the journal file (so we don't delete it)
+       u_int32_t            hfs_jnlfileid;
+       u_int32_t            hfs_jnlinfoblkid;
+    volatile int         readers;
+       volatile int         blocker;
  } hfsmount_t;
  
  #define hfs_private_metadata_dir       hfs_privdir_desc.cd_cnid
  
+#define hfs_global_shared_lock_acquire(hfsmp)    \
+    do { \
+       if (hfsmp->blocker) { \
+              tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \
+           continue; \
+          } \
+          hfsmp->readers++; \
+       break; \
+       } while (1)
+
+#define hfs_global_shared_lock_release(hfsmp)    \
+    do { \
+           hfsmp->readers--; \
+           if (hfsmp->readers == 0) { \
+               wakeup((caddr_t)&hfsmp->readers); \
+        } \
+    } while (0)
+
+#define hfs_global_exclusive_lock_acquire(hfsmp) \
+    do { \
+       if (hfsmp->blocker) { \
+              tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \
+           continue; \
+          } \
+       if (hfsmp->readers != 0) { \
+              tsleep((caddr_t)&hfsmp->readers, PRIBIO, "journal_enable/disble", 0); \
+           continue; \
+       } \
+       hfsmp->blocker = 1; \
+       break; \
+       } while (1)
+     
+#define hfs_global_exclusive_lock_release(hfsmp) \
+    hfsmp->blocker = 0; \
+       wakeup((caddr_t)&hfsmp->blocker)
+
  #define MAXHFSVNODELEN         31
  
  
@@ -325,6 +374,7 @@ enum { kdirentMaxNameBytes = NAME_MAX };
  #define VTOHFS(VP) ((struct hfsmount *)((VP)->v_mount->mnt_data))
  #define        VFSTOHFS(MP) ((struct hfsmount *)(MP)->mnt_data)        
  #define VCBTOHFS(VCB) (((struct vfsVCB *)(VCB))->vcb_hfsmp)
+#define FCBTOHFS(FCB) ((struct hfsmount *)(FCB)->ff_cp->c_vp->v_mount->mnt_data)
  
  /*
   * Various ways to acquire a VCB pointer:
@@ -332,6 +382,7 @@ enum { kdirentMaxNameBytes = NAME_MAX };
  #define VTOVCB(VP) (&(((struct hfsmount *)((VP)->v_mount->mnt_data))->hfs_vcb.vcb_vcb))
  #define VFSTOVCB(MP) (&(((struct hfsmount *)(MP)->mnt_data)->hfs_vcb.vcb_vcb))
  #define HFSTOVCB(HFSMP) (&(HFSMP)->hfs_vcb.vcb_vcb)
+#define FCBTOVCB(FCB) (&(((struct hfsmount *)((FCB)->ff_cp->c_vp->v_mount->mnt_data))->hfs_vcb.vcb_vcb))
  
  
  #define E_NONE 0
@@ -376,6 +427,8 @@ extern int hfs_metafilelocking(struct hfsmount *hfsmp, u_long fileID, u_int flag
  
  extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve);
  
+extern void hfs_remove_orphans(struct hfsmount *);
+
  
  short MacToVFSError(OSErr err);
  
@@ -388,6 +441,8 @@ u_long FindMetaDataDirectory(ExtendedVCB *vcb);
  #define  HFS_SYNCTRANS         1
  
  extern int hfs_btsync(struct vnode *vp, int sync_transaction);
+// used as a callback by the journaling code
+extern void hfs_sync_metadata(void *arg);
  
  short make_dir_entry(FCB **fileptr, char *name, u_int32_t fileID);
  
@@ -399,7 +454,13 @@ unsigned long BestBlockSizeFit(unsigned long allocationBlockSize,
  OSErr  hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb,
                 struct proc *p);
  OSErr  hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
-               off_t embeddedOffset, u_int64_t disksize, struct proc *p);
+               off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args);
+
+extern int     hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
+                                                          void *_args, int embeddedOffset, int mdb_offset,
+                                                          HFSMasterDirectoryBlock *mdbp, struct ucred *cred);
+extern u_long  GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name,
+                                       struct cat_attr *fattr, struct cat_fork *forkinfo);
  
  int hfs_getconverter(u_int32_t encoding, hfs_to_unicode_func_t *get_unicode,
                      unicode_to_hfs_func_t *get_hfsname);
diff --git a/bsd/hfs/hfs_attrlist.c b/bsd/hfs/hfs_attrlist.c

index 650d6fa4a67bb54420d4a7f916cae89860ac8979..f53d05e3fdac0368525211a2f3af3fb8e8fc5860 100644 (file)
--- a/bsd/hfs/hfs_attrlist.c
+++ b/bsd/hfs/hfs_attrlist.c
@@ -194,15 +194,35 @@ hfs_getattrlist(ap)
                 if ((error = hfs_write_access(vp, ap->a_cred, ap->a_p, false)) != 0)
                         return (error);
  
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               if (hfsmp->jnl) {
+                   if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                               hfs_global_shared_lock_release(hfsmp);
+                               return error;
+                   }
+               }
+
                 /* Lock catalog b-tree */
                 error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, ap->a_p);
-               if (error)
-                       return (error);
+               if (error) {
+                   if (hfsmp->jnl) {
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+                   return (error);
+               }
  
                 error = cat_insertfilethread(hfsmp, &cp->c_desc);
  
                 /* Unlock catalog b-tree */
                 (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p);
+
+               if (hfsmp->jnl) {
+                   journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
+
                 if (error)
                         return (error);
         }
@@ -350,6 +370,17 @@ hfs_setattrlist(ap)
         }
         if (cp->c_flag & (C_NOEXISTS | C_DELETED))
                 return (ENOENT);
+
+       // XXXdbg - don't allow modifying the journal or journal_info_block
+       if (hfsmp->jnl && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+               
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                       return EPERM;
+               }
+       }
+
         /*
          * Ownership of a file is required in one of two classes of calls:
          *
@@ -447,14 +478,12 @@ hfs_setattrlist(ap)
          * If any cnode attributes changed then do an update.
          */
         if (alist->volattr == 0) {
-               struct timeval atime, mtime;
+               struct timeval tv;
  
-               atime.tv_sec = cp->c_atime;
-               atime.tv_usec = 0;
-               mtime.tv_sec = cp->c_mtime;
-               mtime.tv_usec = cp->c_mtime_nsec / 1000;
                 cp->c_flag |= C_MODIFIED;
-               if ((error = VOP_UPDATE(vp, &atime, &mtime, 1)))
+               tv = time;
+               CTIMES(cp, &tv, &tv);
+               if ((error = VOP_UPDATE(vp, &tv, &tv, 1)))
                         goto ErrorExit;
         }
         /* Volume Rename */
@@ -482,9 +511,28 @@ hfs_setattrlist(ap)
                         to_desc.cd_cnid = cp->c_cnid;
                         to_desc.cd_flags = CD_ISDIR;
  
+                       // XXXdbg
+                       hfs_global_shared_lock_acquire(hfsmp);
+                       if (hfsmp->jnl) {
+                           if (journal_start_transaction(hfsmp->jnl) != 0) {
+                                       hfs_global_shared_lock_release(hfsmp);
+                                       error = EINVAL;
+                                       /* Restore the old name in the VCB */
+                                       copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
+                                       vcb->vcbFlags |= 0xFF00;
+                                       goto ErrorExit;
+                           }
+                       }
+
+
                         /* Lock catalog b-tree */
                         error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
                         if (error) {
+                               if (hfsmp->jnl) {
+                                   journal_end_transaction(hfsmp->jnl);
+                               }
+                               hfs_global_shared_lock_release(hfsmp);
+
                                 /* Restore the old name in the VCB */
                                 copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
                                 vcb->vcbFlags |= 0xFF00;
@@ -495,7 +543,12 @@ hfs_setattrlist(ap)
  
                         /* Unlock the Catalog */
                         (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
-
+                       
+                       if (hfsmp->jnl) {
+                           journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+                       
                         if (error) {
                                 /* Restore the old name in the VCB */
                                 copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
@@ -601,12 +654,17 @@ hfs_readdirattr(ap)
         int error = 0;
         int depleted = 0;
         int index, startindex;
-       int i;
+       int i, dir_entries;
         struct cat_desc *lastdescp = NULL;
         struct cat_desc prevdesc;
         char * prevnamebuf = NULL;
         struct cat_entrylist *ce_list = NULL;
  
+       dir_entries = dcp->c_entries;
+       if (dcp->c_attr.ca_fileid == kHFSRootFolderID && hfsmp->jnl) {
+               dir_entries -= 3;
+       }
+
         *(ap->a_actualcount) = 0;
         *(ap->a_eofflag) = 0;
         
@@ -639,7 +697,7 @@ hfs_readdirattr(ap)
  
         /* Convert uio_offset into a directory index. */
         startindex = index = uio->uio_offset / sizeof(struct dirent);
-       if ((index + 1) > dcp->c_entries) {
+       if ((index + 1) > dir_entries) {
                 *(ap->a_eofflag) = 1;
                 error = 0;
                 goto exit;
@@ -781,7 +839,7 @@ hfs_readdirattr(ap)
                                 /* Termination checks */
                                 if ((--maxcount <= 0) ||
                                     (uio->uio_resid < (fixedblocksize + HFS_AVERAGE_NAME_SIZE)) ||
-                                   (index >= dcp->c_entries)) {
+                                   (index >= dir_entries)) {
                                         depleted = 1;
                                         break;
                                 }
@@ -789,7 +847,7 @@ hfs_readdirattr(ap)
                 } /* for each catalog entry */
  
                 /* If there are more entries then save the last name. */
-               if (index < dcp->c_entries
+               if (index < dir_entries
                 &&  !(*(ap->a_eofflag))
                 &&  lastdescp != NULL) {
                         if (prevnamebuf == NULL)
@@ -1408,9 +1466,12 @@ packdirattr(
         if (ATTR_DIR_ENTRYCOUNT & attr) {
                 u_long entries = cattrp->ca_entries;
  
-               if ((descp->cd_parentcnid == kRootParID) &&
-                   (hfsmp->hfs_private_metadata_dir != 0))
-                       --entries;      /* hide private dir */
+               if (descp->cd_parentcnid == kRootParID) {
+                       if (hfsmp->hfs_private_metadata_dir != 0)
+                               --entries;          /* hide private dir */
+                       if (hfsmp->jnl)
+                               entries -= 2;   /* hide the journal files */
+               }
  
                 *((u_long *)attrbufptr)++ = entries;
         }
diff --git a/bsd/hfs/hfs_btreeio.c b/bsd/hfs/hfs_btreeio.c

index 6947a695a3fc77828517c1263b3430b36be794fa..a70290d05b909fe756aca2939b8d9a0425a010a5 100644 (file)
--- a/bsd/hfs/hfs_btreeio.c
+++ b/bsd/hfs/hfs_btreeio.c
@@ -68,7 +68,7 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option
         if (options & kGetEmptyBlock)
                 bp = getblk(vp, blockNum, block->blockSize, 0, 0, BLK_META);
         else
-       retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
+               retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
  
      DBG_ASSERT(bp != NULL);
      DBG_ASSERT(bp->b_data != NULL);
@@ -83,6 +83,9 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option
          block->buffer = bp->b_data;
          block->blockReadFromDisk = (bp->b_flags & B_CACHE) == 0;       /* not found in cache ==> came from disk */
  
+               // XXXdbg 
+               block->isModified = 0;
+
  #if BYTE_ORDER == LITTLE_ENDIAN
          /* Endian swap B-Tree node (only if it's a valid block) */
          if (!(options & kGetEmptyBlock)) {
@@ -116,9 +119,31 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option
  }
  
  
+__private_extern__
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr)
+{
+       struct hfsmount *hfsmp = VTOHFS(vp);
+    struct buf *bp = NULL;
+
+       if (hfsmp->jnl == NULL) {
+               return;
+       }
+       
+    bp = (struct buf *) blockPtr->blockHeader;
+    if (bp == NULL) {
+               panic("ModifyBlockStart: null bp  for blockdescptr 0x%x?!?\n", blockPtr);
+               return;
+    }
+
+       journal_modify_block_start(hfsmp->jnl, bp);
+       blockPtr->isModified = 1;
+}
+
+
  __private_extern__
  OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options)
  {
+    struct hfsmount    *hfsmp = VTOHFS(vp);
      extern int bdwrite_internal(struct buf *, int);
      OSStatus   retval = E_NONE;
      struct buf *bp = NULL;
@@ -131,16 +156,25 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock
      }
  
      if (options & kTrashBlock) {
-        bp->b_flags |= B_INVAL;
-       brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+               bp->b_flags |= B_INVAL;
+               if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+                       journal_kill_block(hfsmp->jnl, bp);
+               } else {
+                       brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+               }
      } else {
          if (options & kForceWriteBlock) {
-            retval = VOP_BWRITE(bp);
+                       if (hfsmp->jnl) {
+                               if (blockPtr->isModified == 0) {
+                                       panic("hfs: releaseblock: modified is 0 but forcewrite set! bp 0x%x\n", bp);
+                               }
+                               retval = journal_modify_block_end(hfsmp->jnl, bp);
+                               blockPtr->isModified = 0;
+                       } else {
+                               retval = VOP_BWRITE(bp);
+                       }
          } else if (options & kMarkBlockDirty) {
-#if FORCESYNCBTREEWRITES
-            VOP_BWRITE(bp);
-#else
-            if (options & kLockTransaction) {
+            if ((options & kLockTransaction) && hfsmp->jnl == NULL) {
                  /*
                   *
                   * Set the B_LOCKED flag and unlock the buffer, causing brelse to move
@@ -156,24 +190,44 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock
                       /* Rollback sync time to cause a sync on lock release... */
                       (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1));
                  }
-                bp->b_flags |= B_LOCKED;
-           }
+
+                               bp->b_flags |= B_LOCKED;
+            }
+
              /* 
               * Delay-write this block.
               * If the maximum delayed buffers has been exceeded then
               * free up some buffers and fall back to an asynchronous write.
               */
-            if (bdwrite_internal(bp, 1) != 0) {
+                       if (hfsmp->jnl) {
+                               if (blockPtr->isModified == 0) {
+                                       panic("hfs: releaseblock: modified is 0 but markdirty set! bp 0x%x\n", bp);
+                               }
+                               retval = journal_modify_block_end(hfsmp->jnl, bp);
+                               blockPtr->isModified = 0;
+                       } else if (bdwrite_internal(bp, 1) != 0) {
                  hfs_btsync(vp, 0);
                  /* Rollback sync time to cause a sync on lock release... */
                  (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1));
                  bp->b_flags &= ~B_LOCKED;
                  bawrite(bp);
              }
-
-#endif
          } else {
-               brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+                       // check if we had previously called journal_modify_block_start() 
+                       // on this block and if so, abort it (which will call brelse()).
+                       if (hfsmp->jnl && blockPtr->isModified) {
+                               // XXXdbg - I don't want to call modify_block_abort()
+                               //          because I think it may be screwing up the
+                               //          journal and blowing away a block that has
+                               //          valid data in it.
+                               //   
+                               //    journal_modify_block_abort(hfsmp->jnl, bp);
+                               //panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp);
+                               journal_modify_block_end(hfsmp->jnl, bp);
+                               blockPtr->isModified = 0;
+                       } else {
+                               brelse(bp);     /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+                       }
          };
      };
  
@@ -187,17 +241,16 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
  {
  #pragma unused (maxEOF)
  
-       OSStatus        retval;
-       UInt64          actualBytesAdded;
+       OSStatus        retval, ret;
+       UInt64          actualBytesAdded, origSize;
         UInt64          bytesToAdd;
-    UInt32             extendFlags;
         u_int32_t       startAllocation;
         u_int32_t       fileblocks;
         BTreeInfoRec btInfo;
         ExtendedVCB     *vcb;
         FCB                     *filePtr;
      struct proc *p = NULL;
-
+       UInt64          trim = 0;       
  
         filePtr = GetFileControlBlock(vp);
  
@@ -225,13 +278,14 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
         {
                 p = current_proc();
                 /* lock extents b-tree (also protects volume bitmap) */
-               retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
+               retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
                 if (retval)
                         return (retval);
         }
  
      (void) BTGetInformation(filePtr, 0, &btInfo);
  
+#if 0  // XXXdbg
         /*
          * The b-tree code expects nodes to be contiguous. So when
          * the allocation block size is less than the b-tree node
@@ -241,14 +295,38 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
                 extendFlags = 0;
         } else {
                 /* Ensure that all b-tree nodes are contiguous on disk */
-               extendFlags = kEFAllMask | kEFContigMask;
+               extendFlags = kEFContigMask;
         }
+#endif
  
+       origSize = filePtr->fcbEOF;
         fileblocks = filePtr->ff_blocks;
         startAllocation = vcb->nextAllocation;
  
-       retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, extendFlags, &actualBytesAdded);
-
+       // loop trying to get a contiguous chunk that's an integer multiple
+       // of the btree node size.  if we can't get a contiguous chunk that
+       // is at least the node size then we break out of the loop and let
+       // the error propagate back up.
+       do {
+               retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, kEFContigMask, &actualBytesAdded);
+               if (retval == dskFulErr && actualBytesAdded == 0) {
+
+                       if (bytesToAdd == btInfo.nodeSize || bytesToAdd < (minEOF - origSize)) {
+                               // if we're here there's nothing else to try, we're out
+                               // of space so we break and bail out.
+                               break;
+                       } else {
+                               bytesToAdd >>= 1;
+                               if (bytesToAdd < btInfo.nodeSize) {
+                                       bytesToAdd = btInfo.nodeSize;
+                               } else if ((bytesToAdd % btInfo.nodeSize) != 0) {
+                                       // make sure it's an integer multiple of the nodeSize
+                                       bytesToAdd -= (bytesToAdd % btInfo.nodeSize);
+                               }
+                       }
+               }
+       } while (retval == dskFulErr && actualBytesAdded == 0);
+       
         /*
          * If a new extent was added then move the roving allocator
          * reference forward by the current b-tree file size so 
@@ -260,25 +338,74 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
                 vcb->nextAllocation += fileblocks;
         }
                 
+       filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+       // XXXdbg ExtendFileC() could have returned an error even though
+       // it grew the file to be big enough for our needs.  If this is
+       // the case, we don't care about retval so we blow it away.
+       //
+       if (filePtr->fcbEOF >= minEOF && retval != 0) {
+               retval = 0;
+       }
+
+       // XXXdbg if the file grew but isn't large enough or isn't an
+       // even multiple of the nodeSize then trim things back.  if
+       // the file isn't large enough we trim back to the original
+       // size.  otherwise we trim back to be an even multiple of the
+       // btree node size.
+       //
+       if ((filePtr->fcbEOF < minEOF) || (actualBytesAdded % btInfo.nodeSize) != 0) {
+
+               if (filePtr->fcbEOF < minEOF) {
+                       retval = dskFulErr;
+                       
+                       if (filePtr->fcbEOF < origSize) {
+                               panic("hfs: btree file eof %lld less than orig size %lld!\n",
+                                         filePtr->fcbEOF, origSize);
+                       }
+                       
+                       trim = filePtr->fcbEOF - origSize;
+                       if (trim != actualBytesAdded) {
+                               panic("hfs: trim == %lld but actualBytesAdded == %lld\n",
+                                         trim, actualBytesAdded);
+                       }
+               } else {
+                       trim = (actualBytesAdded % btInfo.nodeSize);
+               }
+
+               ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0);
+               filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+               // XXXdbg - panic if the file didn't get trimmed back properly
+               if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+                       panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb 0x%x\n",
+                                 filePtr->fcbEOF, btInfo.nodeSize, filePtr);
+               }
+
+               if (ret) {
+                       // XXXdbg - this probably doesn't need to be a panic()
+                       panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %d)\n",
+                                 filePtr->fcbEOF, trim, ret);
+                       return ret;
+               }
+               actualBytesAdded -= trim;
+       }
+
         if(VTOC(vp)->c_fileid != kHFSExtentsFileID) {
                 /*
                  * Get any extents overflow b-tree changes to disk ASAP!
                  */
-               if (retval == 0) {
-                       (void) BTFlushPath(VTOF(vcb->extentsRefNum));
-                       (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p);
-               }
+               (void) BTFlushPath(VTOF(vcb->extentsRefNum));
+               (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p);
+
                 (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p);
         }
-       if (retval)
-               return (retval);
-       
-       filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
  
-       retval = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);       
-       if (retval)
-               return (retval);
-       
+       if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+               panic("hfs: extendbtree: fcb 0x%x has eof 0x%llx not a multiple of 0x%x (trim %llx)\n",
+                         filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim);
+       }
+
         /*
          * Update the Alternate MDB or Alternate VolumeHeader
          */
@@ -287,8 +414,12 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
             (VTOC(vp)->c_fileid == kHFSAttributesFileID)
            ) {
                 MarkVCBDirty( vcb );
-               retval = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
+               ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
         }
+
+       ret = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);
+       if (ret)
+               return (ret);
         
         return retval;
  }
@@ -300,6 +431,7 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
  static int
  ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount)
  {
+       struct hfsmount *hfsmp = VTOHFS(vp);
         struct buf *bp = NULL;
         daddr_t blk;
         daddr_t blkcnt;
@@ -311,14 +443,36 @@ ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount)
                 bp = getblk(vp, blk, blksize, 0, 0, BLK_META);
                 if (bp == NULL)
                         continue;
+
+        // XXXdbg
+               if (hfsmp->jnl) {
+                       // XXXdbg -- skipping this for now since it makes a transaction
+                       //           become *way* too large
+                   //journal_modify_block_start(hfsmp->jnl, bp);
+               }
+
                 bzero((char *)bp->b_data, blksize);
                 bp->b_flags |= B_AGE;
  
-                /* wait/yield every 32 blocks so we don't hog all the buffers */
-               if ((blk % 32) == 0)
-                       VOP_BWRITE(bp);
-               else
-                       bawrite(bp);
+        // XXXdbg
+               if (hfsmp->jnl) {
+                       // XXXdbg -- skipping this for now since it makes a transaction
+                       //           become *way* too large
+                       //journal_modify_block_end(hfsmp->jnl, bp);
+
+                       // XXXdbg - remove this once we decide what to do with the
+                       //          writes to the journal
+                       if ((blk % 32) == 0)
+                           VOP_BWRITE(bp);
+                       else
+                           bawrite(bp);
+               } else {
+                       /* wait/yield every 32 blocks so we don't hog all the buffers */
+                       if ((blk % 32) == 0)
+                               VOP_BWRITE(bp);
+                       else
+                               bawrite(bp);
+               }
                 --blkcnt;
                 ++blk;
         }
diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c

index 7d6999e6528b25563aa0e60a9452802e8c93e882..769576d7ecf3d4edc8e4954af96732eaacc192ae 100644 (file)
--- a/bsd/hfs/hfs_catalog.c
+++ b/bsd/hfs/hfs_catalog.c
@@ -261,6 +261,11 @@ cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp)
         if (result)
                 goto exit;
  
+       // XXXdbg - preflight all btree operations to make sure there's enough space
+       result = BTCheckFreeSpace(fcb);
+       if (result)
+               goto exit;
+
         BDINIT(file_data, &file_rec);
         result = BTSearchRecord(fcb, &iterator[0], &file_data, &datasize, &iterator[0]);
         if (result) 
@@ -288,6 +293,7 @@ cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp)
                 (void) BTFlushPath(fcb);
         }       
  exit:
+       (void) BTFlushPath(fcb);
         FREE(iterator, M_TEMP);
  
         return MacToVFSError(result);
@@ -426,6 +432,15 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, u_long hint, int wantr
         encoding = getencoding(recp);
         hint = iterator->hint.nodeNum;
  
+       /* Hide the journal files (if any) */
+       if (hfsmp->jnl &&
+               ((cnid == hfsmp->hfs_jnlfileid) ||
+                (cnid == hfsmp->hfs_jnlinfoblkid))) {
+
+               result = ENOENT;
+               goto exit;
+       }
+
         /*
          * When a hardlink link is encountered, auto resolve it
          */
@@ -529,6 +544,11 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
                 hfs_setencodingbits(hfsmp, encoding);
         }
  
+       // XXXdbg - preflight all btree operations to make sure there's enough space
+       result = BTCheckFreeSpace(fcb);
+       if (result)
+               goto exit;
+
         /*
          * Insert the thread record first
          */
@@ -617,9 +637,8 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
         vcb->vcbNxtCNID = nextCNID;
         vcb->vcbFlags |= 0xFF00;
  
-       (void) BTFlushPath(fcb);
-
  exit:
+       (void) BTFlushPath(fcb);
         FREE(bto, M_TEMP);
  
         return MacToVFSError(result);
@@ -678,6 +697,11 @@ cat_rename (
         if ((result = buildkey(hfsmp, to_cdp, (HFSPlusCatalogKey *)&to_iterator->key, 0)))
                 goto exit;      
  
+       // XXXdbg - preflight all btree operations to make sure there's enough space
+       result = BTCheckFreeSpace(fcb);
+       if (result)
+               goto exit;
+
         to_key = (HFSPlusCatalogKey *)&to_iterator->key;
         MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK);
         BDINIT(btdata, recp);
@@ -781,7 +805,17 @@ cat_rename (
                 result = BTInsertRecord(fcb, to_iterator, &btdata, datasize);
                 if (result) {
                         /* Try and restore original before leaving */
+                   // XXXdbg
+                   #if 1
+                      {
+                       int err;
+                       err = BTInsertRecord(fcb, from_iterator, &btdata, datasize);
+                       if (err)
+                               panic("cat_create: could not undo (BTInsert = %d)", err);
+                      }
+                   #else
                         (void) BTInsertRecord(fcb, from_iterator, &btdata, datasize);
+                   #endif
                         goto exit;
                 }
                 sourcegone = 1;
@@ -794,7 +828,17 @@ cat_rename (
                 result = BTDeleteRecord(fcb, from_iterator);
                 if (result) {
                         /* Try and delete new record before leaving */
+                 // XXXdbg
+                 #if 1
+                    {
+                       int err;
+                       err = BTDeleteRecord(fcb, to_iterator);
+                       if (err)
+                               panic("cat_create: could not undo (BTDelete = %d)", err);
+                    }                  
+                 #else
                         (void) BTDeleteRecord(fcb, to_iterator);
+                 #endif
                         goto exit;
                 }
         }
@@ -834,8 +878,8 @@ cat_rename (
                         FREE(pluskey, M_TEMP);
                 }
         }
-       (void) BTFlushPath(fcb);
  exit:
+       (void) BTFlushPath(fcb);
         if (from_iterator)
                 FREE(from_iterator, M_TEMP);
         if (to_iterator)
@@ -874,7 +918,6 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
          * A directory must be empty
          * A file must be zero length (no blocks)
          */
-
         if (descp->cd_cnid < kHFSFirstUserCatalogNodeID ||
             descp->cd_parentcnid == kRootParID)
                 return (EINVAL);
@@ -899,6 +942,11 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
         if (result)
                 goto exit;
  
+       // XXXdbg - preflight all btree operations to make sure there's enough space
+       result = BTCheckFreeSpace(fcb);
+       if (result)
+               goto exit;
+
         /* Delete record */
         result = BTDeleteRecord(fcb, iterator);
         if (result)
@@ -910,8 +958,8 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
  
         TrashCatalogIterator(vcb, descp->cd_parentcnid);
  
-       (void) BTFlushPath(fcb);
  exit:
+       (void) BTFlushPath(fcb);
         FREE(iterator, M_TEMP);
  
         return MacToVFSError(result);
@@ -973,9 +1021,8 @@ cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr
         /* Update the node hint. */
         descp->cd_hint = iterator->hint.nodeNum;
  
-       (void) BTFlushPath(fcb);
-
  exit:
+       (void) BTFlushPath(fcb);
         FREE(iterator, M_TEMP);
  
         return MacToVFSError(result);
@@ -1242,13 +1289,22 @@ catrec_readattr(const CatalogKey *key, const CatalogRecord *rec,
                 return (0);     /* stop */
         }
  
-       /* Hide the private meta data directory. */
-       if (parentcnid == kRootDirID  &&
-           rec->recordType == kHFSPlusFolderRecord &&
-           rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir) {
-               return (1);     /* continue */
+       /* Hide the private meta data directory and journal files */
+       if (parentcnid == kRootDirID) {
+               if ((rec->recordType == kHFSPlusFolderRecord) &&
+                   (rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir)) {
+                       return (1);     /* continue */
+               }
+               if (hfsmp->jnl &&
+                   (rec->recordType == kHFSPlusFileRecord) &&
+                   ((rec->hfsPlusFile.fileID == hfsmp->hfs_jnlfileid) ||
+                    (rec->hfsPlusFile.fileID == hfsmp->hfs_jnlinfoblkid))) {
+
+                       return (1);     /* continue */
+               }
         }
  
+
         cep = &list->entry[list->realentries++];
  
         if (state->stdhfs) {
@@ -1408,6 +1464,8 @@ exit:
  struct read_state {
         u_int32_t       cbs_parentID;
         u_int32_t       cbs_hiddenDirID;
+       u_int32_t       cbs_hiddenJournalID;
+       u_int32_t       cbs_hiddenInfoBlkID;
         off_t           cbs_lastoffset;
         struct uio *    cbs_uio;
         ExtendedVCB *   cbs_vcb;
@@ -1517,6 +1575,15 @@ lastitem:
             catent.d_type == DT_DIR)
                 goto lastitem;
  
+       /* Hide the journal files */
+       if ((curID == kRootDirID) &&
+           (catent.d_type == DT_REG) &&
+           ((catent.d_fileno == state->cbs_hiddenJournalID) ||
+            (catent.d_fileno == state->cbs_hiddenInfoBlkID))) {
+
+               return (1);     /* skip and continue */
+       }
+
         state->cbs_lastoffset = state->cbs_uio->uio_offset;
  
         /* if this entry won't fit then we're done */
@@ -1565,6 +1632,11 @@ cat_getdirentries(struct hfsmount *hfsmp, struct cat_desc *descp,
                 goto cleanup;
  
         state.cbs_hiddenDirID = hfsmp->hfs_private_metadata_dir;
+       if (hfsmp->jnl) {
+               state.cbs_hiddenJournalID = hfsmp->hfs_jnlfileid;
+               state.cbs_hiddenInfoBlkID = hfsmp->hfs_jnlinfoblkid;
+       }
+
         state.cbs_lastoffset = cip->currentOffset;
         state.cbs_vcb = vcb;
         state.cbs_uio = uio;
@@ -2203,7 +2275,11 @@ getcnid(const CatalogRecord *crp)
         case kHFSPlusFileRecord:
                 cnid = crp->hfsPlusFile.fileID;
                 break;
+       default:
+               panic("hfs: getcnid: unknown recordType (crp @ 0x%x)\n", crp);
+               break;
         }
+
         return (cnid);
  }
  
@@ -2225,7 +2301,11 @@ getparentcnid(const CatalogRecord *recp)
         case kHFSPlusFolderThreadRecord:
                 cnid = recp->hfsPlusThread.parentID;
                 break;
+       default:
+               panic("hfs: getparentcnid: unknown recordType (crp @ 0x%x)\n", recp);
+               break;
         }
+
         return (cnid);
  }
  
diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c

index d59163ab59ee0c803b40bb190fda4adb69cba911..65617595f965aaaac498e34fb142f5f71f4b0e9d 100644 (file)
--- a/bsd/hfs/hfs_cnode.c
+++ b/bsd/hfs/hfs_cnode.c
@@ -62,6 +62,7 @@ hfs_inactive(ap)
         int recycle = 0;
         int forkcount = 0;
         int truncated = 0;
+       int started_tr = 0, grabbed_lock = 0;
  
         if (prtactive && vp->v_usecount != 0)
                 vprint("hfs_inactive: pushing active", vp);
@@ -85,9 +86,11 @@ hfs_inactive(ap)
             vp->v_type == VREG &&
             (VTOF(vp)->ff_blocks != 0)) {                       
                 error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p);
-               if (error) goto out;
                 truncated = 1;
+               // have to do this to prevent the lost ubc_info panic
+               SET(cp->c_flag, C_TRANSIT);
                 recycle = 1;
+               if (error) goto out;
         }
  
         /*
@@ -103,6 +106,17 @@ hfs_inactive(ap)
                 cp->c_flag &= ~C_DELETED;
                 cp->c_rdev = 0;
                 
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               grabbed_lock = 1;
+               if (hfsmp->jnl) {
+                   if (journal_start_transaction(hfsmp->jnl) != 0) {
+                               error = EINVAL;
+                               goto out;
+                   }
+                   started_tr = 1;
+               }
+
                 /* Lock catalog b-tree */
                 error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
                 if (error) goto out;
@@ -148,11 +162,21 @@ hfs_inactive(ap)
                 if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)
                         cp->c_flag |= C_MODIFIED;
         }
-        if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) {
-                tv = time;
-                VOP_UPDATE(vp, &tv, &tv, 0);
-        }
+
+       if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) {
+               tv = time;
+               VOP_UPDATE(vp, &tv, &tv, 0);
+       }
  out:
+       // XXXdbg - have to do this because a goto could have come here
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+           started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
         VOP_UNLOCK(vp, 0, p);
         /*
          * If we are done with the vnode, reclaim it
@@ -313,6 +337,16 @@ hfs_getcnode(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *descp, int wa
                         retval = ENOENT;
                         goto exit;
                 }
+
+               /* Hide private journal files */
+               if (hfsmp->jnl &&
+                       (cp->c_parentcnid == kRootDirID) &&
+                       ((cp->c_cnid == hfsmp->hfs_jnlfileid) ||
+                       (cp->c_cnid == hfsmp->hfs_jnlinfoblkid))) {
+                   retval = ENOENT;
+                       goto exit;
+               }
+        
                 if (wantrsrc && rvp != NULL) {
                         vp = rvp;
                         rvp = NULL;
diff --git a/bsd/hfs/hfs_format.h b/bsd/hfs/hfs_format.h

index ffbef0fb925d6054f260a120c8b3c4b78d7a4dff..a8833dedd85c15d258bbd4dcb02d0cc52d34e89d 100644 (file)
--- a/bsd/hfs/hfs_format.h
+++ b/bsd/hfs/hfs_format.h
@@ -45,9 +45,11 @@ extern "C" {
  enum {
         kHFSSigWord             = 0x4244,       /* 'BD' in ASCII */
         kHFSPlusSigWord         = 0x482B,       /* 'H+' in ASCII */
+       kHFSJSigWord            = 0x484a,       /* 'HJ' in ASCII */
         kHFSPlusVersion         = 0x0004,       /* will change as format changes */
                                                 /* version 4 shipped with Mac OS 8.1 */
-       kHFSPlusMountVersion    = 0x31302E30    /* '10.0' for Mac OS X */
+       kHFSPlusMountVersion    = 0x31302E30,   /* '10.0' for Mac OS X */
+       kHFSJMountVersion       = 0x4846534a    /* 'HFSJ' for journaled HFS+ on OS X */
  };
  
  
@@ -452,7 +454,8 @@ enum {
         kHFSVolumeNoCacheRequiredBit = 10,              /* don't cache volume blocks (i.e. RAM or ROM disk) */
         kHFSBootVolumeInconsistentBit = 11,             /* boot volume is inconsistent (System 7.6 and later) */
         kHFSCatalogNodeIDsReusedBit = 12,
-                                                       /* Bits 13-14 are reserved for future use */
+       kHFSVolumeJournaledBit = 13,                    /* this volume has a journal on it */
+                                                       /* Bit 14 is reserved for future use */
         kHFSVolumeSoftwareLockBit       = 15,           /* volume is locked by software */
  
         kHFSVolumeHardwareLockMask      = 1 << kHFSVolumeHardwareLockBit,
@@ -461,6 +464,7 @@ enum {
         kHFSVolumeNoCacheRequiredMask = 1 << kHFSVolumeNoCacheRequiredBit,
         kHFSBootVolumeInconsistentMask = 1 << kHFSBootVolumeInconsistentBit,
         kHFSCatalogNodeIDsReusedMask = 1 << kHFSCatalogNodeIDsReusedBit,
+       kHFSVolumeJournaledMask = 1 << kHFSVolumeJournaledBit,
         kHFSVolumeSoftwareLockMask      = 1 << kHFSVolumeSoftwareLockBit,
         kHFSMDBAttributesMask           = 0x8380
  };
@@ -509,7 +513,8 @@ struct HFSPlusVolumeHeader {
         u_int16_t       version;                /* == kHFSPlusVersion */
         u_int32_t       attributes;             /* volume attributes */
         u_int32_t       lastMountedVersion;     /* implementation version which last mounted volume */
-       u_int32_t       reserved;               /* reserved - initialized as zero */
+//XXXdbg       u_int32_t       reserved;               /* reserved - initialized as zero */
+       u_int32_t       journalInfoBlock;       /* block addr of journal info (if volume is journaled, zero otherwise) */
  
         u_int32_t       createDate;             /* date and time of volume creation */
         u_int32_t       modifyDate;             /* date and time of last modification */
@@ -601,6 +606,23 @@ enum {
         kBTVariableIndexKeysMask = 0x00000004   /* keys in index nodes are variable length */
  };
  
+/* JournalInfoBlock - Structure that describes where our journal lives */
+struct JournalInfoBlock {
+       u_int32_t       flags;
+       u_int32_t       device_signature[8];  // signature used to locate our device.
+       u_int64_t       offset;               // byte offset to the journal on the device
+       u_int64_t       size;                 // size in bytes of the journal
+       u_int32_t       reserved[32];
+};
+typedef struct JournalInfoBlock JournalInfoBlock;
+
+enum {
+    kJIJournalInFSMask          = 0x00000001,
+    kJIJournalOnOtherDeviceMask = 0x00000002,
+    kJIJournalNeedInitMask      = 0x00000004
+};
+
+
  #pragma options align=reset
  
  #ifdef __cplusplus
diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c

index 6a78cd75217cb276b29cdd10b28a5f9689a55ec7..97a36516cf4579d150d35c3b92326acfca74cede 100644 (file)
--- a/bsd/hfs/hfs_link.c
+++ b/bsd/hfs/hfs_link.c
@@ -72,12 +72,25 @@ createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum,
         fip->fdCreator = SWAP_BE32 (kHFSPlusCreator);   /* 'hfs+' */
         fip->fdFlags   = SWAP_BE16 (kHasBeenInited);
  
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return EINVAL;
+           }
+       }
+
         /* Create the indirect link directly in the catalog */
         result = cat_create(hfsmp, &desc, &attr, NULL);
  
-       if (linkcnid != NULL)
+       if (result == 0 && linkcnid != NULL)
                 *linkcnid = attr.ca_fileid;
  
+       if (hfsmp->jnl) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
+
         return (result);
  }
  
@@ -111,8 +124,9 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp,
  
         /* Lock catalog b-tree */
         retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-       if (retval)
-               return retval;
+       if (retval) {
+           return retval;
+       }
  
         /*
          * If this is a new hardlink then we need to create the data
@@ -123,6 +137,7 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp,
                 bzero(&to_desc, sizeof(to_desc));
                 to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid;
                 to_desc.cd_cnid = cp->c_fileid;
+
                 do {
                         /* get a unique indirect node number */
                         indnodeno = ((random() & 0x3fffffff) + 100);
@@ -144,7 +159,17 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp,
                                 cp->c_desc.cd_nameptr, &cp->c_desc.cd_cnid);
                 if (retval) {
                         /* put it source file back */
+               // XXXdbg
+               #if 1
+                   {
+                       int err;
+                               err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+                               if (err)
+                                       panic("hfs_makelink: error %d from cat_rename backout 1", err);
+                   }
+               #else
                         (void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+               #endif
                         goto out;
                 }
                 cp->c_rdev = indnodeno;
@@ -161,7 +186,17 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp,
                 (void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr);
  
                 /* Put the source file back */
+       // XXXdbg
+       #if 1
+               {
+                       int err;
+                       err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+                       if (err)
+                               panic("hfs_makelink: error %d from cat_rename backout 2", err);
+               }
+       #else
                 (void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+       #endif
                 goto out;
         }
  
@@ -205,6 +240,7 @@ hfs_link(ap)
                 struct componentname *a_cnp;
         } */ *ap;
  {
+       struct hfsmount *hfsmp;
         struct vnode *vp = ap->a_vp;
         struct vnode *tdvp = ap->a_tdvp;
         struct componentname *cnp = ap->a_cnp;
@@ -214,6 +250,8 @@ hfs_link(ap)
         struct timeval tv;
         int error;
  
+       hfsmp = VTOHFS(vp);
+       
  #if HFS_DIAGNOSTIC
         if ((cnp->cn_flags & HASBUF) == 0)
                 panic("hfs_link: no name");
@@ -226,7 +264,7 @@ hfs_link(ap)
         if (VTOVCB(tdvp)->vcbSigWord != kHFSPlusSigWord)
                 return err_link(ap);    /* hfs disks don't support hard links */
         
-       if (VTOHFS(vp)->hfs_private_metadata_dir == 0)
+       if (hfsmp->hfs_private_metadata_dir == 0)
                 return err_link(ap);    /* no private metadata dir, no links possible */
  
         if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) {
@@ -252,12 +290,22 @@ hfs_link(ap)
                 goto out1;
         }
  
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return EINVAL;
+           }
+       }
+
         cp->c_nlink++;
         cp->c_flag |= C_CHANGE;
         tv = time;
+
         error = VOP_UPDATE(vp, &tv, &tv, 1);
-       if (!error)
-               error = hfs_makelink(VTOHFS(vp), cp, tdcp, cnp);
+       if (!error) {
+               error = hfs_makelink(hfsmp, cp, tdcp, cnp);
+       }
         if (error) {
                 cp->c_nlink--;
                 cp->c_flag |= C_CHANGE;
@@ -268,10 +316,21 @@ hfs_link(ap)
                 tdcp->c_flag |= C_CHANGE | C_UPDATE;
                 tv = time;
                 (void) VOP_UPDATE(tdvp, &tv, &tv, 0);
-               hfs_volupdate(VTOHFS(vp), VOL_MKFILE,
+
+               hfs_volupdate(hfsmp, VOL_MKFILE,
                         (tdcp->c_cnid == kHFSRootFolderID));
         }
+
+       // XXXdbg - need to do this here as well because cp could have changed
+       error = VOP_UPDATE(vp, &tv, &tv, 1);
+
         FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
+
+       if (hfsmp->jnl) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
+
  out1:
         if (tdvp != vp)
                 VOP_UNLOCK(vp, 0, p);
diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c

index 824f615dc70275c11158657f7040fc5d854bcd5b..db88b99c045fe853d137424d98ca1ec558c34a57 100644 (file)
--- a/bsd/hfs/hfs_lookup.c
+++ b/bsd/hfs/hfs_lookup.c
@@ -261,8 +261,9 @@ notfound:
                          * creation of files in the directory.
                          */
                         retval = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc);
-                       if (retval)
+                       if (retval) {
                                 goto exit;
+                       }
                 
                         cnp->cn_flags |= SAVENAME;
                         if (!(flags & LOCKPARENT))
diff --git a/bsd/hfs/hfs_mount.h b/bsd/hfs/hfs_mount.h

index 06afe6df853bd51ccd4ba1db49b91220c5a65ab2..502926a427e70aa569fad238ec5ce181bef07a5b 100644 (file)
--- a/bsd/hfs/hfs_mount.h
+++ b/bsd/hfs/hfs_mount.h
@@ -52,10 +52,15 @@ struct hfs_mount_args {
         u_long  hfs_encoding;           /* encoding for this volume (standard HFS only) */
         struct  timezone hfs_timezone;  /* user time zone info (standard HFS only) */
         int     flags;                  /* mounting flags, see below */
+       int     journal_tbuffer_size;   /* size in bytes of the journal transaction buffer */
+       int     journal_flags;          /* flags to pass to journal_open/create */
+       int     journal_disable;        /* don't use journaling (potentially dangerous) */
  };
  
  #define HFSFSMNT_NOXONFILES    0x1     /* disable execute permissions for files */
  #define HFSFSMNT_WRAPPER       0x2     /* mount HFS wrapper (if it exists) */
+#define HFSFSMNT_EXTENDED_ARGS  0x4     /* indicates new fields after "flags" are valid */
+
  #endif /* __APPLE_API_UNSTABLE */
  
  #endif /* ! _HFS_MOUNT_H_ */
diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c

index 4544a768558eb7658d3ed8dc1510cf04569b26f2..6f0311411a8af16e6a32141c64f3c232562ea658 100644 (file)
--- a/bsd/hfs/hfs_readwrite.c
+++ b/bsd/hfs/hfs_readwrite.c
@@ -267,6 +267,8 @@ hfs_write(ap)
      int                                retval;
         off_t filebytes;
         u_long fileblocks;
+       struct hfsmount *hfsmp;
+       int started_tr = 0, grabbed_lock = 0;
  
         ioflag = ap->a_ioflag;
  
@@ -288,6 +290,16 @@ hfs_write(ap)
         if ((cp->c_flags & APPEND) && uio->uio_offset != fp->ff_size)
                 return (EPERM);
  
+       // XXXdbg - don't allow modification of the journal or journal_info_block
+       if (VTOHFS(vp)->jnl && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+                       return EPERM;
+               }
+       }
+
         writelimit = uio->uio_offset + uio->uio_resid;
  
         /*
@@ -333,13 +345,26 @@ hfs_write(ap)
         if(writelimit > filebytes) {
                 bytesToAdd = writelimit - filebytes;
  
-               retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, fp->ff_clumpsize)), 
+               retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, vcb->blockSize)), 
                                    ap->a_cred, 0);
                 if (retval)
                         return (retval);
         }
  #endif /* QUOTA */
  
+       hfsmp = VTOHFS(vp);
+       if (writelimit > filebytes) {
+               hfs_global_shared_lock_acquire(hfsmp);
+               grabbed_lock = 1;
+       }
+       if (hfsmp->jnl && (writelimit > filebytes)) {
+               if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return EINVAL;
+               }
+               started_tr = 1;
+       }
+
         while (writelimit > filebytes) {
         
                 bytesToAdd = writelimit - filebytes;
@@ -364,6 +389,17 @@ hfs_write(ap)
                         (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size,  (int)filebytes, 0);
         }
  
+       // XXXdbg
+       if (started_tr) {
+               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+               journal_end_transaction(hfsmp->jnl);
+               started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+               grabbed_lock = 0;
+       }
+
         if (UBCISVALID(vp) && retval == E_NONE) {
                 off_t filesize;
                 off_t zero_off;
@@ -952,6 +988,7 @@ hfs_cmap(ap)
      struct proc                *p = NULL;
      struct rl_entry *invalid_range;
      enum rl_overlaptype overlaptype;
+    int started_tr = 0, grabbed_lock = 0;
  
         /*
          * Check for underlying vnode requests and ensure that logical
@@ -960,12 +997,37 @@ hfs_cmap(ap)
         if (ap->a_bpn == NULL)
                 return (0);
  
-       if (overflow_extents(fp) || fp->ff_unallocblocks) {
+       p = current_proc();
+       if (fp->ff_unallocblocks) {
                 lockExtBtree = 1;
-               p = current_proc();
+
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               grabbed_lock = 1;
+
+               if (hfsmp->jnl) {
+                       if (journal_start_transaction(hfsmp->jnl) != 0) {
+                               hfs_global_shared_lock_release(hfsmp);
+                               return EINVAL;
+                       } else {
+                               started_tr = 1;
+                       }
+               } 
+
                 if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
+                       if (started_tr) {
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       if (grabbed_lock) {
+                               hfs_global_shared_lock_release(hfsmp);
+                       }
                         return (retval);
-               }
+               }
+       } else if (overflow_extents(fp)) {
+               lockExtBtree = 1;
+               if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
+                       return retval;
+               }
         }
  
         /*
@@ -1007,9 +1069,16 @@ hfs_cmap(ap)
                 }
  
                 if (retval) {
-                       (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
-                       return (retval);
-               }
+                       (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
+                       if (started_tr) {
+                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       if (grabbed_lock) {
+                               hfs_global_shared_lock_release(hfsmp);
+                       }
+                       return (retval);
+               }
                 VTOC(ap->a_vp)->c_flag |= C_MODIFIED;
         }
  
@@ -1024,6 +1093,17 @@ hfs_cmap(ap)
         if (lockExtBtree)
                 (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
  
+       // XXXdbg
+       if (started_tr) {
+               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+               journal_end_transaction(hfsmp->jnl);
+               started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+               grabbed_lock = 0;
+       }
+                       
      if (retval == E_NONE) {
          /* Adjust the mapping information for invalid file ranges: */
          overlaptype = rl_scan(&fp->ff_invalidranges,
@@ -1153,6 +1233,11 @@ hfs_strategy_fragmented(struct buf *bp)
         }
         
         frag->b_vp = NULL;
+       //
+       // XXXdbg - in the case that this is a meta-data block, it won't affect
+       //          the journal because this bp is for a physical disk block,
+       //          not a logical block that is part of the catalog or extents
+       //          files.
         SET(frag->b_flags, B_INVAL);
         brelse(frag);
         
@@ -1291,6 +1376,7 @@ int hfs_truncate(ap)
         off_t filebytes;
         u_long fileblocks;
         int blksize;
+       struct hfsmount *hfsmp;
  
         if (vp->v_type != VREG && vp->v_type != VLNK)
                 return (EISDIR);        /* cannot truncate an HFS directory! */
@@ -1309,6 +1395,7 @@ int hfs_truncate(ap)
         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
                 return (EFBIG);
  
+       hfsmp = VTOHFS(vp);
  
         tv = time;
         retval = E_NONE;
@@ -1329,7 +1416,7 @@ int hfs_truncate(ap)
          */
         if (length > fp->ff_size) {
  #if QUOTA
-               retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, fp->ff_clumpsize)),
+               retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
                                 ap->a_cred, 0);
                 if (retval)
                         goto Err_Exit;
@@ -1347,10 +1434,25 @@ int hfs_truncate(ap)
                         if (suser(ap->a_cred, NULL) != 0)
                                 eflags |= kEFReserveMask;  /* keep a reserve */
  
+                       // XXXdbg
+                       hfs_global_shared_lock_acquire(hfsmp);
+                       if (hfsmp->jnl) {
+                               if (journal_start_transaction(hfsmp->jnl) != 0) {
+                                       retval = EINVAL;
+                                       goto Err_Exit;
+                               }
+                       }
+
                         /* lock extents b-tree (also protects volume bitmap) */
                         retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-                       if (retval)
+                       if (retval) {
+                               if (hfsmp->jnl) {
+                                       journal_end_transaction(hfsmp->jnl);
+                               } 
+                               hfs_global_shared_lock_release(hfsmp);
+
                                 goto Err_Exit;
+                       }
  
                         while ((length > filebytes) && (retval == E_NONE)) {
                                 bytesToAdd = length - filebytes;
@@ -1368,7 +1470,16 @@ int hfs_truncate(ap)
                                         break;
                                 }
                         } /* endwhile */
+
                         (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                               journal_end_transaction(hfsmp->jnl);
+                       } 
+                       hfs_global_shared_lock_release(hfsmp);
+
                         if (retval)
                                 goto Err_Exit;
  
@@ -1484,16 +1595,38 @@ int hfs_truncate(ap)
  #if QUOTA
                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
  #endif /* QUOTA */
+                 // XXXdbg
+                 hfs_global_shared_lock_acquire(hfsmp);
+                       if (hfsmp->jnl) {
+                               if (journal_start_transaction(hfsmp->jnl) != 0) {
+                                       retval = EINVAL;
+                                       goto Err_Exit;
+                               }
+                       }
+
                         /* lock extents b-tree (also protects volume bitmap) */
                         retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-                       if (retval)
+                       if (retval) {
+                               if (hfsmp->jnl) {
+                                       journal_end_transaction(hfsmp->jnl);
+                               }
+                               hfs_global_shared_lock_release(hfsmp);
                                 goto Err_Exit;
+                       }
                         
                         if (fp->ff_unallocblocks == 0)
                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
                                                 (FCB*)fp, length, false));
  
                         (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+
                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
                         if (retval)
                                 goto Err_Exit;
@@ -1564,6 +1697,9 @@ int hfs_allocate(ap)
         int retval, retval2;
         UInt32 blockHint;
         UInt32 extendFlags =0;   /* For call to ExtendFileC */
+       struct hfsmount *hfsmp;
+
+       hfsmp = VTOHFS(vp);
  
         *(ap->a_bytesallocated) = 0;
         fileblocks = fp->ff_blocks;
@@ -1610,15 +1746,31 @@ int hfs_allocate(ap)
                 moreBytesRequested = length - filebytes;
                 
  #if QUOTA
-               retval = hfs_chkdq(cp, (int64_t)(roundup(moreBytesRequested, fp->ff_clumpsize)), 
+               retval = hfs_chkdq(cp,
+                               (int64_t)(roundup(moreBytesRequested, VTOVCB(vp)->blockSize)), 
                                 ap->a_cred, 0);
                 if (retval)
                         return (retval);
  
  #endif /* QUOTA */
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               if (hfsmp->jnl) {
+                       if (journal_start_transaction(hfsmp->jnl) != 0) {
+                               retval = EINVAL;
+                               goto Err_Exit;
+                       }
+               }
+
                 /* lock extents b-tree (also protects volume bitmap) */
                 retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-               if (retval) goto Err_Exit;
+               if (retval) {
+                       if (hfsmp->jnl) {
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+                       goto Err_Exit;
+               }
  
                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
                                                 (FCB*)fp,
@@ -1629,8 +1781,16 @@ int hfs_allocate(ap)
  
                 *(ap->a_bytesallocated) = actualBytesAdded;
                 filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+
                 (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
  
+               // XXXdbg
+               if (hfsmp->jnl) {
+                       hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                       journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
+
                 /*
                  * if we get an error and no changes were made then exit
                  * otherwise we must do the VOP_UPDATE to reflect the changes
@@ -1661,9 +1821,25 @@ int hfs_allocate(ap)
                         (void) vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0);
                 }
  
+               // XXXdbg
+               hfs_global_shared_lock_acquire(hfsmp);
+               if (hfsmp->jnl) {
+                       if (journal_start_transaction(hfsmp->jnl) != 0) {
+                               retval = EINVAL;
+                               goto Err_Exit;
+                       }
+               }
+
                 /* lock extents b-tree (also protects volume bitmap) */
                 retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
-               if (retval) goto Err_Exit;
+               if (retval) {
+                       if (hfsmp->jnl) {
+                               journal_end_transaction(hfsmp->jnl);
+                       }
+                       hfs_global_shared_lock_release(hfsmp);
+
+                       goto Err_Exit;
+               }                       
  
                 retval = MacToVFSError(
                              TruncateFileC(
@@ -1673,6 +1849,14 @@ int hfs_allocate(ap)
                                              false));
                 (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
                 filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+
+               if (hfsmp->jnl) {
+                       hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+                       journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
+               
+
                 /*
                  * if we get an error and no changes were made then exit
                  * otherwise we must do the VOP_UPDATE to reflect the changes
@@ -1794,9 +1978,9 @@ hfs_bwrite(ap)
         } */ *ap;
  {
         int retval = 0;
-#if BYTE_ORDER == LITTLE_ENDIAN
         register struct buf *bp = ap->a_bp;
         register struct vnode *vp = bp->b_vp;
+#if BYTE_ORDER == LITTLE_ENDIAN
         BlockDescriptor block;
  
         /* Trap B-Tree writes */
@@ -1820,8 +2004,12 @@ hfs_bwrite(ap)
         }
  #endif
         /* This buffer shouldn't be locked anymore but if it is clear it */
-       if (ISSET(ap->a_bp->b_flags, B_LOCKED)) {
-               CLR(ap->a_bp->b_flags, B_LOCKED);
+       if (ISSET(bp->b_flags, B_LOCKED)) {
+           // XXXdbg
+           if (VTOHFS(vp)->jnl) {
+                       panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp);
+           }
+               CLR(bp->b_flags, B_LOCKED);
                 printf("hfs_bwrite: called with lock bit set\n");
         }
         retval = vn_bwrite (ap);
diff --git a/bsd/hfs/hfs_search.c b/bsd/hfs/hfs_search.c

index 0c7638fbe86a4f05fdec7b2cf8d7e8e9b16da248..84aecbb016dd7c3090fc5a70172f15509f47fe63 100644 (file)
--- a/bsd/hfs/hfs_search.c
+++ b/bsd/hfs/hfs_search.c
@@ -193,6 +193,8 @@ hfs_search( ap )
         CatalogRecord * myCurrentDataPtr;
         CatPosition * myCatPositionPtr;
         BTScanState myBTScanState;
+       void *user_start = NULL;
+       int   user_len;
  
         /* XXX Parameter check a_searchattrs? */
  
@@ -223,6 +225,20 @@ hfs_search( ap )
         MALLOC( attributesBuffer, void *, eachReturnBufferSize, M_TEMP, M_WAITOK );
         variableBuffer = (void*)((char*) attributesBuffer + fixedBlockSize);
  
+       // XXXdbg - have to lock the user's buffer so we don't fault
+       // while holding the shared catalog file lock.  see the comment
+       // in hfs_readdir() for more details.
+       //
+       if (VTOHFS(ap->a_vp)->jnl && ap->a_uio->uio_segflg == UIO_USERSPACE) {
+               user_start = ap->a_uio->uio_iov->iov_base;
+               user_len   = ap->a_uio->uio_iov->iov_len;
+
+               if ((err = vslock(user_start, user_len)) != 0) {
+                       user_start = NULL;
+                       goto ExitThisRoutine;
+               }
+       }
+
         /* Lock catalog b-tree */
         err = hfs_metafilelocking(VTOHFS(ap->a_vp), kHFSCatalogFileID, LK_SHARED, p);
         if (err)
@@ -383,6 +399,10 @@ QuickExit:
  ExitThisRoutine:
          FREE( attributesBuffer, M_TEMP );
  
+       if (VTOHFS(ap->a_vp)->jnl && user_start) {
+               vsunlock(user_start, user_len, TRUE);
+       }
+
         return (MacToVFSError(err));
  }
  
@@ -858,6 +878,14 @@ InsertMatch( struct vnode *root_vp, struct uio *a_uio, CatalogRecord *rec,
                 goto exit;
         }
  
+       /* Hide the private journal files */
+       if (VTOHFS(root_vp)->jnl &&
+           ((c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlfileid) ||
+            (c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlinfoblkid))) {
+               err = 0;
+               goto exit;
+       }
+
         if (returnAttrList->commonattr & ATTR_CMN_NAME) {
                 cat_convertkey(VTOHFS(root_vp), key, rec, &c_desc);
         } else {
diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c

index c92af136d5d8fe2d8347f78b35c9e59b01a005ab..cff8a45ec7c8ceee331e4367ba5e27eec7a0dad9 100644 (file)
--- a/bsd/hfs/hfs_vfsops.c
+++ b/bsd/hfs/hfs_vfsops.c
@@ -77,6 +77,9 @@
  #include <sys/quota.h>
  #include <sys/disk.h>
  
+// XXXdbg
+#include <vfs/vfs_journal.h>
+
  #include <miscfs/specfs/specdev.h>
  #include <hfs/hfs_mount.h>
  
@@ -259,6 +262,8 @@ hfs_mount(mp, path, data, ndp, p)
                     (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)) {
                         /* setup private/hidden directory for unlinked files */
                         hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(HFSTOVCB(hfsmp));
+                       if (hfsmp->jnl)
+                               hfs_remove_orphans(hfsmp);
                 }
  
                 if (args.fspec == 0) {
@@ -325,7 +330,6 @@ hfs_mount(mp, path, data, ndp, p)
                 goto error_exit;
         }
  
-       
         /* Set the mount flag to indicate that we support volfs  */
         mp->mnt_flag |= MNT_DOVOLFS;
      if (VFSTOVCB(mp)->vcbSigWord == kHFSSigWord) {
@@ -333,6 +337,7 @@ hfs_mount(mp, path, data, ndp, p)
         mp->mnt_flag |= MNT_FIXEDSCRIPTENCODING;
      }
         (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN-1, &size);
+
         bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size);
         (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
@@ -641,6 +646,7 @@ loop:
  
         vcb->vcbLsMod           = to_bsd_time(SWAP_BE32(vhp->modifyDate));
         vcb->vcbAtrb            = (UInt16) SWAP_BE32 (vhp->attributes); /* VCB only uses lower 16 bits */
+       vcb->vcbJinfoBlock  = SWAP_BE32(vhp->journalInfoBlock);
         vcb->vcbClpSiz          = SWAP_BE32 (vhp->rsrcClumpSize);
         vcb->vcbNxtCNID         = SWAP_BE32 (vhp->nextCatalogID);
         vcb->vcbVolBkUp         = to_bsd_time(SWAP_BE32(vhp->backupDate));
@@ -720,6 +726,84 @@ loop:
  }
  
  
+static int
+get_raw_device(char *fspec, int is_user, int ronly, struct vnode **rvp, struct ucred *cred, struct proc *p)
+{
+       char            *rawbuf;
+       char            *dp;
+       size_t           namelen;
+       struct nameidata nd;
+       int               retval;
+
+       *rvp = NULL;
+
+       MALLOC(rawbuf, char *, MAXPATHLEN, M_HFSMNT, M_WAITOK);
+       if (rawbuf == NULL) {
+               retval = ENOMEM;
+               goto error_exit;
+       }
+
+       if (is_user) {
+               retval = copyinstr(fspec, rawbuf, MAXPATHLEN - 1, &namelen);
+               if (retval != E_NONE) {
+                       FREE(rawbuf, M_HFSMNT);
+                       goto error_exit;
+               }
+       } else {
+               strcpy(rawbuf, fspec);
+               namelen = strlen(rawbuf);
+       }
+
+       /* make sure it's null terminated */
+       rawbuf[MAXPATHLEN-1] = '\0';   
+
+       dp = &rawbuf[namelen-1];
+       while(dp >= rawbuf && *dp != '/') {
+               dp--;
+       }
+                       
+       if (dp != NULL) {
+               dp++;
+       } else {
+               dp = rawbuf;
+       }
+                       
+       /* make room for and insert the 'r' for the raw device */
+       memmove(dp+1, dp, strlen(dp)+1);
+       *dp = 'r';
+
+       NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, rawbuf, p);
+       retval = namei(&nd);
+       if (retval != E_NONE) {
+               DBG_ERR(("hfs_mountfs: can't open raw device for journal: %s, %x\n", rawbuf, nd.ni_vp->v_rdev));
+               FREE(rawbuf, M_HFSMNT);
+               goto error_exit;
+       }
+
+       *rvp = nd.ni_vp;
+       if ((retval = VOP_OPEN(*rvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p))) {
+               *rvp = NULL;
+               goto error_exit;
+       }
+
+       // don't need this any more
+       FREE(rawbuf, M_HFSMNT);
+
+       return 0;
+
+  error_exit:
+       if (*rvp) {
+           (void)VOP_CLOSE(*rvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+       }
+
+       if (rawbuf) {
+               FREE(rawbuf, M_HFSMNT);
+       }
+       return retval;
+}
+
+
+
  /*
   * Common code for mount and mountroot
   */
@@ -741,6 +825,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
         u_int32_t blksize;
         u_int32_t minblksize;
         u_int32_t iswritable;
+       daddr_t   mdb_offset;
  
         dev = devvp->v_rdev;
         cred = p ? p->p_ucred : NOCRED;
@@ -825,6 +910,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
                 return (retval);
         }
  
+       mdb_offset = HFS_PRI_SECTOR(blksize);
         if ((retval = meta_bread(devvp, HFS_PRI_SECTOR(blksize), blksize, cred, &bp))) {
                 goto error_exit;
         }
@@ -837,7 +923,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
         bzero(hfsmp, sizeof(struct hfsmount));
  
         simple_lock_init(&hfsmp->hfs_renamelock);
-
+       
         /*
         *  Init the volume information structure
         */
@@ -932,6 +1018,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
         } else /* Mount an HFS Plus disk */ {
                 HFSPlusVolumeHeader *vhp;
                 off_t embeddedOffset;
+               int   jnl_disable = 0;
         
                 /* Get the embedded Volume Header */
                 if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
@@ -973,8 +1060,8 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
  
                         hfsmp->hfs_phys_block_count = disksize / blksize;
         
-                       retval = meta_bread(devvp, (embeddedOffset / blksize) +
-                                       HFS_PRI_SECTOR(blksize), blksize, cred, &bp);
+                       mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize);
+                       retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp);
                         if (retval)
                                 goto error_exit;
                         bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512);
@@ -987,9 +1074,42 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
                         vhp = (HFSPlusVolumeHeader*) mdbp;
                 }
  
+               // XXXdbg
+               //
+               hfsmp->jnl = NULL;
+               hfsmp->jvp = NULL;
+               if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) && args->journal_disable) {
+                   jnl_disable = 1;
+               }
+                               
+               //
+               // We only initialize the journal here if the last person
+               // to mount this volume was journaling aware.  Otherwise
+               // we delay journal initialization until later at the end
+               // of hfs_MountHFSPlusVolume() because the last person who
+               // mounted it could have messed things up behind our back
+               // (so we need to go find the .journal file, make sure it's
+               // the right size, re-sync up if it was moved, etc).
+               //
+               if (   (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
+                       && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
+                       && !jnl_disable) {
+                       
+                       // if we're able to init the journal, mark the mount
+                       // point as journaled.
+                       //
+                       if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
+                               mp->mnt_flag |= MNT_JOURNALED;
+                       } else {
+                               retval = EINVAL;
+                               goto error_exit;
+                       }
+               }
+               // XXXdbg
+       
                 (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
  
-               retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p);
+               retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args);
                 /*
                  * If the backend didn't like our physical blocksize
                  * then retry with physical blocksize of 512.
@@ -1012,7 +1132,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p,
                         hfsmp->hfs_phys_block_size = blksize;
   
                         /* Try again with a smaller block size... */
-                       retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p);
+                       retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args);
                 }
                 if (retval)
                         (void) hfs_relconverter(0);
@@ -1039,6 +1159,10 @@ error_exit:
         if (mdbp)
                 FREE(mdbp, M_TEMP);
         (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+       if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+           (void)VOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+               hfsmp->jvp = NULL;
+       }
         if (hfsmp) {
                 FREE(hfsmp, M_HFSMNT);
                 mp->mnt_data = (qaddr_t)0;
@@ -1075,6 +1199,7 @@ hfs_unmount(mp, mntflags, p)
         int retval = E_NONE;
         int flags;
         int force;
+       int started_tr = 0, grabbed_lock = 0;
  
         flags = 0;
         force = 0;
@@ -1090,17 +1215,33 @@ hfs_unmount(mp, mntflags, p)
          * Flush out the b-trees, volume bitmap and Volume Header
          */
         if (hfsmp->hfs_fs_ronly == 0) {
+               hfs_global_shared_lock_acquire(hfsmp);
+               grabbed_lock = 1;
+           if (hfsmp->jnl) {
+                       journal_start_transaction(hfsmp->jnl);
+                       started_tr = 1;
+               }
+               
                 retval = VOP_FSYNC(HFSTOVCB(hfsmp)->catalogRefNum, NOCRED, MNT_WAIT, p);
                 if (retval && !force)
-                       return (retval);
-
+                       goto err_exit;
+               
                 retval = VOP_FSYNC(HFSTOVCB(hfsmp)->extentsRefNum, NOCRED, MNT_WAIT, p);
                 if (retval && !force)
-                       return (retval);
+                       goto err_exit;
+                       
+               // if we have an allocation file, sync it too so we don't leave dirty
+               // blocks around
+               if (HFSTOVCB(hfsmp)->allocationsRefNum) {
+                   if (retval = VOP_FSYNC(HFSTOVCB(hfsmp)->allocationsRefNum, NOCRED, MNT_WAIT, p)) {
+                       if (!force)
+                           goto err_exit;
+                   }
+               }
  
                 if (retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p)) {
                         if (!force)
-                               return (retval);
+                               goto err_exit;
                 }
                 
                 /* See if this volume is damaged, is so do not unmount cleanly */
@@ -1110,14 +1251,27 @@ hfs_unmount(mp, mntflags, p)
                         HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
                 }
  
-               retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
+               retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
                 if (retval) {
                         HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
                         if (!force)
-                               return (retval);        /* could not flush everything */
+                               goto err_exit;  /* could not flush everything */
+               }
+
+               if (hfsmp->jnl) {
+                       journal_end_transaction(hfsmp->jnl);
+                       started_tr = 0;
+               }
+               if (grabbed_lock) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       grabbed_lock = 0;
                 }
         }
  
+       if (hfsmp->jnl) {
+               journal_flush(hfsmp->jnl);
+       }
+       
         /*
          *      Invalidate our caches and release metadata vnodes
          */
@@ -1126,6 +1280,19 @@ hfs_unmount(mp, mntflags, p)
         if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
                 (void) hfs_relconverter(hfsmp->hfs_encoding);
  
+       // XXXdbg
+       if (hfsmp->jnl) {
+           journal_close(hfsmp->jnl);
+       }
+
+       if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+           retval = VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE,
+                              NOCRED, p);
+           vrele(hfsmp->jvp);
+               hfsmp->jvp = NULL;
+       }
+       // XXXdbg
+
         hfsmp->hfs_devvp->v_specflags &= ~SI_MOUNTEDON;
         retval = VOP_CLOSE(hfsmp->hfs_devvp,
                     hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE,
@@ -1137,6 +1304,15 @@ hfs_unmount(mp, mntflags, p)
         FREE(hfsmp, M_HFSMNT);
         mp->mnt_data = (qaddr_t)0;
         return (0);
+
+  err_exit:
+       if (hfsmp->jnl && started_tr) {
+               journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+       return retval;
  }
  
  
@@ -1241,6 +1417,8 @@ hfs_quotactl(mp, cmds, uid, arg, p)
  }
  
  
+
+
  /*
   * Get file system statistics.
   */
@@ -1276,6 +1454,70 @@ hfs_statfs(mp, sbp, p)
  }
  
  
+//
+// XXXdbg -- this is a callback to be used by the journal to
+//           get meta data blocks flushed out to disk.
+//
+// XXXdbg -- be smarter and don't flush *every* block on each
+//           call.  try to only flush some so we don't wind up
+//           being too synchronous.
+//
+__private_extern__
+void
+hfs_sync_metadata(void *arg)
+{
+       struct mount *mp = (struct mount *)arg;
+       struct cnode *cp;
+       struct hfsmount *hfsmp;
+       ExtendedVCB *vcb;
+       struct vnode *meta_vp[3];
+       struct buf *bp;
+       int i, sectorsize, priIDSector, altIDSector, retval;
+       int error, allerror = 0;
+
+       hfsmp = VFSTOHFS(mp);
+       vcb = HFSTOVCB(hfsmp);
+
+       bflushq(BQ_META, mp);
+
+
+#if 1     // XXXdbg - I do not believe this is necessary...
+          //          but if I pull it out, then the journal
+             //          does not seem to get flushed properly
+             //          when it is closed....
+       
+       // now make sure the super block is flushed
+       sectorsize = hfsmp->hfs_phys_block_size;
+       priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
+                  HFS_PRI_SECTOR(sectorsize);
+       retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp);
+       if (retval != 0) {
+               panic("hfs: sync_metadata: can't read super-block?! (retval 0x%x, priIDSector)\n",
+                         retval, priIDSector);
+       }
+
+       if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+           bwrite(bp);
+       } else if (bp) {
+           brelse(bp);
+       }
+
+       // the alternate super block...
+       // XXXdbg - we probably don't need to do this each and every time.
+       //          hfs_btreeio.c:FlushAlternate() should flag when it was
+       //          written...
+       altIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
+                       HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
+       retval = meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &bp);
+       if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+           bwrite(bp);
+       } else if (bp) {
+           brelse(bp);
+       }
+#endif
+       
+}
+
  /*
   * Go through the disk queues to initiate sandbagged IO;
   * go through the inodes to write those that have been modified;
@@ -1310,6 +1552,17 @@ hfs_sync(mp, waitfor, cred, p)
                 panic("update: rofs mod");
         };
  
+#if 0
+       // XXXdbg first go through and flush out any modified
+       //        meta data blocks so they go out in order...
+       bflushq(BQ_META, mp);
+       bflushq(BQ_LRU,  mp);
+       // only flush locked blocks if we're not doing journaling
+       if (hfsmp->jnl == NULL) {
+           bflushq(BQ_LOCKED, mp);
+       }
+#endif
+
         /*
          * Write back each 'modified' vnode
          */
@@ -1326,10 +1579,19 @@ loop:
                         simple_unlock(&mntvnode_slock);
                         goto loop;
                 }
+
                 simple_lock(&vp->v_interlock);
                 nvp = vp->v_mntvnodes.le_next;
+
                 cp = VTOC(vp);
  
+               // restart our whole search if this guy is locked
+               // or being reclaimed.
+               if (cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) {
+                       simple_unlock(&vp->v_interlock);
+                       continue;
+               }
+
                 if ((vp->v_flag & VSYSTEM) || (vp->v_type == VNON) ||
                     (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) &&
                     (vp->v_dirtyblkhd.lh_first == NULL) && !(vp->v_flag & VHASDIRTY))) {
@@ -1372,6 +1634,7 @@ loop:
                 btvp = btvp = meta_vp[i];;
                 if ((btvp==0) || (btvp->v_type == VNON) || (btvp->v_mount != mp))
                         continue;
+
                 simple_lock(&btvp->v_interlock);
                 cp = VTOC(btvp);
                 if (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) &&
@@ -1409,11 +1672,22 @@ loop:
          */
  
         if (IsVCBDirty(vcb)) {
+               // XXXdbg - debugging, remove
+               if (hfsmp->jnl) {
+                       //printf("hfs: sync: strange, a journaled volume w/dirty VCB? jnl 0x%x hfsmp 0x%x\n",
+                       //        hfsmp->jnl, hfsmp);
+               }
+
                 error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
-               if (error)
-                       allerror = error;
+               if (error)
+                       allerror = error;
         }
  
+       if (hfsmp->jnl) {
+           journal_flush(hfsmp->jnl);
+       }
+       
+  err_exit:
         return (allerror);
  }
  
@@ -1534,6 +1808,10 @@ hfs_init(vfsp)
  }
  
  
+// XXXdbg
+#include <sys/filedesc.h>
+
+
  /*
   * HFS filesystem related variables.
   */
@@ -1550,12 +1828,133 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
         extern u_int32_t hfs_encodingbias;
  
         /* all sysctl names at this level are terminal */
-       if (namelen != 1)
-               return (ENOTDIR);       /* overloaded */
  
         if (name[0] == HFS_ENCODINGBIAS)
                 return (sysctl_int(oldp, oldlenp, newp, newlen,
                                 &hfs_encodingbias));
+       else if (name[0] == 0x082969) {
+               // make the file system journaled...
+               struct vnode *vp = p->p_fd->fd_cdir, *jvp;
+               struct hfsmount *hfsmp;
+               ExtendedVCB *vcb;
+               int retval;
+               struct cat_attr jnl_attr, jinfo_attr;
+               struct cat_fork jnl_fork, jinfo_fork;
+               void *jnl = NULL;
+               
+               hfsmp = VTOHFS(vp);
+               if (hfsmp->hfs_fs_ronly) {
+                       return EROFS;
+               }
+               if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
+                       printf("hfs: can't make a plain hfs volume journaled.\n");
+                       return EINVAL;
+               }
+
+               if (hfsmp->jnl) {
+                   printf("hfs: volume @ mp 0x%x is already journaled!\n", vp->v_mount);
+                   return EAGAIN;
+               }
+
+               vcb = HFSTOVCB(hfsmp);
+               if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
+                       BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
+
+                       printf("hfs: volume has a btree w/non-contiguous nodes.  can not enable journaling.\n");
+                       return EINVAL;
+               }
+
+               // make sure these both exist!
+               if (   GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
+                       || GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, &jnl_fork) == 0) {
+
+                       return EINVAL;
+               }
+
+               hfs_sync(hfsmp->hfs_mp, MNT_WAIT, FSCRED, p);
+               bflushq(BQ_META);
+
+               printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+                          (off_t)name[2], (off_t)name[3]);
+
+               jvp = hfsmp->hfs_devvp;
+               jnl = journal_create(jvp,
+                                                        (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize
+                                                        + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
+                                                        (off_t)name[3],
+                                                        hfsmp->hfs_devvp,
+                                                        hfsmp->hfs_phys_block_size,
+                                                        0,
+                                                        0,
+                                                        hfs_sync_metadata, hfsmp->hfs_mp);
+
+               if (jnl == NULL) {
+                       printf("hfs: FAILED to create the journal!\n");
+                       if (jvp && jvp != hfsmp->hfs_devvp) {
+                               VOP_CLOSE(jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+                       }
+                       jvp = NULL;
+
+                       return EINVAL;
+               } 
+
+               hfs_global_exclusive_lock_acquire(hfsmp);
+               
+               HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
+               HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
+               hfsmp->jvp = jvp;
+               hfsmp->jnl = jnl;
+
+               // save this off for the hack-y check in hfs_remove()
+               hfsmp->jnl_start        = (u_int32_t)name[2];
+               hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
+               hfsmp->hfs_jnlfileid    = jnl_attr.ca_fileid;
+
+               hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED;
+
+               hfs_global_exclusive_lock_release(hfsmp);
+               hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
+
+               return 0;
+       } else if (name[0] == 0x031272) {
+               // clear the journaling bit 
+               struct vnode *vp = p->p_fd->fd_cdir;
+               struct hfsmount *hfsmp;
+               void *jnl;
+               int retval;
+               
+               hfsmp = VTOHFS(vp);
+               if (hfsmp->jnl == NULL) {
+                       return EINVAL;
+               }
+
+               printf("hfs: disabling journaling for mount @ 0x%x\n", vp->v_mount);
+
+               jnl = hfsmp->jnl;
+               
+               hfs_global_exclusive_lock_acquire(hfsmp);
+
+               // Lights out for you buddy!
+               hfsmp->jnl = NULL;
+               journal_close(jnl);
+
+               if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+                       VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+               }
+               hfsmp->jnl = NULL;
+               hfsmp->jvp = NULL;
+               hfsmp->hfs_mp->mnt_flag &= ~MNT_JOURNALED;
+               hfsmp->jnl_start        = 0;
+               hfsmp->hfs_jnlinfoblkid = 0;
+               hfsmp->hfs_jnlfileid    = 0;
+               
+               HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
+               
+               hfs_global_exclusive_lock_release(hfsmp);
+               hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
+
+               return 0;
+       }
  
         return (EOPNOTSUPP);
  }
@@ -1688,6 +2087,11 @@ hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot)
                         --vcb->vcbNmFls;
                 break;
         }
+
+       if (hfsmp->jnl) {
+               hfs_flushvolumeheader(hfsmp, 0, 0);
+       }
+
         return (0);
  }
  
@@ -1704,7 +2108,6 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
         ByteCount namelen;
  
         sectorsize = hfsmp->hfs_phys_block_size;
-
         retval = bread(hfsmp->hfs_devvp, HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp);
         if (retval) {
                 if (bp)
@@ -1716,6 +2119,10 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
         DBG_ASSERT(bp->b_data != NULL);
         DBG_ASSERT(bp->b_bcount == size);
  
+       if (hfsmp->jnl) {
+               panic("hfs: standard hfs volumes should not be journaled!\n");
+       }
+
         mdb = (HFSMasterDirectoryBlock *)(bp->b_data + HFS_PRI_OFFSET(sectorsize));
      
         mdb->drCrDate   = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbCrDate)));
@@ -1770,6 +2177,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
  
                 if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) {
                         bcopy(mdb, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
+
                         (void) VOP_BWRITE(alt_bp);
                 } else if (alt_bp)
                         brelse(alt_bp);
@@ -1777,7 +2185,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
  
         if (waitfor != MNT_WAIT)
                 bawrite(bp);
-       else
+       else 
                 retval = VOP_BWRITE(bp);
   
         MarkVCBClean( vcb );
@@ -1809,13 +2217,32 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
         priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
                         HFS_PRI_SECTOR(sectorsize);
  
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+               if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                   return EINVAL;
+           }
+       }
+
         retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp);
         if (retval) {
                 if (bp)
                         brelse(bp);
+
+               if (hfsmp->jnl) {
+                       journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
+
                 return (retval);
         }
  
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, bp);
+       }
+
         volumeHeader = (HFSPlusVolumeHeader *)((char *)bp->b_data + HFS_PRI_OFFSET(sectorsize));
  
         /*
@@ -1839,9 +2266,19 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
  
                         if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
                           {
+                               // XXXdbg
+                               if (hfsmp->jnl) {
+                                   journal_modify_block_start(hfsmp->jnl, bp2);
+                               }
+
                                 mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate);       /* pick up the new create date */
  
-                               (void) VOP_BWRITE(bp2);         /* write out the changes */
+                               // XXXdbg
+                               if (hfsmp->jnl) {
+                                       journal_modify_block_end(hfsmp->jnl, bp2);
+                               } else {
+                                       (void) VOP_BWRITE(bp2);         /* write out the changes */
+                               }
                           }
                         else
                           {
@@ -1850,9 +2287,36 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
                   }     
         }
  
+// XXXdbg - only monkey around with the volume signature on non-root volumes
+//
+#if 0
+       if (hfsmp->jnl &&
+               hfsmp->hfs_fs_ronly == 0 &&
+               (HFSTOVFS(hfsmp)->mnt_flag & MNT_ROOTFS) == 0) {
+               
+               int old_sig = volumeHeader->signature;
+
+               if (vcb->vcbAtrb & kHFSVolumeUnmountedMask) {
+                       volumeHeader->signature = kHFSPlusSigWord;
+               } else {
+                       volumeHeader->signature = kHFSJSigWord;
+               }
+
+               if (old_sig != volumeHeader->signature) {
+                       altflush = 1;
+               }
+       }
+#endif
+// XXXdbg
+
         /* Note: only update the lower 16 bits worth of attributes */
         volumeHeader->attributes        = SWAP_BE32 ((SWAP_BE32 (volumeHeader->attributes) & 0xFFFF0000) + (UInt16) vcb->vcbAtrb);
-       volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
+       volumeHeader->journalInfoBlock = SWAP_BE32(vcb->vcbJinfoBlock);
+       if (hfsmp->jnl) {
+               volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
+       } else {
+               volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
+       }
         volumeHeader->createDate        = SWAP_BE32 (vcb->localCreateDate);  /* volume create date is in local time */
         volumeHeader->modifyDate        = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
         volumeHeader->backupDate        = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
@@ -1918,22 +2382,38 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
                         HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
  
                 if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) {
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, alt_bp);
+                       }
+
                         bcopy(volumeHeader, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
-                       (void) VOP_BWRITE(alt_bp);
+
+                       if (hfsmp->jnl) {
+                               journal_modify_block_end(hfsmp->jnl, alt_bp);
+                       } else {
+                               (void) VOP_BWRITE(alt_bp);
+                       }
                 } else if (alt_bp)
                         brelse(alt_bp);
         }
  
-       if (waitfor != MNT_WAIT)
-               bawrite(bp);
-       else {
-               retval = VOP_BWRITE(bp);
-               /* When critical data changes, flush the device cache */
-               if (critical && (retval == 0)) {
+       // XXXdbg
+       if (hfsmp->jnl) {
+               journal_modify_block_end(hfsmp->jnl, bp);
+               journal_end_transaction(hfsmp->jnl);
+       } else {
+               if (waitfor != MNT_WAIT)
+                       bawrite(bp);
+               else {
+                   retval = VOP_BWRITE(bp);
+                   /* When critical data changes, flush the device cache */
+                   if (critical && (retval == 0)) {
                         (void) VOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
-                                       NULL, FWRITE, NOCRED, current_proc());
+                                        NULL, FWRITE, NOCRED, current_proc());
+                   }
                 }
         }
+       hfs_global_shared_lock_release(hfsmp);
   
         vcb->vcbFlags &= 0x00FF;
         return (retval);
diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c

index c45f8a898cb04cbdb99732df5e63635e65ade57f..386acae02507bf36778e8f948c27a8a8648bf439 100644 (file)
--- a/bsd/hfs/hfs_vfsutils.c
+++ b/bsd/hfs/hfs_vfsutils.c
@@ -55,6 +55,7 @@ extern uid_t console_user;
  
  
  static void ReleaseMetaFileVNode(struct vnode *vp);
+static int  hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args);
  
  u_int32_t GetLogicalBlockSize(struct vnode *vp);
  
@@ -246,7 +247,7 @@ CmdDone:
  //*******************************************************************************
  
  OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
-       off_t embeddedOffset, u_int64_t disksize, struct proc *p)
+       off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args)
  {
         register ExtendedVCB *vcb;
         struct cat_desc cndesc;
@@ -254,9 +255,15 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
         UInt32 blockSize;
         OSErr retval;
  
-       if (SWAP_BE16(vhp->signature) != kHFSPlusSigWord ||
-           SWAP_BE16(vhp->version) != kHFSPlusVersion)
-               return (EINVAL);
+       // XXXdbg - added the kHFSJSigWord case
+       if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
+                SWAP_BE16(vhp->signature) != kHFSJSigWord) ||
+           SWAP_BE16(vhp->version) != kHFSPlusVersion) {
+               // XXXdbg
+               printf("hfs: mount: sig 0x%x and version 0x%x are not HFS or HFS+.\n",
+                          vhp->signature, vhp->version);
+               return (EINVAL);
+       }
  
         /* Block size must be at least 512 and a power of 2 */
         blockSize = SWAP_BE32(vhp->blockSize);
@@ -264,7 +271,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
                 return (EINVAL);
     
         /* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */
-       if (hfsmp->hfs_fs_ronly == 0 && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0)
+       if (hfsmp->hfs_fs_ronly == 0 && hfsmp->jnl == NULL && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0)
                 return (EINVAL);
  
         /* Make sure we can live with the physical block size. */
@@ -280,6 +287,13 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
         vcb = HFSTOVCB(hfsmp);
  
         vcb->vcbSigWord = SWAP_BE16(vhp->signature);
+
+       // XXXdbg - remap this in case we've mounted a dirty journaled volume
+       if (vcb->vcbSigWord == kHFSJSigWord) {
+               vcb->vcbSigWord = kHFSPlusSigWord;
+       }
+
+       vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock);
         vcb->vcbLsMod   = to_bsd_time(SWAP_BE32(vhp->modifyDate));
         vcb->vcbAtrb    = (UInt16)SWAP_BE32(vhp->attributes);
         vcb->vcbClpSiz  = SWAP_BE32(vhp->rsrcClumpSize);
@@ -413,6 +427,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
  
         /* mark the volume dirty (clear clean unmount bit) */
         vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask;
+       if (hfsmp->jnl && hfsmp->hfs_fs_ronly == 0) {
+               hfs_flushvolumeheader(hfsmp, TRUE, TRUE);
+       }
  
         /*
          * all done with metadata files so we can unlock now...
@@ -423,12 +440,46 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
  
         /* setup private/hidden directory for unlinked files */
         hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(vcb);
+       if (hfsmp->jnl && (hfsmp->hfs_fs_ronly == 0))
+               hfs_remove_orphans(hfsmp);
  
         if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) )     // if the disk is not write protected
         {
                 MarkVCBDirty( vcb );    // mark VCB dirty so it will be written
         }
  
+
+       //
+       // Check if we need to do late journal initialization.  This only
+       // happens if a previous version of MacOS X (or 9) touched the disk.
+       // In that case hfs_late_journal_init() will go re-locate the journal 
+       // and journal_info_block files and validate that they're still kosher.
+       //
+       if (   (vcb->vcbAtrb & kHFSVolumeJournaledMask)
+               && (SWAP_BE32(vhp->lastMountedVersion) != kHFSJMountVersion)
+               && (hfsmp->jnl == NULL)) {
+
+               retval = hfs_late_journal_init(hfsmp, vhp, args);
+               if (retval != 0) {
+                       hfsmp->jnl = NULL;
+                       goto ErrorExit;
+               } else if (hfsmp->jnl) {
+                       hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED;
+               }
+       } else if (hfsmp->jnl) {
+               struct cat_attr jinfo_attr, jnl_attr;
+               
+               // if we're here we need to fill in the fileid's for the
+               // journal and journal_info_block.
+               hfsmp->hfs_jnlinfoblkid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, NULL);
+               hfsmp->hfs_jnlfileid    = GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, NULL);
+               if (hfsmp->hfs_jnlinfoblkid == 0 || hfsmp->hfs_jnlfileid == 0) {
+                       printf("hfs: danger! couldn't find the file-id's for the journal or journal_info_block\n");
+                       printf("hfs: jnlfileid %d, jnlinfoblkid %d\n", hfsmp->hfs_jnlfileid, hfsmp->hfs_jnlinfoblkid);
+               }
+       }
+
+
         return (0);
  
  ErrorExit:
@@ -759,13 +810,28 @@ FindMetaDataDirectory(ExtendedVCB *vcb)
         fndrinfo->frLocation.h = SWAP_BE16 (22460);
         fndrinfo->frFlags |= SWAP_BE16 (kIsInvisible + kNameLocked);            
  
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return (0);
+           }
+       }
+
         error = cat_create(hfsmp, &hfsmp->hfs_privdir_desc,
                         &hfsmp->hfs_privdir_attr, &out_desc);
  
         /* Unlock catalog b-tree */
         (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
-       if (error)
-               return (0);
+       if (error) {
+           if (hfsmp->jnl) {
+                       journal_end_transaction(hfsmp->jnl);
+           }
+               hfs_global_shared_lock_release(hfsmp);
+
+           return (0);
+       }
  
         hfsmp->hfs_privdir_desc.cd_hint = out_desc.cd_hint;
         hfsmp->hfs_privdir_desc.cd_cnid = out_desc.cd_cnid;
@@ -783,11 +849,209 @@ FindMetaDataDirectory(ExtendedVCB *vcb)
                 vput(dvp);
         }
         hfs_volupdate(hfsmp, VOL_MKDIR, 1);
+       if (hfsmp->jnl) {
+           journal_end_transaction(hfsmp->jnl);
+       } 
+       hfs_global_shared_lock_release(hfsmp);
+
         cat_releasedesc(&out_desc);
  
         return (out_desc.cd_cnid);
  }
  
+__private_extern__
+u_long
+GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name,
+                       struct cat_attr *fattr, struct cat_fork *forkinfo)
+{
+       struct hfsmount * hfsmp;
+       struct vnode * dvp = NULL;
+       struct cnode * dcp = NULL;
+       struct FndrDirInfo * fndrinfo;
+       struct cat_desc jdesc;
+       struct timeval tv;
+       int error;
+       
+       if (vcb->vcbSigWord != kHFSPlusSigWord)
+               return (0);
+
+       hfsmp = VCBTOHFS(vcb);
+
+       memset(&jdesc, 0, sizeof(struct cat_desc));
+       jdesc.cd_parentcnid = kRootDirID;
+       jdesc.cd_nameptr = name;
+       jdesc.cd_namelen = strlen(name);
+
+       /* Lock catalog b-tree */
+       error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc());    
+       if (error)
+               return (0);
+
+       error = cat_lookup(hfsmp, &jdesc, 0, NULL, fattr, forkinfo);
+
+       (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
+
+       if (error == 0) {
+               return (fattr->ca_fileid);
+       } else if (hfsmp->hfs_fs_ronly) {
+               return (0);
+       }
+}
+
+
+/*
+ * On Journaled HFS, there can be orphaned files.  These
+ * are files that were unlinked while busy. If the volume
+ * was not cleanly unmounted then some of these files may
+ * have persisted and need to be removed.
+ */
+__private_extern__
+void
+hfs_remove_orphans(struct hfsmount * hfsmp)
+{
+       struct BTreeIterator * iterator = NULL;
+       struct FSBufferDescriptor btdata;
+       struct HFSPlusCatalogFile filerec;
+       struct HFSPlusCatalogKey * keyp;
+       FCB *fcb;
+       ExtendedVCB *vcb;
+       char filename[32];
+       char tempname[32];
+       size_t namelen;
+       int catlock = 0;
+       int result, started_tr = 0;
+       
+       if (hfsmp->hfs_orphans_cleaned)
+               return;
+
+       vcb = HFSTOVCB(hfsmp);
+       fcb = VTOF(vcb->catalogRefNum);
+
+       btdata.bufferAddress = &filerec;
+       btdata.itemSize = sizeof(filerec);
+       btdata.itemCount = 1;
+
+       MALLOC(iterator, struct BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK);
+       bzero(iterator, sizeof(*iterator));
+       keyp = (HFSPlusCatalogKey*)&iterator->key;
+       keyp->parentID = hfsmp->hfs_private_metadata_dir;
+
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if (journal_start_transaction(hfsmp->jnl) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return;
+           }
+               started_tr = 1;
+       }
+
+       /* Lock catalog b-tree */
+       result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc());   
+       if (result)
+               goto exit;
+       catlock = 1;
+
+       /*
+        * Position the iterator at the folder thread record.
+        * (i.e. one record before first child)
+        */
+       result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator);
+       if (result)
+               goto exit;
+
+       /* Visit all the children in the HFS+ private directory. */
+       for (;;) {
+               result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
+               if (result)
+                       break;
+               if (keyp->parentID != hfsmp->hfs_private_metadata_dir)
+                       break;
+               if (filerec.recordType != kHFSPlusFileRecord)
+                       continue;
+               
+               (void) utf8_encodestr(keyp->nodeName.unicode, keyp->nodeName.length * 2,
+                                     filename, &namelen, sizeof(filename), 0, 0);
+               
+               (void) sprintf(tempname, "%s%d", HFS_DELETE_PREFIX, filerec.fileID);
+               
+               /*
+                * Delete all files named "tempxxx", where
+                * xxx is the file's cnid in decimal.
+                *
+                * Delete all files named "iNodexxx", that
+                * have a link count of zero.
+                */
+               if (bcmp(tempname, filename, namelen) == 0) {
+                       struct filefork fork = {0};
+                       struct cnode cnode = {0};
+
+                       // XXXdebug
+                       //printf("hfs_remove_orphans: removing %s\n", filename);
+
+                       /* Build a fake cnode */
+                       cnode.c_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+                       cnode.c_desc.cd_nameptr = filename;
+                       cnode.c_desc.cd_namelen = namelen;
+                       cnode.c_desc.cd_cnid = filerec.fileID;
+                       cnode.c_attr.ca_fileid = filerec.fileID;
+                       cnode.c_blocks = filerec.dataFork.totalBlocks +
+                                        filerec.resourceFork.totalBlocks;
+
+                       /* Position iterator at previous entry */
+                       if (BTIterateRecord(fcb, kBTreePrevRecord, iterator,
+                           NULL, NULL) != 0)
+                               break;
+                       
+                       /* Truncate the file to zero (both forks) */
+                       if (filerec.dataFork.totalBlocks > 0) {
+                               fork.ff_cp = &cnode;
+                               cnode.c_datafork = &fork;
+                               bcopy(&filerec.dataFork, &fork.ff_data, sizeof(struct cat_fork));
+                               if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) {
+                                       printf("error truncting data fork!\n");
+                                       break;
+                               }
+                       }
+                       if (filerec.resourceFork.totalBlocks > 0) {
+                               fork.ff_cp = &cnode;
+                               cnode.c_datafork = NULL;
+                               cnode.c_rsrcfork = &fork;
+                               bcopy(&filerec.resourceFork, &fork.ff_data, sizeof(struct cat_fork));
+                               if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) {
+                                       printf("error truncting rsrc fork!\n");
+                                       break;
+                               }
+                       }
+
+                       /* Remove the file record from the Catalog */   
+                       if (cat_delete(hfsmp, &cnode.c_desc, &cnode.c_attr) != 0) {
+                               printf("error deleting cat rec!\n");
+                               break;
+                       }
+                       
+                       /* Update parent and volume counts */   
+                       hfsmp->hfs_privdir_attr.ca_entries--;
+                       (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc,
+                                        &hfsmp->hfs_privdir_attr, NULL, NULL);
+                       hfs_volupdate(hfsmp, VOL_RMFILE, 0);
+               }
+       }
+       
+exit:
+       /* Unlock catalog b-tree */
+       if (catlock)
+               (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
+
+       if (started_tr) {
+               journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
+
+       FREE(iterator, M_TEMP);
+       hfsmp->hfs_orphans_cleaned = 1;
+}
+
  
  /*
   * This will return the correct logical block size for a given vnode.
@@ -860,12 +1124,14 @@ short MacToVFSError(OSErr err)
  
         switch (err) {
         case dskFulErr:                 /*    -34 */
-       case btNoSpaceAvail:            /* -32733 */
+               return ENOSPC;
+       case btNoSpaceAvail:    /* -32733 */
+               return EFBIG;
         case fxOvFlErr:                 /* -32750 */
-               return ENOSPC;          /*    +28 */
+               return EOVERFLOW;
         
         case btBadNode:                 /* -32731 */
-               return EIO;             /*    +5 */
+               return EBADF;
         
         case memFullErr:                /*  -108 */
                 return ENOMEM;          /*   +12 */
@@ -885,7 +1151,7 @@ short MacToVFSError(OSErr err)
                 return EISDIR;          /*     21 */
         
         case fxRangeErr:                /* -32751 */
-               return EIO;             /*      5 */
+               return ERANGE;
         
         case bdNamErr:                  /*   -37 */
                 return ENAMETOOLONG;    /*    63 */
@@ -995,4 +1261,299 @@ hfs_relnamehints(struct cnode *dcp)
  }
  
  
+__private_extern__
+int
+hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
+                                          void *_args, int embeddedOffset, int mdb_offset,
+                                          HFSMasterDirectoryBlock *mdbp, struct ucred *cred)
+{
+       JournalInfoBlock *jibp;
+       struct buf       *jinfo_bp, *bp;
+       int               sectors_per_fsblock, arg_flags=0, arg_tbufsz=0;
+       int               retval, blksize = hfsmp->hfs_phys_block_size;
+       struct vnode     *devvp;
+       struct hfs_mount_args *args = _args;
+
+       devvp = hfsmp->hfs_devvp;
+
+       if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) {
+               arg_flags  = args->journal_flags;
+               arg_tbufsz = args->journal_tbuffer_size;
+       }
+
+       sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / blksize;
+                               
+       retval = meta_bread(devvp,
+                                               embeddedOffset/blksize + 
+                                               (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock),
+                                               SWAP_BE32(vhp->blockSize), cred, &jinfo_bp);
+       if (retval)
+               return retval;
+
+       jibp = (JournalInfoBlock *)jinfo_bp->b_data;
+       jibp->flags  = SWAP_BE32(jibp->flags);
+       jibp->offset = SWAP_BE64(jibp->offset);
+       jibp->size   = SWAP_BE64(jibp->size);
+
+       if (jibp->flags & kJIJournalInFSMask) {
+               hfsmp->jvp = hfsmp->hfs_devvp;
+       } else {
+               printf("hfs: journal not stored in fs! don't know what to do.\n");
+               brelse(jinfo_bp);
+               return EINVAL;
+       }
+
+       // save this off for the hack-y check in hfs_remove()
+       hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize);
+
+       if (jibp->flags & kJIJournalNeedInitMask) {
+               printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+                          jibp->offset + (off_t)embeddedOffset, jibp->size);
+               hfsmp->jnl = journal_create(hfsmp->jvp,
+                                                                       jibp->offset + (off_t)embeddedOffset,
+                                                                       jibp->size,
+                                                                       devvp,
+                                                                       blksize,
+                                                                       arg_flags,
+                                                                       arg_tbufsz,
+                                                                       hfs_sync_metadata, hfsmp->hfs_mp);
+
+               // no need to start a transaction here... if this were to fail
+               // we'd just re-init it on the next mount.
+               jibp->flags &= ~kJIJournalNeedInitMask;
+               jibp->flags  = SWAP_BE32(jibp->flags);
+               bwrite(jinfo_bp);
+               jinfo_bp = NULL;
+               jibp     = NULL;
+       } else { 
+               //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n",
+               //         jibp->offset + (off_t)embeddedOffset,
+               //         jibp->size, SWAP_BE32(vhp->blockSize));
+                               
+               hfsmp->jnl = journal_open(hfsmp->jvp,
+                                                                 jibp->offset + (off_t)embeddedOffset,
+                                                                 jibp->size,
+                                                                 devvp,
+                                                                 blksize,
+                                                                 arg_flags,
+                                                                 arg_tbufsz,
+                                                                 hfs_sync_metadata, hfsmp->hfs_mp);
+
+               brelse(jinfo_bp);
+               jinfo_bp = NULL;
+               jibp     = NULL;
+
+               if (hfsmp->jnl && mdbp) {
+                       // reload the mdb because it could have changed
+                       // if the journal had to be replayed.
+                       retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp);
+                       if (retval) {
+                               brelse(bp);
+                               printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n",
+                                          retval);
+                               return retval;
+                       }
+                       bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512);
+                       brelse(bp);
+                       bp = NULL;
+               }
+       }
+
+
+       //printf("journal @ 0x%x\n", hfsmp->jnl);
+       
+       // if we expected the journal to be there and we couldn't
+       // create it or open it then we have to bail out.
+       if (hfsmp->jnl == NULL) {
+               hfsmp->jnl_start = 0;
+               
+               printf("hfs: failed to open/create the journal (retval %d).\n", retval);
+               return EINVAL;
+       }
  
+       return 0;
+}
+
+
+//
+// This function will go and re-locate the .journal_info_block and
+// the .journal files in case they moved (which can happen if you
+// run Norton SpeedDisk).  If we fail to find either file we just
+// disable journaling for this volume and return.  We turn off the
+// journaling bit in the vcb and assume it will get written to disk
+// later (if it doesn't on the next mount we'd do the same thing
+// again which is harmless).  If we disable journaling we don't
+// return an error so that the volume is still mountable.
+//
+// If the info we find for the .journal_info_block and .journal files
+// isn't what we had stored, we re-set our cached info and proceed
+// with opening the journal normally.
+//
+static int
+hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args)
+{
+       JournalInfoBlock *jibp;
+       struct buf       *jinfo_bp, *bp;
+       int               sectors_per_fsblock, arg_flags=0, arg_tbufsz=0;
+       int               retval, need_flush = 0, write_jibp = 0;
+       struct vnode     *devvp;
+       struct cat_attr   jib_attr, jattr;
+       struct cat_fork   jib_fork, jfork;
+       ExtendedVCB      *vcb;
+       u_long            fid;
+       struct hfs_mount_args *args = _args;
+       
+       devvp = hfsmp->hfs_devvp;
+       vcb = HFSTOVCB(hfsmp);
+       
+       if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) {
+               if (args->journal_disable) {
+                       return 0;
+               }
+
+               arg_flags  = args->journal_flags;
+               arg_tbufsz = args->journal_tbuffer_size;
+       }
+
+       fid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jib_attr, &jib_fork);
+       if (fid == 0 || jib_fork.cf_extents[0].startBlock == 0 || jib_fork.cf_size == 0) {
+               printf("hfs: can't find the .journal_info_block! disabling journaling (start: %d).\n",
+                          jib_fork.cf_extents[0].startBlock);
+               vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+               return 0;
+       }
+       hfsmp->hfs_jnlinfoblkid = fid;
+
+       // make sure the journal_info_block begins where we think it should.
+       if (SWAP_BE32(vhp->journalInfoBlock) != jib_fork.cf_extents[0].startBlock) {
+               printf("hfs: The journal_info_block moved (was: %d; is: %d).  Fixing up\n",
+                          SWAP_BE32(vhp->journalInfoBlock), jib_fork.cf_extents[0].startBlock);
+
+               vcb->vcbJinfoBlock    = jib_fork.cf_extents[0].startBlock;
+               vhp->journalInfoBlock = SWAP_BE32(jib_fork.cf_extents[0].startBlock);
+       }
+
+
+       sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_phys_block_size;
+       retval = meta_bread(devvp,
+                                               vcb->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size + 
+                                               (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock),
+                                               SWAP_BE32(vhp->blockSize), NOCRED, &jinfo_bp);
+       if (retval) {
+               printf("hfs: can't read journal info block. disabling journaling.\n");
+               vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+               return 0;
+       }
+
+       jibp = (JournalInfoBlock *)jinfo_bp->b_data;
+       jibp->flags  = SWAP_BE32(jibp->flags);
+       jibp->offset = SWAP_BE64(jibp->offset);
+       jibp->size   = SWAP_BE64(jibp->size);
+
+       fid = GetFileInfo(vcb, kRootDirID, ".journal", &jattr, &jfork);
+       if (fid == 0 || jfork.cf_extents[0].startBlock == 0 || jfork.cf_size == 0) {
+               printf("hfs: can't find the journal file! disabling journaling (start: %d)\n",
+                          jfork.cf_extents[0].startBlock);
+               brelse(jinfo_bp);
+               vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+               return 0;
+       }
+       hfsmp->hfs_jnlfileid = fid;
+
+       // make sure the journal file begins where we think it should.
+       if ((jibp->offset / (u_int64_t)vcb->blockSize) != jfork.cf_extents[0].startBlock) {
+               printf("hfs: The journal file moved (was: %lld; is: %d).  Fixing up\n",
+                          (jibp->offset / (u_int64_t)vcb->blockSize), jfork.cf_extents[0].startBlock);
+
+               jibp->offset = (u_int64_t)jfork.cf_extents[0].startBlock * (u_int64_t)vcb->blockSize;
+               write_jibp   = 1;
+       }
+
+       // check the size of the journal file.
+       if (jibp->size != (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize) {
+               printf("hfs: The journal file changed size! (was %lld; is %lld).  Fixing up.\n",
+                          jibp->size, (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize);
+               
+               jibp->size = (u_int64_t)jfork.cf_extents[0].blockCount * vcb->blockSize;
+               write_jibp = 1;
+       }
+       
+       if (jibp->flags & kJIJournalInFSMask) {
+               hfsmp->jvp = hfsmp->hfs_devvp;
+       } else {
+               printf("hfs: journal not stored in fs! don't know what to do.\n");
+               brelse(jinfo_bp);
+               return EINVAL;
+       }
+
+       // save this off for the hack-y check in hfs_remove()
+       hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize);
+
+       if (jibp->flags & kJIJournalNeedInitMask) {
+               printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+                          jibp->offset + (off_t)vcb->hfsPlusIOPosOffset, jibp->size);
+               hfsmp->jnl = journal_create(hfsmp->jvp,
+                                                                       jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+                                                                       jibp->size,
+                                                                       devvp,
+                                                                       hfsmp->hfs_phys_block_size,
+                                                                       arg_flags,
+                                                                       arg_tbufsz,
+                                                                       hfs_sync_metadata, hfsmp->hfs_mp);
+
+               // no need to start a transaction here... if this were to fail
+               // we'd just re-init it on the next mount.
+               jibp->flags &= ~kJIJournalNeedInitMask;
+               write_jibp   = 1;
+
+       } else { 
+               //
+               // if we weren't the last person to mount this volume
+               // then we need to throw away the journal because it
+               // is likely that someone else mucked with the disk.
+               // if the journal is empty this is no big deal.  if the
+               // disk is dirty this prevents us from replaying the
+               // journal over top of changes that someone else made.
+               //
+               arg_flags |= JOURNAL_RESET;
+               
+               //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n",
+               //         jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+               //         jibp->size, SWAP_BE32(vhp->blockSize));
+                               
+               hfsmp->jnl = journal_open(hfsmp->jvp,
+                                                                 jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+                                                                 jibp->size,
+                                                                 devvp,
+                                                                 hfsmp->hfs_phys_block_size,
+                                                                 arg_flags,
+                                                                 arg_tbufsz,
+                                                                 hfs_sync_metadata, hfsmp->hfs_mp);
+       }
+                       
+
+       if (write_jibp) {
+               jibp->flags  = SWAP_BE32(jibp->flags);
+               jibp->offset = SWAP_BE64(jibp->offset);
+               jibp->size   = SWAP_BE64(jibp->size);
+
+               bwrite(jinfo_bp);
+       } else {
+               brelse(jinfo_bp);
+       } 
+       jinfo_bp = NULL;
+       jibp     = NULL;
+
+       //printf("journal @ 0x%x\n", hfsmp->jnl);
+       
+       // if we expected the journal to be there and we couldn't
+       // create it or open it then we have to bail out.
+       if (hfsmp->jnl == NULL) {
+               hfsmp->jnl_start = 0;
+               
+               printf("hfs: failed to open/create the journal (retval %d).\n", retval);
+               return EINVAL;
+       }
+
+       return 0;
+}
diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c

index 19006da0ea82e25b227cd4441b311913df24e88a..0080c14008fcff282a5628e4fcd3a8e0072b7a9f 100644 (file)
--- a/bsd/hfs/hfs_vnops.c
+++ b/bsd/hfs/hfs_vnops.c
@@ -561,6 +561,17 @@ hfs_setattr(ap)
  
         if (cp->c_flags & (IMMUTABLE | APPEND))
                 return (EPERM);
+
+       // XXXdbg - don't allow modification of the journal or journal_info_block
+       if (VTOHFS(vp)->jnl && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+                       return EPERM;
+               }
+       }
+
         /*
          * Go through the fields and update iff not VNOVAL.
          */
@@ -649,6 +660,16 @@ hfs_chmod(vp, mode, cred, p)
         if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord)
                 return (0);
  
+       // XXXdbg - don't allow modification of the journal or journal_info_block
+       if (VTOHFS(vp)->jnl && cp && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+                       return EPERM;
+               }
+       }
+
  #if OVERRIDE_UNKNOWN_PERMISSIONS
         if (VTOVFS(vp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) {
                 return (0);
@@ -915,7 +936,7 @@ hfs_exchange(ap)
         struct hfsmount *hfsmp = VTOHFS(from_vp);
         struct cat_desc tempdesc;
         struct cat_attr tempattr;
-       int error = 0;
+       int error = 0, started_tr = 0, grabbed_lock = 0;
  
         /* The files must be on the same volume. */
         if (from_vp->v_mount != to_vp->v_mount)
@@ -927,6 +948,25 @@ hfs_exchange(ap)
             VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp))
                 return (EINVAL);
  
+       // XXXdbg - don't allow modification of the journal or journal_info_block
+       if (hfsmp->jnl) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               if (from_cp->c_datafork) {
+                       extd = &from_cp->c_datafork->ff_data.cf_extents[0];
+                       if (extd->startBlock == VTOVCB(from_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                               return EPERM;
+                       }
+               }
+
+               if (to_cp->c_datafork) {
+                       extd = &to_cp->c_datafork->ff_data.cf_extents[0];
+                       if (extd->startBlock == VTOVCB(to_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                               return EPERM;
+                       }
+               }
+       }
+
         from_rvp = from_cp->c_rsrc_vp;
         to_rvp = to_cp->c_rsrc_vp;
  
@@ -952,6 +992,16 @@ hfs_exchange(ap)
         if (to_rvp)
                 (void) vinvalbuf(to_rvp, V_SAVE, ap->a_cred, ap->a_p, 0, 0);
  
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto Err_Exit;
+           }
+               started_tr = 1;
+       }
+       
         /* Lock catalog b-tree */
         error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, ap->a_p);
         if (error) goto Err_Exit;
@@ -994,6 +1044,7 @@ hfs_exchange(ap)
          * (except the modify date)
          */
         bcopy(&to_cp->c_desc, &from_cp->c_desc, sizeof(struct cat_desc));
+
         from_cp->c_hint = 0;
         from_cp->c_fileid = from_cp->c_cnid;
         from_cp->c_itime = to_cp->c_itime;
@@ -1031,6 +1082,14 @@ Err_Exit:
         if (from_rvp)
                 vrele(from_rvp);
  
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
         return (error);
  }
  
@@ -1046,7 +1105,6 @@ Err_Exit:
       IN struct proc *p;
  
       */
-
  static int
  hfs_fsync(ap)
         struct vop_fsync_args /* {
@@ -1063,6 +1121,7 @@ hfs_fsync(ap)
         register struct buf *bp;
         struct timeval tv;
         struct buf *nbp;
+       struct hfsmount *hfsmp = VTOHFS(ap->a_vp);
         int s;
         int wait;
         int retry = 0;
@@ -1078,8 +1137,17 @@ hfs_fsync(ap)
          * for regular files write out any clusters
          */
         if (vp->v_flag & VSYSTEM) {
-               if (VTOF(vp)->fcbBTCBPtr != NULL)
-                       BTFlushPath(VTOF(vp));
+           if (VTOF(vp)->fcbBTCBPtr != NULL) {
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               if (BTIsDirty(VTOF(vp))) {
+                                       panic("hfs: system file vp 0x%x has dirty blocks (jnl 0x%x)\n",
+                                                 vp, hfsmp->jnl);
+                               }
+                       } else {
+                               BTFlushPath(VTOF(vp));
+                       }
+           }
         } else if (UBCINFOEXISTS(vp))
                 (void) cluster_push(vp);
  
@@ -1139,11 +1207,27 @@ loop:
                 if ((bp->b_flags & B_BUSY))
                         continue;
                 if ((bp->b_flags & B_DELWRI) == 0)
-                       panic("hfs_fsync: not dirty");
+                       panic("hfs_fsync: bp 0x% not dirty (hfsmp 0x%x)", bp, hfsmp);
+               // XXXdbg
+               if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+                       if ((bp->b_flags & B_META) == 0) {
+                               panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n",
+                                         bp, hfsmp->jnl);
+                       }
+                       // if journal_active() returns >= 0 then the journal is ok and we 
+                       // shouldn't do anything to this locked block (because it is part 
+                       // of a transaction).  otherwise we'll just go through the normal 
+                       // code path and flush the buffer.
+                       if (journal_active(hfsmp->jnl) >= 0) {
+                               continue;
+                       }
+               }
+
                 bremfree(bp);
                 bp->b_flags |= B_BUSY;
                 /* Clear B_LOCKED, should only be set on meta files */
                 bp->b_flags &= ~B_LOCKED;
+
                 splx(s);
                 /*
                  * Wait for I/O associated with indirect blocks to complete,
@@ -1162,7 +1246,9 @@ loop:
                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "hfs_fsync", 0);
                 }
  
-               if (vp->v_dirtyblkhd.lh_first) {
+               // XXXdbg -- is checking for hfsmp->jnl == NULL the right
+               //           thing to do?
+               if (hfsmp->jnl == NULL && vp->v_dirtyblkhd.lh_first) {
                         /* still have some dirty buffers */
                         if (retry++ > 10) {
                                 vprint("hfs_fsync: dirty", vp);
@@ -1216,6 +1302,11 @@ hfs_metasync(struct hfsmount *hfsmp, daddr_t node, struct proc *p)
  
         vp = HFSTOVCB(hfsmp)->catalogRefNum;
  
+       // XXXdbg - don't need to do this on a journaled volume
+       if (hfsmp->jnl) {
+               return 0;
+       }
+
         if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p) != 0)
                 return (0);
  
@@ -1254,6 +1345,7 @@ hfs_btsync(struct vnode *vp, int sync_transaction)
         register struct buf *bp;
         struct timeval tv;
         struct buf *nbp;
+       struct hfsmount *hfsmp = VTOHFS(vp);
         int s;
  
         /*
@@ -1267,13 +1359,30 @@ loop:
                 if ((bp->b_flags & B_BUSY))
                         continue;
                 if ((bp->b_flags & B_DELWRI) == 0)
-                       panic("hfs_fsync: not dirty");
+                       panic("hfs_btsync: not dirty (bp 0x%x hfsmp 0x%x)", bp, hfsmp);
+
+               // XXXdbg
+               if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+                       if ((bp->b_flags & B_META) == 0) {
+                               panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n",
+                                         bp, hfsmp->jnl);
+                       }
+                       // if journal_active() returns >= 0 then the journal is ok and we 
+                       // shouldn't do anything to this locked block (because it is part 
+                       // of a transaction).  otherwise we'll just go through the normal 
+                       // code path and flush the buffer.
+                       if (journal_active(hfsmp->jnl) >= 0) {
+                           continue;
+                       }
+               }
+
                 if (sync_transaction && !(bp->b_flags & B_LOCKED))
                         continue;
  
                 bremfree(bp);
                 bp->b_flags |= B_BUSY;
                 bp->b_flags &= ~B_LOCKED;
+
                 splx(s);
  
                 (void) bawrite(bp);
@@ -1316,7 +1425,7 @@ hfs_rmdir(ap)
         struct cnode *dcp;
         struct hfsmount * hfsmp;
         struct timeval tv;
-       int error = 0;
+       int error = 0, started_tr = 0, grabbed_lock = 0;
  
         cp = VTOC(vp);
         dcp = VTOC(dvp);
@@ -1327,6 +1436,17 @@ hfs_rmdir(ap)
                 vput(vp);
                 return (EINVAL);        /* cannot remove "." */
         }
+
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto out;
+           }
+               started_tr = 1;
+       }
+
         /*
          * Verify the directory is empty (and valid).
          * (Rmdir ".." won't be valid since
@@ -1372,6 +1492,7 @@ hfs_rmdir(ap)
         dcp->c_flag |= C_CHANGE | C_UPDATE;
         tv = time;
         (void) VOP_UPDATE(dvp, &tv, &tv, 0);
+
         hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID));
  
         cp->c_mode = 0;  /* Makes the vnode go away...see inactive */
@@ -1380,6 +1501,15 @@ out:
         if (dvp) 
                 vput(dvp);
         vput(vp);
+
+       // XXXdbg
+       if (started_tr) { 
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
         return (error);
  }
  
@@ -1415,6 +1545,7 @@ hfs_remove(ap)
         int truncated = 0;
         struct timeval tv;
         int error = 0;
+       int started_tr = 0, grabbed_lock = 0;
  
         /* Redirect directories to rmdir */
         if (vp->v_type == VDIR)
@@ -1435,7 +1566,7 @@ hfs_remove(ap)
             VNODE_IS_RSRC(vp)) {
                 error = EPERM;
                 goto out;
-        }
+       }
  
         /*
          * Aquire a vnode for a non-empty resource fork.
@@ -1447,6 +1578,17 @@ hfs_remove(ap)
                         goto out;
         }
  
+       // XXXdbg - don't allow deleting the journal or journal_info_block
+       if (hfsmp->jnl && cp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+
+               extd = &cp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                       error = EPERM;
+                       goto out;
+               }
+       }
+
         /*
          * Check if this file is being used.
          *
@@ -1470,9 +1612,48 @@ hfs_remove(ap)
                 goto out;
         }
  
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto out;
+           }
+           started_tr = 1;
+       }
+
         /* Remove our entry from the namei cache. */
         cache_purge(vp);
  
+       // XXXdbg - if we're journaled, kill any dirty symlink buffers 
+       if (hfsmp->jnl && vp->v_type == VLNK && vp->v_dirtyblkhd.lh_first) {
+           struct buf *bp, *nbp;
+
+         recheck:
+           for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
+                       nbp = bp->b_vnbufs.le_next;
+                       
+                       if ((bp->b_flags & B_BUSY)) {
+                               // if it was busy, someone else must be dealing
+                               // with it so just move on.
+                               continue;
+                       }
+
+                       if (!(bp->b_flags & B_META)) {
+                               panic("hfs: symlink bp @ 0x%x is not marked meta-data!\n", bp);
+                       }
+
+                       // if it's part of the current transaction, kill it.
+                       if (bp->b_flags & B_LOCKED) {
+                               bremfree(bp);
+                               bp->b_flags |= B_BUSY;
+                               journal_kill_block(hfsmp->jnl, bp);
+                               goto recheck;
+                       }
+           }
+       }
+       // XXXdbg
+
         /*
          * Truncate any non-busy forks.  Busy forks will
          * get trucated when their vnode goes inactive.
@@ -1535,8 +1716,42 @@ hfs_remove(ap)
                 if (error)
                         goto out;
  
+               /* Delete the link record */
                 error = cat_delete(hfsmp, &desc, &cp->c_attr);
  
+               if ((error == 0) && (--cp->c_nlink < 1)) {
+                       char inodename[32];
+                       char delname[32];
+                       struct cat_desc to_desc;
+                       struct cat_desc from_desc;
+
+                       /*
+                        * This is now esentially an open deleted file.
+                        * Rename it to reflect this state which makes
+                        * orphan file cleanup easier (see hfs_remove_orphans).
+                        * Note: a rename failure here is not fatal.
+                        */     
+                       MAKE_INODE_NAME(inodename, cp->c_rdev);
+                       bzero(&from_desc, sizeof(from_desc));
+                       from_desc.cd_nameptr = inodename;
+                       from_desc.cd_namelen = strlen(inodename);
+                       from_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+                       from_desc.cd_flags = 0;
+                       from_desc.cd_cnid = cp->c_fileid;
+
+                       MAKE_DELETED_NAME(delname, cp->c_fileid);               
+                       bzero(&to_desc, sizeof(to_desc));
+                       to_desc.cd_nameptr = delname;
+                       to_desc.cd_namelen = strlen(delname);
+                       to_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+                       to_desc.cd_flags = 0;
+                       to_desc.cd_cnid = cp->c_fileid;
+       
+                       (void) cat_rename(hfsmp, &from_desc, &hfsmp->hfs_privdir_desc,
+                                         &to_desc, (struct cat_desc *)NULL);
+                       cp->c_flag |= C_DELETED;
+               }
+
                 /* Unlock the Catalog */
                 (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
  
@@ -1548,8 +1763,9 @@ hfs_remove(ap)
                         goto out;
  
                 cp->c_flag |= C_CHANGE;
-                if (--cp->c_nlink < 1)
-                       cp->c_flag |= C_DELETED;
+               tv = time;
+               (void) VOP_UPDATE(vp, &tv, &tv, 0);
+
                 hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID));
  
         } else if (dataforkbusy || rsrcforkbusy) {
@@ -1573,12 +1789,16 @@ hfs_remove(ap)
  
                 /* Lock catalog b-tree */
                 error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-               if (error) goto out;
+               if (error)
+                       goto out;
  
                 error = cat_rename(hfsmp, &cp->c_desc, &todir_desc,
                                 &to_desc, (struct cat_desc *)NULL);
  
-               hfsmp->hfs_privdir_attr.ca_entries++;
+               // XXXdbg - only bump this count if we were successful
+               if (error == 0) {
+                       hfsmp->hfs_privdir_attr.ca_entries++;
+               }
                 (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc,
                                 &hfsmp->hfs_privdir_attr, NULL, NULL);
  
@@ -1588,22 +1808,33 @@ hfs_remove(ap)
  
                 cp->c_flag |= C_CHANGE | C_DELETED | C_NOEXISTS;
                 --cp->c_nlink;
+               tv = time;
+               (void) VOP_UPDATE(vp, &tv, &tv, 0);
  
         } else /* Not busy */ {
  
-               /* Lock catalog b-tree */
-               error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
-               if (error) goto out;
-
                 if (vp->v_type == VDIR && cp->c_entries > 0)
                         panic("hfs_remove: attempting to delete a non-empty directory!");
                 if (vp->v_type != VDIR && cp->c_blocks > 0)
                         panic("hfs_remove: attempting to delete a non-empty file!");
  
+               /* Lock catalog b-tree */
+               error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
+               if (error)
+                       goto out;
+
                 error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr);
  
-               if (error && truncated)
-                       panic("hfs_remove: couldn't delete a truncated file!");
+               if (error && error != ENXIO && truncated) {
+                       if ((cp->c_datafork && cp->c_datafork->ff_data.cf_size != 0) ||
+                               (cp->c_rsrcfork && cp->c_rsrcfork->ff_data.cf_size != 0)) {
+                               panic("hfs: remove: couldn't delete a truncated file! (%d, data sz %lld; rsrc sz %lld)",
+                                         error, cp->c_datafork->ff_data.cf_size, cp->c_rsrcfork->ff_data.cf_size);
+                       } else {
+                               printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n",
+                                          cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error);
+                       }
+               }
  
                 /* Unlock the Catalog */
                 (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
@@ -1642,10 +1873,23 @@ hfs_remove(ap)
         if (rvp)
                 vrele(rvp);
         VOP_UNLOCK(vp, 0, p);
-       (void) ubc_uncache(vp);
+       // XXXdbg - try to prevent the lost ubc_info panic
+       if ((cp->c_flag & C_HARDLINK) == 0 || cp->c_nlink == 0) {
+               (void) ubc_uncache(vp);
+       }
         vrele(vp);
         vput(dvp);
+
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
         return (0);
+
  out:
         if (rvp)
                 vrele(rvp);
@@ -1658,6 +1902,15 @@ out:
         }
         vput(vp);
         vput(dvp);
+
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
         return (error);
  }
  
@@ -1736,10 +1989,20 @@ hfs_rename(ap)
         struct hfsmount *hfsmp;
         struct proc *p = fcnp->cn_proc;
         struct timeval tv;
-       int retval = 0;
+       int retval = 0, started_tr = 0, grabbed_lock = 0;
+       int fdvp_locked = 0;
+       int fvp_locked = 0;
         cnid_t oldparent = 0;
         cnid_t newparent = 0;
  
+       // XXXdbg
+       if (fvp) 
+           hfsmp = VTOHFS(fvp);
+       else if (tvp)
+           hfsmp = VTOHFS(tvp);
+       else
+           hfsmp = NULL;
+       
  #if HFS_DIAGNOSTIC
      if ((tcnp->cn_flags & HASBUF) == 0 ||
          (fcnp->cn_flags & HASBUF) == 0)
@@ -1780,9 +2043,6 @@ hfs_rename(ap)
                 goto abortop;
         }
  
-       if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)))
-               goto abortop;
-
         /*
          * Make sure "from" vnode and its parent are changeable.
          */
@@ -1790,13 +2050,11 @@ hfs_rename(ap)
         fcp = VTOC(fvp);
         oldparent = fdcp->c_cnid;
         if ((fcp->c_flags & (IMMUTABLE | APPEND)) || (fdcp->c_flags & APPEND)) {
-               VOP_UNLOCK(fvp, 0, p);
                 retval = EPERM;
                 goto abortop;
         }
  
         if (fcp->c_parentcnid != fdcp->c_cnid) {
-               VOP_UNLOCK(fvp, 0, p);
                 retval = EINVAL;
                 goto abortop;
         }
@@ -1812,7 +2070,6 @@ hfs_rename(ap)
         if (fvp == ap->a_tvp &&
             (bcmp(fcp->c_desc.cd_nameptr, tcnp->cn_nameptr,
              fcp->c_desc.cd_namelen) == 0)) {
-               VOP_UNLOCK(fvp, 0, p);
                 retval = 0;
                 goto abortop;
         }
@@ -1829,7 +2086,6 @@ hfs_rename(ap)
                         || fdcp == fcp
                         || (fcnp->cn_flags&ISDOTDOT)
                         || (fcp->c_flag & C_RENAME)) {
-                       VOP_UNLOCK(fvp, 0, p);
                         retval = EINVAL;
                         goto abortop;
                 }
@@ -1846,6 +2102,27 @@ hfs_rename(ap)
  
         newparent = tdcp->c_cnid;
         
+       // XXXdbg - don't allow renaming the journal or journal_info_block
+       if (hfsmp->jnl && fcp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+                       
+               extd = &fcp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                       retval = EPERM;
+                       goto bad;
+               }
+       }
+
+       if (hfsmp->jnl && tcp && tcp->c_datafork) {
+               struct HFSPlusExtentDescriptor *extd;
+                       
+               extd = &tcp->c_datafork->ff_data.cf_extents[0];
+               if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+                       retval = EPERM;
+                       goto bad;
+               }
+       }
+
         retval = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc);
         if ((fvp->v_type == VDIR) && (newparent != oldparent)) {
                 if (retval)             /* write access check above */
@@ -1853,6 +2130,42 @@ hfs_rename(ap)
         }
         retval = 0;  /* Reset value from above, we dont care about it anymore */
         
+       /* XXX
+        * Prevent lock heirarchy violation (deadlock):
+        *
+        * If fdvp is the parent of tdvp then we must drop
+        * tdvp lock before aquiring the lock for fdvp.
+        *
+        * XXXdbg - moved this to happen up here *before* we
+        *          start a transaction.  otherwise we can
+        *          deadlock because the vnode layer may get
+        *          this lock for someone else and then they'll
+        *          never be able to start a transaction.
+        */
+       if (newparent != oldparent) {
+           if (fdcp->c_cnid == tdcp->c_parentcnid) {
+                       vput(tdvp);
+                       vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
+                       vget(tdvp, LK_EXCLUSIVE | LK_RETRY, p);
+           } else {
+                       vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
+               }
+       }
+       fdvp_locked = 1;
+       if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)))
+               goto bad;
+       fvp_locked = 1;
+       
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((retval = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto bad;
+           }
+               started_tr = 1;
+       }
+
         /*
          * If the destination exists, then be sure its type (file or dir)
          * matches that of the source.  And, if it is a directory make sure
@@ -1904,19 +2217,9 @@ hfs_rename(ap)
  
         }
  
-       /* XXX
-        * Prevent lock heirarchy violation (deadlock):
-        *
-        * If fdvp is the parent of tdvp then we must drop
-        * tdvp lock before aquiring the lock for fdvp.
-        */
-       if (newparent != oldparent)
-               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
-
         /* remove the existing entry from the namei cache: */
         cache_purge(fvp);
  
-       hfsmp = VTOHFS(fvp);
         bzero(&from_desc, sizeof(from_desc));
         from_desc.cd_nameptr = fcnp->cn_nameptr;
         from_desc.cd_namelen = fcnp->cn_namelen;
@@ -1933,18 +2236,18 @@ hfs_rename(ap)
         /* Lock catalog b-tree */
         retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
         if (retval) {
-               if (newparent != oldparent)  /* unlock the lock we just got */
-                       VOP_UNLOCK(fdvp, 0, p);
                  goto bad;
         }
-       retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc,
-                       &to_desc, &out_desc);
+       retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc,
+                                               &to_desc, &out_desc);
  
         /* Unlock catalog b-tree */
         (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
  
-       if (newparent != oldparent)
+       if (newparent != oldparent) {
                 VOP_UNLOCK(fdvp, 0, p);
+               fdvp_locked = 0;
+       }
  
         if (retval)  goto bad;
  
@@ -1965,13 +2268,19 @@ hfs_rename(ap)
                 fdcp->c_entries--;
         tdcp->c_nlink++;
         tdcp->c_entries++;
-       fdcp->c_flag |= C_UPDATE;
-       tdcp->c_flag |= C_UPDATE;
+       fdcp->c_flag |= C_CHANGE | C_UPDATE;
+       tdcp->c_flag |= C_CHANGE | C_UPDATE;
         tv = time;
         CTIMES(fdcp, &tv, &tv);
         CTIMES(tdcp, &tv, &tv);
         tdcp->c_childhint = out_desc.cd_hint;   /* Cache directory's location */
  
+       // make sure both directories get updated on disk.
+       if (fdvp != tdvp) {
+               (void) VOP_UPDATE(fdvp, &tv, &tv, 0);
+       }
+       (void) VOP_UPDATE(tdvp, &tv, &tv, 0);
+
         hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_RMDIR : VOL_RMFILE,
                 (fdcp->c_cnid == kHFSRootFolderID));
         hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_MKDIR : VOL_MKFILE,
@@ -1980,23 +2289,52 @@ hfs_rename(ap)
         vput(tdvp);
         vrele(fdvp);
         vput(fvp);
+
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
         return (0);
  
  bad:
         if (fcp)
                 fcp->c_flag &= ~C_RENAME;
+
+       // XXXdbg make sure both directories get updated on disk.
+       if (fdvp != tdvp) {
+               (void) VOP_UPDATE(fdvp, &tv, &tv, 0);
+       }
+       (void) VOP_UPDATE(tdvp, &tv, &tv, 0);
+
         if (tdvp == tvp)
                 vrele(tdvp);
         else
                 vput(tdvp);
         if (tvp)
                 vput(tvp);
-       vrele(fdvp);
  
-       if (VOP_ISLOCKED(fvp))
+       if (fdvp_locked)
+               vput(fdvp);
+       else
+               vrele(fdvp);
+
+       if (fvp_locked)
                 vput(fvp);
         else
                 vrele(fvp);
+
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+       }
+
         return (retval);
  
  abortop:
@@ -2011,6 +2349,7 @@ abortop:
         VOP_ABORTOP(fdvp, fcnp);
         vrele(fdvp);
         vrele(fvp);
+
         return (retval);
  }
  
@@ -2079,6 +2418,7 @@ hfs_symlink(ap)
         } */ *ap;
  {
         register struct vnode *vp, **vpp = ap->a_vpp;
+       struct hfsmount *hfsmp;
         struct filefork *fp;
         int len, error;
         struct buf *bp = NULL;
@@ -2097,16 +2437,31 @@ hfs_symlink(ap)
                 return (EINVAL);
         }
  
+
+       hfsmp = VTOHFS(ap->a_dvp);
+
         /* Create the vnode */
         if ((error = hfs_makenode(S_IFLNK | ap->a_vap->va_mode,
-           ap->a_dvp, vpp, ap->a_cnp)))
+                                                         ap->a_dvp, vpp, ap->a_cnp))) {
                 return (error);
+       }
  
         vp = *vpp;
         len = strlen(ap->a_target);
         fp = VTOF(vp);
         fp->ff_clumpsize = VTOVCB(vp)->blockSize;
  
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
+                       vput(ap->a_dvp);
+                       return (error);
+           }
+       }
+
         /* Allocate space for the link */
         error = VOP_TRUNCATE(vp, len, IO_NOZEROFILL,
                               ap->a_cnp->cn_cred, ap->a_cnp->cn_proc);
@@ -2116,10 +2471,21 @@ hfs_symlink(ap)
         /* Write the link to disk */
         bp = getblk(vp, 0, roundup((int)fp->ff_size, VTOHFS(vp)->hfs_phys_block_size),
                         0, 0, BLK_META);
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, bp);
+       }
         bzero(bp->b_data, bp->b_bufsize);
         bcopy(ap->a_target, bp->b_data, len);
-       bawrite(bp);
+       if (hfsmp->jnl) {
+               journal_modify_block_end(hfsmp->jnl, bp);
+       } else {
+               bawrite(bp);
+       }
  out:
+       if (hfsmp->jnl) {
+               journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
         vput(vp);
         return (error);
  }
@@ -2207,11 +2573,41 @@ hfs_readdir(ap)
         off_t off = uio->uio_offset;
         int retval = 0;
         int eofflag = 0;
-
+       void *user_start = NULL;
+       int   user_len;
+ 
         /* We assume it's all one big buffer... */
         if (uio->uio_iovcnt > 1 || uio->uio_resid < AVERAGE_HFSDIRENTRY_SIZE)
                 return EINVAL;
  
+       // XXXdbg
+       // We have to lock the user's buffer here so that we won't
+       // fault on it after we've acquired a shared lock on the
+       // catalog file.  The issue is that you can get a 3-way
+       // deadlock if someone else starts a transaction and then
+       // tries to lock the catalog file but can't because we're
+       // here and we can't service our page fault because VM is
+       // blocked trying to start a transaction as a result of
+       // trying to free up pages for our page fault.  It's messy
+       // but it does happen on dual-procesors that are paging
+       // heavily (see radar 3082639 for more info).  By locking
+       // the buffer up-front we prevent ourselves from faulting
+       // while holding the shared catalog file lock.
+       //
+       // Fortunately this and hfs_search() are the only two places
+       // currently (10/30/02) that can fault on user data with a
+       // shared lock on the catalog file.
+       //
+       if (hfsmp->jnl && uio->uio_segflg == UIO_USERSPACE) {
+               user_start = uio->uio_iov->iov_base;
+               user_len   = uio->uio_iov->iov_len;
+
+               if ((retval = vslock(user_start, user_len)) != 0) {
+                       return retval;
+               }
+       }
+
+
         /* Create the entries for . and .. */
         if (uio->uio_offset < sizeof(rootdots)) {
                 caddr_t dep;
@@ -2297,6 +2693,10 @@ hfs_readdir(ap)
         }
  
  Exit:;
+       if (hfsmp->jnl && user_start) {
+               vsunlock(user_start, user_len, TRUE);
+       }
+
         if (ap->a_eofflag)
                 *ap->a_eofflag = eofflag;
  
@@ -2359,7 +2759,9 @@ hfs_readlink(ap)
                 }
                 bcopy(bp->b_data, fp->ff_symlinkptr, (size_t)fp->ff_size);
                 if (bp) {
-                       bp->b_flags |= B_INVAL;         /* data no longer needed */
+                       if (VTOHFS(vp)->jnl && (bp->b_flags & B_LOCKED) == 0) {
+                               bp->b_flags |= B_INVAL;         /* data no longer needed */
+                       }
                         brelse(bp);
                 }
         }
@@ -2693,8 +3095,11 @@ hfs_update(ap)
         struct cat_fork *rsrcforkp = NULL;
         struct cat_fork datafork;
         int updateflag;
+       struct hfsmount *hfsmp;
         int error;
  
+       hfsmp = VTOHFS(vp);
+
         /* XXX do we really want to clear the sytem cnode flags here???? */
         if ((vp->v_flag & VSYSTEM) ||
             (VTOVFS(vp)->mnt_flag & MNT_RDONLY) ||
@@ -2706,11 +3111,13 @@ hfs_update(ap)
         updateflag = cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE);
  
         /* Nothing to update. */
-       if (updateflag == 0)
+       if (updateflag == 0) {
                 return (0);
+       }
         /* HFS standard doesn't have access times. */
-       if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord))
+       if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord)) {
                 return (0);
+       }
         if (updateflag & C_ACCESS) {
                 /*
                  * If only the access time is changing then defer
@@ -2764,12 +3171,24 @@ hfs_update(ap)
             (dataforkp && cp->c_datafork->ff_unallocblocks) ||
             (rsrcforkp && cp->c_rsrcfork->ff_unallocblocks)) {
                 if (updateflag & (C_CHANGE | C_UPDATE))
-                       hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0);       
+                       hfs_volupdate(hfsmp, VOL_UPDATE, 0);    
                 cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_UPDATE);
                 cp->c_flag |= C_MODIFIED;
+
                 return (0);
         }
  
+
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       if (hfsmp->jnl) {
+               if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       hfs_global_shared_lock_release(hfsmp);
+                       return error;
+           }
+       }
+                       
+
         /*
          * For files with invalid ranges (holes) the on-disk
          * field representing the size of the file (cf_size)
@@ -2786,18 +3205,29 @@ hfs_update(ap)
          * A shared lock is sufficient since an update doesn't change
          * the tree and the lock on vp protects the cnode.
          */
-       error = hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_SHARED, p);
-       if (error)
+       error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p);
+       if (error) {
+               if (hfsmp->jnl) {
+                       journal_end_transaction(hfsmp->jnl);
+               }
+               hfs_global_shared_lock_release(hfsmp);
                 return (error);
+       }
  
         /* XXX - waitfor is not enforced */
-       error = cat_update(VTOHFS(vp), &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp);
+       error = cat_update(hfsmp, &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp);
  
          /* Unlock the Catalog b-tree file. */
-       (void) hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_RELEASE, p);
+       (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
  
         if (updateflag & (C_CHANGE | C_UPDATE))
-               hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0);       
+               hfs_volupdate(hfsmp, VOL_UPDATE, 0);    
+
+       // XXXdbg
+       if (hfsmp->jnl) {
+           journal_end_transaction(hfsmp->jnl);
+       }
+       hfs_global_shared_lock_release(hfsmp);
  
         /* After the updates are finished, clear the flags */
         cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_ATIMEMOD);
@@ -2826,7 +3256,7 @@ hfs_makenode(mode, dvp, vpp, cnp)
         struct proc *p;
         struct cat_desc in_desc, out_desc;
         struct cat_attr attr;
-       int error;
+       int error, started_tr = 0, grabbed_lock = 0;
         enum vtype vnodetype;
  
         p = cnp->cn_proc;
@@ -2902,6 +3332,16 @@ hfs_makenode(mode, dvp, vpp, cnp)
         in_desc.cd_parentcnid = dcp->c_cnid;
         in_desc.cd_flags = S_ISDIR(mode) ? CD_ISDIR : 0;
  
+       // XXXdbg
+       hfs_global_shared_lock_acquire(hfsmp);
+       grabbed_lock = 1;
+       if (hfsmp->jnl) {
+           if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+                       goto exit;
+           }
+               started_tr = 1;
+       }
+
         /* Lock catalog b-tree */
         error = hfs_metafilelocking(VTOHFS(dvp), kHFSCatalogFileID, LK_EXCLUSIVE, p);
         if (error)
@@ -2921,14 +3361,37 @@ hfs_makenode(mode, dvp, vpp, cnp)
         dcp->c_flag |= C_CHANGE | C_UPDATE;
         tv = time;
         (void) VOP_UPDATE(dvp, &tv, &tv, 0);
+
         hfs_volupdate(hfsmp, vnodetype == VDIR ? VOL_MKDIR : VOL_MKFILE,
                 (dcp->c_cnid == kHFSRootFolderID));
  
+       // XXXdbg
+       // have to end the transaction here before we call hfs_getnewvnode()
+       // because that can cause us to try and reclaim a vnode on a different
+       // file system which could cause us to start a transaction which can
+       // deadlock with someone on that other file system (since we could be
+       // holding two transaction locks as well as various vnodes and we did
+       // not obtain the locks on them in the proper order).
+    //
+       // NOTE: this means that if the quota check fails or we have to update
+       //       the change time on a block-special device that those changes
+       //       will happen as part of independent transactions.
+       //
+       if (started_tr) {
+               journal_end_transaction(hfsmp->jnl);
+               started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+               grabbed_lock = 0;
+       }
+
         /* Create a vnode for the object just created: */
         error = hfs_getnewvnode(hfsmp, NULL, &out_desc, 0, &attr, NULL, &tvp);
         if (error)
                 goto exit;
  
+
  #if QUOTA
         cp = VTOC(tvp);
         /* 
@@ -2945,6 +3408,7 @@ hfs_makenode(mode, dvp, vpp, cnp)
                         VOP_RMDIR(dvp,tvp, cnp);
                 else
                         VOP_REMOVE(dvp,tvp, cnp);
+
                 return (error);
         }
  #endif /* QUOTA */
@@ -2960,8 +3424,8 @@ hfs_makenode(mode, dvp, vpp, cnp)
                 tvp->v_type = IFTOVT(mode);
                 cp->c_flag |= C_CHANGE;
                 tv = time;
-               if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) {
-                       vput(tvp);
+               if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) {
+                       vput(tvp);
                         goto exit;
                 }
         }
@@ -2974,6 +3438,16 @@ exit:
                 FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
         vput(dvp);
  
+       // XXXdbg
+       if (started_tr) {
+           journal_end_transaction(hfsmp->jnl);
+               started_tr = 0;
+       }
+       if (grabbed_lock) {
+               hfs_global_shared_lock_release(hfsmp);
+               grabbed_lock = 0;
+       }
+
         return (error);
  }
  
diff --git a/bsd/hfs/hfscommon/BTree/BTree.c b/bsd/hfs/hfscommon/BTree/BTree.c

index 12c2680afe2adcf32449fb4c2d025dfb6dd3dbf7..65c12839f4cd882fef81e68ccab1fb6a98204055 100644 (file)
--- a/bsd/hfs/hfscommon/BTree/BTree.c
+++ b/bsd/hfs/hfscommon/BTree/BTree.c
@@ -339,6 +339,20 @@ OSStatus   BTOpenPath                      (FCB                                    *filePtr,
         err = ReleaseNode (btreePtr, &nodeRec);
         M_ExitOnError (err);
  
+       /*
+        * Under Mac OS, b-tree nodes can be non-contiguous on disk when the
+        * allocation block size is smaller than the b-tree node size.
+        *
+        * If journaling is turned on for this volume we can't deal with this
+        * situation and so we bail out.  If journaling isn't on it's ok as
+        * hfs_strategy_fragmented() deals with it.  Journaling can't support
+        * this because it assumes that if you give it a block that it's
+        * contiguous on disk.
+        */
+       if ( FCBTOHFS(filePtr)->jnl && !NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize) ) {
+               return fsBTInvalidNodeErr;
+       }
+
         //////////////////////////////// Success ////////////////////////////////////
  
         //\80\80 align LEOF to multiple of node size?       - just on close
@@ -456,6 +470,9 @@ OSStatus    BTSearchRecord          (FCB                                            *filePtr,
         if (filePtr == nil)                                                                     return  paramErr;
         if (searchIterator == nil)                                                      return  paramErr;
  
+       node.buffer = nil;
+       node.blockHeader = nil;
+
         btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
         if (btreePtr == nil)                                                            return  fsBTInvalidFileErr;
  
@@ -629,9 +646,12 @@ OSStatus   BTIterateRecord         (FCB                                            *filePtr,
  
         ////////////////////////// Priliminary Checks ///////////////////////////////
  
-       left.buffer             = nil;
-       right.buffer    = nil;
-       node.buffer             = nil;
+       left.buffer               = nil;
+       left.blockHeader  = nil;
+       right.buffer      = nil;
+       right.blockHeader = nil;
+       node.buffer               = nil;
+       node.blockHeader  = nil;
  
  
         if (filePtr == nil)
@@ -928,9 +948,12 @@ BTIterateRecords(FCB *filePtr, BTreeIterationOperation operation, BTreeIterator
  
         ////////////////////////// Priliminary Checks ///////////////////////////////
  
-       left.buffer  = nil;
-       right.buffer = nil;
-       node.buffer  = nil;
+       left.buffer       = nil;
+       left.blockHeader  = nil;
+       right.buffer      = nil;
+       right.blockHeader = nil;
+       node.buffer       = nil;
+       node.blockHeader  = nil;
  
         btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
  
@@ -1201,10 +1224,10 @@ OSStatus        BTInsertRecord          (FCB                                            *filePtr,
         UInt16                                  index;
         Boolean                                 recordFit;
  
-
         ////////////////////////// Priliminary Checks ///////////////////////////////
  
         nodeRec.buffer = nil;                                   // so we can call ReleaseNode
+       nodeRec.blockHeader = nil;
  
         err = CheckInsertParams (filePtr, iterator, record, recordLen);
         if (err != noErr)
@@ -1241,6 +1264,9 @@ OSStatus  BTInsertRecord          (FCB                                            *filePtr,
                                                                 err = GetNewNode (btreePtr, insertNodeNum, &nodeRec);
                                                                 M_ExitOnError (err);
  
+                                                               // XXXdbg
+                                                               ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
                                                                 ((NodeDescPtr)nodeRec.buffer)->kind = kBTLeafNode;
                                                                 ((NodeDescPtr)nodeRec.buffer)->height   = 1;
  
@@ -1261,6 +1287,7 @@ OSStatus  BTInsertRecord          (FCB                                            *filePtr,
                                                                 btreePtr->rootNode                      = insertNodeNum;
                                                                 btreePtr->firstLeafNode         = insertNodeNum;
                                                                 btreePtr->lastLeafNode          = insertNodeNum;
+
                                                                 M_BTreeHeaderDirty (btreePtr);
  
                                                                 goto Success;
@@ -1270,6 +1297,9 @@ OSStatus  BTInsertRecord          (FCB                                            *filePtr,
  
         if (index > 0)
         {
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
                 recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, index,
                                                                                 &iterator->key, KeyLength(btreePtr, &iterator->key),
                                                                                 record->bufferAddress, recordLen);
@@ -1308,7 +1338,7 @@ Success:
         ++btreePtr->writeCount;
         ++btreePtr->leafRecords;
         M_BTreeHeaderDirty (btreePtr);
-
+               
         // create hint
         iterator->hint.writeCount       = btreePtr->writeCount;
         iterator->hint.nodeNum          = insertNodeNum;
@@ -1359,6 +1389,7 @@ OSStatus  BTReplaceRecord         (FCB                                            *filePtr,
         ////////////////////////// Priliminary Checks ///////////////////////////////
  
         nodeRec.buffer = nil;                                   // so we can call ReleaseNode
+       nodeRec.blockHeader = nil;
  
         err = CheckInsertParams (filePtr, iterator, record, recordLen);
         if (err != noErr)
@@ -1380,6 +1411,9 @@ OSStatus  BTReplaceRecord         (FCB                                            *filePtr,
                 err = GetNode (btreePtr, insertNodeNum, &nodeRec);
                 if( err == noErr )
                 {
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
                         err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit);
                         M_ExitOnError (err);
  
@@ -1415,6 +1449,9 @@ OSStatus  BTReplaceRecord         (FCB                                            *filePtr,
         // optimization - if simple replace will work then don't extend btree
         // \80\80 if we tried this before, and failed because it wouldn't fit then we shouldn't try this again...
  
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
         err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit);
         M_ExitOnError (err);
  
@@ -1441,6 +1478,9 @@ OSStatus  BTReplaceRecord         (FCB                                            *filePtr,
         }
  
  
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
         DeleteRecord (btreePtr, nodeRec.buffer, index); // delete existing key/record
  
         err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress,
@@ -1498,6 +1538,7 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator,
         ////////////////////////// Priliminary Checks ///////////////////////////////
  
         nodeRec.buffer = nil;                                   // so we can call ReleaseNode
+       nodeRec.blockHeader = nil;
  
         btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
  
@@ -1521,6 +1562,9 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator,
                                 err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen);
                                 M_ExitOnError (err);
  
+                               // XXXdbg
+                               ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
                                 err = callBackProc(keyPtr, recordPtr, recordLen, callBackState);
                                 M_ExitOnError (err);
  
@@ -1553,6 +1597,9 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator,
         err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen);
         M_ExitOnError (err);
  
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+                                                               
         err = callBackProc(keyPtr, recordPtr, recordLen, callBackState);
         M_ExitOnError (err);
  
@@ -1600,6 +1647,7 @@ OSStatus  BTDeleteRecord          (FCB                                            *filePtr,
         ////////////////////////// Priliminary Checks ///////////////////////////////
  
         nodeRec.buffer = nil;                                   // so we can call ReleaseNode
+       nodeRec.blockHeader = nil;
  
         M_ReturnErrorIf (filePtr == nil,        paramErr);
         M_ReturnErrorIf (iterator == nil,       paramErr);
@@ -1630,7 +1678,7 @@ OSStatus  BTDeleteRecord          (FCB                                            *filePtr,
         ++btreePtr->writeCount;
         --btreePtr->leafRecords;
         M_BTreeHeaderDirty (btreePtr);
-
+               
         iterator->hint.nodeNum  = 0;
  
         return noErr;
@@ -1682,7 +1730,16 @@ OSStatus BTGetInformation        (FCB                                    *filePtr,
         return noErr;
  }
  
+// XXXdbg
+__private_extern__
+OSStatus
+BTIsDirty(FCB *filePtr)
+{
+       BTreeControlBlockPtr    btreePtr;
  
+       btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+       return TreeIsDirty(btreePtr);
+}
  
  /*-------------------------------------------------------------------------------
  Routine:       BTFlushPath     -       Flush BTreeControlBlock to Header Node.
@@ -1743,6 +1800,9 @@ BTReloadData(FCB *filePtr)
         BTHeaderRec *header;    
  
  
+       node.buffer = nil;
+       node.blockHeader = nil;
+
         btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
         if (btreePtr == nil)
                 return (fsBTInvalidFileErr);
@@ -1877,3 +1937,62 @@ OSStatus BTSetLastSync           (FCB                                    *filePtr,
  }
  
  
+/*-------------------------------------------------------------------------------
+Routine:       BTCheckFreeSpace
+
+Function:      Makes sure there is enough free space so that a tree operation
+            will succeed.
+
+Input:         fcb     - pointer file control block
+
+Output:                none
+
+Result:                noErr                   - success
+            
+-------------------------------------------------------------------------------*/
+
+__private_extern__
+OSStatus       BTCheckFreeSpace                (FCB                                    *filePtr)
+{
+       BTreeControlBlockPtr    btreePtr;
+       int                                     nodesNeeded, err = noErr;
+
+
+       M_ReturnErrorIf (filePtr == nil,        paramErr);
+
+       btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+       
+       REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true);
+
+       M_ReturnErrorIf (btreePtr == nil,       fsBTInvalidFileErr);
+
+       // XXXdbg this is highly conservative but so much better than
+       //        winding up with turds on your disk.
+       //
+       nodesNeeded = (btreePtr->treeDepth + 1) * 10;
+       
+       if (btreePtr->freeNodes < nodesNeeded) {
+               err = ExtendBTree(btreePtr, nodesNeeded + btreePtr->totalNodes - btreePtr->freeNodes);
+       }
+
+       return err;
+}
+
+
+__private_extern__
+OSStatus       BTHasContiguousNodes    (FCB                                    *filePtr)
+{
+       BTreeControlBlockPtr    btreePtr;
+       int                                     nodesNeeded, err = noErr;
+
+
+       M_ReturnErrorIf (filePtr == nil,        paramErr);
+
+       btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+       
+       REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true);
+
+       M_ReturnErrorIf (btreePtr == nil,       fsBTInvalidFileErr);
+
+       return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize);
+}
diff --git a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c

index 60cfa06359546d76d8eb1aa048ad3e429e6732ce..a902d5087aef9ac162c5a802fe6883333f771171 100644 (file)
--- a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c
+++ b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c
@@ -125,12 +125,16 @@ OSStatus  AllocateNode (BTreeControlBlockPtr              btreePtr, UInt32        *nodeNum)
         nodeNumber              = 0;                            // first node number of header map record
         node.buffer             = nil;                          // clear node.buffer to get header node
                                                                                 //      - and for ErrorExit
+       node.blockHeader = nil;
         
         while (true)
         {
                 err = GetMapNode (btreePtr, &node, &mapPtr, &mapSize);
                 M_ExitOnError (err);
                 
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, &node);
+                                                               
         //////////////////////// Find Word with Free Bit ////////////////////////////
  
                 pos             = mapPtr;
@@ -233,6 +237,7 @@ OSStatus    FreeNode (BTreeControlBlockPtr          btreePtr, UInt32        nodeNum)
         //////////////////////////// Find Map Record ////////////////////////////////
         nodeIndex                       = 0;                            // first node number of header map record
         node.buffer                     = nil;                          // invalidate node.buffer to get header node
+       node.blockHeader    = nil;
         
         while (nodeNum >= nodeIndex)
         {
@@ -244,6 +249,9 @@ OSStatus    FreeNode (BTreeControlBlockPtr          btreePtr, UInt32        nodeNum)
         
         //////////////////////////// Mark Node Free /////////////////////////////////
  
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &node);
+                                                               
         nodeNum -= (nodeIndex - (mapSize << 3));                        // relative to this map record
         bitOffset = 15 - (nodeNum & 0x0000000F);                        // last 4 bits are bit offset
         mapPos += nodeNum >> 4;                                                         // point to word containing map bit
@@ -319,7 +327,9 @@ OSStatus    ExtendBTree     (BTreeControlBlockPtr   btreePtr,
         filePtr                         = GetFileControlBlock(btreePtr->fileRefNum);
         
         mapNode.buffer          = nil;
+       mapNode.blockHeader = nil;
         newNode.buffer          = nil;
+       newNode.blockHeader = nil;
  
         mapNodeRecSize  = nodeSize - sizeof(BTNodeDescriptor) - 6;      // 2 bytes of free space (see note)
  
@@ -379,6 +389,8 @@ OSStatus    ExtendBTree     (BTreeControlBlockPtr   btreePtr,
         
  
         /////////////////////// Initialize New Map Nodes ////////////////////////////
+       // XXXdbg - this is the correct place for this:
+       ModifyBlockStart(btreePtr->fileRefNum, &mapNode);
  
         ((BTNodeDescriptor*)mapNode.buffer)->fLink = firstNewMapNodeNum;
  
@@ -388,6 +400,9 @@ OSStatus    ExtendBTree     (BTreeControlBlockPtr   btreePtr,
                 err = GetNewNode (btreePtr, nodeNum, &newNode);
                 M_ExitOnError (err);
                 
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, &newNode);
+
                 ((NodeDescPtr)newNode.buffer)->numRecords       = 1;
                 ((NodeDescPtr)newNode.buffer)->kind = kBTMapNode;
                 
@@ -428,6 +443,9 @@ OSStatus    ExtendBTree     (BTreeControlBlockPtr   btreePtr,
                         err = GetNode (btreePtr, nextNodeNum, &mapNode);
                         M_ExitOnError (err);
                         
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &mapNode);
+
                         mapIndex = 0;
                         
                         mapStart         = (UInt16 *) GetRecordAddress (btreePtr, mapNode.buffer, mapIndex);
@@ -476,7 +494,7 @@ Success:
         ////////////////////////////// Error Exit ///////////////////////////////////
  
  ErrorExit:
-
+       
         (void) ReleaseNode (btreePtr, &mapNode);
         (void) ReleaseNode (btreePtr, &newNode);
         
diff --git a/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c b/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c

index c71fab021edbb20f2adddc0a672c59be5c2d8158..7d56bf4f8f2801160db42fd476cd8c75f08b2f7b 100644 (file)
--- a/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c
+++ b/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c
@@ -209,6 +209,14 @@ OSStatus   VerifyHeader    (FCB                            *filePtr,
  
  
  
+__private_extern__
+OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr)
+{
+    return (btreePtr->flags & kBTHeaderDirty);
+}
+
+
+
  /*-------------------------------------------------------------------------------
  Routine:       UpdateHeader    -       Write BTreeInfoRec fields to Header node.
  
@@ -229,15 +237,18 @@ OSStatus UpdateHeader(BTreeControlBlockPtr btreePtr, Boolean forceWrite)
         BTHeaderRec     *header;        
         UInt32 options;
  
-
         if ((btreePtr->flags & kBTHeaderDirty) == 0)                    // btree info already flushed
         return  noErr;
         
         
         err = GetNode (btreePtr, kHeaderNodeNum, &node );
-       if (err != noErr)
+       if (err != noErr) {
                 return  err;
+       }
         
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &node);
+
         header = (BTHeaderRec*) ((char *)node.buffer + sizeof(BTNodeDescriptor));
         
         header->treeDepth               = btreePtr->treeDepth;
@@ -315,8 +326,11 @@ OSStatus   FindIteratorPosition    (BTreeControlBlockPtr    btreePtr,
         // assume foundRecord points to Boolean
         
         left->buffer            = nil;
+       left->blockHeader   = nil;
         middle->buffer          = nil;
+       middle->blockHeader     = nil;
         right->buffer           = nil;
+       right->blockHeader      = nil;
         
         foundIt                         = false;
         
diff --git a/bsd/hfs/hfscommon/BTree/BTreeScanner.c b/bsd/hfs/hfscommon/BTree/BTreeScanner.c

index 014069807e94da6d338a15b17565f3d83770eb5d..8cc50aaa18d8d41629ac7088b34994a6081c0a0e 100644 (file)
--- a/bsd/hfs/hfscommon/BTree/BTreeScanner.c
+++ b/bsd/hfs/hfscommon/BTree/BTreeScanner.c
@@ -221,7 +221,7 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr )
         // release old buffer if we have one
         if ( theScanStatePtr->bufferPtr != NULL )
         {
-               theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE);
+           theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE);
                 brelse( theScanStatePtr->bufferPtr );
                 theScanStatePtr->bufferPtr = NULL;
                 theScanStatePtr->currentNodePtr = NULL;
@@ -249,10 +249,10 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr )
         
         // now read blocks from the device 
         myErr = bread(  myDevPtr, 
-                                       myPhyBlockNum, 
-                                       myBufferSize,  
-                                       NOCRED, 
-                                       &theScanStatePtr->bufferPtr );
+                                                       myPhyBlockNum, 
+                                                       myBufferSize,  
+                                                       NOCRED, 
+                                                       &theScanStatePtr->bufferPtr );
         if ( myErr != E_NONE )
         {
                 goto ExitThisRoutine;
@@ -374,7 +374,7 @@ int  BTScanTerminate(       BTScanState *           scanState,
         if ( scanState->bufferPtr != NULL )
         {
                 scanState->bufferPtr->b_flags |= (B_INVAL | B_AGE);
-               brelse( scanState->bufferPtr ); 
+               brelse( scanState->bufferPtr );
                 scanState->bufferPtr = NULL;
                 scanState->currentNodePtr = NULL;
         }
diff --git a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c

index 2de28032106bc6526f1551ac41f3b6a563bb8843..3a8463911628ac01863caf5fa273401276c94a23 100644 (file)
--- a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c
+++ b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c
@@ -395,13 +395,17 @@ OSStatus  InsertLevel (BTreeControlBlockPtr                btreePtr,
         PanicIf ((level == 1) && (((NodeDescPtr)targetNode->buffer)->kind != kBTLeafNode), "\P InsertLevel: non-leaf at level 1! ");
  #endif
         leftNode.buffer = nil;
+       leftNode.blockHeader = nil;
         targetNodeNum = treePathTable [level].node;
  
         insertParent = false;
         updateParent = false;
  
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, targetNode);
+
         ////// process first insert //////
-       
+
         err = InsertNode (btreePtr, primaryKey, targetNode, targetNodeNum, index,
                                           &newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &newRoot );
         M_ExitOnError (err);
@@ -446,6 +450,9 @@ OSStatus    InsertLevel (BTreeControlBlockPtr                btreePtr,
                 UInt8 *                         recPtr;
                 UInt16                          recSize;
                 
+               parentNode.buffer = nil;
+               parentNode.blockHeader = nil;
+
                 secondaryKey = nil;
                 
                 PanicIf ( (level == btreePtr->treeDepth), "\p InsertLevel: unfinished insert!?");
@@ -468,6 +475,9 @@ OSStatus    InsertLevel (BTreeControlBlockPtr                btreePtr,
         
                 if ( updateParent )
                 {
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &parentNode);
+
                         //\80\80 debug: check if ptr == targetNodeNum
                         GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize);
                         PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p InsertLevel: parent ptr doesn't match target node!");
@@ -594,6 +604,8 @@ static OSErr        InsertNode      (BTreeControlBlockPtr    btreePtr,
                 {
                         err = GetNode (btreePtr, leftNodeNum, leftNode);        // will be released by caller or a split below
                         M_ExitOnError (err);
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, leftNode);
                 }
  
                 PanicIf ( ((NodeDescPtr) leftNode->buffer)->fLink != node, "\p InsertNode, RotateLeft: invalid sibling link!" );
@@ -642,7 +654,6 @@ static OSErr        InsertNode      (BTreeControlBlockPtr    btreePtr,
         return noErr;
  
  ErrorExit:
-
         (void) ReleaseNode (btreePtr, leftNode);
         return err;
         
@@ -678,7 +689,11 @@ OSStatus   DeleteTree                      (BTreeControlBlockPtr            btreePtr,
         Boolean                         deleteRequired;
         Boolean                         updateRequired;
  
-
+       // XXXdbg - initialize these to null in case we get an
+       //          error and try to exit before it's initialized
+       parentNode.buffer      = nil;   
+       parentNode.blockHeader = nil;
+       
         deleteRequired = false;
         updateRequired = false;
  
@@ -686,6 +701,9 @@ OSStatus    DeleteTree                      (BTreeControlBlockPtr            btreePtr,
         targetNodePtr = targetNode->buffer;
         PanicIf (targetNodePtr == nil, "\pDeleteTree: targetNode has nil buffer!");
  
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, targetNode);
+
         DeleteRecord (btreePtr, targetNodePtr, index);
                 
         //\80\80 coalesce remaining records?
@@ -697,6 +715,9 @@ OSStatus    DeleteTree                      (BTreeControlBlockPtr            btreePtr,
  
                 deleteRequired = true;
                 
+               siblingNode.buffer = nil;
+               siblingNode.blockHeader = nil;
+
                 ////////////////// Get Siblings & Update Links //////////////////////////
                 
                 siblingNodeNum = targetNodePtr->bLink;                          // Left Sibling Node
@@ -704,6 +725,10 @@ OSStatus   DeleteTree                      (BTreeControlBlockPtr            btreePtr,
                 {
                         err = GetNode (btreePtr, siblingNodeNum, &siblingNode);
                         M_ExitOnError (err);
+
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &siblingNode);
+
                         ((NodeDescPtr)siblingNode.buffer)->fLink = targetNodePtr->fLink;
                         err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction);
                         M_ExitOnError (err);
@@ -718,6 +743,10 @@ OSStatus   DeleteTree                      (BTreeControlBlockPtr            btreePtr,
                 {
                         err = GetNode (btreePtr, siblingNodeNum, &siblingNode);
                         M_ExitOnError (err);
+
+                       // XXXdbg
+                       ModifyBlockStart(btreePtr->fileRefNum, &siblingNode);
+
                         ((NodeDescPtr)siblingNode.buffer)->bLink = targetNodePtr->bLink;
                         err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction);
                         M_ExitOnError (err);
@@ -733,6 +762,7 @@ OSStatus    DeleteTree                      (BTreeControlBlockPtr            btreePtr,
                 
                 err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction);
                 M_ExitOnError (err);
+
                 err = FreeNode (btreePtr, targetNodeNum);
                 M_ExitOnError (err);
         }
@@ -776,6 +806,9 @@ OSStatus    DeleteTree                      (BTreeControlBlockPtr            btreePtr,
                          UInt16         recSize;
                          UInt32         insertNode;
                          
+                        // XXXdbg
+                        ModifyBlockStart(btreePtr->fileRefNum, &parentNode);
+
                         //\80\80 debug: check if ptr == targetNodeNum
                         GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize);
                         PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p DeleteTree: parent ptr doesn't match targetNodeNum!!");
@@ -805,7 +838,7 @@ OSStatus    DeleteTree                      (BTreeControlBlockPtr            btreePtr,
         return  noErr;
  
  ErrorExit:
-       
+
         (void) ReleaseNode (btreePtr, targetNode);
         (void) ReleaseNode (btreePtr, &parentNode);
  
@@ -826,6 +859,9 @@ static OSStatus     CollapseTree    (BTreeControlBlockPtr           btreePtr,
         
         originalRoot    = btreePtr->rootNode;
         
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, blockPtr);
+
         while (true)
         {
                 if ( ((NodeDescPtr)blockPtr->buffer)->numRecords > 1)
@@ -848,6 +884,9 @@ static OSStatus     CollapseTree    (BTreeControlBlockPtr           btreePtr,
                 //// Get New Root Node
                 err = GetNode (btreePtr, btreePtr->rootNode, blockPtr);
                 M_ExitOnError (err);
+
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, blockPtr);
         }
         
         if (btreePtr->rootNode != originalRoot)
@@ -1110,6 +1149,9 @@ static OSStatus   SplitLeft               (BTreeControlBlockPtr            btreePtr,
  
         if ( left != nil )
         {
+               // XXXdbg
+               ModifyBlockStart(btreePtr->fileRefNum, leftNode);
+
                 left->fLink     = newNodeNum;
                 err = UpdateNode (btreePtr, leftNode, 0, kLockTransaction);
                 M_ExitOnError (err);
@@ -1121,6 +1163,9 @@ static OSStatus   SplitLeft               (BTreeControlBlockPtr            btreePtr,
         err = GetNewNode (btreePtr, newNodeNum, leftNode);
         M_ExitOnError (err);
         
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, leftNode);
+
         left            = leftNode->buffer;
         left->fLink     = rightNodeNum;
         
@@ -1145,8 +1190,9 @@ static OSStatus   SplitLeft               (BTreeControlBlockPtr            btreePtr,
  
         err = RotateLeft (btreePtr, left, right, index, keyPtr, recPtr, recSize,
                                           insertIndex, insertNodeNum, &recordFit, recsRotated);
-       M_ExitOnError (err);
         
+       M_ExitOnError (err);
+
         return noErr;
         
  ErrorExit:
@@ -1202,6 +1248,9 @@ static OSStatus   AddNewRootNode  (BTreeControlBlockPtr    btreePtr,
         Boolean                         didItFit;
         UInt16                          keyLength;      
         
+       rootNode.buffer = nil;
+       rootNode.blockHeader = nil;
+
         PanicIf (leftNode == nil, "\pAddNewRootNode: leftNode == nil");
         PanicIf (rightNode == nil, "\pAddNewRootNode: rightNode == nil");
         
@@ -1214,6 +1263,9 @@ static OSStatus   AddNewRootNode  (BTreeControlBlockPtr    btreePtr,
         err = GetNewNode (btreePtr, rootNum, &rootNode);
         M_ExitOnError (err);
                 
+       // XXXdbg
+       ModifyBlockStart(btreePtr->fileRefNum, &rootNode);
+
         ((NodeDescPtr)rootNode.buffer)->kind = kBTIndexNode;
         ((NodeDescPtr)rootNode.buffer)->height  = ++btreePtr->treeDepth;
         
diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c

index 923e90334bf855d5d4ff84105419b9f17e8f70a2..44e5996a0ac604b43a27f814ff73be24fcec148b 100644 (file)
--- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c
+++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c
@@ -65,6 +65,9 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param
         err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL);
         ReturnIfError(err);
  
+       err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+       ReturnIfError(err);
+       
         if ( isHFSPlus )
         {
                 //--    Step 1: Check the catalog nodes for extents
diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c

index b294edd9ada7c28ab253a7502d32a4e0fd7af9b1..6831d79c0c8898162afa9045fd8ecca1d4aef086 100644 (file)
--- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c
+++ b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c
@@ -495,6 +495,12 @@ static OSErr CreateExtentRecord(
         
         err = noErr;
         *hint = 0;
+
+       // XXXdbg - preflight that there's enough space
+       err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+       if (err)
+               return err;
+
         MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
         bzero(btIterator, sizeof(*btIterator));
         
@@ -530,6 +536,8 @@ static OSErr CreateExtentRecord(
         if (err == noErr)
                 *hint = btIterator->hint.nodeNum;
  
+       (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum));
+       
         FREE(btIterator, M_TEMP);       
         return err;
  }
@@ -545,6 +553,12 @@ OSErr DeleteExtentRecord(
         OSErr                           err;
         
         err = noErr;
+
+       // XXXdbg - preflight that there's enough space
+       err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+       if (err)
+               return err;
+
         MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
         bzero(btIterator, sizeof(*btIterator));
         
@@ -569,7 +583,8 @@ OSErr DeleteExtentRecord(
         }
  
         err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator);
-
+       (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum));
+       
         FREE(btIterator, M_TEMP);       
         return err;
  }
@@ -1730,6 +1745,12 @@ static OSErr UpdateExtentRecord (
                 //      Need to find and change a record in Extents BTree
                 //
                 btFCB = GetFileControlBlock(vcb->extentsRefNum);
+
+               // XXXdbg - preflight that there's enough space
+               err = BTCheckFreeSpace(btFCB);
+               if (err)
+                       return err;
+
                 MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
                 bzero(btIterator, sizeof(*btIterator));
  
@@ -1757,6 +1778,7 @@ static OSErr UpdateExtentRecord (
  
                         if (err == noErr)
                                 err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize);
+                       (void) BTFlushPath(btFCB);
                 }
                 else {          //      HFS Plus volume
                         HFSPlusExtentRecord     foundData;              // The extent data actually found
@@ -1776,6 +1798,7 @@ static OSErr UpdateExtentRecord (
                                 BlockMoveData(extentData, &foundData, sizeof(HFSPlusExtentRecord));
                                 err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize);
                         }
+                       (void) BTFlushPath(btFCB);
                 }
                 FREE(btIterator, M_TEMP);       
         }
@@ -1887,3 +1910,58 @@ static Boolean ExtentsAreIntegral(
         
         return true;
  }
+
+
+//_________________________________________________________________________________
+//
+// Routine:            NodesAreContiguous
+//
+// Purpose:            Ensure that all b-tree nodes are contiguous on disk
+//                             Called by BTOpenPath during volume mount
+//_________________________________________________________________________________
+
+Boolean NodesAreContiguous(
+       ExtendedVCB     *vcb,
+       FCB                     *fcb,
+       UInt32          nodeSize)
+{
+       UInt32                          mask;
+       UInt32                          startBlock;
+       UInt32                          blocksChecked;
+       UInt32                          hint;
+       HFSPlusExtentKey        key;
+       HFSPlusExtentRecord     extents;
+       OSErr                           result;
+       Boolean                         lastExtentReached;
+       
+
+       if (vcb->blockSize >= nodeSize)
+               return TRUE;
+
+       mask = (nodeSize / vcb->blockSize) - 1;
+
+       // check the local extents
+       (void) GetFCBExtentRecord(fcb, extents);
+       if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) )
+               return FALSE;
+
+       if (lastExtentReached || (SInt64)((SInt64)blocksChecked * (SInt64)vcb->blockSize) >= fcb->ff_size)
+               return TRUE;
+
+       startBlock = blocksChecked;
+
+       // check the overflow extents (if any)
+       while ( !lastExtentReached )
+       {
+               result = FindExtentRecord(vcb, kDataForkType, fcb->ff_cp->c_fileid, startBlock, FALSE, &key, extents, &hint);
+               if (result) break;
+
+               if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) )
+                       return FALSE;
+
+               startBlock += blocksChecked;
+       }
+
+       return TRUE;
+}
+
diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c

index ae4fccf6f0bc38097538ef81945c4e331d1303b6..4fe6499214d2987b85693abafd71ddf6e88b18c8 100644 (file)
--- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c
+++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c
@@ -476,7 +476,14 @@ static OSErr ReleaseBitmapBlock(
  
         if (bp) {
                 if (dirty) {
-                       bdwrite(bp);
+                       // XXXdbg
+                       struct hfsmount *hfsmp = VCBTOHFS(vcb);
+                       
+                       if (hfsmp->jnl) {
+                               journal_modify_block_end(hfsmp->jnl, bp);
+                       } else {
+                               bdwrite(bp);
+                       }
                 } else {
                         brelse(bp);
                 }
@@ -597,6 +604,7 @@ static OSErr BlockAllocateAny(
         UInt32  bitsPerBlock;
         UInt32  wordsPerBlock;
         Boolean dirty = false;
+       struct hfsmount *hfsmp = VCBTOHFS(vcb);
  
         //      Since this routine doesn't wrap around
         if (maxBlocks > (endingBlock - startingBlock)) {
@@ -678,6 +686,11 @@ static OSErr BlockAllocateAny(
                 endingBlock = block + maxBlocks;        //      if we get this far, we've found enough
         }
         
+       // XXXdbg
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+       }
+
         //
         //      Allocate all of the consecutive blocks
         //
@@ -709,6 +722,11 @@ static OSErr BlockAllocateAny(
                                 if (err != noErr) goto Exit;
                  buffer = currCache;
  
+                               // XXXdbg
+                               if (hfsmp->jnl) {
+                                       journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                               }
+                               
                                 wordsLeft = wordsPerBlock;
                         }
                         
@@ -845,6 +863,8 @@ static OSErr BlockMarkAllocated(
         UInt32  blockRef;
         UInt32  bitsPerBlock;
         UInt32  wordsPerBlock;
+       // XXXdbg
+       struct hfsmount *hfsmp = VCBTOHFS(vcb);
  
         //
         //      Pre-read the bitmap block containing the first word of allocation
@@ -866,6 +886,11 @@ static OSErr BlockMarkAllocated(
                 wordsLeft = wordsPerBlock - wordIndexInBlock;
         }
         
+       // XXXdbg
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+       }
+
         //
         //      If the first block to allocate doesn't start on a word
         //      boundary in the bitmap, then treat that first word
@@ -909,6 +934,11 @@ static OSErr BlockMarkAllocated(
                         err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
                         if (err != noErr) goto Exit;
  
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                       }
+
                         //      Readjust currentWord and wordsLeft
                         currentWord = buffer;
                         wordsLeft = wordsPerBlock;
@@ -942,6 +972,11 @@ static OSErr BlockMarkAllocated(
                         err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
                         if (err != noErr) goto Exit;
  
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                       }
+                       
                         //      Readjust currentWord and wordsLeft
                         currentWord = buffer;
                         wordsLeft = wordsPerBlock;
@@ -995,6 +1030,8 @@ static OSErr BlockMarkFree(
         UInt32  blockRef;
         UInt32  bitsPerBlock;
         UInt32  wordsPerBlock;
+    // XXXdbg
+       struct hfsmount *hfsmp = VCBTOHFS(vcb);
  
         //
         //      Pre-read the bitmap block containing the first word of allocation
@@ -1002,6 +1039,11 @@ static OSErr BlockMarkFree(
  
         err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
         if (err != noErr) goto Exit;
+       // XXXdbg
+       if (hfsmp->jnl) {
+               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+       }
+
         //
         //      Initialize currentWord, and wordsLeft.
         //
@@ -1058,6 +1100,11 @@ static OSErr BlockMarkFree(
                         err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
                         if (err != noErr) goto Exit;
  
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                       }
+
                         //      Readjust currentWord and wordsLeft
                         currentWord = buffer;
                         wordsLeft = wordsPerBlock;
@@ -1092,6 +1139,11 @@ static OSErr BlockMarkFree(
                         err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
                         if (err != noErr) goto Exit;
  
+                       // XXXdbg
+                       if (hfsmp->jnl) {
+                               journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+                       }
+                       
                         //      Readjust currentWord and wordsLeft
                         currentWord = buffer;
                         wordsLeft = wordsPerBlock;
diff --git a/bsd/hfs/hfscommon/headers/BTreesInternal.h b/bsd/hfs/hfscommon/headers/BTreesInternal.h

index a473cfceb56070aa51e29813053363e7e6328b7b..4ae9e7ad31c3ddaeabd81165957bc52095e84619 100644 (file)
--- a/bsd/hfs/hfscommon/headers/BTreesInternal.h
+++ b/bsd/hfs/hfscommon/headers/BTreesInternal.h
@@ -115,7 +115,8 @@ struct BlockDescriptor{
         void            *blockHeader;
         ByteCount        blockSize;
         Boolean          blockReadFromDisk;
-       Byte             reserved[3];
+       Byte         isModified;             // XXXdbg - for journaling
+       Byte             reserved[2];
  };
  typedef struct BlockDescriptor BlockDescriptor;
  typedef BlockDescriptor *BlockDescPtr;
@@ -338,6 +339,10 @@ extern OSStatus    BTGetLastSync           (FCB                                            *filePtr,
  extern OSStatus        BTSetLastSync           (FCB                                            *filePtr,
                                                                          UInt32                                         lastfsync );
  
+extern OSStatus        BTCheckFreeSpace        (FCB                                            *filePtr);
+
+extern OSStatus        BTHasContiguousNodes(FCB                                                *filePtr);
+
  #endif /* __APPLE_API_PRIVATE */
  #endif /* KERNEL */
  #endif // __BTREESINTERNAL__
diff --git a/bsd/hfs/hfscommon/headers/BTreesPrivate.h b/bsd/hfs/hfscommon/headers/BTreesPrivate.h

index 4721f13a5741c14e299db995003472676ab145f4..805c86346b69ffbf7c10eb804171c3dda4ed8ece 100644 (file)
--- a/bsd/hfs/hfscommon/headers/BTreesPrivate.h
+++ b/bsd/hfs/hfscommon/headers/BTreesPrivate.h
@@ -382,6 +382,10 @@ OSStatus   ReleaseNode                             (BTreeControlBlockPtr    btreePtr,
  OSStatus       TrashNode                               (BTreeControlBlockPtr    btreePtr,
                                                                          NodePtr                                 nodePtr );
  
+// XXXdbg
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr);
+// XXXdbg
+
  OSStatus       UpdateNode                              (BTreeControlBlockPtr    btreePtr,
                                                                          NodePtr                                 nodePtr,
                                                                          UInt32                                  transactionID,
diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c

index ed614c238d60f594c3144aaa2b0c9837ad007d40..3febc75bf148f077d4170a500f510a4288151304 100644 (file)
--- a/bsd/kern/kern_mman.c
+++ b/bsd/kern/kern_mman.c
@@ -1086,6 +1086,10 @@ kern_return_t map_fd_funneled(
         
         if (fp->f_type != DTYPE_VNODE)
                 return(KERN_INVALID_ARGUMENT);
+
+       if (!(fp->f_flag & FREAD))
+               return (KERN_PROTECTION_FAILURE);
+
         vp = (struct vnode *)fp->f_data;
  
         if (vp->v_type != VREG)
diff --git a/bsd/kern/qsort.c b/bsd/kern/qsort.c

index 6ccb0411212622f9184f789853a414dc4dcb3b99..d1505f175c7bdd5d3e79ea02f846164c79576d6d 100644 (file)
--- a/bsd/kern/qsort.c
+++ b/bsd/kern/qsort.c
@@ -58,7 +58,7 @@
  
  
  #include <sys/types.h>
-#include <stdlib.h>
+//#include <stdlib.h>
  
  static inline char     *med3 __P((char *, char *, char *, int (*)()));
  static inline void      swapfunc __P((char *, char *, int, int));
@@ -113,6 +113,7 @@ med3(a, b, c, cmp)
                :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
  }
  
+__private_extern__
  void
  qsort(a, n, es, cmp)
         void *a;
diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c

index 955b6b638e5702d987244782c36e872475bcfb14..47cb041ab19db3152d4c426a69cd2d0c41532813 100644 (file)
--- a/bsd/kern/ubc_subr.c
+++ b/bsd/kern/ubc_subr.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 1999-2001 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved.
   *
   * @APPLE_LICENSE_HEADER_START@
   * 
@@ -78,6 +78,62 @@ ubc_unlock(struct vnode *vp)
         simple_unlock(&vp->v_interlock);
  }
  
+/*
+ * Serialize the requests to the VM
+ * Returns:
+ *             0       -       Failure
+ *             1       -       Sucessful in acquiring the lock
+ *             2       -       Sucessful in acquiring the lock recursively
+ *                             do not call ubc_unbusy()
+ *                             [This is strange, but saves 4 bytes in struct ubc_info]
+ */
+static int
+ubc_busy(struct vnode *vp)
+{
+       register struct ubc_info        *uip;
+
+       if (!UBCINFOEXISTS(vp))
+               return (0);
+
+       uip = vp->v_ubcinfo;
+
+       while (ISSET(uip->ui_flags, UI_BUSY)) {
+
+               if (uip->ui_owner == (void *)current_thread())
+                       return (2);
+
+               SET(uip->ui_flags, UI_WANTED);
+               (void) tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "ubcbusy", 0);
+
+               if (!UBCINFOEXISTS(vp))
+                       return (0);
+       }
+       uip->ui_owner = (void *)current_thread();
+
+       SET(uip->ui_flags, UI_BUSY);
+
+       return (1);
+}
+
+static void
+ubc_unbusy(struct vnode *vp)
+{
+       register struct ubc_info        *uip;
+
+       if (!UBCINFOEXISTS(vp)) {
+               wakeup((caddr_t)&vp->v_ubcinfo);
+               return;
+       }
+       uip = vp->v_ubcinfo;
+       CLR(uip->ui_flags, UI_BUSY);
+       uip->ui_owner = (void *)NULL;
+
+       if (ISSET(uip->ui_flags, UI_WANTED)) {
+               CLR(uip->ui_flags, UI_WANTED);
+               wakeup((caddr_t)&vp->v_ubcinfo);
+       }
+}
+
  /*
   *     Initialization of the zone for Unified Buffer Cache.
   */
@@ -139,6 +195,7 @@ ubc_info_init(struct vnode *vp)
                 uip->ui_refcount = 1;
                 uip->ui_size = 0;
                 uip->ui_mapped = 0;
+               uip->ui_owner = (void *)NULL;
                 ubc_lock(vp);
         }
  #if DIAGNOSTIC
@@ -232,10 +289,20 @@ ubc_info_free(struct ubc_info *uip)
  void
  ubc_info_deallocate(struct ubc_info *uip)
  {
+
         assert(uip->ui_refcount > 0);
  
-    if (uip->ui_refcount-- == 1)
+    if (uip->ui_refcount-- == 1) {
+               struct vnode *vp;
+
+               vp = uip->ui_vnode;
+               if (ISSET(uip->ui_flags, UI_WANTED)) {
+                       CLR(uip->ui_flags, UI_WANTED);
+                       wakeup((caddr_t)&vp->v_ubcinfo);
+               }
+
                 ubc_info_free(uip);
+       }
  }
  
  /*
@@ -339,12 +406,16 @@ ubc_uncache(struct vnode *vp)
  {
         kern_return_t kret;
         struct ubc_info *uip;
+       int    recursed;
         memory_object_control_t control;
         memory_object_perf_info_data_t   perf;
  
         if (!UBCINFOEXISTS(vp))
                 return (0);
  
+       if ((recursed = ubc_busy(vp)) == 0)
+               return (0);
+
         uip = vp->v_ubcinfo;
  
         assert(uip != UBC_INFO_NULL);
@@ -372,11 +443,15 @@ ubc_uncache(struct vnode *vp)
         if (kret != KERN_SUCCESS) {
                 printf("ubc_uncache: memory_object_change_attributes_named "
                         "kret = %d", kret);
+               if (recursed == 1)
+                       ubc_unbusy(vp);
                 return (0);
         }
  
         ubc_release_named(vp);
  
+       if (recursed == 1)
+               ubc_unbusy(vp);
         return (1);
  }
  
@@ -506,15 +581,16 @@ memory_object_control_t
  ubc_getobject(struct vnode *vp, int flags)
  {
         struct ubc_info *uip;
+       int    recursed;
         memory_object_control_t control;
  
-       uip = vp->v_ubcinfo;
-
         if (UBCINVALID(vp))
                 return (0);
  
-       ubc_lock(vp);
+       if ((recursed = ubc_busy(vp)) == 0)
+               return (0);
  
+       uip = vp->v_ubcinfo;
         control = uip->ui_control;
  
         if ((flags & UBC_HOLDOBJECT) && (!ISSET(uip->ui_flags, UI_HASOBJREF))) {
@@ -523,19 +599,21 @@ ubc_getobject(struct vnode *vp, int flags)
                  * Take a temporary reference on the ubc info so that it won't go
                  * away during our recovery attempt.
                  */
+               ubc_lock(vp);
                 uip->ui_refcount++;
                 ubc_unlock(vp);
                 if (memory_object_recover_named(control, TRUE) == KERN_SUCCESS) {
-                       ubc_lock(vp);
                         SET(uip->ui_flags, UI_HASOBJREF);
-                       ubc_unlock(vp);
                 } else {
                         control = MEMORY_OBJECT_CONTROL_NULL;
                 }
+               if (recursed == 1)
+                       ubc_unbusy(vp);
                 ubc_info_deallocate(uip);
  
         } else {
-               ubc_unlock(vp);
+               if (recursed == 1)
+                       ubc_unbusy(vp);
         }
  
         return (control);
@@ -770,15 +848,16 @@ int
  ubc_hold(struct vnode *vp)
  {
         struct ubc_info *uip;
+       int    recursed;
         memory_object_control_t object;
  
         if (UBCINVALID(vp))
                 return (0);
  
-       if (!UBCINFOEXISTS(vp)) {
+       if ((recursed = ubc_busy(vp)) == 0) {
                 /* must be invalid or dying vnode */
                 assert(UBCINVALID(vp) ||
-                          ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)));
+                       ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)));
                 return (0);
         }
  
@@ -787,21 +866,23 @@ ubc_hold(struct vnode *vp)
  
         ubc_lock(vp);
         uip->ui_refcount++;
+       ubc_unlock(vp);
  
         if (!ISSET(uip->ui_flags, UI_HASOBJREF)) {
-               ubc_unlock(vp);
-               if (memory_object_recover_named(uip->ui_control, TRUE) != KERN_SUCCESS) {
+               if (memory_object_recover_named(uip->ui_control, TRUE)
+                       != KERN_SUCCESS) {
+                       if (recursed == 1)
+                               ubc_unbusy(vp);
                         ubc_info_deallocate(uip);
                         return (0);
                 }
-               ubc_lock(vp);
                 SET(uip->ui_flags, UI_HASOBJREF);
-               ubc_unlock(vp);
-       } else {
-               ubc_unlock(vp);
         }
+       if (recursed == 1)
+               ubc_unbusy(vp);
  
         assert(uip->ui_refcount > 0);
+
         return (1);
  }
  
@@ -872,28 +953,30 @@ int
  ubc_release_named(struct vnode *vp)
  {
         struct ubc_info *uip;
+       int    recursed;
         memory_object_control_t control;
-       kern_return_t kret;
+       kern_return_t kret = KERN_FAILURE;
  
         if (UBCINVALID(vp))
                 return (0);
  
-       if (!UBCINFOEXISTS(vp))
+       if ((recursed = ubc_busy(vp)) == 0)
                 return (0);
-
         uip = vp->v_ubcinfo;
  
         /* can not release held or mapped vnodes */
         if (ISSET(uip->ui_flags, UI_HASOBJREF) && 
-           (uip->ui_refcount == 1) && !uip->ui_mapped) {
+               (uip->ui_refcount == 1) && !uip->ui_mapped) {
                 control = uip->ui_control;
                 assert(control);
                 CLR(uip->ui_flags, UI_HASOBJREF);
                 kret = memory_object_release_name(control,
                                 MEMORY_OBJECT_RESPECT_CACHE);
-               return ((kret != KERN_SUCCESS) ? 0 : 1);
-       } else 
-               return (0);
+       }
+
+       if (recursed == 1)
+               ubc_unbusy(vp);
+       return ((kret != KERN_SUCCESS) ? 0 : 1);
  }
  
  /*
diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c

index 52eea7f8115fb3ef3652f3643beb7d0379bf4522..f44cd9323ad5e24a84d2e99f7f50b03a031e52e3 100644 (file)
--- a/bsd/miscfs/specfs/spec_vnops.c
+++ b/bsd/miscfs/specfs/spec_vnops.c
@@ -555,7 +555,8 @@ loop:
         s = splbio();
         for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
                 nbp = bp->b_vnbufs.le_next;
-               if ((bp->b_flags & B_BUSY))
+               // XXXdbg - don't flush locked blocks.  they may be journaled.
+               if ((bp->b_flags & B_BUSY) || (bp->b_flags & B_LOCKED))
                         continue;
                 if ((bp->b_flags & B_DELWRI) == 0)
                         panic("spec_fsync: not dirty");
diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c

index 7a7394c7869f4d6be5396dcf550d21c4dae5ecbe..4b77d663733ddb4c5d212242b39db23281d2b1db 100644 (file)
--- a/bsd/nfs/nfs_bio.c
+++ b/bsd/nfs/nfs_bio.c
@@ -115,7 +115,8 @@ nfs_bioread(vp, uio, ioflag, cred, getpages)
         int getpages;
  {
         register struct nfsnode *np = VTONFS(vp);
-       register int biosize, diff, i;
+       register int biosize, i;
+       off_t diff;
         struct buf *bp = 0, *rabp;
         struct vattr vattr;
         struct proc *p;
@@ -268,7 +269,7 @@ again:
                 bufsize = biosize;
                 if ((off_t)(lbn + 1) * biosize > np->n_size && 
                     (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
-                       bufsize = np->n_size - lbn * biosize;
+                       bufsize = np->n_size - (off_t)lbn * biosize;
                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
                 }
                 bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
@@ -876,7 +877,7 @@ nfs_getcacheblk(vp, bn, size, p, operation)
                 bp = getblk(vp, bn, size, 0, 0, operation);
  
         if( vp->v_type == VREG)
-               bp->b_blkno = (bn * biosize) / DEV_BSIZE;
+               bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
  
         return (bp);
  }
diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c

index ef42d4683da81424ee6f16c7ae86751d2073a5f2..8038b43a62d58d45d924f75fbed8e3b23f10c700 100644 (file)
--- a/bsd/nfs/nfs_socket.c
+++ b/bsd/nfs/nfs_socket.c
@@ -2204,7 +2204,7 @@ nfsrv_getstream(slp, waitflag)
         register struct mbuf *m, **mpp;
         register char *cp1, *cp2;
         register int len;
-       struct mbuf *om, *m2, *recm = 0;
+       struct mbuf *om, *m2, *recm;
         u_long recmark;
  
         if (slp->ns_flag & SLP_GETSTREAM)
@@ -2249,7 +2249,11 @@ nfsrv_getstream(slp, waitflag)
  
             /*
              * Now get the record part.
+            *
+            * Note that slp->ns_reclen may be 0.  Linux sometimes
+            * generates 0-length RPCs
              */
+           recm = NULL;
             if (slp->ns_cc == slp->ns_reclen) {
                 recm = slp->ns_raw;
                 slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c

index 2d516acf24c19c93300cc48da40954f89e86c7cf..e8c78eee83e2f04ce212067c680c982471170cd5 100644 (file)
--- a/bsd/nfs/nfs_vnops.c
+++ b/bsd/nfs/nfs_vnops.c
@@ -4512,8 +4512,8 @@ again:
  #if 0
                 /* (removed for UBC) */
                 bufsize = biosize;
-               if ((lbn + 1) * biosize > np->n_size) {
-                       bufsize = np->n_size - lbn * biosize;
+               if ((off_t)(lbn + 1) * biosize > np->n_size) {
+                       bufsize = np->n_size - (off_t)lbn * biosize;
                         bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
                 }
  #endif
@@ -4618,7 +4618,7 @@ nfs_blktooff(ap)
  
         biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */
  
-       *ap->a_offset = (off_t)(ap->a_lblkno *  biosize);
+       *ap->a_offset = (off_t)ap->a_lblkno *  biosize;
  
         return (0);
  }
diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h

index fb456c5625ae4ff120e8fb2841df78bed9e7194c..d051d11f0b1c26bd873a7b0d999eee4ab0f347a6 100644 (file)
--- a/bsd/sys/buf.h
+++ b/bsd/sys/buf.h
@@ -132,6 +132,15 @@ struct buf {
  #define b_trans_head b_freelist.tqe_prev
  #define b_trans_next b_freelist.tqe_next
  #define b_real_bp    b_saveaddr
+#define b_iostate    b_rcred
+
+/* journaling uses this cluster i/o field for its own
+ * purposes because meta data buf's should never go
+ * through the clustering code.
+ */
+#define b_transaction b_vectorlist
+
+   
  
  /*
   * These flags are kept in b_flags.
@@ -163,7 +172,7 @@ struct buf {
  #define        B_WRITE         0x00000000      /* Write buffer (pseudo flag). */
  #define        B_WRITEINPROG   0x01000000      /* Write in progress. */
  #define        B_HDRALLOC      0x02000000      /* zone allocated buffer header */
-#define        B_UNUSED1       0x04000000      /* Unused bit */
+#define        B_NORELSE       0x04000000      /* don't brelse() in bwrite() */
  #define B_NEED_IODONE   0x08000000
                                                                 /* need to do a biodone on the */
                                                                 /* real_bp associated with a cluster_io */
diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h

index 74b269c5869d075db4d54c4ff7d0cbc5bbe1ee4c..65a4bffdd6837f9ecb10ded48027c78a782ee750 100644 (file)
--- a/bsd/sys/disk.h
+++ b/bsd/sys/disk.h
@@ -44,8 +44,12 @@ typedef struct
  
  #define DKIOCGETMAXBLOCKCOUNTREAD    _IOR('d', 64, u_int64_t)
  #define DKIOCGETMAXBLOCKCOUNTWRITE   _IOR('d', 65, u_int64_t)
+#define DKIOCGETMAXBYTECOUNTREAD         _IOR('d', 70, u_int64_t)
+#define DKIOCGETMAXBYTECOUNTWRITE        _IOR('d', 71, u_int64_t)
  #define DKIOCGETMAXSEGMENTCOUNTREAD  _IOR('d', 66, u_int64_t)
  #define DKIOCGETMAXSEGMENTCOUNTWRITE _IOR('d', 67, u_int64_t)
+#define DKIOCGETMAXSEGMENTBYTECOUNTREAD  _IOR('d', 68, u_int64_t)
+#define DKIOCGETMAXSEGMENTBYTECOUNTWRITE _IOR('d', 69, u_int64_t)
  
  #ifdef KERNEL
  #define DKIOCSETBLOCKSIZE            _IOW('d', 24, u_int32_t)
diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h

index fb05b87343c8fb5309fa08f3da11a449c2892d88..751de10e53700202e1ea9e42e36f869a4ff47fb9 100644 (file)
--- a/bsd/sys/malloc.h
+++ b/bsd/sys/malloc.h
@@ -164,8 +164,9 @@
  #define M_IP6MISC      88      /* IPv6 misc. memory */
  #define M_TSEGQ                89      /* TCP segment queue entry */
  #define M_IGMP         90
+#define M_JOURNAL       91      /* VFS Journaling code */
  
-#define        M_LAST          91      /* Must be last type + 1 */
+#define        M_LAST          92      /* Must be last type + 1 */
  
  /* Strings corresponding to types of memory */
  /* Must be in synch with the #defines above */
@@ -258,9 +259,10 @@
         "UDF mount"     /* 85 M_UDFMNT */ \
         "IPv6 NDP",     /* 86 M_IP6NDP */ \
         "IPv6 options", /* 87 M_IP6OPT */ \
-       "IPv6 Misc"     /* 88 M_IP6MISC */\
-       "TCP Segment Q" /* 89 M_TSEGQ */\
-       "IGMP state"    /* 90 M_IGMP */\
+       "IPv6 Misc",    /* 88 M_IP6MISC */\
+       "TCP Segment Q",/* 89 M_TSEGQ */\
+       "IGMP state",   /* 90 M_IGMP */\
+       "Journaling"    /* 91 M_JOURNAL */\
  }
  
  struct kmemstats {
diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h

index a2840d9bc5074cb39d187d952b650b3c0a6c969e..2b8e1e05c0001936675b2491ba61b570fa0ee578 100644 (file)
--- a/bsd/sys/mount.h
+++ b/bsd/sys/mount.h
@@ -159,6 +159,7 @@ struct mount {
  #define MNT_DONTBROWSE 0x00100000      /* file system is not appropriate path to user data */
  #define MNT_UNKNOWNPERMISSIONS 0x00200000 /* no known mapping for uid/gid in permissions information on disk */
  #define MNT_AUTOMOUNTED 0x00400000     /* filesystem was mounted by automounter */
+#define MNT_JOURNALED   0x00800000  /* filesystem is journaled */
  
  /*
   * NFS export related mount flags.
@@ -188,7 +189,7 @@ struct mount {
                         MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB    | \
                         MNT_LOCAL       |               MNT_QUOTA       | \
                         MNT_ROOTFS      | MNT_DOVOLFS   | MNT_DONTBROWSE | \
-                       MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_FIXEDSCRIPTENCODING )
+                       MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_JOURNALED | MNT_FIXEDSCRIPTENCODING )
  /*
   * External filesystem command modifier flags.
   * Unmount can use the MNT_FORCE flag.
diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h

index d243bb97fa6c4863612ca97efb31bb8f427d1a4c..e6a2a189d0bb99cd0789589446aaf88364600207 100644 (file)
--- a/bsd/sys/ubc.h
+++ b/bsd/sys/ubc.h
@@ -60,6 +60,7 @@ struct ubc_info {
         int                                             ui_refcount;/* ref count on the ubc_info */
         off_t                                   ui_size;        /* file size for the vnode */
         long                                    ui_mapped;      /* is it currently mapped */
+       void                                    *ui_owner;      /* for recursive ubc_busy */
  };
  
  /* Defines for ui_flags */
@@ -69,6 +70,8 @@ struct ubc_info {
  #define UI_HASOBJREF   0x00000004              /* hold a reference on object */
  #define UI_WASMAPPED   0x00000008              /* vnode was mapped */
  #define        UI_DONTCACHE    0x00000010              /* do not cache object */
+#define        UI_BUSY                 0x00000020              /* for VM synchronization */
+#define        UI_WANTED               0x00000040              /* for VM synchronization */
  
  #endif /* __APPLE_API_PRIVATE */
  
diff --git a/bsd/vfs/Makefile b/bsd/vfs/Makefile

index ce2bd8753962e8d408edf35d99cc4a9a807123b7..1ed043ac222a6d95c6a60dc6c4daa9b1bf1f16f7 100644 (file)
--- a/bsd/vfs/Makefile
+++ b/bsd/vfs/Makefile
@@ -20,7 +20,7 @@ EXPINC_SUBDIRS_PPC = \
  EXPINC_SUBDIRS_I386 = \
  
  DATAFILES = \
-        vfs_support.h  
+       vfs_support.h vfs_journal.h
  
  INSTALL_MI_LIST        = ${DATAFILES}
  
diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c

index c11c03bea18cd5a10a1cb13507f0c1a31114cce3..57c206760d1eb5aa2c5cac31e2cea63915a52fdd 100644 (file)
--- a/bsd/vfs/vfs_bio.c
+++ b/bsd/vfs/vfs_bio.c
@@ -180,6 +180,7 @@ simple_lock_data_t bufhashlist_slock;               /* lock on buffer hash list */
  /* number of per vnode, "in flight" buffer writes */
  #define        BUFWRITE_THROTTLE       9
  
+
  /*
   * Time in seconds before a buffer on a list is 
   * considered as a stale buffer 
@@ -211,9 +212,9 @@ binshash(struct buf *bp, struct bufhashhdr *dp)
  
         simple_lock(&bufhashlist_slock);
  
-#if 0  
-       if(incore(bp->b_vp, bp->b_lblkno))
-               panic("binshash: already incore");
+#if 0
+       if((bad = incore(bp->b_vp, bp->b_lblkno)))
+               panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
  #endif /* 0 */
  
         BHASHENTCHECK(bp);
@@ -459,6 +460,7 @@ bio_doread(vp, blkno, size, cred, async, queuetype)
                          */
                         bp->b_rcred = crdup(cred);
                 }
+
                 VOP_STRATEGY(bp);
  
                 trace(TR_BREADMISS, pack(vp, size), blkno);
@@ -627,7 +629,12 @@ bwrite(bp)
                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
  
                 /* Release the buffer. */
-               brelse(bp);
+               // XXXdbg - only if the unused bit is set
+               if (!ISSET(bp->b_flags, B_NORELSE)) {
+                   brelse(bp);
+               } else {
+                   CLR(bp->b_flags, B_NORELSE);
+               }
  
                 return (rv);
         } else {
@@ -707,7 +714,10 @@ bdwrite_internal(bp, return_error)
         if (nbdwrite < 0)
                 panic("bdwrite: Negative nbdwrite");
  
-       if (nbdwrite > ((nbuf/4)*3)) {
+       // can't do a bawrite() if the LOCKED bit is set because the
+       // buffer is part of a transaction and can't go to disk until
+       // the LOCKED bit is cleared.
+       if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
                 if (return_error)
                         return (EAGAIN);
                 else
@@ -807,6 +817,27 @@ brelse(bp)
  
         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
  
+       // if we're invalidating a buffer that has the B_CALL bit
+       // set then call the b_iodone function so it gets cleaned
+       // up properly.
+       //
+       if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
+               if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
+                       panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
+               }
+               if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
+                       void    (*iodone_func)(struct buf *) = bp->b_iodone;
+
+                       CLR(bp->b_flags, B_CALL);       /* but note callout done */
+                       bp->b_iodone = NULL;
+
+                       if (iodone_func == NULL) {
+                               panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
+                       }
+                       (*iodone_func)(bp);
+               }
+       }
+       
         /* IO is done. Cleanup the UPL state */
         if (!ISSET(bp->b_flags, B_META)
                 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
@@ -1121,6 +1152,10 @@ start:
                         brelse(bp);
                         goto start;
                 }
+               /*
+                * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
+                *       CALLED!  BE CAREFUL.
+                */
  
                 /*
                  * if it is meta, the queue may be set to other 
@@ -1451,7 +1486,7 @@ allocbuf(bp, size)
         }
  
         if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
-               panic("allocbuf: bp->b_data is NULL");
+               panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
  
         bp->b_bufsize = desired_size;
         bp->b_bcount = size;
@@ -1603,11 +1638,15 @@ start:
                 panic("getnewbuf: null bp");
  
  found:
+       if (ISSET(bp->b_flags, B_LOCKED)) {
+           panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
+       }
+       
         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) 
-               panic("getnewbuf: le_prev is deadbeef");
+               panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
  
         if(ISSET(bp->b_flags, B_BUSY))
-               panic("getnewbuf reusing BUSY buf");
+               panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
  
         /* Clean it */
         if (bcleanbuf(bp)) {
@@ -1822,8 +1861,16 @@ biodone(bp)
         }
  
         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
+               void    (*iodone_func)(struct buf *) = bp->b_iodone;
+
                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
-               (*bp->b_iodone)(bp);
+               bp->b_iodone = NULL;
+
+               if (iodone_func == NULL) {
+                       panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);                   
+               } else { 
+                       (*iodone_func)(bp);
+               }
         } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
                 brelse(bp);
         else {                                  /* or just wakeup the buffer */ 
@@ -1932,6 +1979,7 @@ alloc_io_buf(vp, priv)
         /* clear out various fields */
         bp->b_flags = B_BUSY;
         bp->b_blkno = bp->b_lblkno = 0;
+
         bp->b_iodone = 0;
         bp->b_error = 0;
         bp->b_resid = 0;
@@ -2344,3 +2392,76 @@ doit:
  
         (void) thread_funnel_set(kernel_flock, funnel_state);
  }
+
+
+static int
+bp_cmp(void *a, void *b)
+{
+    struct buf *bp_a = *(struct buf **)a,
+               *bp_b = *(struct buf **)b;
+    daddr_t res;
+
+    // don't have to worry about negative block
+    // numbers so this is ok to do.
+    //
+    res = (bp_a->b_blkno - bp_b->b_blkno);
+
+    return (int)res;
+}
+
+#define NFLUSH 32
+
+int
+bflushq(int whichq, struct mount *mp)
+{
+       struct buf *bp, *next;
+       int         i, buf_count, s;
+       int         counter=0, total_writes=0;
+       static struct buf *flush_table[NFLUSH];
+
+       if (whichq < 0 || whichq >= BQUEUES) {
+           return;
+       }
+
+
+  restart:
+       bp = TAILQ_FIRST(&bufqueues[whichq]);
+       for(buf_count=0; bp; bp=next) {
+           next = bp->b_freelist.tqe_next;
+                       
+           if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
+               continue;
+           }
+
+           if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
+               if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
+                   panic("bflushq: bp @ 0x%x is locked!\n", bp);
+               }
+               
+               bremfree(bp);
+               bp->b_flags |= B_BUSY;
+               flush_table[buf_count] = bp;
+               buf_count++;
+               total_writes++;
+
+               if (buf_count >= NFLUSH) {
+                   qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
+
+                   for(i=0; i < buf_count; i++) {
+                       bawrite(flush_table[i]);
+                   }
+
+                   goto restart;
+               }
+           }
+       }
+
+       if (buf_count > 0) {
+           qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
+           for(i=0; i < buf_count; i++) {
+               bawrite(flush_table[i]);
+           }
+       }
+
+       return total_writes;
+}
diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c

index df2e7375144c2142f50f58f754aeed59057af4ee..49b0938bbd113488bb7f48b6e0e791375ca6f344 100644 (file)
--- a/bsd/vfs/vfs_cluster.c
+++ b/bsd/vfs/vfs_cluster.c
@@ -1,4 +1,3 @@
-
  /*
   * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   *
@@ -80,6 +79,16 @@
  #define CL_NOZERO    0x80
  #define CL_PAGEIN    0x100
  #define CL_DEV_MEMORY 0x200
+#define CL_PRESERVE   0x400
+
+struct clios {
+        u_int  io_completed;
+        u_int  io_issued;
+        off_t  io_offset;
+        int    io_error;
+        int    io_wanted;
+};
+
  
  static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
                 int size, struct buf *bp);
@@ -93,8 +102,11 @@ static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
  static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
                 off_t newEOF, int devblocksize, int flags);
  static int cluster_phys_read(struct vnode *vp, struct uio *uio,
-               off_t filesize);
-static int cluster_phys_write(struct vnode *vp, struct uio *uio, off_t newEOF);
+               off_t filesize, int devblocksize, int flags);
+static int cluster_phys_write(struct vnode *vp, struct uio *uio,
+               off_t newEOF, int devblocksize, int flags);
+static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
+                vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
  static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
  static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
  
@@ -116,12 +128,14 @@ cluster_iodone(bp)
         int         total_resid;
         int         upl_offset;
         int         zero_offset;
+       int         l_blkno;
         upl_t       upl;
         struct buf *cbp;
         struct buf *cbp_head;
         struct buf *cbp_next;
         struct buf *real_bp;
         struct vnode *vp;
+       struct clios *iostate;
         int         commit_size;
         int         pg_offset;
  
@@ -155,6 +169,8 @@ cluster_iodone(bp)
         real_bp    = cbp->b_real_bp;
         vp         = cbp->b_vp;
         zero_offset= cbp->b_validend;
+       l_blkno    = cbp->b_lblkno;
+       iostate    = (struct clios *)cbp->b_iostate;
  
         while (cbp) {
                 if (cbp->b_vectorcount > 1)
@@ -172,13 +188,34 @@ cluster_iodone(bp)
  
                 cbp = cbp_next;
         }
+       if (zero_offset)
+               cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+
         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
                 vp->v_flag &= ~VTHROTTLED;
                 wakeup((caddr_t)&vp->v_numoutput);
         }
-       if (zero_offset)
-               cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+       if (iostate) {
+               if (error) {
+                       off_t   error_offset;
+
+                       error_offset = (off_t)l_blkno * PAGE_SIZE_64;
  
+                       if (iostate->io_error == 0) {
+                               iostate->io_error = error;
+                               iostate->io_offset = error_offset;
+                       } else {
+                               if (error_offset < iostate->io_offset)
+                                       iostate->io_offset = error_offset;
+                       }
+               }
+               iostate->io_completed += total_size;
+
+               if (iostate->io_wanted) {
+                       iostate->io_wanted = 0;
+                       wakeup((caddr_t)&iostate->io_wanted);
+               }
+       }
         if ((b_flags & B_NEED_IODONE) && real_bp) {
                 if (error) {
                         real_bp->b_flags |= B_ERROR;
@@ -192,13 +229,15 @@ cluster_iodone(bp)
                 error = EIO;
  
         if (b_flags & B_COMMIT_UPL) {
-               pg_offset   = upl_offset & PAGE_MASK;
+               pg_offset   = upl_offset & PAGE_MASK;
                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
  
-               if (error || (b_flags & B_NOCACHE)) {
+               if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
                         int upl_abort_code;
  
-                       if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
+                       if (b_flags & B_PHYS)
+                               upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
+                       else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
                         else if (b_flags & B_PGIN)
                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
@@ -215,7 +254,9 @@ cluster_iodone(bp)
                 } else {
                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
  
-                       if ( !(b_flags & B_PAGEOUT))
+                       if (b_flags & B_PHYS)
+                               upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
+                       else if ( !(b_flags & B_PAGEOUT))
                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
                         if (b_flags & B_AGE)
                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
@@ -271,7 +312,7 @@ cluster_zero(upl, upl_offset, size, bp)
  }
  
  static int
-cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp)
+cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
         struct vnode *vp;
         upl_t         upl;
         vm_offset_t   upl_offset;
@@ -280,10 +321,12 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
         int           devblocksize;
         int           flags;
         struct buf   *real_bp;
+       struct clios *iostate;
  {
         struct buf   *cbp;
         struct iovec *iovp;
-       u_int           size;
+       u_int         size;
+       u_int         io_size;
         int           io_flags;
         int           error = 0;
         int           retval = 0;
@@ -297,6 +340,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
         u_int max_vectors;
         int priv;
         int zero_offset = 0;
+       u_int  first_lblkno;
  
         if (flags & CL_READ) {
                 io_flags = (B_VECTORLIST | B_READ);
@@ -309,14 +353,18 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
         }
         pl = ubc_upl_pageinfo(upl);
  
-       if (flags & CL_ASYNC)
-               io_flags |= (B_CALL | B_ASYNC);
         if (flags & CL_AGE)
                 io_flags |= B_AGE;
         if (flags & CL_DUMP)
                 io_flags |= B_NOCACHE;
         if (flags & CL_PAGEIN)
                 io_flags |= B_PGIN;
+       if (flags & CL_PAGEOUT)
+               io_flags |= B_PAGEOUT;
+       if (flags & CL_COMMIT)
+               io_flags |= B_COMMIT_UPL;
+       if (flags & CL_PRESERVE)
+               io_flags |= B_PHYS;
  
         if (devblocksize)
                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
@@ -338,7 +386,6 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
                 zero_offset = upl_offset + non_rounded_size;
         }
         while (size) {
-               size_t io_size;
                 int vsize;
                 int i;
                 int pl_index;
@@ -352,7 +399,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
                 else
                         io_size = size;
  
-               if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
+               if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
                         if (error == EOPNOTSUPP)
                                 panic("VOP_CMAP Unimplemented");
                         break;
@@ -587,8 +634,10 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
                 if (error)
                         break;
  
-               if (flags & CL_ASYNC)
-                       cbp->b_iodone = (void *)cluster_iodone;
+               if (flags & CL_ASYNC) {
+                       cbp->b_flags |= (B_CALL | B_ASYNC);
+                       cbp->b_iodone = (void *)cluster_iodone;
+               }
                 cbp->b_flags |= io_flags;
  
                 cbp->b_lblkno = lblkno;
@@ -598,6 +647,9 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
                 cbp->b_uploffset = upl_offset;
                 cbp->b_trans_next = (struct buf *)0;
  
+               if (cbp->b_iostate = (void *)iostate)
+                       iostate->io_issued += io_size;
+
                 if (flags & CL_READ)
                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
@@ -631,13 +683,6 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags,
                          * then go ahead and issue the I/O
                          */
  start_io:              
-                       if (flags & CL_COMMIT)
-                               cbp_head->b_flags |= B_COMMIT_UPL;
-                       if (flags & CL_PAGEOUT)
-                               cbp_head->b_flags |= B_PAGEOUT;
-                       if (flags & CL_PAGEIN)
-                               cbp_head->b_flags |= B_PGIN;
-
                         if (real_bp) {
                                 cbp_head->b_flags |= B_NEED_IODONE;
                                 cbp_head->b_real_bp = real_bp;
@@ -687,6 +732,8 @@ start_io:
         if (error) {
                 int abort_size;
  
+               io_size = 0;
+               
                 for (cbp = cbp_head; cbp;) {
                         struct buf * cbp_next;
   
@@ -694,21 +741,36 @@ start_io:
                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
                         upl_offset -= cbp->b_bcount;
                         size       += cbp->b_bcount;
+                       io_size    += cbp->b_bcount;
  
                         cbp_next = cbp->b_trans_next;
                         free_io_buf(cbp);
                         cbp = cbp_next;
                 }
+               if (iostate) {
+                       if (iostate->io_error == 0) {
+                               iostate->io_error = error;
+                               iostate->io_offset = f_offset - (off_t)io_size;
+                       }
+                       iostate->io_issued -= io_size;
+
+                       if (iostate->io_wanted) {
+                               iostate->io_wanted = 0;
+                               wakeup((caddr_t)&iostate->io_wanted);
+                       }
+               }
                 pg_offset  = upl_offset & PAGE_MASK;
                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
  
                 if (flags & CL_COMMIT) {
                         int upl_abort_code;
  
-                       if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
+                       if (flags & CL_PRESERVE)
+                               upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
+                       else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
                         else if (flags & CL_PAGEIN)
-                           upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
+                               upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
                         else
                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
  
@@ -910,7 +972,7 @@ cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, fla
         }
  
         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
-                          local_flags, (struct buf *)0));
+                          local_flags, (struct buf *)0, (struct clios *)0));
  }
  
  int
@@ -968,7 +1030,7 @@ cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flag
                                     size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
         
         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
-                          local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
+                          local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
  
         if (retval == 0) {
                 int b_lblkno;
@@ -1010,7 +1072,7 @@ cluster_bp(bp)
  
         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
  
-        return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp));
+        return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
  }
  
  int
@@ -1037,7 +1099,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
         int           retval = 0;
  
  
-       if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
+       if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
           {
             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
             return(retval);
@@ -1074,14 +1136,6 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
  
              if (upl_flags & UPL_PHYS_CONTIG)
               {
-               /*
-                * since the interface to the IOKit below us uses physical block #'s and
-                * block counts to specify the I/O, we can't handle anything that isn't
-                * devblocksize aligned 
-                */
-               if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
-                   return(EINVAL);
-
                 if (flags & IO_HEADZEROFILL)
                   {
                     flags &= ~IO_HEADZEROFILL;
@@ -1090,7 +1144,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
                         return(retval);
                   }
  
-               retval = cluster_phys_write(vp, uio, newEOF);
+               retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
  
                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
                   {
@@ -1172,6 +1226,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
         return(retval);
  }
  
+
  static int
  cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
         struct vnode *vp;
@@ -1326,7 +1381,7 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
                        (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
  
           error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-                            io_size, devblocksize, 0, (struct buf *)0);
+                            io_size, devblocksize, 0, (struct buf *)0, (struct clios *)0);
  
           if (error == 0) {
             /*
@@ -1361,14 +1416,20 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
         return (error);
  }
  
+
  static int
-cluster_phys_write(vp, uio, newEOF)
+cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
         struct vnode *vp;
         struct uio   *uio;
         off_t        newEOF;
+       int          devblocksize;
+       int          flags;
  {
+       upl_page_info_t *pl;
+       vm_offset_t      src_paddr;
         upl_t            upl;
         vm_offset_t      upl_offset;
+       int              tail_size;
         int              io_size;
         int              upl_size;
         int              upl_needed_size;
@@ -1399,49 +1460,78 @@ cluster_phys_write(vp, uio, newEOF)
                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
  
-       if (kret != KERN_SUCCESS)
-         {
-           /* cluster_phys_write: failed to get pagelist */
-             /* note: return kret here */
+       if (kret != KERN_SUCCESS) {
+               /*
+                * cluster_phys_write: failed to get pagelist
+                * note: return kret here
+                */
               return(EINVAL);
-         }
-
+       }
         /*
          * Consider the possibility that upl_size wasn't satisfied.
          * This is a failure in the physical memory case.
          */
-       if (upl_size < upl_needed_size)
-         {
-           kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
-           return(EINVAL);
-         }
+       if (upl_size < upl_needed_size) {
+               kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+               return(EINVAL);
+       }
+       pl = ubc_upl_pageinfo(upl);
  
-       /*
-        * issue a synchronous write to cluster_io
-        */
+       src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
  
-       error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-                          io_size, 0, CL_DEV_MEMORY, (struct buf *)0);
+       while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
+               int   head_size;
  
-       if (error == 0) {
-         /*
-          * The cluster_io write completed successfully,
-          * update the uio structure and commit.
-          */
+               head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
  
-         ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
-           
-         iov->iov_base += io_size;
-         iov->iov_len -= io_size;
-         uio->uio_resid -= io_size;
-         uio->uio_offset += io_size;
+               if (head_size > io_size)
+                       head_size = io_size;
+
+               error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
+
+               if (error) {
+                       ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+                       return(EINVAL);
+               }
+               upl_offset += head_size;
+               src_paddr  += head_size;
+               io_size    -= head_size;
         }
-       else
-         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+       tail_size = io_size & (devblocksize - 1);
+       io_size  -= tail_size;
+
+       if (io_size) {
+               /*
+                * issue a synchronous write to cluster_io
+                */
+               error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
+                                  io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
+       }
+       if (error == 0) {
+               /*
+                * The cluster_io write completed successfully,
+                * update the uio structure
+                */
+               uio->uio_resid  -= io_size;
+               iov->iov_len    -= io_size;
+               iov->iov_base   += io_size;
+               uio->uio_offset += io_size;
+               src_paddr       += io_size;
+
+               if (tail_size)
+                       error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
+       }
+       /*
+        * just release our hold on the physically contiguous
+        * region without changing any state
+        */
+       ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
  
         return (error);
  }
  
+
  static int
  cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
         struct vnode *vp;
@@ -1593,7 +1683,7 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
                                 read_size = newEOF - upl_f_offset;
  
                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
-                                           CL_READ, (struct buf *)0);
+                                           CL_READ, (struct buf *)0, (struct clios *)0);
                         if (retval) {
                                 /*
                                  * we had an error during the read which causes us to abort
@@ -1627,7 +1717,7 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
                                         read_size = newEOF - (upl_f_offset + upl_offset);
  
                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
-                                                   CL_READ, (struct buf *)0);
+                                                   CL_READ, (struct buf *)0, (struct clios *)0);
                                 if (retval) {
                                         /*
                                          * we had an error during the read which causes us to abort
@@ -1934,7 +2024,7 @@ delay_io:
                         if (last_blkno > vp->v_lastw)
                                 vp->v_lastw = last_blkno;
  
-                       ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
+                       ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
                         continue;
  issue_io:
                         /*
@@ -1963,7 +2053,7 @@ issue_io:
                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
                         }       
                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
-                                           io_flags, (struct buf *)0);
+                                           io_flags, (struct buf *)0, (struct clios *)0);
                 }
         }
         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
@@ -2039,7 +2129,7 @@ cluster_read(vp, uio, filesize, devblocksize, flags)
  
             if (upl_flags & UPL_PHYS_CONTIG)
               {
-               retval = cluster_phys_read(vp, uio, filesize);
+               retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
               }
             else if (uio->uio_resid < 4 * PAGE_SIZE)
               {
@@ -2119,6 +2209,7 @@ cluster_read(vp, uio, filesize, devblocksize, flags)
         return(retval);
  }
  
+
  static int
  cluster_read_x(vp, uio, filesize, devblocksize, flags)
         struct vnode *vp;
@@ -2288,7 +2379,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags)
                          */
  
                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
-                                          io_size, devblocksize, CL_READ, (struct buf *)0);
+                                          io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
                 }
                 if (error == 0) {
                         /*
@@ -2481,6 +2572,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags)
         return (retval);
  }
  
+
  static int
  cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
         struct vnode *vp;
@@ -2687,7 +2779,7 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
                        (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
  
           error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
-                            io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0);
+                            io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0,  (struct clios *)0);
  
           if (error == 0) {
             /*
@@ -2724,22 +2816,29 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
  }
  
  
+
  static int
-cluster_phys_read(vp, uio, filesize)
+cluster_phys_read(vp, uio, filesize, devblocksize, flags)
         struct vnode *vp;
         struct uio   *uio;
         off_t        filesize;
+       int          devblocksize;
+       int          flags;
  {
+       upl_page_info_t *pl;
         upl_t            upl;
         vm_offset_t      upl_offset;
+       vm_offset_t      dst_paddr;
         off_t            max_size;
         int              io_size;
+       int              tail_size;
         int              upl_size;
         int              upl_needed_size;
         int              pages_in_pl;
         int              upl_flags;
         kern_return_t    kret;
         struct iovec     *iov;
+       struct clios     iostate;
         int              error;
  
         /*
@@ -2752,14 +2851,15 @@ cluster_phys_read(vp, uio, filesize)
  
         max_size = filesize - uio->uio_offset;
  
-       if (max_size < (off_t)((unsigned int)iov->iov_len))
-           io_size = max_size;
+       if (max_size > (off_t)((unsigned int)iov->iov_len))
+               io_size = iov->iov_len;
         else
-           io_size = iov->iov_len;
+               io_size = max_size;
  
         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
         upl_needed_size = upl_offset + io_size;
  
+       error       = 0;
         pages_in_pl = 0;
         upl_size = upl_needed_size;
         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
@@ -2768,48 +2868,112 @@ cluster_phys_read(vp, uio, filesize)
                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
  
-       if (kret != KERN_SUCCESS)
-         {
-           /* cluster_phys_read: failed to get pagelist */
-           return(EINVAL);
-         }
+       if (kret != KERN_SUCCESS) {
+               /*
+                * cluster_phys_read: failed to get pagelist
+                */
+               return(EINVAL);
+       }
+       if (upl_size < upl_needed_size) {
+               /*
+                * The upl_size wasn't satisfied.
+                */
+               ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+               return(EINVAL);
+       }
+       pl = ubc_upl_pageinfo(upl);
+
+       dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
  
+       while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
+               int   head_size;
+
+               head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
+
+               if (head_size > io_size)
+                       head_size = io_size;
+
+               error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
+
+               if (error) {
+                       ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+                       return(EINVAL);
+               }
+               upl_offset += head_size;
+               dst_paddr  += head_size;
+               io_size    -= head_size;
+       }
+       tail_size = io_size & (devblocksize - 1);
+       io_size  -= tail_size;
+
+       iostate.io_completed = 0;
+       iostate.io_issued = 0;
+       iostate.io_error = 0;
+       iostate.io_wanted = 0;
+
+       while (io_size && error == 0) {
+               int  xsize;
+
+               if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
+                       xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
+               else
+                       xsize = io_size;
+               /*
+                * request asynchronously so that we can overlap
+                * the preparation of the next I/O... we'll do
+                * the commit after all the I/O has completed
+                * since its all issued against the same UPL
+                * if there are already too many outstanding reads
+                * throttle back until we reach a more reasonable level
+                */
+               while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
+                       iostate.io_wanted = 1;
+                       tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
+               }       
+
+               error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0, 
+                                  CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
+                                  (struct buf *)0, &iostate);
+               /*
+                * The cluster_io read was issued successfully,
+                * update the uio structure
+                */
+               if (error == 0) {
+                       uio->uio_resid  -= xsize;
+                       iov->iov_len    -= xsize;
+                       iov->iov_base   += xsize;
+                       uio->uio_offset += xsize;
+                       dst_paddr       += xsize;
+                       upl_offset      += xsize;
+                       io_size         -= xsize;
+               }
+       }
         /*
-        * Consider the possibility that upl_size wasn't satisfied.
+        * make sure any async reads have completed before
+        * we proceed
          */
-       if (upl_size < upl_needed_size)
-         {
-           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
-           return(EINVAL);
-         }
+       while (iostate.io_issued != iostate.io_completed) {
+               iostate.io_wanted = 1;
+               tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
+       }       
+       if (iostate.io_error) {
+               error = iostate.io_error;
+       }
+       if (error == 0 && tail_size)
+               error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
  
         /*
-        * issue a synchronous read to cluster_io
+        * just release our hold on the physically contiguous
+        * region without changing any state
          */
-
-       error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
-                          io_size, 0, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
-
-       if (error == 0)
-         {
-           /*
-            * The cluster_io read completed successfully,
-            * update the uio structure and commit.
-            */
-
-           ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
-           
-           iov->iov_base += io_size;
-           iov->iov_len -= io_size;
-           uio->uio_resid -= io_size;
-           uio->uio_offset += io_size;
-         }
-       else
-           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+       ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
         
         return (error);
  }
  
+
  /*
   * generate advisory I/O's in the largest chunks possible
   * the completed pages will be released into the VM cache
@@ -2932,7 +3096,7 @@ advisory_read(vp, filesize, f_offset, resid, devblocksize)
                                  * issue an asynchronous read to cluster_io
                                  */
                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
-                                                   CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
+                                                   CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
  
                                 issued_io = 1;
                         }
@@ -3228,7 +3392,7 @@ cluster_push_x(vp, EOF, first, last, can_delay)
                         vp->v_flag |= VTHROTTLED;
                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
                 }
-               cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0);
+               cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
  
                 size -= io_size;
         }
@@ -3236,3 +3400,64 @@ cluster_push_x(vp, EOF, first, last, can_delay)
  
         return(1);
  }
+
+
+
+static int
+cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
+{
+        struct iovec     *iov;
+        upl_page_info_t  *pl;
+        upl_t            upl;
+        vm_offset_t      ubc_paddr;
+        kern_return_t    kret;
+        int              error = 0;
+
+        iov = uio->uio_iov;
+
+        kret = ubc_create_upl(vp,
+                              uio->uio_offset & ~PAGE_MASK_64,
+                              PAGE_SIZE,
+                              &upl,
+                              &pl,
+                              UPL_FLAGS_NONE);
+
+        if (kret != KERN_SUCCESS)
+                return(EINVAL);
+
+        if (!upl_valid_page(pl, 0)) {
+                /*
+                 * issue a synchronous read to cluster_io
+                 */
+                error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+                                  CL_READ, (struct buf *)0, (struct clios *)0);
+                if (error) {
+                          ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+                          return(error);
+                }
+        }
+        ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
+
+       if (flags & CL_READ)
+               copyp2p(ubc_paddr, usr_paddr, xsize, 2);
+       else
+               copyp2p(usr_paddr, ubc_paddr, xsize, 1);
+
+       if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
+                /*
+                 * issue a synchronous write to cluster_io
+                 */
+                error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+                                  0, (struct buf *)0, (struct clios *)0);
+       }
+       if (error == 0) {
+               uio->uio_offset += xsize;
+               iov->iov_base   += xsize;
+               iov->iov_len    -= xsize;
+               uio->uio_resid  -= xsize;
+       }
+       ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+        return (error);
+}
diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c

new file mode 100644 (file)

index 0000000..2acb4fa
--- /dev/null
+++ b/bsd/vfs/vfs_journal.c
@@ -0,0 +1,2067 @@
+/*
+ * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+//
+// This file implements a simple write-ahead journaling layer.  
+// In theory any file system can make use of it by calling these 
+// functions when the fs wants to modify meta-data blocks.  See
+// vfs_journal.h for a more detailed description of the api and
+// data structures.
+//
+// Dominic Giampaolo (dbg@apple.com)
+//
+
+#ifdef KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/ioctl.h>
+#include <sys/tty.h>
+#include <sys/ubc.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <kern/thread_act.h>
+#include <sys/disk.h>
+#include <miscfs/specfs/specdev.h>
+
+extern task_t kernel_task;
+
+#else
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include "compat.h"
+
+#endif   /* KERNEL */
+
+#include "vfs_journal.h"
+
+
+// number of bytes to checksum in a block_list_header
+// NOTE: this should be enough to clear out the header
+//       fields as well as the first entry of binfo[]
+#define BLHDR_CHECKSUM_SIZE 32
+
+
+
+static int  end_transaction(transaction *tr, int force_it);
+static void abort_transaction(journal *jnl, transaction *tr);
+static void dump_journal(journal *jnl);
+
+
+#define CHECK_JOURNAL(jnl) \
+    do { \
+    if (jnl == NULL) {\
+       panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (jnl->jdev == NULL) { \
+       panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
+    } \
+    if (jnl->fsdev == NULL) { \
+       panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
+    } \
+    if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
+       panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
+       __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
+    }\
+    if (   jnl->jhdr->start <= 0 \
+       || jnl->jhdr->start > jnl->jhdr->size\
+       || jnl->jhdr->start > 128*1024*1024) {\
+       panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
+       __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
+    }\
+    if (   jnl->jhdr->end <= 0 \
+       || jnl->jhdr->end > jnl->jhdr->size\
+       || jnl->jhdr->end > 128*1024*1024) {\
+       panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
+       __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
+    }\
+    if (jnl->jhdr->size > 128*1024*1024) {\
+       panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
+       __FILE__, __LINE__, jnl->jhdr->size);\
+    } \
+    } while(0)
+
+#define CHECK_TRANSACTION(tr) \
+    do {\
+    if (tr == NULL) {\
+       panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (tr->jnl == NULL) {\
+       panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
+    }\
+    if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
+       panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
+    }\
+    if (tr->total_bytes < 0) {\
+       panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
+    }\
+    if (tr->journal_start < 0 || tr->journal_start > 128*1024*1024) {\
+       panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
+    }\
+    if (tr->journal_end < 0 || tr->journal_end > 128*1024*1024) {\
+       panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
+    }\
+    if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > 2048)) {\
+       panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
+    }\
+    } while(0)
+
+
+
+//
+// this isn't a great checksum routine but it will do for now.
+// we use it to checksum the journal header and the block list
+// headers that are at the start of each transaction.
+//
+static int
+calc_checksum(char *ptr, int len)
+{
+    int i, cksum=0;
+
+    // this is a lame checksum but for now it'll do
+    for(i=0; i < len; i++, ptr++) {
+               cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
+    }
+
+    return (~cksum);
+}
+
+
+#define JNL_WRITE 1
+#define JNL_READ  2
+
+//
+// This function sets up a fake buf and passes it directly to the
+// journal device strategy routine (so that it won't get cached in
+// the block cache.
+//
+// It also handles range checking the i/o so that we don't write
+// outside the journal boundaries and it will wrap the i/o back
+// to the beginning if necessary (skipping over the journal header)
+// 
+static size_t
+do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
+{
+    int         err, io_sz=0, curlen=len;
+    struct buf *bp;
+       int max_iosize=0, max_vectors;
+
+    if (*offset < 0 || *offset > jnl->jhdr->size) {
+               panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
+    }
+
+  again:
+    bp = alloc_io_buf(jnl->jdev, 1);
+
+    if (direction == JNL_WRITE) {
+               bp->b_flags  |= 0;   // don't have to set any flags (was: B_WRITEINPROG)
+               jnl->jdev->v_numoutput++;
+               vfs_io_attributes(jnl->jdev, B_WRITE, &max_iosize, &max_vectors);
+    } else if (direction == JNL_READ) {
+               bp->b_flags  |= B_READ;
+               vfs_io_attributes(jnl->jdev, B_READ, &max_iosize, &max_vectors);
+    }
+
+       if (max_iosize == 0) {
+               max_iosize = 128 * 1024;
+       }
+
+    if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
+               if (*offset == jnl->jhdr->size) {
+                       *offset = jnl->jhdr->jhdr_size;
+               } else {
+                       curlen = (off_t)jnl->jhdr->size - *offset;
+               }
+    }
+
+       if (curlen > max_iosize) {
+               curlen = max_iosize;
+       }
+
+    if (curlen <= 0) {
+               panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen, *offset, len);
+    }
+
+    bp->b_bufsize = curlen;
+    bp->b_bcount  = curlen;
+    bp->b_data    = data;
+    bp->b_blkno   = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
+    bp->b_lblkno  = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
+
+    err = VOP_STRATEGY(bp);
+    if (!err) {
+               err = biowait(bp);
+    }
+    
+    bp->b_data    = NULL;
+    bp->b_bufsize = bp->b_bcount = 0;
+    bp->b_blkno   = bp->b_lblkno = -1;
+
+    free_io_buf(bp);
+
+    if (err) {
+               printf("jnl: do_jnl_io: strategy err 0x%x\n", err);
+               return 0;
+    }
+
+    *offset += curlen;
+    io_sz   += curlen;
+    if (io_sz != len) {
+               // handle wrap-around
+               data    = (char *)data + curlen;
+               curlen  = len - io_sz;
+               if (*offset >= jnl->jhdr->size) {
+                       *offset = jnl->jhdr->jhdr_size;
+               }
+               goto again;
+    }
+
+    return io_sz;
+}
+
+static size_t
+read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
+{
+    return do_journal_io(jnl, offset, data, len, JNL_READ);
+}
+
+static size_t
+write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
+{
+    return do_journal_io(jnl, offset, data, len, JNL_WRITE);
+}
+
+
+static int
+write_journal_header(journal *jnl)
+{
+    int ret;
+    off_t jhdr_offset = 0;
+    
+    // 
+    // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
+    //
+    ret = VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc());
+    if (ret != 0) {
+               printf("jnl: flushing fs disk buffer returned 0x%x\n", ret);
+    }
+
+
+    jnl->jhdr->checksum = 0;
+    jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
+    if (write_journal_data(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size) != jnl->jhdr->jhdr_size) {
+               printf("jnl: write_journal_header: error writing the journal header!\n");
+               jnl->flags |= JOURNAL_INVALID;
+               return -1;
+    }  
+
+    return 0;
+}
+
+
+
+//
+// this is a work function used to free up transactions that
+// completed. they can't be free'd from buffer_flushed_callback
+// because it is called from deep with the disk driver stack
+// and thus can't do something that would potentially cause
+// paging.  it gets called by each of the journal api entry
+// points so stuff shouldn't hang around for too long.
+//
+static void
+free_old_stuff(journal *jnl)
+{
+    transaction *tr, *next;
+
+    for(tr=jnl->tr_freeme; tr; tr=next) {
+               next = tr->next;
+               kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+    }
+
+    jnl->tr_freeme = NULL;
+}
+
+
+
+//
+// This is our callback that lets us know when a buffer has been
+// flushed to disk.  It's called from deep within the driver stack
+// and thus is quite limited in what it can do.  Notably, it can
+// not initiate any new i/o's or allocate/free memory.
+//
+static void
+buffer_flushed_callback(struct buf *bp)
+{
+    transaction  *tr;
+    journal      *jnl;
+    transaction  *ctr, *prev=NULL, *next;
+    int           i, bufsize;
+
+
+    //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n",
+    //    bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction);
+
+    // snarf out the bits we want
+    bufsize = bp->b_bufsize;
+    tr      = bp->b_transaction;
+
+    bp->b_iodone      = NULL;   // don't call us for this guy again
+    bp->b_transaction = NULL;
+
+    //
+    // This is what biodone() would do if it didn't call us.
+    // NOTE: THIS CODE *HAS* TO BE HERE!
+    //
+    if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
+               brelse(bp);
+    } else {                                   /* or just wakeup the buffer */ 
+               CLR(bp->b_flags, B_WANTED);
+               wakeup(bp);
+    }
+
+    // NOTE: from here on out we do *NOT* touch bp anymore.
+
+
+    // then we've already seen it
+    if (tr == NULL) {
+               return;
+    }
+
+    CHECK_TRANSACTION(tr);
+
+    jnl = tr->jnl;
+    if (jnl->flags & JOURNAL_INVALID) {
+               return;
+    }
+
+    CHECK_JOURNAL(jnl);
+
+    // update the number of blocks that have been flushed.
+    // this buf may represent more than one block so take
+    // that into account.
+    tr->num_flushed += bufsize;
+
+
+    // if this transaction isn't done yet, just return as
+    // there is nothing to do.
+    if ((tr->num_flushed + tr->num_killed) < tr->total_bytes) {
+               return;
+    }
+
+    //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
+    //   tr, tr->journal_start, tr->journal_end, jnl);
+
+       // find this entry in the old_start[] index and mark it completed
+       simple_lock(&jnl->old_start_lock);
+       for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
+
+               if ((jnl->old_start[i] & ~(0x8000000000000000LL)) == tr->journal_start) {
+                       jnl->old_start[i] &= ~(0x8000000000000000LL);
+                       break;
+               }
+       }
+       if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
+               panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
+                         tr->journal_start, tr, jnl);
+       }
+       simple_unlock(&jnl->old_start_lock);
+
+
+    // if we are here then we need to update the journal header
+    // to reflect that this transaction is complete
+    if (tr->journal_start == jnl->active_start) {
+               jnl->active_start = tr->journal_end;
+               tr->journal_start = tr->journal_end = (off_t)0;
+    }
+
+    // go through the completed_trs list and try to coalesce
+    // entries, restarting back at the beginning if we have to.
+    for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
+               if (ctr->journal_start == jnl->active_start) {
+                       jnl->active_start = ctr->journal_end;
+                       if (prev) {
+                               prev->next = ctr->next;
+                       }
+                       if (ctr == jnl->completed_trs) {
+                               jnl->completed_trs = ctr->next;
+                       }
+           
+                       next           = jnl->completed_trs;   // this starts us over again
+                       ctr->next      = jnl->tr_freeme;
+                       jnl->tr_freeme = ctr;
+                       ctr            = NULL;
+               } else if (tr->journal_end == ctr->journal_start) {
+                       ctr->journal_start = tr->journal_start;
+                       next               = jnl->completed_trs;  // this starts us over again
+                       ctr                = NULL;
+                       tr->journal_start  = tr->journal_end = (off_t)0;
+               } else if (tr->journal_start == ctr->journal_end) {
+                       ctr->journal_end  = tr->journal_end;
+                       next              = ctr->next;
+                       tr->journal_start = tr->journal_end = (off_t)0;
+               } else {
+                       next = ctr->next;
+               }
+    }
+    
+    // at this point no one should be using this guy anymore
+    tr->total_bytes = 0xfbadc0de;
+
+    // if this is true then we didn't merge with anyone
+    // so link ourselves in at the head of the completed
+    // transaction list.
+    if (tr->journal_start != 0) {
+               // put this entry into the correct sorted place
+               // in the list instead of just at the head.
+               //
+       
+               prev = NULL;
+               for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
+                       // just keep looping
+               }
+
+               if (ctr == NULL && prev == NULL) {
+                       jnl->completed_trs = tr;
+                       tr->next = NULL;
+               } else if (ctr == jnl->completed_trs) {
+                       tr->next = jnl->completed_trs;
+                       jnl->completed_trs = tr;
+               } else {
+                       tr->next = prev->next;
+                       prev->next = tr;
+               }
+    } else {
+               // if we're here this tr got merged with someone else so
+               // put it on the list to be free'd
+               tr->next       = jnl->tr_freeme;
+               jnl->tr_freeme = tr;
+    }
+}
+
+static int
+update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
+{
+    int         ret;
+    struct buf *oblock_bp=NULL;
+    
+    // first read the block we want.
+    ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
+    if (ret != 0) {
+               printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block, ret);
+
+               if (oblock_bp) {
+                       brelse(oblock_bp);
+                       oblock_bp = NULL;
+               }
+
+               // let's try to be aggressive here and just re-write the block
+               oblock_bp = getblk(jnl->fsdev, (daddr_t)fs_block, bsize, 0, 0, BLK_META);
+               if (oblock_bp == NULL) {
+                       printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block);
+                       return -1;
+               }
+    }
+           
+    // make sure it's the correct size.
+    if (oblock_bp->b_bufsize != bsize) {
+               brelse(oblock_bp);
+               return -1;
+    }
+
+    // copy the journal data over top of it
+    memcpy(oblock_bp->b_data, block_ptr, bsize);
+
+    if ((ret = VOP_BWRITE(oblock_bp)) != 0) {
+               printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block,ret);
+               brelse(oblock_bp);
+               return ret;
+    }
+
+    // and now invalidate it so that if someone else wants to read
+    // it in a different size they'll be able to do it.
+    ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
+    if (oblock_bp) {
+               oblock_bp->b_flags |= B_INVAL;
+               brelse(oblock_bp);
+    }
+           
+    return 0;
+}
+
+
+static int
+replay_journal(journal *jnl)
+{
+    int i, ret, checksum, max_bsize;
+    struct buf *oblock_bp;
+    block_list_header *blhdr;
+    off_t offset;
+    char *buf, *block_ptr=NULL;
+    
+    // wrap the start ptr if it points to the very end of the journal
+    if (jnl->jhdr->start == jnl->jhdr->size) {
+               jnl->jhdr->start = jnl->jhdr->jhdr_size;
+    }
+    if (jnl->jhdr->end == jnl->jhdr->size) {
+               jnl->jhdr->end = jnl->jhdr->jhdr_size;
+    }
+
+    if (jnl->jhdr->start == jnl->jhdr->end) {
+               return 0;
+    }
+
+    // allocate memory for the header_block.  we'll read each blhdr into this
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&buf, jnl->jhdr->blhdr_size)) {
+               printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
+                          jnl->jhdr->blhdr_size);
+               return -1;
+    }
+    
+
+    printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
+                  jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
+
+    while(jnl->jhdr->start != jnl->jhdr->end) {
+               offset = jnl->jhdr->start;
+               ret = read_journal_data(jnl, &offset, buf, jnl->jhdr->blhdr_size);
+               if (ret != jnl->jhdr->blhdr_size) {
+                       printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset);
+                       goto bad_replay;
+               }
+
+               blhdr = (block_list_header *)buf;
+               checksum = blhdr->checksum;
+               blhdr->checksum = 0;
+               if (checksum != calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE)) {
+                       printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
+                                  offset, checksum, calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE));
+                       goto bad_replay;
+               }
+               if (   blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
+                          || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
+                       printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
+                                  blhdr->max_blocks, blhdr->num_blocks);
+                       goto bad_replay;
+               }
+       
+               for(i=1,max_bsize=0; i < blhdr->num_blocks; i++) {
+                       if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
+                               printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum);
+                               goto bad_replay;
+                       }
+                       if (blhdr->binfo[i].bsize > max_bsize) {
+                               max_bsize = blhdr->binfo[i].bsize;
+                       }
+               }
+
+               // make sure it's at least one page in size.
+               if (max_bsize & (PAGE_SIZE - 1)) {
+                       max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
+               }
+
+               if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
+                       goto bad_replay;
+               }
+
+               //printf("jnl: replay_journal: %d blocks in journal entry @ 0x%llx\n", blhdr->num_blocks-1,
+               //         jnl->jhdr->start);
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       int size;
+
+                       size = blhdr->binfo[i].bsize;
+
+                       ret = read_journal_data(jnl, &offset, block_ptr, size);
+                       if (ret != size) {
+                               printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset);
+                               goto bad_replay;
+                       }
+
+                       // don't replay "killed" blocks
+                       if (blhdr->binfo[i].bnum == (off_t)-1) {
+                               // printf("jnl: replay_journal: skipping killed fs block (slot %d)\n", i);
+                       } else {
+                               //printf("jnl: replay_journal: fixing fs block # %lld (%d)\n",
+                               //         blhdr->binfo[i].bnum, blhdr->binfo[i].bsize);
+
+                               if (update_fs_block(jnl, block_ptr, blhdr->binfo[i].bnum, blhdr->binfo[i].bsize) != 0) {
+                                       goto bad_replay;
+                               }
+                       }
+
+                       // check if we need to wrap offset back to the beginning
+                       // (which is just past the journal header)
+                       //
+                       if (offset >= jnl->jhdr->size) {
+                               offset = jnl->jhdr->jhdr_size;
+                       }
+               }
+
+               kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
+               block_ptr = NULL;
+
+               jnl->jhdr->start += blhdr->bytes_used;
+               if (jnl->jhdr->start >= jnl->jhdr->size) {
+                       // wrap around and skip the journal header block
+                       jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
+               }
+
+               // only update the on-disk journal header if we've reached the
+               // last chunk of updates from this transaction.  if binfo[0].bnum
+               // is zero then we know we're at the end.
+               if (blhdr->binfo[0].bnum == 0) {
+                       if (write_journal_header(jnl) != 0) {
+                               goto bad_replay;
+                       }
+               }
+    }
+
+    kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
+    return 0;
+
+  bad_replay:
+    if (block_ptr) {
+               kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
+    }
+    kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
+    return -1;
+}
+
+
+#define DEFAULT_TRANSACTION_BUFFER_SIZE  (128*1024)
+//#define DEFAULT_TRANSACTION_BUFFER_SIZE  (256*1024)  // better performance but uses more mem
+#define MAX_TRANSACTION_BUFFER_SIZE      (512*1024)
+
+// XXXdbg - so I can change it in the debugger
+int def_tbuffer_size = 0;
+
+
+//
+// This function sets the size of the tbuffer and the
+// size of the blhdr.  It assumes that jnl->jhdr->size
+// and jnl->jhdr->jhdr_size are already valid.
+//
+static void
+size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
+{
+       //
+       // one-time initialization based on how much memory 
+       // there is in the machine.
+       //
+       if (def_tbuffer_size == 0) {
+               if (mem_size < (256*1024*1024)) {
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
+               } else if (mem_size < (512*1024*1024)) {
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
+               } else if (mem_size < (1024*1024*1024)) {
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
+               } else if (mem_size >= (1024*1024*1024)) {
+                       def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
+               }
+       }
+
+    // size up the transaction buffer... can't be larger than the number
+    // of blocks that can fit in a block_list_header block.
+    if (tbuffer_size == 0) {
+               jnl->tbuffer_size = def_tbuffer_size;
+    } else {
+               // make sure that the specified tbuffer_size isn't too small
+               if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
+                       tbuffer_size = jnl->jhdr->blhdr_size * 2;
+               }
+               // and make sure it's an even multiple of the block size
+               if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
+                       tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
+               }
+
+               jnl->tbuffer_size = tbuffer_size;
+    }
+
+    if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
+               jnl->tbuffer_size = (jnl->jhdr->size / 2);
+    }
+    
+    if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
+               jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
+    }
+
+    jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
+       if (jnl->jhdr->blhdr_size < phys_blksz) {
+               jnl->jhdr->blhdr_size = phys_blksz;
+       }
+}
+
+
+
+journal *
+journal_create(struct vnode *jvp,
+                          off_t         offset,
+                          off_t         journal_size,
+                          struct vnode *fsvp,
+                          size_t        min_fs_blksz,
+                          int32_t       flags,
+                          int32_t       tbuffer_size,
+                          void        (*flush)(void *arg),
+                          void         *arg)
+{
+    journal *jnl;
+    int      ret, phys_blksz;
+
+    /* Get the real physical block size. */
+    if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
+               return NULL;
+    }
+
+    if (phys_blksz > min_fs_blksz) {
+               printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
+                          phys_blksz, min_fs_blksz);
+               return NULL;
+    }
+
+    if ((journal_size % phys_blksz) != 0) {
+               printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
+                          journal_size, phys_blksz);
+               return NULL;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) {
+               return NULL;
+    }
+    memset(jnl, 0, sizeof(*jnl));
+
+    jnl->jdev         = jvp;
+    jnl->jdev_offset  = offset;
+    jnl->fsdev        = fsvp;
+    jnl->flush        = flush;
+    jnl->flush_arg    = arg;
+    jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
+       simple_lock_init(&jnl->old_start_lock);
+       
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
+               printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
+               goto bad_kmem_alloc;
+    }
+
+    memset(jnl->header_buf, 0, phys_blksz);
+    
+    jnl->jhdr             = (journal_header *)jnl->header_buf;
+    jnl->jhdr->magic      = JOURNAL_HEADER_MAGIC;
+    jnl->jhdr->endian     = ENDIAN_MAGIC;
+    jnl->jhdr->start      = phys_blksz;    // start at block #1, block #0 is for the jhdr itself
+    jnl->jhdr->end        = phys_blksz;
+    jnl->jhdr->size       = journal_size;
+    jnl->jhdr->jhdr_size  = phys_blksz;
+    size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+       jnl->active_start     = jnl->jhdr->start;
+
+    // XXXdbg  - for testing you can force the journal to wrap around
+    // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
+    // jnl->jhdr->end   = jnl->jhdr->size - (phys_blksz*3);
+    
+    if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) {
+               printf("jnl: journal_create: failed to create journal semaphore..\n");
+               goto bad_sem;
+    }
+
+    if (write_journal_header(jnl) != 0) {
+               printf("jnl: journal_create: failed to write journal header.\n");
+               goto bad_write;
+    }
+
+    return jnl;
+
+
+  bad_write:
+    semaphore_destroy(kernel_task, jnl->jsem);
+  bad_sem:
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+  bad_kmem_alloc:
+    jnl->jhdr = NULL;
+       kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+    return NULL;
+}
+
+
+journal *
+journal_open(struct vnode *jvp,
+                        off_t         offset,
+                        off_t         journal_size,
+                        struct vnode *fsvp,
+                        size_t        min_fs_blksz,
+                        int32_t       flags,
+                        int32_t       tbuffer_size,
+                        void        (*flush)(void *arg),
+                        void         *arg)
+{
+    journal *jnl;
+    int      orig_blksz=0, phys_blksz, blhdr_size;
+    off_t    hdr_offset=0;
+
+    /* Get the real physical block size. */
+    if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
+               return NULL;
+    }
+
+    if (phys_blksz > min_fs_blksz) {
+               printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
+                          phys_blksz, min_fs_blksz);
+               return NULL;
+    }
+
+    if ((journal_size % phys_blksz) != 0) {
+               printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
+                          journal_size, phys_blksz);
+               return NULL;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) {
+               return NULL;
+    }
+    memset(jnl, 0, sizeof(*jnl));
+
+    jnl->jdev         = jvp;
+    jnl->jdev_offset  = offset;
+    jnl->fsdev        = fsvp;
+    jnl->flush        = flush;
+    jnl->flush_arg    = arg;
+    jnl->flags        = (flags & JOURNAL_OPTION_FLAGS_MASK);
+       simple_lock_init(&jnl->old_start_lock);
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
+               printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
+               goto bad_kmem_alloc;
+    }
+
+    jnl->jhdr = (journal_header *)jnl->header_buf;
+    memset(jnl->jhdr, 0, sizeof(journal_header)+4);
+
+    // we have to set this up here so that do_journal_io() will work
+    jnl->jhdr->jhdr_size = phys_blksz;
+
+    if (read_journal_data(jnl, &hdr_offset, jnl->jhdr, phys_blksz) != phys_blksz) {
+               printf("jnl: open: could not read %d bytes for the journal header.\n",
+                          phys_blksz);
+               goto bad_journal;
+    }
+
+    if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
+               printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
+                          jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
+               goto bad_journal;
+    }
+
+       // only check if we're the current journal header magic value
+       if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
+               int orig_checksum = jnl->jhdr->checksum;
+
+               jnl->jhdr->checksum = 0;
+               if (orig_checksum != calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header))) {
+                       printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum,
+                                  calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header)));
+                       //goto bad_journal;
+               }
+       }
+
+       // XXXdbg - convert old style magic numbers to the new one
+       if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
+               jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
+       }
+
+    if (phys_blksz != jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
+               printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
+                          phys_blksz, jnl->jhdr->jhdr_size);
+
+               orig_blksz = phys_blksz;
+               phys_blksz = jnl->jhdr->jhdr_size;
+               if (VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, FSCRED, NULL)) {
+                       printf("jnl: could not set block size to %d bytes.\n", phys_blksz);
+                       goto bad_journal;
+               }
+//             goto bad_journal;
+    }
+
+    if (   jnl->jhdr->start <= 0
+                  || jnl->jhdr->start > jnl->jhdr->size
+                  || jnl->jhdr->start > 128*1024*1024) {
+               printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
+                          jnl->jhdr->start, jnl->jhdr->size);
+               goto bad_journal;
+    }
+
+    if (   jnl->jhdr->end <= 0
+                  || jnl->jhdr->end > jnl->jhdr->size
+                  || jnl->jhdr->end > 128*1024*1024) {
+               printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
+                          jnl->jhdr->end, jnl->jhdr->size);
+               goto bad_journal;
+    }
+
+    if (jnl->jhdr->size > 128*1024*1024) {
+               printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size);
+               goto bad_journal;
+    }
+
+// XXXdbg - can't do these checks because hfs writes all kinds of
+//          non-uniform sized blocks even on devices that have a block size
+//          that is larger than 512 bytes (i.e. optical media w/2k blocks).
+//          therefore these checks will fail and so we just have to punt and
+//          do more relaxed checking...
+// XXXdbg    if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
+    if ((jnl->jhdr->start % 512) != 0) {
+               printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
+                          jnl->jhdr->start);
+               goto bad_journal;
+    }
+
+//XXXdbg    if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
+    if ((jnl->jhdr->end % 512) != 0) {
+               printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
+                          jnl->jhdr->end, jnl->jhdr->jhdr_size);
+               goto bad_journal;
+    }
+
+    // take care of replaying the journal if necessary
+       if (flags & JOURNAL_RESET) {
+               printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
+                          jnl, jnl->jhdr->start, jnl->jhdr->end);
+               jnl->jhdr->start = jnl->jhdr->end;
+       } else if (replay_journal(jnl) != 0) {
+               printf("jnl: journal_open: Error replaying the journal!\n");
+               goto bad_journal;
+    }
+
+       if (orig_blksz != 0) {
+               VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
+               phys_blksz = orig_blksz;
+       }
+
+       // make sure this is in sync!
+       jnl->active_start = jnl->jhdr->start;
+
+    // set this now, after we've replayed the journal
+    size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+    if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) {
+               printf("jnl: journal_create: failed to create journal semaphore..\n");
+               goto bad_journal;
+    }
+
+    return jnl;
+
+  bad_journal:
+       if (orig_blksz != 0) {
+               phys_blksz = orig_blksz;
+               VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
+       }
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+  bad_kmem_alloc:
+       kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+    return NULL;    
+}
+
+void
+journal_close(journal *jnl)
+{
+    volatile off_t *start, *end;
+    int             counter=0;
+
+    CHECK_JOURNAL(jnl);
+
+       // set this before doing anything that would block so that
+       // we start tearing things down properly.
+       //
+       jnl->flags |= JOURNAL_CLOSE_PENDING;
+
+    if (jnl->owner != current_act()) {
+               int ret;
+
+               while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+                       // just keep trying if we've been ^C'ed
+               }
+               if (ret != 0) {
+                       printf("jnl: close: sem wait failed.\n");
+                       return;
+               }
+    }
+
+    //
+    // only write stuff to disk if the journal is still valid
+    //
+    if ((jnl->flags & JOURNAL_INVALID) == 0) {
+
+               if (jnl->active_tr) {
+                       journal_end_transaction(jnl);
+               }
+               
+               // flush any buffered transactions
+               if (jnl->cur_tr) {
+                       transaction *tr = jnl->cur_tr;
+
+                       jnl->cur_tr = NULL;
+                       end_transaction(tr, 1);   // force it to get flushed
+               }
+    
+               //start = &jnl->jhdr->start;
+               start = &jnl->active_start;
+               end   = &jnl->jhdr->end;
+    
+               while (*start != *end && counter++ < 500) {
+                       printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
+                       if (jnl->flush) {
+                               jnl->flush(jnl->flush_arg);
+                       }
+       
+               }
+
+               if (*start != *end) {
+                       printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
+                                  *start, *end);
+               }
+
+               // make sure this is in sync when we close the journal
+               jnl->jhdr->start = jnl->active_start;
+
+               // if this fails there's not much we can do at this point...
+               write_journal_header(jnl);
+    } else {
+               // if we're here the journal isn't valid any more.
+               // so make sure we don't leave any locked blocks lying around
+               printf("jnl: close: journal 0x%x, is invalid.  aborting outstanding transactions\n", jnl);
+               if (jnl->active_tr || jnl->cur_tr) {
+                       transaction *tr;
+                       if (jnl->active_tr) {
+                               tr = jnl->active_tr;
+                               jnl->active_tr = NULL;
+                       } else {
+                               tr = jnl->cur_tr;
+                               jnl->cur_tr = NULL;
+                       }
+
+                       abort_transaction(jnl, tr);
+                       if (jnl->active_tr || jnl->cur_tr) {
+                               panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl);
+                       }
+               }
+    }
+
+    free_old_stuff(jnl);
+
+    kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
+    jnl->jhdr = (void *)0xbeefbabe;
+
+    semaphore_destroy(kernel_task, jnl->jsem);
+       kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+}
+
+static void
+dump_journal(journal *jnl)
+{
+    transaction *ctr;
+
+    printf("journal:");
+    printf("  jdev_offset %.8llx\n", jnl->jdev_offset);
+    printf("  magic: 0x%.8x\n", jnl->jhdr->magic);
+    printf("  start: 0x%.8llx\n", jnl->jhdr->start);
+    printf("  end:   0x%.8llx\n", jnl->jhdr->end);
+    printf("  size:  0x%.8llx\n", jnl->jhdr->size);
+    printf("  blhdr size: %d\n", jnl->jhdr->blhdr_size);
+    printf("  jhdr size: %d\n", jnl->jhdr->jhdr_size);
+    printf("  chksum: 0x%.8x\n", jnl->jhdr->checksum);
+    
+    printf("  completed transactions:\n");
+    for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
+               printf("    0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
+    }
+}
+
+
+
+static off_t
+free_space(journal *jnl)
+{
+    off_t free_space;
+       
+    if (jnl->jhdr->start < jnl->jhdr->end) {
+               free_space = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
+    } else if (jnl->jhdr->start > jnl->jhdr->end) {
+               free_space = jnl->jhdr->start - jnl->jhdr->end;
+    } else {
+               // journal is completely empty
+               free_space = jnl->jhdr->size - jnl->jhdr->jhdr_size;
+    }
+
+    return free_space;
+}
+
+
+//
+// The journal must be locked on entry to this function.
+// The "desired_size" is in bytes.
+//
+static int
+check_free_space(journal *jnl, int desired_size)
+{
+    int    i, counter=0;
+
+    //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
+//        desired_size, free_space(jnl));
+    
+    while (1) {
+               if (counter++ == 5000) {
+                       dump_journal(jnl);
+                       panic("jnl: check_free_space: buffer flushing isn't working "
+                                 "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl,
+                                 jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
+               }
+               if (counter > 7500) {
+                       printf("jnl: check_free_space: giving up waiting for free space.\n");
+                       return ENOSPC;
+               }
+
+               // make sure there's space in the journal to hold this transaction
+               if (free_space(jnl) > desired_size) {
+                       break;
+               }
+
+               //
+               // here's where we lazily bump up jnl->jhdr->start.  we'll consume
+               // entries until there is enough space for the next transaction.
+               //
+               simple_lock(&jnl->old_start_lock);
+               for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
+                       int   counter;
+
+                       counter = 0;
+                       while (jnl->old_start[i] & 0x8000000000000000LL) {
+                               if (counter++ > 100) {
+                                       panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
+                                                 jnl->old_start[i], jnl);
+                               }
+                               
+                               simple_unlock(&jnl->old_start_lock);
+                               if (jnl->flush) {
+                                       jnl->flush(jnl->flush_arg);
+                               }
+                               tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
+                               simple_lock(&jnl->old_start_lock);
+                       }
+
+                       if (jnl->old_start[i] == 0) {
+                               continue;
+                       }
+
+                       jnl->jhdr->start  = jnl->old_start[i];
+                       jnl->old_start[i] = 0;
+                       if (free_space(jnl) > desired_size) {
+                               write_journal_header(jnl);
+                               break;
+                       }
+               }
+               simple_unlock(&jnl->old_start_lock);
+               
+               // if we bumped the start, loop and try again
+               if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
+                       continue;
+               }
+
+
+               // if the file system gave us a flush function, call it to so that
+               // it can flush some blocks which hopefully will cause some transactions
+               // to complete and thus free up space in the journal.
+               if (jnl->flush) {
+                       jnl->flush(jnl->flush_arg);
+               }
+       
+               // wait for a while to avoid being cpu-bound (this will
+               // put us to sleep for 10 milliseconds)
+               tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
+    }
+
+    return 0;
+}
+
+int
+journal_start_transaction(journal *jnl)
+{
+    int ret;
+    transaction *tr;
+
+    CHECK_JOURNAL(jnl);
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    if (jnl->owner == current_act()) {
+               if (jnl->active_tr == NULL) {
+                       panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n",
+                                 jnl, jnl->owner, current_act());
+               }
+               jnl->nested_count++;
+               return 0;
+    }
+
+    while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+               // just keep looping if we've been ^C'ed
+    }
+    if (ret != 0) {
+               printf("jnl: start_tr: sem wait failed.\n");
+               return EINVAL;
+    }
+
+    if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
+               panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
+                         jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
+    }
+
+    jnl->owner        = current_act();
+    jnl->nested_count = 1;
+
+    free_old_stuff(jnl);
+
+    // make sure there's room in the journal
+    if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
+               printf("jnl: start transaction failed: no space\n");
+               ret = ENOSPC;
+               goto bad_start;
+    }
+
+    // if there's a buffered transaction, use it.
+    if (jnl->cur_tr) {
+               jnl->active_tr = jnl->cur_tr;
+               jnl->cur_tr    = NULL;
+
+               return 0;
+    }
+
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&tr, sizeof(transaction))) {
+               printf("jnl: start transaction failed: no mem\n");
+               ret = ENOMEM;
+               goto bad_start;
+    }
+    memset(tr, 0, sizeof(transaction));
+
+    tr->tbuffer_size = jnl->tbuffer_size;
+    if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
+               kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+               printf("jnl: start transaction failed: no tbuffer mem\n");
+               ret = ENOMEM;
+               goto bad_start;
+    }
+
+    // journal replay code checksum check depends on this.
+    memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
+
+    tr->blhdr = (block_list_header *)tr->tbuffer;
+    tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
+    tr->blhdr->num_blocks = 1;      // accounts for this header block
+    tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
+
+    tr->num_blhdrs  = 1;
+    tr->total_bytes = jnl->jhdr->blhdr_size;
+    tr->jnl         = jnl;
+
+    jnl->active_tr    = tr;
+
+    // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
+
+    return 0;
+
+  bad_start:
+       jnl->owner        = NULL;
+       jnl->nested_count = 0;
+       semaphore_signal(jnl->jsem);
+       return ret;
+}
+
+
+int
+journal_modify_block_start(journal *jnl, struct buf *bp)
+{
+    transaction *tr;
+    
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    // XXXdbg - for debugging I want this to be true.  later it may
+    //          not be necessary.
+    if ((bp->b_flags & B_META) == 0) {
+               panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp, jnl);
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+               panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n",
+    //   bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
+
+    // can't allow blocks that aren't an even multiple of the
+    // underlying block size.
+    if ((bp->b_bufsize % jnl->jhdr->jhdr_size) != 0) {
+               panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
+                         bp->b_bufsize, jnl->jhdr->jhdr_size);
+               return -1;
+    }
+
+    // make sure that this transaction isn't bigger than the whole journal
+    if (tr->total_bytes+bp->b_bufsize >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
+               panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
+                         tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), bp->b_bufsize, tr, bp);
+               return -1;
+    }
+
+    // if the block is dirty and not already locked we have to write
+    // it out before we muck with it because it has data that belongs
+    // (presumably) to another transaction.
+    //
+    if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+
+               // this will cause it to not be brelse()'d
+               bp->b_flags |= B_NORELSE;
+               VOP_BWRITE(bp);
+    }
+
+    bp->b_flags |= B_LOCKED;
+       
+    return 0;
+}
+
+int
+journal_modify_block_abort(journal *jnl, struct buf *bp)
+{
+    transaction *tr;
+       block_list_header *blhdr;
+       int i, j;
+    
+    CHECK_JOURNAL(jnl);
+
+    tr = jnl->active_tr;
+       
+       //
+       // if there's no active transaction then we just want to
+       // call brelse() and return since this is just a block
+       // that happened to be modified as part of another tr.
+       //
+       if (tr == NULL) {
+               brelse(bp);
+               return 0;
+       }
+
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    CHECK_TRANSACTION(tr);
+    
+    if (jnl->owner != current_act()) {
+               panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
+
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (bp == blhdr->binfo[i].bp) {
+                               if (bp->b_bufsize != blhdr->binfo[i].bsize) {
+                                       panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
+                                                 bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
+                               }
+                               break;
+                       }
+               }
+
+               if (i < blhdr->num_blocks) {
+                       break;
+               }
+    }
+
+       //
+       // if blhdr is null, then this block has only had modify_block_start
+       // called on it as part of the current transaction.  that means that
+       // it is ok to clear the LOCKED bit since it hasn't actually been
+       // modified.  if blhdr is non-null then modify_block_end was called
+       // on it and so we need to keep it locked in memory.
+       //
+       if (blhdr == NULL) { 
+               bp->b_flags &= ~(B_LOCKED);
+       }
+
+    brelse(bp);
+    return 0;
+}
+
+
+int
+journal_modify_block_end(journal *jnl, struct buf *bp)
+{
+    int                i, j, tbuffer_offset;
+    char              *blkptr;
+    block_list_header *blhdr, *prev=NULL;
+    transaction       *tr;
+
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+               panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    //printf("jnl: mod block end:  (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n", 
+    //   bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
+
+    if ((bp->b_flags & B_LOCKED) == 0) {
+               panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
+               bp->b_flags |= B_LOCKED;
+    }
+        
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+               tbuffer_offset = jnl->jhdr->blhdr_size;
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (bp == blhdr->binfo[i].bp) {
+                               if (bp->b_bufsize != blhdr->binfo[i].bsize) {
+                                       panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
+                                                 bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
+                               }
+                               break;
+                       }
+                       tbuffer_offset += blhdr->binfo[i].bsize;
+               }
+
+               if (i < blhdr->num_blocks) {
+                       break;
+               }
+    }
+
+    if (blhdr == NULL
+               && prev
+               && (prev->num_blocks+1) <= prev->max_blocks
+               && (prev->bytes_used+bp->b_bufsize) <= tr->tbuffer_size) {
+               blhdr = prev;
+    } else if (blhdr == NULL) {
+               block_list_header *nblhdr;
+
+               if (prev == NULL) {
+                       panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl, bp);
+               }
+
+               // we got to the end of the list, didn't find the block and there's
+               // no room in the block_list_header pointed to by prev
+       
+               // we allocate another tbuffer and link it in at the end of the list
+               // through prev->binfo[0].bnum.  that's a skanky way to do things but
+               // avoids having yet another linked list of small data structures to manage.
+
+               if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
+                       panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
+                                 tr, tr->total_bytes);
+               }
+
+               // journal replay code checksum check depends on this.
+               memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
+
+               // initialize the new guy
+               nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
+               nblhdr->num_blocks = 1;      // accounts for this header block
+               nblhdr->bytes_used = jnl->jhdr->blhdr_size;
+           
+               tr->num_blhdrs++;
+               tr->total_bytes += jnl->jhdr->blhdr_size;
+
+               // then link him in at the end
+               prev->binfo[0].bnum = (off_t)((long)nblhdr);
+
+               // and finally switch to using the new guy
+               blhdr          = nblhdr;
+               tbuffer_offset = jnl->jhdr->blhdr_size;
+               i              = 1;
+    }
+
+
+    if ((i+1) > blhdr->max_blocks) {
+               panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
+    }
+
+    // copy the data into the in-memory transaction buffer
+    blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
+    memcpy(blkptr, bp->b_data, bp->b_bufsize);
+
+    // if this is true then this is a new block we haven't seen
+    if (i >= blhdr->num_blocks) {
+               vget(bp->b_vp, 0, current_proc());
+
+               blhdr->binfo[i].bnum  = bp->b_blkno;
+               blhdr->binfo[i].bsize = bp->b_bufsize;
+               blhdr->binfo[i].bp    = bp;
+
+               blhdr->bytes_used += bp->b_bufsize;
+               tr->total_bytes   += bp->b_bufsize;
+
+               blhdr->num_blocks++;
+    }
+
+    bdwrite(bp);
+
+    return 0;
+}
+
+int
+journal_kill_block(journal *jnl, struct buf *bp)
+{
+    int                i;
+    block_list_header *blhdr;
+    transaction       *tr;
+
+    CHECK_JOURNAL(jnl);
+
+    if (jnl->flags & JOURNAL_INVALID) {
+               return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    if (jnl->owner != current_act()) {
+               panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    if ((bp->b_flags & B_LOCKED) == 0) {
+               panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
+    }
+
+    // first check if it's already part of this transaction
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (bp == blhdr->binfo[i].bp) {
+                               bp->b_flags &= ~B_LOCKED;
+
+                               // this undoes the vget() in journal_modify_block_end()
+                               vrele(bp->b_vp);
+
+                               // if the block has the DELWRI and CALL bits sets, then
+                               // things are seriously weird.  if it was part of another
+                               // transaction then journal_modify_block_start() should
+                               // have force it to be written.
+                               //
+                               if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_CALL)) {
+                                       panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
+                               } else {
+                                       tr->num_killed += bp->b_bufsize;
+                               }
+
+                               if (bp->b_flags & B_BUSY) {
+                                       brelse(bp);
+                               }
+
+                               blhdr->binfo[i].bp   = NULL;
+                               blhdr->binfo[i].bnum = (off_t)-1;
+                               break;
+                       }
+               }
+
+               if (i < blhdr->num_blocks) {
+                       break;
+               }
+    }
+
+    return 0;
+}
+
+
+static int
+journal_binfo_cmp(void *a, void *b)
+{
+    block_info *bi_a = (struct block_info *)a,
+ *bi_b = (struct block_info *)b;
+    daddr_t res;
+
+    if (bi_a->bp == NULL) {
+               return 1;
+    }
+    if (bi_b->bp == NULL) {
+               return -1;
+    }
+
+    // don't have to worry about negative block
+    // numbers so this is ok to do.
+    //
+    res = (bi_a->bp->b_blkno - bi_b->bp->b_blkno);
+
+    return (int)res;
+}
+
+
+static int
+end_transaction(transaction *tr, int force_it)
+{
+    int                 i, j, ret, amt;
+    off_t               end;
+    journal            *jnl = tr->jnl;
+    struct buf         *bp;
+    block_list_header  *blhdr=NULL, *next=NULL;
+
+       if (jnl->cur_tr) {
+               panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
+                         jnl, jnl->cur_tr, tr);
+       }
+
+    // if there weren't any modified blocks in the transaction
+    // just save off the transaction pointer and return.
+    if (tr->total_bytes == jnl->jhdr->blhdr_size) {
+               jnl->cur_tr = tr;
+               return;
+    }
+
+    // if our transaction buffer isn't very full, just hang
+    // on to it and don't actually flush anything.  this is
+    // what is known as "group commit".  we will flush the
+    // transaction buffer if it's full or if we have more than
+    // one of them so we don't start hogging too much memory.
+    //
+    if (   force_it == 0
+                  && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 
+                  && tr->num_blhdrs < 3
+                  && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
+
+               jnl->cur_tr = tr;
+               return;
+    }
+
+
+    // if we're here we're going to flush the transaction buffer to disk.
+    // make sure there is room in the journal first.
+    check_free_space(jnl, tr->total_bytes);
+
+    // range check the end index
+    if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
+               panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
+                         jnl->jhdr->end, jnl->jhdr->size);
+    }
+
+    // this transaction starts where the current journal ends
+    tr->journal_start = jnl->jhdr->end;
+    end               = jnl->jhdr->end;
+
+       //
+       // if the first entry in old_start[] isn't free yet, loop calling the
+       // file system flush routine until it is (or we panic).
+       //
+       i = 0;
+       simple_lock(&jnl->old_start_lock);
+       while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
+               if (jnl->flush) {
+                       simple_unlock(&jnl->old_start_lock);
+
+                       if (jnl->flush) {
+                               jnl->flush(jnl->flush_arg);
+                       }
+
+                       // yield the cpu so others can get in to clear the lock bit
+                       (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
+
+                       simple_lock(&jnl->old_start_lock);
+               }
+               if (i++ >= 100) {
+                       panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
+                                 jnl->old_start[0] & (~0x8000000000000000LL), jnl);
+               }
+       }
+
+       //
+       // slide everyone else down and put our latest guy in the last
+       // entry in the old_start array
+       //
+       memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
+       jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
+
+       simple_unlock(&jnl->old_start_lock);
+
+
+    // for each block, make sure that the physical block # is set
+    for(blhdr=tr->blhdr; blhdr; blhdr=next) {
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+           
+                       bp = blhdr->binfo[i].bp;
+                       if (bp == NULL) {   // only true if a block was "killed" 
+                               if (blhdr->binfo[i].bnum != (off_t)-1) {
+                                       panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
+                                                 blhdr->binfo[i].bnum, jnl, tr);
+                               }
+                               continue;
+                       }
+
+                       if (bp->b_vp == NULL && bp->b_lblkno == bp->b_blkno) {
+                               panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n",
+                                         bp, bp->b_lblkno, bp->b_blkno);
+                       }
+           
+                       // if the lblkno is the same as blkno and this bp isn't
+                       // associated with the underlying file system device then
+                       // we need to call bmap() to get the actual physical block.
+                       //
+                       if ((bp->b_lblkno == bp->b_blkno) && (bp->b_vp != jnl->fsdev)) {
+                               if (VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL) != 0) {
+                                       printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl);
+                                       goto bad_journal;
+                               }
+                       }
+           
+                       // update this so we write out the correct physical block number!
+                       blhdr->binfo[i].bnum = bp->b_blkno;
+               }
+
+               next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+    }
+    
+    for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+
+               amt = blhdr->bytes_used;
+
+               blhdr->checksum = 0;
+               blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
+       
+               ret = write_journal_data(jnl, &end, blhdr, amt);
+               if (ret != amt) {
+                       printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
+                                  ret, amt);
+
+                       goto bad_journal;
+               }
+    }
+
+    jnl->jhdr->end  = end;    // update where the journal now ends
+    tr->journal_end = end;    // the transaction ends here too
+    if (tr->journal_start == 0 || tr->journal_end == 0) {
+               panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
+                         tr->journal_start, tr->journal_end);
+    }
+
+    if (write_journal_header(jnl) != 0) {
+               goto bad_journal;
+    }
+
+    //
+    // setup for looping through all the blhdr's.  we null out the
+    // tbuffer and blhdr fields so that they're not used any more.
+    //
+    blhdr       = tr->blhdr;
+    tr->tbuffer = NULL;
+    tr->blhdr   = NULL;
+
+    // the buffer_flushed_callback will only be called for the 
+    // real blocks that get flushed so we have to account for 
+    // the block_list_headers here.
+    //
+    tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
+
+    // for each block, set the iodone callback and unlock it
+    for(; blhdr; blhdr=next) {
+
+               // we can re-order the buf ptrs because everything is written out already
+               qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (blhdr->binfo[i].bp == NULL) {
+                               continue;
+                       }
+
+                       ret = meta_bread(blhdr->binfo[i].bp->b_vp,
+                                                        (daddr_t)blhdr->binfo[i].bp->b_lblkno,
+                                                        blhdr->binfo[i].bp->b_bufsize,
+                                                        NOCRED,
+                                                        &bp);
+                       if (ret == 0 && bp != NULL) {
+                               struct vnode *save_vp;
+               
+                               if (bp != blhdr->binfo[i].bp) {
+                                       panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
+                                                 bp, blhdr->binfo[i].bp, jnl);
+                               }
+
+                               if ((bp->b_flags & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
+                                       if (jnl->flags & JOURNAL_CLOSE_PENDING) {
+                                               brelse(bp);
+                                               continue;
+                                       } else {
+                                               panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, bp->b_flags);
+                                       }
+                               }
+
+                               if (bp->b_iodone != NULL) {
+                                       panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
+                                                 bp, bp->b_blkno, bp->b_vp, bp->b_iodone, buffer_flushed_callback);
+                               }
+
+                               save_vp = bp->b_vp;
+
+                               bp->b_iodone       = buffer_flushed_callback;
+                               bp->b_transaction  = tr;
+                               bp->b_flags       |= B_CALL;
+                               bp->b_flags       &= ~(B_LOCKED);
+
+                               // kicking off the write here helps performance
+                               bawrite(bp);
+                               // XXXdbg this is good for testing: bdwrite(bp);
+                               //bdwrite(bp);
+                               
+                               // this undoes the vget() in journal_modify_block_end()
+                               vrele(save_vp);
+
+                       } else {
+                               printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
+                                          blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
+                       }
+               }
+
+               next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+
+               // we can free blhdr here since we won't need it any more
+               blhdr->binfo[0].bnum = 0xdeadc0de;
+               kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
+    }
+
+    //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
+    //   tr, tr->journal_start, tr->journal_end);
+    return 0;
+
+
+  bad_journal:
+    jnl->flags |= JOURNAL_INVALID;
+    abort_transaction(jnl, tr);
+    return -1;
+}
+
+static void
+abort_transaction(journal *jnl, transaction *tr)
+{
+    int                i, ret;
+    block_list_header *blhdr, *next;
+    struct buf        *bp;
+
+    // for each block list header, iterate over the blocks then
+    // free up the memory associated with the block list.
+    //
+    // for each block, clear the lock bit and release it.
+    //
+    for(blhdr=tr->blhdr; blhdr; blhdr=next) {
+
+               for(i=1; i < blhdr->num_blocks; i++) {
+                       if (blhdr->binfo[i].bp == NULL) {
+                               continue;
+                       }
+           
+                       ret = meta_bread(blhdr->binfo[i].bp->b_vp,
+                                                        (daddr_t)blhdr->binfo[i].bp->b_lblkno,
+                                                        blhdr->binfo[i].bp->b_bufsize,
+                                                        NOCRED,
+                                                        &bp);
+                       if (ret == 0 && bp != NULL) {
+                               if (bp != blhdr->binfo[i].bp) {
+                                       panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
+                                                 bp, blhdr->binfo[i].bp, jnl);
+                               }
+
+                               // clear the locked bit and the delayed-write bit.  we
+                               // don't want these blocks going to disk.
+                               bp->b_flags &= ~(B_LOCKED|B_DELWRI);
+                               bp->b_flags |= B_INVAL;
+
+                               brelse(bp);
+
+                       } else {
+                               printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
+                                          blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
+                       }
+               }
+
+               next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+
+               // we can free blhdr here since we won't need it any more
+               blhdr->binfo[0].bnum = 0xdeadc0de;
+               kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
+    }
+
+    tr->tbuffer     = NULL;
+    tr->blhdr       = NULL;
+    tr->total_bytes = 0xdbadc0de;
+       kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+}
+
+
+int
+journal_end_transaction(journal *jnl)
+{
+    int ret;
+    transaction *tr;
+    
+    CHECK_JOURNAL(jnl);
+
+       if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
+               return 0;
+       }
+
+    if (jnl->owner != current_act()) {
+               panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+                         jnl, jnl->owner, current_act());
+    }
+
+    free_old_stuff(jnl);
+
+    jnl->nested_count--;
+    if (jnl->nested_count > 0) {
+               return 0;
+    } else if (jnl->nested_count < 0) {
+               panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
+    }
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+               if (jnl->active_tr) {
+                       transaction *tr;
+
+                       if (jnl->cur_tr != NULL) {
+                               panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
+                                         jnl, jnl->active_tr, jnl->cur_tr);
+                       }
+           
+                       tr             = jnl->active_tr;
+                       jnl->active_tr = NULL;
+                       abort_transaction(jnl, tr);
+               }
+
+               jnl->owner = NULL;
+               semaphore_signal(jnl->jsem);
+
+               return EINVAL;
+    }
+
+    tr = jnl->active_tr;
+    CHECK_TRANSACTION(tr);
+
+    // clear this out here so that when check_free_space() calls
+    // the FS flush function, we don't panic in journal_flush()
+    // if the FS were to call that.  note: check_free_space() is
+    // called from end_transaction().
+    // 
+    jnl->active_tr = NULL;
+    ret = end_transaction(tr, 0);
+
+    jnl->owner = NULL;
+    semaphore_signal(jnl->jsem);
+
+    return ret;
+}
+
+
+int
+journal_flush(journal *jnl)
+{
+    int need_signal = 0;
+    
+    CHECK_JOURNAL(jnl);
+    
+    if (jnl->flags & JOURNAL_INVALID) {
+               return -1;
+    }
+
+    if (jnl->owner != current_act()) {
+               int ret;
+
+               while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+                       // just keep looping if we've ben ^C'ed 
+               }
+               if (ret != 0) {
+                       printf("jnl: flush: sem wait failed.\n");
+                       return -1;
+               }
+               need_signal = 1;
+    }
+
+    free_old_stuff(jnl);
+
+    // if we're not active, flush any buffered transactions
+    if (jnl->active_tr == NULL && jnl->cur_tr) {
+               transaction *tr = jnl->cur_tr;
+
+               jnl->cur_tr = NULL;
+               end_transaction(tr, 1);   // force it to get flushed
+    }
+
+    if (need_signal) {
+               semaphore_signal(jnl->jsem);
+    }
+
+    return 0;
+}
+
+int
+journal_active(journal *jnl)
+{
+    if (jnl->flags & JOURNAL_INVALID) {
+               return -1;
+    }
+    
+    return (jnl->active_tr == NULL) ? 0 : 1;
+}
diff --git a/bsd/vfs/vfs_journal.h b/bsd/vfs/vfs_journal.h

new file mode 100644 (file)

index 0000000..523ba7d
--- /dev/null
+++ b/bsd/vfs/vfs_journal.h
@@ -0,0 +1,238 @@
+
+/*
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ * 
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+/*
+ * This header contains the structures and function prototypes
+ * for the vfs journaling code.  The data types are not meant
+ * to be modified by user code.  Just use the functions and do
+ * not mess around with the structs.
+ */ 
+#ifndef _SYS_VFS_JOURNAL_H_
+#define _SYS_VFS_JOURNAL_H_
+
+#include <sys/appleapiopts.h>
+
+#ifdef __APPLE_API_UNSTABLE
+
+#include <sys/types.h>
+
+typedef struct block_info {
+    off_t       bnum;                // block # on the file system device
+    size_t      bsize;               // in bytes
+    struct buf *bp;
+} block_info;
+
+typedef struct block_list_header {
+    u_int16_t   max_blocks;          // max number of blocks in this chunk
+    u_int16_t   num_blocks;          // number of valid block numbers in block_nums
+    int32_t     bytes_used;          // how many bytes of this tbuffer are used
+    int32_t     checksum;            // on-disk: checksum of this header and binfo[0]
+    int32_t     pad;                 // pad out to 16 bytes
+    block_info  binfo[1];            // so we can reference them by name
+} block_list_header;
+
+
+struct journal;
+
+typedef struct transaction {
+    int                 tbuffer_size;  // in bytes
+    char               *tbuffer;       // memory copy of the transaction
+    block_list_header  *blhdr;         // points to the first byte of tbuffer
+    int                 num_blhdrs;    // how many buffers we've allocated
+    int                 total_bytes;   // total # of bytes in transaction
+    int                 num_flushed;   // how many bytes have been flushed
+    int                 num_killed;    // how many bytes were "killed"
+    off_t               journal_start; // where in the journal this transaction starts
+    off_t               journal_end;   // where in the journal this transaction ends
+    struct journal     *jnl;           // ptr back to the journal structure
+    struct transaction *next;          // list of tr's (either completed or to be free'd)
+} transaction;
+
+
+/*
+ * This is written to block zero of the journal and it
+ * maintains overall state about the journal.
+ */
+typedef struct journal_header {
+    int32_t        magic;
+    int32_t        endian;
+    volatile off_t start;         // zero-based byte offset of the start of the first transaction
+    volatile off_t end;           // zero-based byte offset of where free space begins
+    off_t          size;          // size in bytes of the entire journal
+    int32_t        blhdr_size;    // size in bytes of each block_list_header in the journal
+    int32_t        checksum;
+    int32_t        jhdr_size;     // block size (in bytes) of the journal header
+} journal_header;
+
+#define JOURNAL_HEADER_MAGIC  0x4a4e4c78   // 'JNLx'
+#define ENDIAN_MAGIC          0x12345678
+
+#define OLD_JOURNAL_HEADER_MAGIC  0x4a484452   // 'JHDR'
+
+
+/*
+ * In memory structure about the journal.
+ */
+typedef struct journal {
+    struct vnode       *jdev;              // vnode of the device where the journal lives
+    off_t               jdev_offset;       // byte offset to the start of the journal
+
+    struct vnode       *fsdev;             // vnode of the file system device
+    
+    void              (*flush)(void *arg); // fs callback to flush meta data blocks
+    void               *flush_arg;         // arg that's passed to flush()
+
+    int32_t             flags;
+    int32_t             tbuffer_size;      // default transaction buffer size
+
+    char               *header_buf;        // in-memory copy of the journal header
+    journal_header     *jhdr;              // points to the first byte of header_buf
+
+    transaction        *cur_tr;            // for group-commit
+    transaction        *completed_trs;     // out-of-order transactions that completed
+    transaction        *active_tr;         // for nested transactions
+    int32_t             nested_count;      // for nested transactions
+    void               *owner;             // a ptr that's unique to the calling process
+
+    transaction        *tr_freeme;         // transaction structs that need to be free'd
+
+       volatile off_t      active_start;      // the active start that we only keep in memory
+       simple_lock_data_t  old_start_lock;    // guard access
+       volatile off_t      old_start[16];     // this is how we do lazy start update
+
+    semaphore_t         jsem;
+} journal;
+
+/* internal-only journal flags (top 16 bits) */
+#define JOURNAL_CLOSE_PENDING     0x00010000
+#define JOURNAL_INVALID           0x00020000
+
+/* journal_open/create options are always in the low-16 bits */
+#define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff
+
+/*
+ * Prototypes.
+ */
+
+/*
+ * Call journal_create() to create a new journal.  You only
+ * call this once, typically at file system creation time.
+ *
+ * The "jvp" argument is the vnode where the journal is written.
+ * The journal starts at "offset" and is "journal_size" bytes long.
+ *
+ * The "fsvp" argument is the vnode of your file system.  It may be
+ * the same as "jvp".
+ *
+ * The "min_fs_block_size" argument is the minimum block size
+ * (in bytes) that the file system will ever write.  Typically
+ * this is the block size of the file system (1k, 4k, etc) but
+ * on HFS+ it is the minimum block size of the underlying device.
+ *
+ * The flags argument lets you disable group commit if you
+ * want tighter guarantees on transactions (in exchange for
+ * lower performance).
+ *
+ * The tbuffer_size is the size of the transaction buffer
+ * used by the journal. If you specify zero, the journal code
+ * will use a reasonable defaults.  The tbuffer_size should 
+ * be an integer multiple of the min_fs_block_size.
+ *
+ * Returns a valid journal pointer or NULL if one could not
+ * be created.
+ */
+journal *journal_create(struct vnode *jvp,
+                                               off_t         offset,
+                                               off_t         journal_size,
+                                               struct vnode *fsvp,
+                                               size_t        min_fs_block_size,
+                                               int32_t       flags,
+                                               int32_t       tbuffer_size,
+                                               void        (*flush)(void *arg),
+                                               void         *arg);
+
+/*
+ * Call journal_open() when mounting an existing file system
+ * that has a previously created journal.  It will take care
+ * of validating the journal and replaying it if necessary.
+ *
+ * See journal_create() for a description of the arguments.
+ *
+ * Returns a valid journal pointer of NULL if it runs into
+ * trouble reading/playing back the journal.
+ */
+journal  *journal_open(struct vnode *jvp,
+                                          off_t         offset,
+                                          off_t         journal_size,
+                                          struct vnode *fsvp,
+                                          size_t        min_fs_block_size,
+                                          int32_t       flags,
+                                          int32_t       tbuffer_size,
+                                          void        (*flush)(void *arg),
+                                          void         *arg);
+
+/*
+ * Call journal_close() just before your file system is unmounted.
+ * It flushes any outstanding transactions and makes sure the
+ * journal is in a consistent state.
+ */
+void      journal_close(journal *journal);
+
+/*
+ * flags for journal_create/open.  only can use 
+ * the low 16 bits for flags because internal 
+ * bits go in the high 16.
+ */
+#define JOURNAL_NO_GROUP_COMMIT   0x00000001
+#define JOURNAL_RESET             0x00000002
+
+/*
+ * Transaction related functions.
+ *
+ * Before you start modifying file system meta data, you
+ * should call journal_start_transaction().  Then before
+ * you modify each block, call journal_modify_block_start()
+ * and when you're done, journal_modify_block_end().  When
+ * you've modified the last block as part of a transaction,
+ * call journal_end_transaction() to commit the changes.
+ *
+ * If you decide to abort the modifications to a block you
+ * should call journal_modify_block_abort().
+ *
+ * If as part of a transaction you need want to throw out
+ * any previous copies of a block (because it got deleted)
+ * then call journal_kill_block().  This will mark it so
+ * that the journal does not play it back (effectively
+ * dropping it).
+ */
+int   journal_start_transaction(journal *jnl);
+int   journal_modify_block_start(journal *jnl, struct buf *bp);
+int   journal_modify_block_abort(journal *jnl, struct buf *bp);
+int   journal_modify_block_end(journal *jnl, struct buf *bp);
+int   journal_kill_block(journal *jnl, struct buf *bp);
+int   journal_end_transaction(journal *jnl);
+
+int   journal_active(journal *jnl);
+int   journal_flush(journal *jnl);
+
+#endif /* __APPLE_API_UNSTABLE */
+#endif /* !_SYS_VFS_JOURNAL_H_ */
diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c

index c49f321c2ea8237b3da5a25788431376564d1fcf..ce79f9d4ddc573857d629d3fe8cb910d97ffa207 100644 (file)
--- a/bsd/vfs/vfs_subr.c
+++ b/bsd/vfs/vfs_subr.c
@@ -677,12 +677,22 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
                 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
                         return (error);
                 }
-               if (vp->v_dirtyblkhd.lh_first)
-                       panic("vinvalbuf: dirty bufs");
+
+               // XXXdbg - if there are dirty bufs, wait for 'em if they're busy
+               for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
+                   nbp = bp->b_vnbufs.le_next;
+                   if (ISSET(bp->b_flags, B_BUSY)) {
+                       SET(bp->b_flags, B_WANTED);
+                       tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), "vinvalbuf", 0);
+                       nbp = vp->v_dirtyblkhd.lh_first;
+                   } else {
+                       panic("vinvalbuf: dirty buf (vp 0x%x, bp 0x%x)", vp, bp);
+                   }
+               }
         }
  
         for (;;) {
-               if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
+               if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
                         while (blist && blist->b_lblkno < 0)
                                 blist = blist->b_vnbufs.le_next;
                 if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
@@ -694,7 +704,7 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
  
                 for (bp = blist; bp; bp = nbp) {
                         nbp = bp->b_vnbufs.le_next;
-                       if (flags & V_SAVEMETA && bp->b_lblkno < 0)
+                       if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
                                 continue;
                         s = splbio();
                         if (ISSET(bp->b_flags, B_BUSY)) {
@@ -720,7 +730,13 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
                                 (void) VOP_BWRITE(bp);
                                 break;
                         }
-                       SET(bp->b_flags, B_INVAL);
+
+                       if (bp->b_flags & B_LOCKED) {
+                               panic("vinvalbuf: bp @ 0x%x is locked!\n", bp);
+                               break;
+                       } else {
+                               SET(bp->b_flags, B_INVAL);
+                       }
                         brelse(bp);
                 }
         }
diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h

index 6862e89e9b5442ee02dd76f017e3617b42926a69..65fe7197d0014472bf9fa1d423a01dbb0845184b 100644 (file)
--- a/iokit/IOKit/IOKitKeys.h
+++ b/iokit/IOKit/IOKitKeys.h
@@ -96,10 +96,14 @@
  #define kIOCommandPoolSizeKey         "IOCommandPoolSize"          // (OSNumber)
  
  // properties found in services that have transfer constraints
-#define kIOMaximumBlockCountReadKey    "IOMaximumBlockCountRead"    // (OSNumber)
-#define kIOMaximumBlockCountWriteKey   "IOMaximumBlockCountWrite"   // (OSNumber)
-#define kIOMaximumSegmentCountReadKey  "IOMaximumSegmentCountRead"  // (OSNumber)
-#define kIOMaximumSegmentCountWriteKey "IOMaximumSegmentCountWrite" // (OSNumber)
+#define kIOMaximumBlockCountReadKey        "IOMaximumBlockCountRead"        // (OSNumber)
+#define kIOMaximumBlockCountWriteKey       "IOMaximumBlockCountWrite"       // (OSNumber)
+#define kIOMaximumByteCountReadKey         "IOMaximumByteCountRead"         // (OSNumber)
+#define kIOMaximumByteCountWriteKey        "IOMaximumByteCountWrite"        // (OSNumber)
+#define kIOMaximumSegmentCountReadKey      "IOMaximumSegmentCountRead"      // (OSNumber)
+#define kIOMaximumSegmentCountWriteKey     "IOMaximumSegmentCountWrite"     // (OSNumber)
+#define kIOMaximumSegmentByteCountReadKey  "IOMaximumSegmentByteCountRead"  // (OSNumber)
+#define kIOMaximumSegmentByteCountWriteKey "IOMaximumSegmentByteCountWrite" // (OSNumber)
  
  // properties found in services that wish to describe an icon
  //
diff --git a/iokit/KernelConfigTables.cpp b/iokit/KernelConfigTables.cpp

index ff0b955c9f94c35e99e0c000b47b4e3ff26d06fc..1eedcc6df09a58f830ed64437028df6c5121b3f9 100644 (file)
--- a/iokit/KernelConfigTables.cpp
+++ b/iokit/KernelConfigTables.cpp
@@ -28,16 +28,16 @@
   */
  const char * gIOKernelKmods =
  "{
-    'com.apple.kernel'                         = '6.1';
-    'com.apple.kernel.bsd'                     = '6.1';
-    'com.apple.kernel.iokit'                   = '6.1';
-    'com.apple.kernel.libkern'                 = '6.1';
-    'com.apple.kernel.mach'                    = '6.1';
-    'com.apple.iokit.IOADBFamily'              = '1.1';
-    'com.apple.iokit.IONVRAMFamily'            = '1.1';
-    'com.apple.iokit.IOSystemManagementFamily' = '1.1';
-    'com.apple.iokit.ApplePlatformFamily'      = '1.0';
-    'com.apple.driver.AppleNMI'                = '1.0';
+    'com.apple.kernel'                         = '6.2';
+    'com.apple.kernel.bsd'                     = '6.2';
+    'com.apple.kernel.iokit'                   = '6.2';
+    'com.apple.kernel.libkern'                 = '6.2';
+    'com.apple.kernel.mach'                    = '6.2';
+    'com.apple.iokit.IOADBFamily'              = '6.2';
+    'com.apple.iokit.IONVRAMFamily'            = '6.2';
+    'com.apple.iokit.IOSystemManagementFamily' = '6.2';
+    'com.apple.iokit.ApplePlatformFamily'      = '6.2';
+    'com.apple.driver.AppleNMI'                = '6.2';
  }";
  
  
diff --git a/iokit/conf/version.minor b/iokit/conf/version.minor

index d00491fd7e5bb6fa28c517a0bb32b8b506539d4d..0cfbf08886fca9a91cb753ec8734c84fcbe52c9f 100644 (file)
--- a/iokit/conf/version.minor
+++ b/iokit/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/libkern/conf/version.minor b/libkern/conf/version.minor

index d00491fd7e5bb6fa28c517a0bb32b8b506539d4d..0cfbf08886fca9a91cb753ec8734c84fcbe52c9f 100644 (file)
--- a/libkern/conf/version.minor
+++ b/libkern/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/libsa/conf/version.minor b/libsa/conf/version.minor

index d00491fd7e5bb6fa28c517a0bb32b8b506539d4d..0cfbf08886fca9a91cb753ec8734c84fcbe52c9f 100644 (file)
--- a/libsa/conf/version.minor
+++ b/libsa/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/osfmk/conf/kernelversion.minor b/osfmk/conf/kernelversion.minor

index d00491fd7e5bb6fa28c517a0bb32b8b506539d4d..0cfbf08886fca9a91cb753ec8734c84fcbe52c9f 100644 (file)
--- a/osfmk/conf/kernelversion.minor
+++ b/osfmk/conf/kernelversion.minor
@@ -1 +1 @@
-1
+2
diff --git a/osfmk/conf/version.minor b/osfmk/conf/version.minor

index d00491fd7e5bb6fa28c517a0bb32b8b506539d4d..0cfbf08886fca9a91cb753ec8734c84fcbe52c9f 100644 (file)
--- a/osfmk/conf/version.minor
+++ b/osfmk/conf/version.minor
@@ -1 +1 @@
-1
+2
diff --git a/osfmk/i386/loose_ends.c b/osfmk/i386/loose_ends.c

index e64faedb222a2f25a58726e9052e9a31991b9f75..b0cd27fffdea95a0cd901f63905aa54f6ab5336c 100644 (file)
--- a/osfmk/i386/loose_ends.c
+++ b/osfmk/i386/loose_ends.c
@@ -64,6 +64,49 @@
          */
  
  
+/*
+ * copy 'size' bytes from physical to physical address
+ * the caller must validate the physical ranges 
+ *
+ * if flush_action == 0, no cache flush necessary
+ * if flush_action == 1, flush the source
+ * if flush_action == 2, flush the dest
+ * if flush_action == 3, flush both source and dest
+ */
+
+kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) {
+
+        switch(flush_action) {
+       case 1:
+               flush_dcache(source, size, 1);
+               break;
+       case 2:
+               flush_dcache(dest, size, 1);
+               break;
+       case 3:
+               flush_dcache(source, size, 1);
+               flush_dcache(dest, size, 1);
+               break;
+
+       }
+        bcopy_phys((char *)source, (char *)dest, size);        /* Do a physical copy */
+
+        switch(flush_action) {
+       case 1:
+               flush_dcache(source, size, 1);
+               break;
+       case 2:
+               flush_dcache(dest, size, 1);
+               break;
+       case 3:
+               flush_dcache(source, size, 1);
+               flush_dcache(dest, size, 1);
+               break;
+
+       }
+}
+
+
  
  /*
   *              Copies data from a physical page to a virtual page.  This is used to
diff --git a/osfmk/ppc/cswtch.s b/osfmk/ppc/cswtch.s

index 3cca411b23118571e2c2392738860b9bcd46535d..14873039354c17371ababc5df269d0eb3423e8af 100644 (file)
--- a/osfmk/ppc/cswtch.s
+++ b/osfmk/ppc/cswtch.s
@@ -871,7 +871,7 @@ fsenable:   lwz             r8,savesrr1(r25)                                ; Get the msr of the interrupted guy
                         rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT   ; See if we are doing this for user state
                         stw             r8,savesrr1(r25)                                ; Set the msr of the interrupted guy
                         xor             r3,r25,r5                                               ; Get the real address of the savearea
-                       bne-    fsnuser                                                 ; We are not user state...
+                       beq-    fsnuser                                                 ; We are not user state...
                         stw             r10,ACT_MACT_SPF(r17)                   ; Set the activation copy
                         stw             r10,spcFlags(r26)                               ; Set per_proc copy
  
@@ -2297,7 +2297,7 @@ vrenable: lwz             r8,savesrr1(r25)                                ; Get the msr of the interrupted guy
                         rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT   ; See if we are doing this for user state
                         stw             r8,savesrr1(r25)                                ; Set the msr of the interrupted guy
                         xor             r3,r25,r5                                               ; Get the real address of the savearea
-                       bne-    vrnuser                                                 ; We are not user state...
+                       beq-    vrnuser                                                 ; We are not user state...
                         stw             r10,ACT_MACT_SPF(r17)                   ; Set the activation copy
                         stw             r10,spcFlags(r26)                               ; Set per_proc copy
  
diff --git a/osfmk/ppc/mappings.c b/osfmk/ppc/mappings.c

index de3411de90d5027a1c6c3d74ee0b62949911756e..237e2bc12e55b1b51277592f0640ea9be24db6f8 100644 (file)
--- a/osfmk/ppc/mappings.c
+++ b/osfmk/ppc/mappings.c
@@ -70,6 +70,7 @@
  #endif
  
  vm_map_t        mapping_map = VM_MAP_NULL;
+#define                MAPPING_MAP_SIZE        33554432        /* 32MB address space */
  
  unsigned int   incrVSID = 0;                                                                   /* VSID increment value */
  unsigned int   mappingdeb0 = 0;                                                
@@ -1548,7 +1549,7 @@ void mapping_free_prime(void) {                                                                   /* Primes the mapping block release list
         mappingblok     *mbn;
         vm_offset_t     mapping_min;
         
-       retr = kmem_suballoc(kernel_map, &mapping_min, mem_size / 16,
+       retr = kmem_suballoc(kernel_map, &mapping_min, MAPPING_MAP_SIZE,
                              FALSE, TRUE, &mapping_map);
  
         if (retr != KERN_SUCCESS)
@@ -1877,6 +1878,50 @@ kern_return_t copyp2v(vm_offset_t source, vm_offset_t sink, unsigned int size) {
  }
  
  
+/*
+ * copy 'size' bytes from physical to physical address
+ * the caller must validate the physical ranges 
+ *
+ * if flush_action == 0, no cache flush necessary
+ * if flush_action == 1, flush the source
+ * if flush_action == 2, flush the dest
+ * if flush_action == 3, flush both source and dest
+ */
+
+kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) {
+
+        switch(flush_action) {
+       case 1:
+               flush_dcache(source, size, 1);
+               break;
+       case 2:
+               flush_dcache(dest, size, 1);
+               break;
+       case 3:
+               flush_dcache(source, size, 1);
+               flush_dcache(dest, size, 1);
+               break;
+
+       }
+        bcopy_phys((char *)source, (char *)dest, size);        /* Do a physical copy */
+
+        switch(flush_action) {
+       case 1:
+               flush_dcache(source, size, 1);
+               break;
+       case 2:
+               flush_dcache(dest, size, 1);
+               break;
+       case 3:
+               flush_dcache(source, size, 1);
+               flush_dcache(dest, size, 1);
+               break;
+
+       }
+}
+
+
+
  #if DEBUG
  /*
   *             Dumps out the mapping stuff associated with a virtual address
diff --git a/osfmk/ppc/pmap.c b/osfmk/ppc/pmap.c

index f6f6a8e34d732d53fd0a1d62b1952c45f629837f..85fabb668fb1b472e1c7b0d3645c50ce534f640d 100644 (file)
--- a/osfmk/ppc/pmap.c
+++ b/osfmk/ppc/pmap.c
@@ -483,6 +483,9 @@ pmap_bootstrap(unsigned int mem_size, vm_offset_t *first_avail, vm_offset_t *fir
              hash_table_size *= 2)
                 continue;
  
+       if (num > (sizeof(pte_t) * 524288))
+               hash_table_size = hash_table_size/2; /* reduce by half above 512MB */
+
         /* Scale to within any physical memory layout constraints */
         do {
                 num = atop(mem_size);   /* num now holds mem_size in pages */
diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c

index 0a729a2a8750f37fcbad162596d52a964f9dc899..d066f2f588ad08c4101ee89b3a8e7324abd28942 100644 (file)
--- a/osfmk/vm/vm_init.c
+++ b/osfmk/vm/vm_init.c
@@ -100,7 +100,11 @@ vm_mem_bootstrap(void)
         kmem_init(start, end);
         pmap_init();
         
-       zsize = mem_size >> 2;                  /* Get target zone size as 1/4 of physical memory */
+       if (PE_parse_boot_arg("zsize", &zsize))
+               zsize = zsize * 1024 * 1024;
+       else {
+               zsize = mem_size >> 2;                  /* Get target zone size as 1/4 of physical memory */
+       }
         if(zsize < ZONE_MAP_MIN) zsize = ZONE_MAP_MIN;  /* Clamp to min */
         if(zsize > ZONE_MAP_MAX) zsize = ZONE_MAP_MAX;  /* Clamp to max */
         zone_init(zsize);                                               /* Allocate address space for zones */
diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c

index ab065e048bedff8e104e6bbbf26ca05ce2ed4a9c..4d4ee31cb09e79d02445622f8f183333fb7c87f0 100644 (file)
--- a/osfmk/vm/vm_kern.c
+++ b/osfmk/vm/vm_kern.c
@@ -85,9 +85,7 @@ vm_map_t      kernel_pageable_map;
  extern kern_return_t kmem_alloc_pages(
         register vm_object_t            object,
         register vm_object_offset_t     offset,
-       register vm_offset_t            start,
-       register vm_offset_t            end,
-       vm_prot_t                       protection);
+       register vm_size_t              size);
  
  extern void kmem_remap_pages(
         register vm_object_t            object,
@@ -254,8 +252,13 @@ kernel_memory_allocate(
  
         /*
          *      Since we have not given out this address yet,
-        *      it is safe to unlock the map.
+        *      it is safe to unlock the map. Except of course
+        *      we must make certain no one coalesces our address
+         *      or does a blind vm_deallocate and removes the object
+        *      an extra object reference will suffice to protect
+        *      against both contingencies.
          */
+       vm_object_reference(object);
         vm_map_unlock(map);
  
         vm_object_lock(object);
@@ -271,6 +274,7 @@ kernel_memory_allocate(
                                                 offset + (vm_object_offset_t)i);
                                 vm_object_unlock(object);
                                 vm_map_remove(map, addr, addr + size, 0);
+                               vm_object_deallocate(object);
                                 return KERN_RESOURCE_SHORTAGE;
                         }
                         vm_object_unlock(object);
@@ -289,8 +293,11 @@ kernel_memory_allocate(
                         vm_object_unlock(object);
                 }
                 vm_map_remove(map, addr, addr + size, 0);
+               vm_object_deallocate(object);
                 return (kr);
         }
+       /* now that the page is wired, we no longer have to fear coalesce */
+       vm_object_deallocate(object);
         if (object == kernel_object)
                 vm_map_simplify(map, addr);
  
@@ -338,31 +345,26 @@ kmem_realloc(
         vm_offset_t     *newaddrp,
         vm_size_t       newsize)
  {
-       vm_offset_t oldmin, oldmax;
-       vm_offset_t newaddr;
-       vm_object_t object;
-       vm_map_entry_t oldentry, newentry;
-       kern_return_t kr;
+       vm_offset_t     oldmin, oldmax;
+       vm_offset_t     newaddr;
+       vm_offset_t     offset;
+       vm_object_t     object;
+       vm_map_entry_t  oldentry, newentry;
+       vm_page_t       mem;
+       kern_return_t   kr;
  
         oldmin = trunc_page(oldaddr);
         oldmax = round_page(oldaddr + oldsize);
         oldsize = oldmax - oldmin;
         newsize = round_page(newsize);
  
-       /*
-        *      Find space for the new region.
-        */
-
-       kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0,
-                              &newentry);
-       if (kr != KERN_SUCCESS) {
-               return kr;
-       }
  
         /*
          *      Find the VM object backing the old region.
          */
  
+       vm_map_lock(map);
+
         if (!vm_map_lookup_entry(map, oldmin, &oldentry))
                 panic("kmem_realloc");
         object = oldentry->object.vm_object;
@@ -373,36 +375,71 @@ kmem_realloc(
          */
  
         vm_object_reference(object);
+       /* by grabbing the object lock before unlocking the map */
+       /* we guarantee that we will panic if more than one     */
+       /* attempt is made to realloc a kmem_alloc'd area       */
         vm_object_lock(object);
+       vm_map_unlock(map);
         if (object->size != oldsize)
                 panic("kmem_realloc");
         object->size = newsize;
         vm_object_unlock(object);
  
-       newentry->object.vm_object = object;
-       newentry->offset = 0;
-       assert (newentry->wired_count == 0);
-       newentry->wired_count = 1;
+       /* allocate the new pages while expanded portion of the */
+       /* object is still not mapped */
+       kmem_alloc_pages(object, oldsize, newsize-oldsize);
+
  
         /*
-        *      Since we have not given out this address yet,
-        *      it is safe to unlock the map.  We are trusting
-        *      that nobody will play with either region.
+        *      Find space for the new region.
          */
  
+       kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0,
+                              &newentry);
+       if (kr != KERN_SUCCESS) {
+               vm_object_lock(object);
+               for(offset = oldsize; 
+                               offset<newsize; offset+=PAGE_SIZE) {
+                       if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
+                               vm_page_lock_queues();
+                               vm_page_free(mem);
+                               vm_page_unlock_queues();
+                       }
+               }
+               object->size = oldsize;
+               vm_object_unlock(object);
+               vm_object_deallocate(object);
+               return kr;
+       }
+       newentry->object.vm_object = object;
+       newentry->offset = 0;
+       assert (newentry->wired_count == 0);
+
+       
+       /* add an extra reference in case we have someone doing an */
+       /* unexpected deallocate */
+       vm_object_reference(object);
         vm_map_unlock(map);
  
-       /*
-        *      Remap the pages in the old region and
-        *      allocate more pages for the new region.
-        */
+       if ((kr = vm_map_wire(map, newaddr, newaddr + newsize, 
+                               VM_PROT_DEFAULT, FALSE)) != KERN_SUCCESS) {
+               vm_map_remove(map, newaddr, newaddr + newsize, 0);
+               vm_object_lock(object);
+               for(offset = oldsize; 
+                               offset<newsize; offset+=PAGE_SIZE) {
+                       if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
+                               vm_page_lock_queues();
+                               vm_page_free(mem);
+                               vm_page_unlock_queues();
+                       }
+               }
+               object->size = oldsize;
+               vm_object_unlock(object);
+               vm_object_deallocate(object);
+               return (kr);
+       }
+       vm_object_deallocate(object);
  
-       kmem_remap_pages(object, 0,
-                        newaddr, newaddr + oldsize,
-                        VM_PROT_DEFAULT);
-       kmem_alloc_pages(object, oldsize,
-                        newaddr + oldsize, newaddr + newsize,
-                        VM_PROT_DEFAULT);
  
         *newaddrp = newaddr;
         return KERN_SUCCESS;
@@ -500,28 +537,21 @@ kmem_free(
  }
  
  /*
- *     Allocate new wired pages in an object.
- *     The object is assumed to be mapped into the kernel map or
- *     a submap.
+ *     Allocate new pages in an object.
   */
  
  kern_return_t
  kmem_alloc_pages(
         register vm_object_t            object,
         register vm_object_offset_t     offset,
-       register vm_offset_t            start,
-       register vm_offset_t            end,
-       vm_prot_t                       protection)
+       register vm_size_t              size)
  {
-       /*
-        *      Mark the pmap region as not pageable.
-        */
-       pmap_pageable(kernel_pmap, start, end, FALSE);
  
-       while (start < end) {
+       size = round_page(size);
+        vm_object_lock(object);
+       while (size) {
             register vm_page_t  mem;
  
-           vm_object_lock(object);
  
             /*
              *  Allocate a page
@@ -533,27 +563,12 @@ kmem_alloc_pages(
                 vm_object_lock(object);
             }
  
-           /*
-            *  Wire it down
-            */
-           vm_page_lock_queues();
-           vm_page_wire(mem);
-           vm_page_unlock_queues();
-           vm_object_unlock(object);
-
-           /*
-            *  Enter it in the kernel pmap
-            */
-           PMAP_ENTER(kernel_pmap, start, mem, protection, 
-                               VM_WIMG_USE_DEFAULT, TRUE);
-
-           vm_object_lock(object);
-           PAGE_WAKEUP_DONE(mem);
-           vm_object_unlock(object);
  
-           start += PAGE_SIZE;
-           offset += PAGE_SIZE_64;
+           offset += PAGE_SIZE;
+           size -= PAGE_SIZE;
+           mem->busy = FALSE;
         }
+       vm_object_unlock(object);
         return KERN_SUCCESS;
  }
  
diff --git a/pexpert/conf/version.minor b/pexpert/conf/version.minor

index d00491fd7e5bb6fa28c517a0bb32b8b506539d4d..0cfbf08886fca9a91cb753ec8734c84fcbe52c9f 100644 (file)
--- a/pexpert/conf/version.minor
+++ b/pexpert/conf/version.minor
@@ -1 +1 @@
-1
+2
author	Apple <opensource@apple.com>
	Tue, 12 Aug 2003 21:04:55 +0000 (21:04 +0000)
committer	Apple <opensource@apple.com>
	Tue, 12 Aug 2003 21:04:55 +0000 (21:04 +0000)
bsd/conf/files		patch \| blob \| blame \| history
bsd/conf/version.minor		patch \| blob \| blame \| history
bsd/hfs/hfs.h		patch \| blob \| blame \| history
bsd/hfs/hfs_attrlist.c		patch \| blob \| blame \| history
bsd/hfs/hfs_btreeio.c		patch \| blob \| blame \| history
bsd/hfs/hfs_catalog.c		patch \| blob \| blame \| history
bsd/hfs/hfs_cnode.c		patch \| blob \| blame \| history
bsd/hfs/hfs_format.h		patch \| blob \| blame \| history
bsd/hfs/hfs_link.c		patch \| blob \| blame \| history
bsd/hfs/hfs_lookup.c		patch \| blob \| blame \| history
bsd/hfs/hfs_mount.h		patch \| blob \| blame \| history
bsd/hfs/hfs_readwrite.c		patch \| blob \| blame \| history
bsd/hfs/hfs_search.c		patch \| blob \| blame \| history
bsd/hfs/hfs_vfsops.c		patch \| blob \| blame \| history
bsd/hfs/hfs_vfsutils.c		patch \| blob \| blame \| history
bsd/hfs/hfs_vnops.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/BTree/BTree.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/BTree/BTreeAllocate.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/BTree/BTreeMiscOps.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/BTree/BTreeScanner.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/BTree/BTreeTreeOps.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/Catalog/FileIDsServices.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/Misc/FileExtentMapping.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/Misc/VolumeAllocation.c		patch \| blob \| blame \| history
bsd/hfs/hfscommon/headers/BTreesInternal.h		patch \| blob \| blame \| history
bsd/hfs/hfscommon/headers/BTreesPrivate.h		patch \| blob \| blame \| history
bsd/kern/kern_mman.c		patch \| blob \| blame \| history
bsd/kern/qsort.c		patch \| blob \| blame \| history
bsd/kern/ubc_subr.c		patch \| blob \| blame \| history
bsd/miscfs/specfs/spec_vnops.c		patch \| blob \| blame \| history
bsd/nfs/nfs_bio.c		patch \| blob \| blame \| history
bsd/nfs/nfs_socket.c		patch \| blob \| blame \| history
bsd/nfs/nfs_vnops.c		patch \| blob \| blame \| history
bsd/sys/buf.h		patch \| blob \| blame \| history
bsd/sys/disk.h		patch \| blob \| blame \| history
bsd/sys/malloc.h		patch \| blob \| blame \| history
bsd/sys/mount.h		patch \| blob \| blame \| history
bsd/sys/ubc.h		patch \| blob \| blame \| history
bsd/vfs/Makefile		patch \| blob \| blame \| history
bsd/vfs/vfs_bio.c		patch \| blob \| blame \| history
bsd/vfs/vfs_cluster.c		patch \| blob \| blame \| history
bsd/vfs/vfs_journal.c	[new file with mode: 0644]	patch \| blob
bsd/vfs/vfs_journal.h	[new file with mode: 0644]	patch \| blob
bsd/vfs/vfs_subr.c		patch \| blob \| blame \| history
iokit/IOKit/IOKitKeys.h		patch \| blob \| blame \| history
iokit/KernelConfigTables.cpp		patch \| blob \| blame \| history
iokit/conf/version.minor		patch \| blob \| blame \| history
libkern/conf/version.minor		patch \| blob \| blame \| history
libsa/conf/version.minor		patch \| blob \| blame \| history
osfmk/conf/kernelversion.minor		patch \| blob \| blame \| history
osfmk/conf/version.minor		patch \| blob \| blame \| history
osfmk/i386/loose_ends.c		patch \| blob \| blame \| history
osfmk/ppc/cswtch.s		patch \| blob \| blame \| history
osfmk/ppc/mappings.c		patch \| blob \| blame \| history
osfmk/ppc/pmap.c		patch \| blob \| blame \| history
osfmk/vm/vm_init.c		patch \| blob \| blame \| history
osfmk/vm/vm_kern.c		patch \| blob \| blame \| history
pexpert/conf/version.minor		patch \| blob \| blame \| history