]> git.saurik.com Git - apple/libc.git/blob - db/btree/bt_split.c
Libc-262.3.2.tar.gz
[apple/libc.git] / db / btree / bt_split.c
1 /*
2 * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /*
26 * Copyright (c) 1990, 1993
27 * The Regents of the University of California. All rights reserved.
28 *
29 * This code is derived from software contributed to Berkeley by
30 * Mike Olson.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 */
60
61
62 #include <sys/types.h>
63
64 #include <limits.h>
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68
69 #include <db.h>
70 #include "btree.h"
71
72 static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
73 static PAGE *bt_page
74 __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
75 static int bt_preserve __P((BTREE *, pgno_t));
76 static PAGE *bt_psplit
77 __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
78 static PAGE *bt_root
79 __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
80 static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
81 static recno_t rec_total __P((PAGE *));
82
83 #ifdef STATISTICS
84 u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
85 #endif
86
87 /*
88 * __BT_SPLIT -- Split the tree.
89 *
90 * Parameters:
91 * t: tree
92 * sp: page to split
93 * key: key to insert
94 * data: data to insert
95 * flags: BIGKEY/BIGDATA flags
96 * ilen: insert length
97 * skip: index to leave open
98 *
99 * Returns:
100 * RET_ERROR, RET_SUCCESS
101 */
102 int
103 __bt_split(t, sp, key, data, flags, ilen, skip)
104 BTREE *t;
105 PAGE *sp;
106 const DBT *key, *data;
107 int flags;
108 size_t ilen;
109 indx_t skip;
110 {
111 BINTERNAL *bi;
112 BLEAF *bl, *tbl;
113 DBT a, b;
114 EPGNO *parent;
115 PAGE *h, *l, *r, *lchild, *rchild;
116 indx_t nxtindex;
117 size_t n, nbytes, nksize;
118 int parentsplit;
119 char *dest;
120
121 /*
122 * Split the page into two pages, l and r. The split routines return
123 * a pointer to the page into which the key should be inserted and with
124 * skip set to the offset which should be used. Additionally, l and r
125 * are pinned.
126 */
127 h = sp->pgno == P_ROOT ?
128 bt_root(t, sp, &l, &r, &skip, ilen) :
129 bt_page(t, sp, &l, &r, &skip, ilen);
130 if (h == NULL)
131 return (RET_ERROR);
132
133 /*
134 * Insert the new key/data pair into the leaf page. (Key inserts
135 * always cause a leaf page to split first.)
136 */
137 h->linp[skip] = h->upper -= ilen;
138 dest = (char *)h + h->upper;
139 if (ISSET(t, R_RECNO))
140 WR_RLEAF(dest, data, flags)
141 else
142 WR_BLEAF(dest, key, data, flags)
143
144 /* If the root page was split, make it look right. */
145 if (sp->pgno == P_ROOT &&
146 (ISSET(t, R_RECNO) ?
147 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
148 goto err2;
149
150 /*
151 * Now we walk the parent page stack -- a LIFO stack of the pages that
152 * were traversed when we searched for the page that split. Each stack
153 * entry is a page number and a page index offset. The offset is for
154 * the page traversed on the search. We've just split a page, so we
155 * have to insert a new key into the parent page.
156 *
157 * If the insert into the parent page causes it to split, may have to
158 * continue splitting all the way up the tree. We stop if the root
159 * splits or the page inserted into didn't have to split to hold the
160 * new key. Some algorithms replace the key for the old page as well
161 * as the new page. We don't, as there's no reason to believe that the
162 * first key on the old page is any better than the key we have, and,
163 * in the case of a key being placed at index 0 causing the split, the
164 * key is unavailable.
165 *
166 * There are a maximum of 5 pages pinned at any time. We keep the left
167 * and right pages pinned while working on the parent. The 5 are the
168 * two children, left parent and right parent (when the parent splits)
169 * and the root page or the overflow key page when calling bt_preserve.
170 * This code must make sure that all pins are released other than the
171 * root page or overflow page which is unlocked elsewhere.
172 */
173 while ((parent = BT_POP(t)) != NULL) {
174 lchild = l;
175 rchild = r;
176
177 /* Get the parent page. */
178 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
179 goto err2;
180
181 /*
182 * The new key goes ONE AFTER the index, because the split
183 * was to the right.
184 */
185 skip = parent->index + 1;
186
187 /*
188 * Calculate the space needed on the parent page.
189 *
190 * Prefix trees: space hack when inserting into BINTERNAL
191 * pages. Retain only what's needed to distinguish between
192 * the new entry and the LAST entry on the page to its left.
193 * If the keys compare equal, retain the entire key. Note,
194 * we don't touch overflow keys, and the entire key must be
195 * retained for the next-to-left most key on the leftmost
196 * page of each level, or the search will fail. Applicable
197 * ONLY to internal pages that have leaf pages as children.
198 * Further reduction of the key between pairs of internal
199 * pages loses too much information.
200 */
201 switch (rchild->flags & P_TYPE) {
202 case P_BINTERNAL:
203 bi = GETBINTERNAL(rchild, 0);
204 nbytes = NBINTERNAL(bi->ksize);
205 break;
206 case P_BLEAF:
207 bl = GETBLEAF(rchild, 0);
208 nbytes = NBINTERNAL(bl->ksize);
209 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
210 (h->prevpg != P_INVALID || skip > 1)) {
211 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
212 a.size = tbl->ksize;
213 a.data = tbl->bytes;
214 b.size = bl->ksize;
215 b.data = bl->bytes;
216 nksize = t->bt_pfx(&a, &b);
217 n = NBINTERNAL(nksize);
218 if (n < nbytes) {
219 #ifdef STATISTICS
220 bt_pfxsaved += nbytes - n;
221 #endif
222 nbytes = n;
223 } else
224 nksize = 0;
225 } else
226 nksize = 0;
227 break;
228 case P_RINTERNAL:
229 case P_RLEAF:
230 nbytes = NRINTERNAL;
231 break;
232 default:
233 abort();
234 }
235
236 /* Split the parent page if necessary or shift the indices. */
237 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
238 sp = h;
239 h = h->pgno == P_ROOT ?
240 bt_root(t, h, &l, &r, &skip, nbytes) :
241 bt_page(t, h, &l, &r, &skip, nbytes);
242 if (h == NULL)
243 goto err1;
244 parentsplit = 1;
245 } else {
246 if (skip < (nxtindex = NEXTINDEX(h)))
247 memmove(h->linp + skip + 1, h->linp + skip,
248 (nxtindex - skip) * sizeof(indx_t));
249 h->lower += sizeof(indx_t);
250 parentsplit = 0;
251 }
252
253 /* Insert the key into the parent page. */
254 switch(rchild->flags & P_TYPE) {
255 case P_BINTERNAL:
256 h->linp[skip] = h->upper -= nbytes;
257 dest = (char *)h + h->linp[skip];
258 memmove(dest, bi, nbytes);
259 ((BINTERNAL *)dest)->pgno = rchild->pgno;
260 break;
261 case P_BLEAF:
262 h->linp[skip] = h->upper -= nbytes;
263 dest = (char *)h + h->linp[skip];
264 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
265 rchild->pgno, bl->flags & P_BIGKEY);
266 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
267 if (bl->flags & P_BIGKEY &&
268 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
269 goto err1;
270 break;
271 case P_RINTERNAL:
272 /*
273 * Update the left page count. If split
274 * added at index 0, fix the correct page.
275 */
276 if (skip > 0)
277 dest = (char *)h + h->linp[skip - 1];
278 else
279 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
280 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
281 ((RINTERNAL *)dest)->pgno = lchild->pgno;
282
283 /* Update the right page count. */
284 h->linp[skip] = h->upper -= nbytes;
285 dest = (char *)h + h->linp[skip];
286 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
287 ((RINTERNAL *)dest)->pgno = rchild->pgno;
288 break;
289 case P_RLEAF:
290 /*
291 * Update the left page count. If split
292 * added at index 0, fix the correct page.
293 */
294 if (skip > 0)
295 dest = (char *)h + h->linp[skip - 1];
296 else
297 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
298 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
299 ((RINTERNAL *)dest)->pgno = lchild->pgno;
300
301 /* Update the right page count. */
302 h->linp[skip] = h->upper -= nbytes;
303 dest = (char *)h + h->linp[skip];
304 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
305 ((RINTERNAL *)dest)->pgno = rchild->pgno;
306 break;
307 default:
308 abort();
309 }
310
311 /* Unpin the held pages. */
312 if (!parentsplit) {
313 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
314 break;
315 }
316
317 /* If the root page was split, make it look right. */
318 if (sp->pgno == P_ROOT &&
319 (ISSET(t, R_RECNO) ?
320 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
321 goto err1;
322
323 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
324 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
325 }
326
327 /* Unpin the held pages. */
328 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
329 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
330
331 /* Clear any pages left on the stack. */
332 return (RET_SUCCESS);
333
334 /*
335 * If something fails in the above loop we were already walking back
336 * up the tree and the tree is now inconsistent. Nothing much we can
337 * do about it but release any memory we're holding.
338 */
339 err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
340 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
341
342 err2: mpool_put(t->bt_mp, l, 0);
343 mpool_put(t->bt_mp, r, 0);
344 __dbpanic(t->bt_dbp);
345 return (RET_ERROR);
346 }
347
348 /*
349 * BT_PAGE -- Split a non-root page of a btree.
350 *
351 * Parameters:
352 * t: tree
353 * h: root page
354 * lp: pointer to left page pointer
355 * rp: pointer to right page pointer
356 * skip: pointer to index to leave open
357 * ilen: insert length
358 *
359 * Returns:
360 * Pointer to page in which to insert or NULL on error.
361 */
362 static PAGE *
363 bt_page(t, h, lp, rp, skip, ilen)
364 BTREE *t;
365 PAGE *h, **lp, **rp;
366 indx_t *skip;
367 size_t ilen;
368 {
369 PAGE *l, *r, *tp;
370 pgno_t npg;
371
372 #ifdef STATISTICS
373 ++bt_split;
374 #endif
375 /* Put the new right page for the split into place. */
376 if ((r = __bt_new(t, &npg)) == NULL)
377 return (NULL);
378 r->pgno = npg;
379 r->lower = BTDATAOFF;
380 r->upper = t->bt_psize;
381 r->nextpg = h->nextpg;
382 r->prevpg = h->pgno;
383 r->flags = h->flags & P_TYPE;
384
385 /*
386 * If we're splitting the last page on a level because we're appending
387 * a key to it (skip is NEXTINDEX()), it's likely that the data is
388 * sorted. Adding an empty page on the side of the level is less work
389 * and can push the fill factor much higher than normal. If we're
390 * wrong it's no big deal, we'll just do the split the right way next
391 * time. It may look like it's equally easy to do a similar hack for
392 * reverse sorted data, that is, split the tree left, but it's not.
393 * Don't even try.
394 */
395 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
396 #ifdef STATISTICS
397 ++bt_sortsplit;
398 #endif
399 h->nextpg = r->pgno;
400 r->lower = BTDATAOFF + sizeof(indx_t);
401 *skip = 0;
402 *lp = h;
403 *rp = r;
404 return (r);
405 }
406
407 /* Put the new left page for the split into place. */
408 if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
409 mpool_put(t->bt_mp, r, 0);
410 return (NULL);
411 }
412 l->pgno = h->pgno;
413 l->nextpg = r->pgno;
414 l->prevpg = h->prevpg;
415 l->lower = BTDATAOFF;
416 l->upper = t->bt_psize;
417 l->flags = h->flags & P_TYPE;
418
419 /* Fix up the previous pointer of the page after the split page. */
420 if (h->nextpg != P_INVALID) {
421 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
422 free(l);
423 /* XXX mpool_free(t->bt_mp, r->pgno); */
424 return (NULL);
425 }
426 tp->prevpg = r->pgno;
427 mpool_put(t->bt_mp, tp, 0);
428 }
429
430 /*
431 * Split right. The key/data pairs aren't sorted in the btree page so
432 * it's simpler to copy the data from the split page onto two new pages
433 * instead of copying half the data to the right page and compacting
434 * the left page in place. Since the left page can't change, we have
435 * to swap the original and the allocated left page after the split.
436 */
437 tp = bt_psplit(t, h, l, r, skip, ilen);
438
439 /* Move the new left page onto the old left page. */
440 memmove(h, l, t->bt_psize);
441 if (tp == l)
442 tp = h;
443 free(l);
444
445 *lp = h;
446 *rp = r;
447 return (tp);
448 }
449
450 /*
451 * BT_ROOT -- Split the root page of a btree.
452 *
453 * Parameters:
454 * t: tree
455 * h: root page
456 * lp: pointer to left page pointer
457 * rp: pointer to right page pointer
458 * skip: pointer to index to leave open
459 * ilen: insert length
460 *
461 * Returns:
462 * Pointer to page in which to insert or NULL on error.
463 */
464 static PAGE *
465 bt_root(t, h, lp, rp, skip, ilen)
466 BTREE *t;
467 PAGE *h, **lp, **rp;
468 indx_t *skip;
469 size_t ilen;
470 {
471 PAGE *l, *r, *tp;
472 pgno_t lnpg, rnpg;
473
474 #ifdef STATISTICS
475 ++bt_split;
476 ++bt_rootsplit;
477 #endif
478 /* Put the new left and right pages for the split into place. */
479 if ((l = __bt_new(t, &lnpg)) == NULL ||
480 (r = __bt_new(t, &rnpg)) == NULL)
481 return (NULL);
482 l->pgno = lnpg;
483 r->pgno = rnpg;
484 l->nextpg = r->pgno;
485 r->prevpg = l->pgno;
486 l->prevpg = r->nextpg = P_INVALID;
487 l->lower = r->lower = BTDATAOFF;
488 l->upper = r->upper = t->bt_psize;
489 l->flags = r->flags = h->flags & P_TYPE;
490
491 /* Split the root page. */
492 tp = bt_psplit(t, h, l, r, skip, ilen);
493
494 *lp = l;
495 *rp = r;
496 return (tp);
497 }
498
499 /*
500 * BT_RROOT -- Fix up the recno root page after it has been split.
501 *
502 * Parameters:
503 * t: tree
504 * h: root page
505 * l: left page
506 * r: right page
507 *
508 * Returns:
509 * RET_ERROR, RET_SUCCESS
510 */
511 static int
512 bt_rroot(t, h, l, r)
513 BTREE *t;
514 PAGE *h, *l, *r;
515 {
516 char *dest;
517
518 /* Insert the left and right keys, set the header information. */
519 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
520 dest = (char *)h + h->upper;
521 WR_RINTERNAL(dest,
522 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
523
524 h->linp[1] = h->upper -= NRINTERNAL;
525 dest = (char *)h + h->upper;
526 WR_RINTERNAL(dest,
527 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
528
529 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
530
531 /* Unpin the root page, set to recno internal page. */
532 h->flags &= ~P_TYPE;
533 h->flags |= P_RINTERNAL;
534 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
535
536 return (RET_SUCCESS);
537 }
538
539 /*
540 * BT_BROOT -- Fix up the btree root page after it has been split.
541 *
542 * Parameters:
543 * t: tree
544 * h: root page
545 * l: left page
546 * r: right page
547 *
548 * Returns:
549 * RET_ERROR, RET_SUCCESS
550 */
551 static int
552 bt_broot(t, h, l, r)
553 BTREE *t;
554 PAGE *h, *l, *r;
555 {
556 BINTERNAL *bi;
557 BLEAF *bl;
558 size_t nbytes;
559 char *dest;
560
561 /*
562 * If the root page was a leaf page, change it into an internal page.
563 * We copy the key we split on (but not the key's data, in the case of
564 * a leaf page) to the new root page.
565 *
566 * The btree comparison code guarantees that the left-most key on any
567 * level of the tree is never used, so it doesn't need to be filled in.
568 */
569 nbytes = NBINTERNAL(0);
570 h->linp[0] = h->upper = t->bt_psize - nbytes;
571 dest = (char *)h + h->upper;
572 WR_BINTERNAL(dest, 0, l->pgno, 0);
573
574 switch(h->flags & P_TYPE) {
575 case P_BLEAF:
576 bl = GETBLEAF(r, 0);
577 nbytes = NBINTERNAL(bl->ksize);
578 h->linp[1] = h->upper -= nbytes;
579 dest = (char *)h + h->upper;
580 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
581 memmove(dest, bl->bytes, bl->ksize);
582
583 /*
584 * If the key is on an overflow page, mark the overflow chain
585 * so it isn't deleted when the leaf copy of the key is deleted.
586 */
587 if (bl->flags & P_BIGKEY &&
588 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
589 return (RET_ERROR);
590 break;
591 case P_BINTERNAL:
592 bi = GETBINTERNAL(r, 0);
593 nbytes = NBINTERNAL(bi->ksize);
594 h->linp[1] = h->upper -= nbytes;
595 dest = (char *)h + h->upper;
596 memmove(dest, bi, nbytes);
597 ((BINTERNAL *)dest)->pgno = r->pgno;
598 break;
599 default:
600 abort();
601 }
602
603 /* There are two keys on the page. */
604 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
605
606 /* Unpin the root page, set to btree internal page. */
607 h->flags &= ~P_TYPE;
608 h->flags |= P_BINTERNAL;
609 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
610
611 return (RET_SUCCESS);
612 }
613
614 /*
615 * BT_PSPLIT -- Do the real work of splitting the page.
616 *
617 * Parameters:
618 * t: tree
619 * h: page to be split
620 * l: page to put lower half of data
621 * r: page to put upper half of data
622 * pskip: pointer to index to leave open
623 * ilen: insert length
624 *
625 * Returns:
626 * Pointer to page in which to insert.
627 */
628 static PAGE *
629 bt_psplit(t, h, l, r, pskip, ilen)
630 BTREE *t;
631 PAGE *h, *l, *r;
632 indx_t *pskip;
633 size_t ilen;
634 {
635 BINTERNAL *bi;
636 BLEAF *bl;
637 RLEAF *rl;
638 EPGNO *c;
639 PAGE *rval;
640 void *src;
641 indx_t full, half, nxt, off, skip, top, used;
642 size_t nbytes;
643 int bigkeycnt, isbigkey;
644
645 /*
646 * Split the data to the left and right pages. Leave the skip index
647 * open. Additionally, make some effort not to split on an overflow
648 * key. This makes internal page processing faster and can save
649 * space as overflow keys used by internal pages are never deleted.
650 */
651 bigkeycnt = 0;
652 skip = *pskip;
653 full = t->bt_psize - BTDATAOFF;
654 half = full / 2;
655 used = 0;
656 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
657 if (skip == off) {
658 nbytes = ilen;
659 isbigkey = 0; /* XXX: not really known. */
660 } else
661 switch (h->flags & P_TYPE) {
662 case P_BINTERNAL:
663 src = bi = GETBINTERNAL(h, nxt);
664 nbytes = NBINTERNAL(bi->ksize);
665 isbigkey = bi->flags & P_BIGKEY;
666 break;
667 case P_BLEAF:
668 src = bl = GETBLEAF(h, nxt);
669 nbytes = NBLEAF(bl);
670 isbigkey = bl->flags & P_BIGKEY;
671 break;
672 case P_RINTERNAL:
673 src = GETRINTERNAL(h, nxt);
674 nbytes = NRINTERNAL;
675 isbigkey = 0;
676 break;
677 case P_RLEAF:
678 src = rl = GETRLEAF(h, nxt);
679 nbytes = NRLEAF(rl);
680 isbigkey = 0;
681 break;
682 default:
683 abort();
684 }
685
686 /*
687 * If the key/data pairs are substantial fractions of the max
688 * possible size for the page, it's possible to get situations
689 * where we decide to try and copy too much onto the left page.
690 * Make sure that doesn't happen.
691 */
692 if (skip <= off && used + nbytes >= full) {
693 --off;
694 break;
695 }
696
697 /* Copy the key/data pair, if not the skipped index. */
698 if (skip != off) {
699 ++nxt;
700
701 l->linp[off] = l->upper -= nbytes;
702 memmove((char *)l + l->upper, src, nbytes);
703 }
704
705 used += nbytes;
706 if (used >= half) {
707 if (!isbigkey || bigkeycnt == 3)
708 break;
709 else
710 ++bigkeycnt;
711 }
712 }
713
714 /*
715 * Off is the last offset that's valid for the left page.
716 * Nxt is the first offset to be placed on the right page.
717 */
718 l->lower += (off + 1) * sizeof(indx_t);
719
720 /*
721 * If splitting the page that the cursor was on, the cursor has to be
722 * adjusted to point to the same record as before the split. If the
723 * cursor is at or past the skipped slot, the cursor is incremented by
724 * one. If the cursor is on the right page, it is decremented by the
725 * number of records split to the left page.
726 *
727 * Don't bother checking for the B_SEQINIT flag, the page number will
728 * be P_INVALID.
729 */
730 c = &t->bt_bcursor;
731 if (c->pgno == h->pgno) {
732 if (c->index >= skip)
733 ++c->index;
734 if (c->index < nxt) /* Left page. */
735 c->pgno = l->pgno;
736 else { /* Right page. */
737 c->pgno = r->pgno;
738 c->index -= nxt;
739 }
740 }
741
742 /*
743 * If the skipped index was on the left page, just return that page.
744 * Otherwise, adjust the skip index to reflect the new position on
745 * the right page.
746 */
747 if (skip <= off) {
748 skip = 0;
749 rval = l;
750 } else {
751 rval = r;
752 *pskip -= nxt;
753 }
754
755 for (off = 0; nxt < top; ++off) {
756 if (skip == nxt) {
757 ++off;
758 skip = 0;
759 }
760 switch (h->flags & P_TYPE) {
761 case P_BINTERNAL:
762 src = bi = GETBINTERNAL(h, nxt);
763 nbytes = NBINTERNAL(bi->ksize);
764 break;
765 case P_BLEAF:
766 src = bl = GETBLEAF(h, nxt);
767 nbytes = NBLEAF(bl);
768 break;
769 case P_RINTERNAL:
770 src = GETRINTERNAL(h, nxt);
771 nbytes = NRINTERNAL;
772 break;
773 case P_RLEAF:
774 src = rl = GETRLEAF(h, nxt);
775 nbytes = NRLEAF(rl);
776 break;
777 default:
778 abort();
779 }
780 ++nxt;
781 r->linp[off] = r->upper -= nbytes;
782 memmove((char *)r + r->upper, src, nbytes);
783 }
784 r->lower += off * sizeof(indx_t);
785
786 /* If the key is being appended to the page, adjust the index. */
787 if (skip == top)
788 r->lower += sizeof(indx_t);
789
790 return (rval);
791 }
792
793 /*
794 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
795 *
796 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
797 * record that references them gets deleted. Chains pointed to by internal
798 * pages never get deleted. This routine marks a chain as pointed to by an
799 * internal page.
800 *
801 * Parameters:
802 * t: tree
803 * pg: page number of first page in the chain.
804 *
805 * Returns:
806 * RET_SUCCESS, RET_ERROR.
807 */
808 static int
809 bt_preserve(t, pg)
810 BTREE *t;
811 pgno_t pg;
812 {
813 PAGE *h;
814
815 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
816 return (RET_ERROR);
817 h->flags |= P_PRESERVE;
818 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
819 return (RET_SUCCESS);
820 }
821
822 /*
823 * REC_TOTAL -- Return the number of recno entries below a page.
824 *
825 * Parameters:
826 * h: page
827 *
828 * Returns:
829 * The number of recno entries below a page.
830 *
831 * XXX
832 * These values could be set by the bt_psplit routine. The problem is that the
833 * entry has to be popped off of the stack etc. or the values have to be passed
834 * all the way back to bt_split/bt_rroot and it's not very clean.
835 */
836 static recno_t
837 rec_total(h)
838 PAGE *h;
839 {
840 recno_t recs;
841 indx_t nxt, top;
842
843 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
844 recs += GETRINTERNAL(h, nxt)->nrecs;
845 return (recs);
846 }