[POWERPC] Export copy_4K_page()
[wrt350n-kernel.git] / fs / ocfs2 / suballoc.c
blob9d91e66f51a9fa4606a6f864e2bc7531e10c1a75
1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
4 * suballoc.c
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
35 #include "ocfs2.h"
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
47 #include "buffer_head_io.h"
49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53 struct inode *alloc_inode,
54 struct buffer_head *bg_bh,
55 u64 group_blkno,
56 u16 my_chain,
57 struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 struct inode *alloc_inode,
60 struct buffer_head *bh);
62 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63 struct ocfs2_alloc_context *ac);
65 static int ocfs2_cluster_group_search(struct inode *inode,
66 struct buffer_head *group_bh,
67 u32 bits_wanted, u32 min_bits,
68 u16 *bit_off, u16 *bits_found);
69 static int ocfs2_block_group_search(struct inode *inode,
70 struct buffer_head *group_bh,
71 u32 bits_wanted, u32 min_bits,
72 u16 *bit_off, u16 *bits_found);
73 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
74 struct ocfs2_alloc_context *ac,
75 u32 bits_wanted,
76 u32 min_bits,
77 u16 *bit_off,
78 unsigned int *num_bits,
79 u64 *bg_blkno);
80 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
81 int nr);
82 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
83 struct inode *alloc_inode,
84 struct ocfs2_group_desc *bg,
85 struct buffer_head *group_bh,
86 unsigned int bit_off,
87 unsigned int num_bits);
88 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
89 struct inode *alloc_inode,
90 struct ocfs2_group_desc *bg,
91 struct buffer_head *group_bh,
92 unsigned int bit_off,
93 unsigned int num_bits);
95 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
96 struct inode *alloc_inode,
97 struct buffer_head *fe_bh,
98 struct buffer_head *bg_bh,
99 struct buffer_head *prev_bg_bh,
100 u16 chain);
101 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
102 u32 wanted);
103 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
104 struct inode *alloc_inode,
105 struct buffer_head *alloc_bh,
106 unsigned int start_bit,
107 u64 bg_blkno,
108 unsigned int count);
109 static inline u64 ocfs2_which_suballoc_group(u64 block,
110 unsigned int bit);
111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
112 u64 bg_blkno,
113 u16 bg_bit_off);
114 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
115 u32 cluster);
116 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
117 u64 data_blkno,
118 u64 *bg_blkno,
119 u16 *bg_bit_off);
121 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
123 if (ac->ac_inode)
124 iput(ac->ac_inode);
125 if (ac->ac_bh)
126 brelse(ac->ac_bh);
127 kfree(ac);
130 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
132 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
135 /* somewhat more expensive than our other checks, so use sparingly. */
136 static int ocfs2_check_group_descriptor(struct super_block *sb,
137 struct ocfs2_dinode *di,
138 struct ocfs2_group_desc *gd)
140 unsigned int max_bits;
142 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
143 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
144 return -EIO;
147 if (di->i_blkno != gd->bg_parent_dinode) {
148 ocfs2_error(sb, "Group descriptor # %llu has bad parent "
149 "pointer (%llu, expected %llu)",
150 (unsigned long long)le64_to_cpu(gd->bg_blkno),
151 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
152 (unsigned long long)le64_to_cpu(di->i_blkno));
153 return -EIO;
156 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
157 if (le16_to_cpu(gd->bg_bits) > max_bits) {
158 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
159 (unsigned long long)le64_to_cpu(gd->bg_blkno),
160 le16_to_cpu(gd->bg_bits));
161 return -EIO;
164 if (le16_to_cpu(gd->bg_chain) >=
165 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
166 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
167 (unsigned long long)le64_to_cpu(gd->bg_blkno),
168 le16_to_cpu(gd->bg_chain));
169 return -EIO;
172 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
173 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
174 "claims that %u are free",
175 (unsigned long long)le64_to_cpu(gd->bg_blkno),
176 le16_to_cpu(gd->bg_bits),
177 le16_to_cpu(gd->bg_free_bits_count));
178 return -EIO;
181 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
182 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
183 "max bitmap bits of %u",
184 (unsigned long long)le64_to_cpu(gd->bg_blkno),
185 le16_to_cpu(gd->bg_bits),
186 8 * le16_to_cpu(gd->bg_size));
187 return -EIO;
190 return 0;
193 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
194 struct inode *alloc_inode,
195 struct buffer_head *bg_bh,
196 u64 group_blkno,
197 u16 my_chain,
198 struct ocfs2_chain_list *cl)
200 int status = 0;
201 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
202 struct super_block * sb = alloc_inode->i_sb;
204 mlog_entry_void();
206 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
207 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
208 "b_blocknr (%llu)",
209 (unsigned long long)group_blkno,
210 (unsigned long long) bg_bh->b_blocknr);
211 status = -EIO;
212 goto bail;
215 status = ocfs2_journal_access(handle,
216 alloc_inode,
217 bg_bh,
218 OCFS2_JOURNAL_ACCESS_CREATE);
219 if (status < 0) {
220 mlog_errno(status);
221 goto bail;
224 memset(bg, 0, sb->s_blocksize);
225 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
226 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
227 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
228 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
229 bg->bg_chain = cpu_to_le16(my_chain);
230 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
231 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
232 bg->bg_blkno = cpu_to_le64(group_blkno);
233 /* set the 1st bit in the bitmap to account for the descriptor block */
234 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
235 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
237 status = ocfs2_journal_dirty(handle, bg_bh);
238 if (status < 0)
239 mlog_errno(status);
241 /* There is no need to zero out or otherwise initialize the
242 * other blocks in a group - All valid FS metadata in a block
243 * group stores the superblock fs_generation value at
244 * allocation time. */
246 bail:
247 mlog_exit(status);
248 return status;
251 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
253 u16 curr, best;
255 best = curr = 0;
256 while (curr < le16_to_cpu(cl->cl_count)) {
257 if (le32_to_cpu(cl->cl_recs[best].c_total) >
258 le32_to_cpu(cl->cl_recs[curr].c_total))
259 best = curr;
260 curr++;
262 return best;
266 * We expect the block group allocator to already be locked.
268 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
269 struct inode *alloc_inode,
270 struct buffer_head *bh)
272 int status, credits;
273 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
274 struct ocfs2_chain_list *cl;
275 struct ocfs2_alloc_context *ac = NULL;
276 struct ocfs2_journal_handle *handle = NULL;
277 u32 bit_off, num_bits;
278 u16 alloc_rec;
279 u64 bg_blkno;
280 struct buffer_head *bg_bh = NULL;
281 struct ocfs2_group_desc *bg;
283 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
285 mlog_entry_void();
287 handle = ocfs2_alloc_handle(osb);
288 if (!handle) {
289 status = -ENOMEM;
290 mlog_errno(status);
291 goto bail;
294 cl = &fe->id2.i_chain;
295 status = ocfs2_reserve_clusters(osb,
296 handle,
297 le16_to_cpu(cl->cl_cpg),
298 &ac);
299 if (status < 0) {
300 if (status != -ENOSPC)
301 mlog_errno(status);
302 goto bail;
305 credits = ocfs2_calc_group_alloc_credits(osb->sb,
306 le16_to_cpu(cl->cl_cpg));
307 handle = ocfs2_start_trans(osb, handle, credits);
308 if (IS_ERR(handle)) {
309 status = PTR_ERR(handle);
310 handle = NULL;
311 mlog_errno(status);
312 goto bail;
315 status = ocfs2_claim_clusters(osb,
316 handle,
318 le16_to_cpu(cl->cl_cpg),
319 &bit_off,
320 &num_bits);
321 if (status < 0) {
322 if (status != -ENOSPC)
323 mlog_errno(status);
324 goto bail;
327 alloc_rec = ocfs2_find_smallest_chain(cl);
329 /* setup the group */
330 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
331 mlog(0, "new descriptor, record %u, at block %llu\n",
332 alloc_rec, (unsigned long long)bg_blkno);
334 bg_bh = sb_getblk(osb->sb, bg_blkno);
335 if (!bg_bh) {
336 status = -EIO;
337 mlog_errno(status);
338 goto bail;
340 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
342 status = ocfs2_block_group_fill(handle,
343 alloc_inode,
344 bg_bh,
345 bg_blkno,
346 alloc_rec,
347 cl);
348 if (status < 0) {
349 mlog_errno(status);
350 goto bail;
353 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
355 status = ocfs2_journal_access(handle, alloc_inode,
356 bh, OCFS2_JOURNAL_ACCESS_WRITE);
357 if (status < 0) {
358 mlog_errno(status);
359 goto bail;
362 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
363 le16_to_cpu(bg->bg_free_bits_count));
364 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
365 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
366 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
367 le16_add_cpu(&cl->cl_next_free_rec, 1);
369 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
370 le16_to_cpu(bg->bg_free_bits_count));
371 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
372 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
374 status = ocfs2_journal_dirty(handle, bh);
375 if (status < 0) {
376 mlog_errno(status);
377 goto bail;
380 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
381 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
382 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
383 le32_to_cpu(fe->i_clusters)));
384 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
385 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
386 alloc_inode->i_blocks =
387 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
389 status = 0;
390 bail:
391 if (handle)
392 ocfs2_commit_trans(handle);
394 if (ac)
395 ocfs2_free_alloc_context(ac);
397 if (bg_bh)
398 brelse(bg_bh);
400 mlog_exit(status);
401 return status;
404 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
405 struct ocfs2_alloc_context *ac)
407 int status;
408 u32 bits_wanted = ac->ac_bits_wanted;
409 struct inode *alloc_inode = ac->ac_inode;
410 struct buffer_head *bh = NULL;
411 struct ocfs2_journal_handle *handle = ac->ac_handle;
412 struct ocfs2_dinode *fe;
413 u32 free_bits;
415 mlog_entry_void();
417 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
419 ocfs2_handle_add_inode(handle, alloc_inode);
420 status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
421 if (status < 0) {
422 mlog_errno(status);
423 goto bail;
426 fe = (struct ocfs2_dinode *) bh->b_data;
427 if (!OCFS2_IS_VALID_DINODE(fe)) {
428 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
429 status = -EIO;
430 goto bail;
432 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
433 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
434 (unsigned long long)le64_to_cpu(fe->i_blkno));
435 status = -EIO;
436 goto bail;
439 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
440 le32_to_cpu(fe->id1.bitmap1.i_used);
442 if (bits_wanted > free_bits) {
443 /* cluster bitmap never grows */
444 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
445 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
446 bits_wanted, free_bits);
447 status = -ENOSPC;
448 goto bail;
451 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
452 if (status < 0) {
453 if (status != -ENOSPC)
454 mlog_errno(status);
455 goto bail;
457 atomic_inc(&osb->alloc_stats.bg_extends);
459 /* You should never ask for this much metadata */
460 BUG_ON(bits_wanted >
461 (le32_to_cpu(fe->id1.bitmap1.i_total)
462 - le32_to_cpu(fe->id1.bitmap1.i_used)));
465 get_bh(bh);
466 ac->ac_bh = bh;
467 bail:
468 if (bh)
469 brelse(bh);
471 mlog_exit(status);
472 return status;
475 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
476 struct ocfs2_journal_handle *handle,
477 struct ocfs2_dinode *fe,
478 struct ocfs2_alloc_context **ac)
480 int status;
481 struct inode *alloc_inode = NULL;
483 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
484 if (!(*ac)) {
485 status = -ENOMEM;
486 mlog_errno(status);
487 goto bail;
490 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
491 (*ac)->ac_handle = handle;
492 (*ac)->ac_which = OCFS2_AC_USE_META;
494 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
495 alloc_inode = ocfs2_get_system_file_inode(osb,
496 EXTENT_ALLOC_SYSTEM_INODE,
498 #else
499 alloc_inode = ocfs2_get_system_file_inode(osb,
500 EXTENT_ALLOC_SYSTEM_INODE,
501 osb->slot_num);
502 #endif
503 if (!alloc_inode) {
504 status = -ENOMEM;
505 mlog_errno(status);
506 goto bail;
509 (*ac)->ac_inode = igrab(alloc_inode);
510 (*ac)->ac_group_search = ocfs2_block_group_search;
512 status = ocfs2_reserve_suballoc_bits(osb, (*ac));
513 if (status < 0) {
514 if (status != -ENOSPC)
515 mlog_errno(status);
516 goto bail;
519 status = 0;
520 bail:
521 if ((status < 0) && *ac) {
522 ocfs2_free_alloc_context(*ac);
523 *ac = NULL;
526 if (alloc_inode)
527 iput(alloc_inode);
529 mlog_exit(status);
530 return status;
533 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
534 struct ocfs2_journal_handle *handle,
535 struct ocfs2_alloc_context **ac)
537 int status;
538 struct inode *alloc_inode = NULL;
540 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
541 if (!(*ac)) {
542 status = -ENOMEM;
543 mlog_errno(status);
544 goto bail;
547 (*ac)->ac_bits_wanted = 1;
548 (*ac)->ac_handle = handle;
549 (*ac)->ac_which = OCFS2_AC_USE_INODE;
551 alloc_inode = ocfs2_get_system_file_inode(osb,
552 INODE_ALLOC_SYSTEM_INODE,
553 osb->slot_num);
554 if (!alloc_inode) {
555 status = -ENOMEM;
556 mlog_errno(status);
557 goto bail;
560 (*ac)->ac_inode = igrab(alloc_inode);
561 (*ac)->ac_group_search = ocfs2_block_group_search;
563 status = ocfs2_reserve_suballoc_bits(osb, *ac);
564 if (status < 0) {
565 if (status != -ENOSPC)
566 mlog_errno(status);
567 goto bail;
570 status = 0;
571 bail:
572 if ((status < 0) && *ac) {
573 ocfs2_free_alloc_context(*ac);
574 *ac = NULL;
577 if (alloc_inode)
578 iput(alloc_inode);
580 mlog_exit(status);
581 return status;
584 /* local alloc code has to do the same thing, so rather than do this
585 * twice.. */
586 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
587 struct ocfs2_alloc_context *ac)
589 int status;
591 ac->ac_inode = ocfs2_get_system_file_inode(osb,
592 GLOBAL_BITMAP_SYSTEM_INODE,
593 OCFS2_INVALID_SLOT);
594 if (!ac->ac_inode) {
595 status = -EINVAL;
596 mlog(ML_ERROR, "Could not get bitmap inode!\n");
597 goto bail;
599 ac->ac_which = OCFS2_AC_USE_MAIN;
600 ac->ac_group_search = ocfs2_cluster_group_search;
602 status = ocfs2_reserve_suballoc_bits(osb, ac);
603 if (status < 0 && status != -ENOSPC)
604 mlog_errno(status);
605 bail:
606 return status;
609 /* Callers don't need to care which bitmap (local alloc or main) to
610 * use so we figure it out for them, but unfortunately this clutters
611 * things a bit. */
612 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
613 struct ocfs2_journal_handle *handle,
614 u32 bits_wanted,
615 struct ocfs2_alloc_context **ac)
617 int status;
619 mlog_entry_void();
621 BUG_ON(!handle);
623 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
624 if (!(*ac)) {
625 status = -ENOMEM;
626 mlog_errno(status);
627 goto bail;
630 (*ac)->ac_bits_wanted = bits_wanted;
631 (*ac)->ac_handle = handle;
633 status = -ENOSPC;
634 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
635 status = ocfs2_reserve_local_alloc_bits(osb,
636 handle,
637 bits_wanted,
638 *ac);
639 if ((status < 0) && (status != -ENOSPC)) {
640 mlog_errno(status);
641 goto bail;
642 } else if (status == -ENOSPC) {
643 /* reserve_local_bits will return enospc with
644 * the local alloc inode still locked, so we
645 * can change this safely here. */
646 mlog(0, "Disabling local alloc\n");
647 /* We set to OCFS2_LA_DISABLED so that umount
648 * can clean up what's left of the local
649 * allocation */
650 osb->local_alloc_state = OCFS2_LA_DISABLED;
654 if (status == -ENOSPC) {
655 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
656 if (status < 0) {
657 if (status != -ENOSPC)
658 mlog_errno(status);
659 goto bail;
663 status = 0;
664 bail:
665 if ((status < 0) && *ac) {
666 ocfs2_free_alloc_context(*ac);
667 *ac = NULL;
670 mlog_exit(status);
671 return status;
675 * More or less lifted from ext3. I'll leave their description below:
677 * "For ext3 allocations, we must not reuse any blocks which are
678 * allocated in the bitmap buffer's "last committed data" copy. This
679 * prevents deletes from freeing up the page for reuse until we have
680 * committed the delete transaction.
682 * If we didn't do this, then deleting something and reallocating it as
683 * data would allow the old block to be overwritten before the
684 * transaction committed (because we force data to disk before commit).
685 * This would lead to corruption if we crashed between overwriting the
686 * data and committing the delete.
688 * @@@ We may want to make this allocation behaviour conditional on
689 * data-writes at some point, and disable it for metadata allocations or
690 * sync-data inodes."
692 * Note: OCFS2 already does this differently for metadata vs data
693 * allocations, as those bitmaps are seperate and undo access is never
694 * called on a metadata group descriptor.
696 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
697 int nr)
699 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
701 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
702 return 0;
703 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
704 return 1;
706 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
707 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
710 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
711 struct buffer_head *bg_bh,
712 unsigned int bits_wanted,
713 unsigned int total_bits,
714 u16 *bit_off,
715 u16 *bits_found)
717 void *bitmap;
718 u16 best_offset, best_size;
719 int offset, start, found, status = 0;
720 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
722 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
723 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
724 return -EIO;
727 found = start = best_offset = best_size = 0;
728 bitmap = bg->bg_bitmap;
730 while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
731 if (offset == total_bits)
732 break;
734 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
735 /* We found a zero, but we can't use it as it
736 * hasn't been put to disk yet! */
737 found = 0;
738 start = offset + 1;
739 } else if (offset == start) {
740 /* we found a zero */
741 found++;
742 /* move start to the next bit to test */
743 start++;
744 } else {
745 /* got a zero after some ones */
746 found = 1;
747 start = offset + 1;
749 if (found > best_size) {
750 best_size = found;
751 best_offset = start - found;
753 /* we got everything we needed */
754 if (found == bits_wanted) {
755 /* mlog(0, "Found it all!\n"); */
756 break;
760 /* XXX: I think the first clause is equivalent to the second
761 * - jlbec */
762 if (found == bits_wanted) {
763 *bit_off = start - found;
764 *bits_found = found;
765 } else if (best_size) {
766 *bit_off = best_offset;
767 *bits_found = best_size;
768 } else {
769 status = -ENOSPC;
770 /* No error log here -- see the comment above
771 * ocfs2_test_bg_bit_allocatable */
774 return status;
777 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
778 struct inode *alloc_inode,
779 struct ocfs2_group_desc *bg,
780 struct buffer_head *group_bh,
781 unsigned int bit_off,
782 unsigned int num_bits)
784 int status;
785 void *bitmap = bg->bg_bitmap;
786 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
788 mlog_entry_void();
790 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
791 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
792 status = -EIO;
793 goto bail;
795 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
797 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
798 num_bits);
800 if (ocfs2_is_cluster_bitmap(alloc_inode))
801 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
803 status = ocfs2_journal_access(handle,
804 alloc_inode,
805 group_bh,
806 journal_type);
807 if (status < 0) {
808 mlog_errno(status);
809 goto bail;
812 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
814 while(num_bits--)
815 ocfs2_set_bit(bit_off++, bitmap);
817 status = ocfs2_journal_dirty(handle,
818 group_bh);
819 if (status < 0) {
820 mlog_errno(status);
821 goto bail;
824 bail:
825 mlog_exit(status);
826 return status;
829 /* find the one with the most empty bits */
830 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
832 u16 curr, best;
834 BUG_ON(!cl->cl_next_free_rec);
836 best = curr = 0;
837 while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
838 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
839 le32_to_cpu(cl->cl_recs[best].c_free))
840 best = curr;
841 curr++;
844 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
845 return best;
848 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
849 struct inode *alloc_inode,
850 struct buffer_head *fe_bh,
851 struct buffer_head *bg_bh,
852 struct buffer_head *prev_bg_bh,
853 u16 chain)
855 int status;
856 /* there is a really tiny chance the journal calls could fail,
857 * but we wouldn't want inconsistent blocks in *any* case. */
858 u64 fe_ptr, bg_ptr, prev_bg_ptr;
859 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
860 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
861 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
863 if (!OCFS2_IS_VALID_DINODE(fe)) {
864 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
865 status = -EIO;
866 goto out;
868 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
869 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
870 status = -EIO;
871 goto out;
873 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
874 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
875 status = -EIO;
876 goto out;
879 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
880 (unsigned long long)fe->i_blkno, chain,
881 (unsigned long long)bg->bg_blkno,
882 (unsigned long long)prev_bg->bg_blkno);
884 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
885 bg_ptr = le64_to_cpu(bg->bg_next_group);
886 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
888 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
889 OCFS2_JOURNAL_ACCESS_WRITE);
890 if (status < 0) {
891 mlog_errno(status);
892 goto out_rollback;
895 prev_bg->bg_next_group = bg->bg_next_group;
897 status = ocfs2_journal_dirty(handle, prev_bg_bh);
898 if (status < 0) {
899 mlog_errno(status);
900 goto out_rollback;
903 status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
904 OCFS2_JOURNAL_ACCESS_WRITE);
905 if (status < 0) {
906 mlog_errno(status);
907 goto out_rollback;
910 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
912 status = ocfs2_journal_dirty(handle, bg_bh);
913 if (status < 0) {
914 mlog_errno(status);
915 goto out_rollback;
918 status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
919 OCFS2_JOURNAL_ACCESS_WRITE);
920 if (status < 0) {
921 mlog_errno(status);
922 goto out_rollback;
925 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
927 status = ocfs2_journal_dirty(handle, fe_bh);
928 if (status < 0) {
929 mlog_errno(status);
930 goto out_rollback;
933 status = 0;
934 out_rollback:
935 if (status < 0) {
936 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
937 bg->bg_next_group = cpu_to_le64(bg_ptr);
938 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
940 out:
941 mlog_exit(status);
942 return status;
945 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
946 u32 wanted)
948 return le16_to_cpu(bg->bg_free_bits_count) > wanted;
951 /* return 0 on success, -ENOSPC to keep searching and any other < 0
952 * value on error. */
953 static int ocfs2_cluster_group_search(struct inode *inode,
954 struct buffer_head *group_bh,
955 u32 bits_wanted, u32 min_bits,
956 u16 *bit_off, u16 *bits_found)
958 int search = -ENOSPC;
959 int ret;
960 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
961 u16 tmp_off, tmp_found;
962 unsigned int max_bits, gd_cluster_off;
964 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
966 if (gd->bg_free_bits_count) {
967 max_bits = le16_to_cpu(gd->bg_bits);
969 /* Tail groups in cluster bitmaps which aren't cpg
970 * aligned are prone to partial extention by a failed
971 * fs resize. If the file system resize never got to
972 * update the dinode cluster count, then we don't want
973 * to trust any clusters past it, regardless of what
974 * the group descriptor says. */
975 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
976 le64_to_cpu(gd->bg_blkno));
977 if ((gd_cluster_off + max_bits) >
978 OCFS2_I(inode)->ip_clusters) {
979 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
980 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
981 (unsigned long long)le64_to_cpu(gd->bg_blkno),
982 le16_to_cpu(gd->bg_bits),
983 OCFS2_I(inode)->ip_clusters, max_bits);
986 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
987 group_bh, bits_wanted,
988 max_bits,
989 &tmp_off, &tmp_found);
990 if (ret)
991 return ret;
993 /* ocfs2_block_group_find_clear_bits() might
994 * return success, but we still want to return
995 * -ENOSPC unless it found the minimum number
996 * of bits. */
997 if (min_bits <= tmp_found) {
998 *bit_off = tmp_off;
999 *bits_found = tmp_found;
1000 search = 0; /* success */
1004 return search;
1007 static int ocfs2_block_group_search(struct inode *inode,
1008 struct buffer_head *group_bh,
1009 u32 bits_wanted, u32 min_bits,
1010 u16 *bit_off, u16 *bits_found)
1012 int ret = -ENOSPC;
1013 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1015 BUG_ON(min_bits != 1);
1016 BUG_ON(ocfs2_is_cluster_bitmap(inode));
1018 if (bg->bg_free_bits_count)
1019 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1020 group_bh, bits_wanted,
1021 le16_to_cpu(bg->bg_bits),
1022 bit_off, bits_found);
1024 return ret;
1027 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1028 struct ocfs2_journal_handle *handle,
1029 struct buffer_head *di_bh,
1030 u32 num_bits,
1031 u16 chain)
1033 int ret;
1034 u32 tmp_used;
1035 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1036 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1038 ret = ocfs2_journal_access(handle, inode, di_bh,
1039 OCFS2_JOURNAL_ACCESS_WRITE);
1040 if (ret < 0) {
1041 mlog_errno(ret);
1042 goto out;
1045 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1046 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1047 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1049 ret = ocfs2_journal_dirty(handle, di_bh);
1050 if (ret < 0)
1051 mlog_errno(ret);
1053 out:
1054 return ret;
1057 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1058 u32 bits_wanted,
1059 u32 min_bits,
1060 u16 *bit_off,
1061 unsigned int *num_bits,
1062 u64 gd_blkno,
1063 u16 *bits_left)
1065 int ret;
1066 u16 found;
1067 struct buffer_head *group_bh = NULL;
1068 struct ocfs2_group_desc *gd;
1069 struct inode *alloc_inode = ac->ac_inode;
1070 struct ocfs2_journal_handle *handle = ac->ac_handle;
1072 ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
1073 &group_bh, OCFS2_BH_CACHED, alloc_inode);
1074 if (ret < 0) {
1075 mlog_errno(ret);
1076 return ret;
1079 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1080 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1081 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1082 ret = -EIO;
1083 goto out;
1086 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1087 bit_off, &found);
1088 if (ret < 0) {
1089 if (ret != -ENOSPC)
1090 mlog_errno(ret);
1091 goto out;
1094 *num_bits = found;
1096 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1097 *num_bits,
1098 le16_to_cpu(gd->bg_chain));
1099 if (ret < 0) {
1100 mlog_errno(ret);
1101 goto out;
1104 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1105 *bit_off, *num_bits);
1106 if (ret < 0)
1107 mlog_errno(ret);
1109 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1111 out:
1112 brelse(group_bh);
1114 return ret;
1117 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1118 u32 bits_wanted,
1119 u32 min_bits,
1120 u16 *bit_off,
1121 unsigned int *num_bits,
1122 u64 *bg_blkno,
1123 u16 *bits_left)
1125 int status;
1126 u16 chain, tmp_bits;
1127 u32 tmp_used;
1128 u64 next_group;
1129 struct ocfs2_journal_handle *handle = ac->ac_handle;
1130 struct inode *alloc_inode = ac->ac_inode;
1131 struct buffer_head *group_bh = NULL;
1132 struct buffer_head *prev_group_bh = NULL;
1133 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1134 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1135 struct ocfs2_group_desc *bg;
1137 chain = ac->ac_chain;
1138 mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1139 bits_wanted, chain,
1140 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1142 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1143 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1144 &group_bh, OCFS2_BH_CACHED, alloc_inode);
1145 if (status < 0) {
1146 mlog_errno(status);
1147 goto bail;
1149 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1150 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1151 if (status) {
1152 mlog_errno(status);
1153 goto bail;
1156 status = -ENOSPC;
1157 /* for now, the chain search is a bit simplistic. We just use
1158 * the 1st group with any empty bits. */
1159 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1160 bits_wanted, min_bits, bit_off,
1161 &tmp_bits)) == -ENOSPC) {
1162 if (!bg->bg_next_group)
1163 break;
1165 if (prev_group_bh) {
1166 brelse(prev_group_bh);
1167 prev_group_bh = NULL;
1169 next_group = le64_to_cpu(bg->bg_next_group);
1170 prev_group_bh = group_bh;
1171 group_bh = NULL;
1172 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1173 next_group, &group_bh,
1174 OCFS2_BH_CACHED, alloc_inode);
1175 if (status < 0) {
1176 mlog_errno(status);
1177 goto bail;
1179 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1180 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1181 if (status) {
1182 mlog_errno(status);
1183 goto bail;
1186 if (status < 0) {
1187 if (status != -ENOSPC)
1188 mlog_errno(status);
1189 goto bail;
1192 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1193 tmp_bits, (unsigned long long)bg->bg_blkno);
1195 *num_bits = tmp_bits;
1197 BUG_ON(*num_bits == 0);
1200 * Keep track of previous block descriptor read. When
1201 * we find a target, if we have read more than X
1202 * number of descriptors, and the target is reasonably
1203 * empty, relink him to top of his chain.
1205 * We've read 0 extra blocks and only send one more to
1206 * the transaction, yet the next guy to search has a
1207 * much easier time.
1209 * Do this *after* figuring out how many bits we're taking out
1210 * of our target group.
1212 if (ac->ac_allow_chain_relink &&
1213 (prev_group_bh) &&
1214 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1215 status = ocfs2_relink_block_group(handle, alloc_inode,
1216 ac->ac_bh, group_bh,
1217 prev_group_bh, chain);
1218 if (status < 0) {
1219 mlog_errno(status);
1220 goto bail;
1224 /* Ok, claim our bits now: set the info on dinode, chainlist
1225 * and then the group */
1226 status = ocfs2_journal_access(handle,
1227 alloc_inode,
1228 ac->ac_bh,
1229 OCFS2_JOURNAL_ACCESS_WRITE);
1230 if (status < 0) {
1231 mlog_errno(status);
1232 goto bail;
1235 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1236 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1237 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1239 status = ocfs2_journal_dirty(handle,
1240 ac->ac_bh);
1241 if (status < 0) {
1242 mlog_errno(status);
1243 goto bail;
1246 status = ocfs2_block_group_set_bits(handle,
1247 alloc_inode,
1249 group_bh,
1250 *bit_off,
1251 *num_bits);
1252 if (status < 0) {
1253 mlog_errno(status);
1254 goto bail;
1257 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1258 (unsigned long long)fe->i_blkno);
1260 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1261 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1262 bail:
1263 if (group_bh)
1264 brelse(group_bh);
1265 if (prev_group_bh)
1266 brelse(prev_group_bh);
1268 mlog_exit(status);
1269 return status;
1272 /* will give out up to bits_wanted contiguous bits. */
1273 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1274 struct ocfs2_alloc_context *ac,
1275 u32 bits_wanted,
1276 u32 min_bits,
1277 u16 *bit_off,
1278 unsigned int *num_bits,
1279 u64 *bg_blkno)
1281 int status;
1282 u16 victim, i;
1283 u16 bits_left = 0;
1284 u64 hint_blkno = ac->ac_last_group;
1285 struct ocfs2_chain_list *cl;
1286 struct ocfs2_dinode *fe;
1288 mlog_entry_void();
1290 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1291 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1292 BUG_ON(!ac->ac_bh);
1294 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1295 if (!OCFS2_IS_VALID_DINODE(fe)) {
1296 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1297 status = -EIO;
1298 goto bail;
1300 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1301 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1302 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1303 "bits but only %u total.",
1304 (unsigned long long)le64_to_cpu(fe->i_blkno),
1305 le32_to_cpu(fe->id1.bitmap1.i_used),
1306 le32_to_cpu(fe->id1.bitmap1.i_total));
1307 status = -EIO;
1308 goto bail;
1311 if (hint_blkno) {
1312 /* Attempt to short-circuit the usual search mechanism
1313 * by jumping straight to the most recently used
1314 * allocation group. This helps us mantain some
1315 * contiguousness across allocations. */
1316 status = ocfs2_search_one_group(ac, bits_wanted, min_bits,
1317 bit_off, num_bits,
1318 hint_blkno, &bits_left);
1319 if (!status) {
1320 /* Be careful to update *bg_blkno here as the
1321 * caller is expecting it to be filled in, and
1322 * ocfs2_search_one_group() won't do that for
1323 * us. */
1324 *bg_blkno = hint_blkno;
1325 goto set_hint;
1327 if (status < 0 && status != -ENOSPC) {
1328 mlog_errno(status);
1329 goto bail;
1333 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1335 victim = ocfs2_find_victim_chain(cl);
1336 ac->ac_chain = victim;
1337 ac->ac_allow_chain_relink = 1;
1339 status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1340 num_bits, bg_blkno, &bits_left);
1341 if (!status)
1342 goto set_hint;
1343 if (status < 0 && status != -ENOSPC) {
1344 mlog_errno(status);
1345 goto bail;
1348 mlog(0, "Search of victim chain %u came up with nothing, "
1349 "trying all chains now.\n", victim);
1351 /* If we didn't pick a good victim, then just default to
1352 * searching each chain in order. Don't allow chain relinking
1353 * because we only calculate enough journal credits for one
1354 * relink per alloc. */
1355 ac->ac_allow_chain_relink = 0;
1356 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1357 if (i == victim)
1358 continue;
1359 if (!cl->cl_recs[i].c_free)
1360 continue;
1362 ac->ac_chain = i;
1363 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1364 bit_off, num_bits, bg_blkno,
1365 &bits_left);
1366 if (!status)
1367 break;
1368 if (status < 0 && status != -ENOSPC) {
1369 mlog_errno(status);
1370 goto bail;
1374 set_hint:
1375 if (status != -ENOSPC) {
1376 /* If the next search of this group is not likely to
1377 * yield a suitable extent, then we reset the last
1378 * group hint so as to not waste a disk read */
1379 if (bits_left < min_bits)
1380 ac->ac_last_group = 0;
1381 else
1382 ac->ac_last_group = *bg_blkno;
1385 bail:
1386 mlog_exit(status);
1387 return status;
1390 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1391 struct ocfs2_journal_handle *handle,
1392 struct ocfs2_alloc_context *ac,
1393 u32 bits_wanted,
1394 u16 *suballoc_bit_start,
1395 unsigned int *num_bits,
1396 u64 *blkno_start)
1398 int status;
1399 u64 bg_blkno;
1401 BUG_ON(!ac);
1402 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1403 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1404 BUG_ON(ac->ac_handle != handle);
1406 status = ocfs2_claim_suballoc_bits(osb,
1408 bits_wanted,
1410 suballoc_bit_start,
1411 num_bits,
1412 &bg_blkno);
1413 if (status < 0) {
1414 mlog_errno(status);
1415 goto bail;
1417 atomic_inc(&osb->alloc_stats.bg_allocs);
1419 *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1420 ac->ac_bits_given += (*num_bits);
1421 status = 0;
1422 bail:
1423 mlog_exit(status);
1424 return status;
1427 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1428 struct ocfs2_journal_handle *handle,
1429 struct ocfs2_alloc_context *ac,
1430 u16 *suballoc_bit,
1431 u64 *fe_blkno)
1433 int status;
1434 unsigned int num_bits;
1435 u64 bg_blkno;
1437 mlog_entry_void();
1439 BUG_ON(!ac);
1440 BUG_ON(ac->ac_bits_given != 0);
1441 BUG_ON(ac->ac_bits_wanted != 1);
1442 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1443 BUG_ON(ac->ac_handle != handle);
1445 status = ocfs2_claim_suballoc_bits(osb,
1449 suballoc_bit,
1450 &num_bits,
1451 &bg_blkno);
1452 if (status < 0) {
1453 mlog_errno(status);
1454 goto bail;
1456 atomic_inc(&osb->alloc_stats.bg_allocs);
1458 BUG_ON(num_bits != 1);
1460 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1461 ac->ac_bits_given++;
1462 status = 0;
1463 bail:
1464 mlog_exit(status);
1465 return status;
1468 /* translate a group desc. blkno and it's bitmap offset into
1469 * disk cluster offset. */
1470 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1471 u64 bg_blkno,
1472 u16 bg_bit_off)
1474 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1475 u32 cluster = 0;
1477 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1479 if (bg_blkno != osb->first_cluster_group_blkno)
1480 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1481 cluster += (u32) bg_bit_off;
1482 return cluster;
1485 /* given a cluster offset, calculate which block group it belongs to
1486 * and return that block offset. */
1487 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1488 u32 cluster)
1490 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1491 u32 group_no;
1493 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1495 group_no = cluster / osb->bitmap_cpg;
1496 if (!group_no)
1497 return osb->first_cluster_group_blkno;
1498 return ocfs2_clusters_to_blocks(inode->i_sb,
1499 group_no * osb->bitmap_cpg);
1502 /* given the block number of a cluster start, calculate which cluster
1503 * group and descriptor bitmap offset that corresponds to. */
1504 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1505 u64 data_blkno,
1506 u64 *bg_blkno,
1507 u16 *bg_bit_off)
1509 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1510 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1512 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1514 *bg_blkno = ocfs2_which_cluster_group(inode,
1515 data_cluster);
1517 if (*bg_blkno == osb->first_cluster_group_blkno)
1518 *bg_bit_off = (u16) data_cluster;
1519 else
1520 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1521 data_blkno - *bg_blkno);
1525 * min_bits - minimum contiguous chunk from this total allocation we
1526 * can handle. set to what we asked for originally for a full
1527 * contig. allocation, set to '1' to indicate we can deal with extents
1528 * of any size.
1530 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1531 struct ocfs2_journal_handle *handle,
1532 struct ocfs2_alloc_context *ac,
1533 u32 min_clusters,
1534 u32 *cluster_start,
1535 u32 *num_clusters)
1537 int status;
1538 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1539 u64 bg_blkno = 0;
1540 u16 bg_bit_off;
1542 mlog_entry_void();
1544 BUG_ON(!ac);
1545 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1547 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1548 && ac->ac_which != OCFS2_AC_USE_MAIN);
1549 BUG_ON(ac->ac_handle != handle);
1551 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1552 status = ocfs2_claim_local_alloc_bits(osb,
1553 handle,
1555 bits_wanted,
1556 cluster_start,
1557 num_clusters);
1558 if (!status)
1559 atomic_inc(&osb->alloc_stats.local_data);
1560 } else {
1561 if (min_clusters > (osb->bitmap_cpg - 1)) {
1562 /* The only paths asking for contiguousness
1563 * should know about this already. */
1564 mlog(ML_ERROR, "minimum allocation requested exceeds "
1565 "group bitmap size!");
1566 status = -ENOSPC;
1567 goto bail;
1569 /* clamp the current request down to a realistic size. */
1570 if (bits_wanted > (osb->bitmap_cpg - 1))
1571 bits_wanted = osb->bitmap_cpg - 1;
1573 status = ocfs2_claim_suballoc_bits(osb,
1575 bits_wanted,
1576 min_clusters,
1577 &bg_bit_off,
1578 num_clusters,
1579 &bg_blkno);
1580 if (!status) {
1581 *cluster_start =
1582 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1583 bg_blkno,
1584 bg_bit_off);
1585 atomic_inc(&osb->alloc_stats.bitmap_data);
1588 if (status < 0) {
1589 if (status != -ENOSPC)
1590 mlog_errno(status);
1591 goto bail;
1594 ac->ac_bits_given += *num_clusters;
1596 bail:
1597 mlog_exit(status);
1598 return status;
1601 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1602 struct inode *alloc_inode,
1603 struct ocfs2_group_desc *bg,
1604 struct buffer_head *group_bh,
1605 unsigned int bit_off,
1606 unsigned int num_bits)
1608 int status;
1609 unsigned int tmp;
1610 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1611 struct ocfs2_group_desc *undo_bg = NULL;
1613 mlog_entry_void();
1615 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1616 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1617 status = -EIO;
1618 goto bail;
1621 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1623 if (ocfs2_is_cluster_bitmap(alloc_inode))
1624 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1626 status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1627 journal_type);
1628 if (status < 0) {
1629 mlog_errno(status);
1630 goto bail;
1633 if (ocfs2_is_cluster_bitmap(alloc_inode))
1634 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1636 tmp = num_bits;
1637 while(tmp--) {
1638 ocfs2_clear_bit((bit_off + tmp),
1639 (unsigned long *) bg->bg_bitmap);
1640 if (ocfs2_is_cluster_bitmap(alloc_inode))
1641 ocfs2_set_bit(bit_off + tmp,
1642 (unsigned long *) undo_bg->bg_bitmap);
1644 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1646 status = ocfs2_journal_dirty(handle, group_bh);
1647 if (status < 0)
1648 mlog_errno(status);
1649 bail:
1650 return status;
1654 * expects the suballoc inode to already be locked.
1656 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1657 struct inode *alloc_inode,
1658 struct buffer_head *alloc_bh,
1659 unsigned int start_bit,
1660 u64 bg_blkno,
1661 unsigned int count)
1663 int status = 0;
1664 u32 tmp_used;
1665 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1666 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1667 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1668 struct buffer_head *group_bh = NULL;
1669 struct ocfs2_group_desc *group;
1671 mlog_entry_void();
1673 if (!OCFS2_IS_VALID_DINODE(fe)) {
1674 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1675 status = -EIO;
1676 goto bail;
1678 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1680 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1681 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1682 (unsigned long long)bg_blkno, start_bit);
1684 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1685 alloc_inode);
1686 if (status < 0) {
1687 mlog_errno(status);
1688 goto bail;
1691 group = (struct ocfs2_group_desc *) group_bh->b_data;
1692 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
1693 if (status) {
1694 mlog_errno(status);
1695 goto bail;
1697 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1699 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1700 group, group_bh,
1701 start_bit, count);
1702 if (status < 0) {
1703 mlog_errno(status);
1704 goto bail;
1707 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1708 OCFS2_JOURNAL_ACCESS_WRITE);
1709 if (status < 0) {
1710 mlog_errno(status);
1711 goto bail;
1714 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1715 count);
1716 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1717 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1719 status = ocfs2_journal_dirty(handle, alloc_bh);
1720 if (status < 0) {
1721 mlog_errno(status);
1722 goto bail;
1725 bail:
1726 if (group_bh)
1727 brelse(group_bh);
1729 mlog_exit(status);
1730 return status;
1733 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1735 u64 group = block - (u64) bit;
1737 return group;
1740 int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1741 struct inode *inode_alloc_inode,
1742 struct buffer_head *inode_alloc_bh,
1743 struct ocfs2_dinode *di)
1745 u64 blk = le64_to_cpu(di->i_blkno);
1746 u16 bit = le16_to_cpu(di->i_suballoc_bit);
1747 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1749 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1750 inode_alloc_bh, bit, bg_blkno, 1);
1753 int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1754 struct inode *eb_alloc_inode,
1755 struct buffer_head *eb_alloc_bh,
1756 struct ocfs2_extent_block *eb)
1758 u64 blk = le64_to_cpu(eb->h_blkno);
1759 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1760 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1762 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1763 bit, bg_blkno, 1);
1766 int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1767 struct inode *bitmap_inode,
1768 struct buffer_head *bitmap_bh,
1769 u64 start_blk,
1770 unsigned int num_clusters)
1772 int status;
1773 u16 bg_start_bit;
1774 u64 bg_blkno;
1775 struct ocfs2_dinode *fe;
1777 /* You can't ever have a contiguous set of clusters
1778 * bigger than a block group bitmap so we never have to worry
1779 * about looping on them. */
1781 mlog_entry_void();
1783 /* This is expensive. We can safely remove once this stuff has
1784 * gotten tested really well. */
1785 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1787 fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1789 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1790 &bg_start_bit);
1792 mlog(0, "want to free %u clusters starting at block %llu\n",
1793 num_clusters, (unsigned long long)start_blk);
1794 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1795 (unsigned long long)bg_blkno, bg_start_bit);
1797 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1798 bg_start_bit, bg_blkno,
1799 num_clusters);
1800 if (status < 0)
1801 mlog_errno(status);
1803 mlog_exit(status);
1804 return status;
1807 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1809 printk("Block Group:\n");
1810 printk("bg_signature: %s\n", bg->bg_signature);
1811 printk("bg_size: %u\n", bg->bg_size);
1812 printk("bg_bits: %u\n", bg->bg_bits);
1813 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1814 printk("bg_chain: %u\n", bg->bg_chain);
1815 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
1816 printk("bg_next_group: %llu\n",
1817 (unsigned long long)bg->bg_next_group);
1818 printk("bg_parent_dinode: %llu\n",
1819 (unsigned long long)bg->bg_parent_dinode);
1820 printk("bg_blkno: %llu\n",
1821 (unsigned long long)bg->bg_blkno);
1824 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1826 int i;
1828 printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
1829 printk("i_signature: %s\n", fe->i_signature);
1830 printk("i_size: %llu\n",
1831 (unsigned long long)fe->i_size);
1832 printk("i_clusters: %u\n", fe->i_clusters);
1833 printk("i_generation: %u\n",
1834 le32_to_cpu(fe->i_generation));
1835 printk("id1.bitmap1.i_used: %u\n",
1836 le32_to_cpu(fe->id1.bitmap1.i_used));
1837 printk("id1.bitmap1.i_total: %u\n",
1838 le32_to_cpu(fe->id1.bitmap1.i_total));
1839 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
1840 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
1841 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
1842 printk("id2.i_chain.cl_next_free_rec: %u\n",
1843 fe->id2.i_chain.cl_next_free_rec);
1844 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1845 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
1846 fe->id2.i_chain.cl_recs[i].c_free);
1847 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1848 fe->id2.i_chain.cl_recs[i].c_total);
1849 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
1850 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);