4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
45 static int fsync_buffers_list(spinlock_t
*lock
, struct list_head
*list
);
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
49 void put_bh(struct buffer_head
*bh
)
51 WARN_ON(atomic_read(&bh
->b_count
) <= 0);
52 smp_mb__before_atomic_dec();
53 atomic_dec(&bh
->b_count
);
55 EXPORT_SYMBOL(put_bh
);
58 init_buffer(struct buffer_head
*bh
, bh_end_io_t
*handler
, void *private)
60 bh
->b_end_io
= handler
;
61 bh
->b_private
= private;
63 EXPORT_SYMBOL(init_buffer
);
65 static int sync_buffer(void *word
)
67 struct block_device
*bd
;
68 struct buffer_head
*bh
69 = container_of(word
, struct buffer_head
, b_state
);
74 blk_run_address_space(bd
->bd_inode
->i_mapping
);
79 void __lock_buffer(struct buffer_head
*bh
)
81 wait_on_bit_lock(&bh
->b_state
, BH_Lock
, sync_buffer
,
82 TASK_UNINTERRUPTIBLE
);
84 EXPORT_SYMBOL(__lock_buffer
);
86 void unlock_buffer(struct buffer_head
*bh
)
88 clear_bit_unlock(BH_Lock
, &bh
->b_state
);
89 smp_mb__after_clear_bit();
90 wake_up_bit(&bh
->b_state
, BH_Lock
);
92 EXPORT_SYMBOL(unlock_buffer
);
95 * Block until a buffer comes unlocked. This doesn't stop it
96 * from becoming locked again - you have to lock it yourself
97 * if you want to preserve its state.
99 void __wait_on_buffer(struct buffer_head
* bh
)
101 wait_on_bit(&bh
->b_state
, BH_Lock
, sync_buffer
, TASK_UNINTERRUPTIBLE
);
103 EXPORT_SYMBOL(__wait_on_buffer
);
106 __clear_page_buffers(struct page
*page
)
108 ClearPagePrivate(page
);
109 set_page_private(page
, 0);
110 page_cache_release(page
);
114 static int quiet_error(struct buffer_head
*bh
)
116 if (!test_bit(BH_Quiet
, &bh
->b_state
) && printk_ratelimit())
122 static void buffer_io_error(struct buffer_head
*bh
)
124 char b
[BDEVNAME_SIZE
];
125 printk(KERN_ERR
"Buffer I/O error on device %s, logical block %Lu\n",
126 bdevname(bh
->b_bdev
, b
),
127 (unsigned long long)bh
->b_blocknr
);
131 * End-of-IO handler helper function which does not touch the bh after
133 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
134 * a race there is benign: unlock_buffer() only use the bh's address for
135 * hashing after unlocking the buffer, so it doesn't actually touch the bh
138 static void __end_buffer_read_notouch(struct buffer_head
*bh
, int uptodate
)
141 set_buffer_uptodate(bh
);
143 /* This happens, due to failed READA attempts. */
144 clear_buffer_uptodate(bh
);
150 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
151 * unlock the buffer. This is what ll_rw_block uses too.
153 void end_buffer_read_sync(struct buffer_head
*bh
, int uptodate
)
155 __end_buffer_read_notouch(bh
, uptodate
);
158 EXPORT_SYMBOL(end_buffer_read_sync
);
160 void end_buffer_write_sync(struct buffer_head
*bh
, int uptodate
)
162 char b
[BDEVNAME_SIZE
];
165 set_buffer_uptodate(bh
);
167 if (!buffer_eopnotsupp(bh
) && !quiet_error(bh
)) {
169 printk(KERN_WARNING
"lost page write due to "
171 bdevname(bh
->b_bdev
, b
));
173 set_buffer_write_io_error(bh
);
174 clear_buffer_uptodate(bh
);
179 EXPORT_SYMBOL(end_buffer_write_sync
);
182 * Various filesystems appear to want __find_get_block to be non-blocking.
183 * But it's the page lock which protects the buffers. To get around this,
184 * we get exclusion from try_to_free_buffers with the blockdev mapping's
187 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
188 * may be quite high. This code could TryLock the page, and if that
189 * succeeds, there is no need to take private_lock. (But if
190 * private_lock is contended then so is mapping->tree_lock).
192 static struct buffer_head
*
193 __find_get_block_slow(struct block_device
*bdev
, sector_t block
)
195 struct inode
*bd_inode
= bdev
->bd_inode
;
196 struct address_space
*bd_mapping
= bd_inode
->i_mapping
;
197 struct buffer_head
*ret
= NULL
;
199 struct buffer_head
*bh
;
200 struct buffer_head
*head
;
204 index
= block
>> (PAGE_CACHE_SHIFT
- bd_inode
->i_blkbits
);
205 page
= find_get_page(bd_mapping
, index
);
209 spin_lock(&bd_mapping
->private_lock
);
210 if (!page_has_buffers(page
))
212 head
= page_buffers(page
);
215 if (!buffer_mapped(bh
))
217 else if (bh
->b_blocknr
== block
) {
222 bh
= bh
->b_this_page
;
223 } while (bh
!= head
);
225 /* we might be here because some of the buffers on this page are
226 * not mapped. This is due to various races between
227 * file io on the block device and getblk. It gets dealt with
228 * elsewhere, don't buffer_error if we had some unmapped buffers
231 printk("__find_get_block_slow() failed. "
232 "block=%llu, b_blocknr=%llu\n",
233 (unsigned long long)block
,
234 (unsigned long long)bh
->b_blocknr
);
235 printk("b_state=0x%08lx, b_size=%zu\n",
236 bh
->b_state
, bh
->b_size
);
237 printk("device blocksize: %d\n", 1 << bd_inode
->i_blkbits
);
240 spin_unlock(&bd_mapping
->private_lock
);
241 page_cache_release(page
);
246 /* If invalidate_buffers() will trash dirty buffers, it means some kind
247 of fs corruption is going on. Trashing dirty data always imply losing
248 information that was supposed to be just stored on the physical layer
251 Thus invalidate_buffers in general usage is not allwowed to trash
252 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
253 be preserved. These buffers are simply skipped.
255 We also skip buffers which are still in use. For example this can
256 happen if a userspace program is reading the block device.
258 NOTE: In the case where the user removed a removable-media-disk even if
259 there's still dirty data not synced on disk (due a bug in the device driver
260 or due an error of the user), by not destroying the dirty buffers we could
261 generate corruption also on the next media inserted, thus a parameter is
262 necessary to handle this case in the most safe way possible (trying
263 to not corrupt also the new disk inserted with the data belonging to
264 the old now corrupted disk). Also for the ramdisk the natural thing
265 to do in order to release the ramdisk memory is to destroy dirty buffers.
267 These are two special cases. Normal usage imply the device driver
268 to issue a sync on the device (without waiting I/O completion) and
269 then an invalidate_buffers call that doesn't trash dirty buffers.
271 For handling cache coherency with the blkdev pagecache the 'update' case
272 is been introduced. It is needed to re-read from disk any pinned
273 buffer. NOTE: re-reading from disk is destructive so we can do it only
274 when we assume nobody is changing the buffercache under our I/O and when
275 we think the disk contains more recent information than the buffercache.
276 The update == 1 pass marks the buffers we need to update, the update == 2
277 pass does the actual I/O. */
278 void invalidate_bdev(struct block_device
*bdev
)
280 struct address_space
*mapping
= bdev
->bd_inode
->i_mapping
;
282 if (mapping
->nrpages
== 0)
285 invalidate_bh_lrus();
286 invalidate_mapping_pages(mapping
, 0, -1);
288 EXPORT_SYMBOL(invalidate_bdev
);
291 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
293 static void free_more_memory(void)
298 wakeup_flusher_threads(1024);
301 for_each_online_node(nid
) {
302 (void)first_zones_zonelist(node_zonelist(nid
, GFP_NOFS
),
303 gfp_zone(GFP_NOFS
), NULL
,
306 try_to_free_pages(node_zonelist(nid
, GFP_NOFS
), 0,
312 * I/O completion handler for block_read_full_page() - pages
313 * which come unlocked at the end of I/O.
315 static void end_buffer_async_read(struct buffer_head
*bh
, int uptodate
)
318 struct buffer_head
*first
;
319 struct buffer_head
*tmp
;
321 int page_uptodate
= 1;
323 BUG_ON(!buffer_async_read(bh
));
327 set_buffer_uptodate(bh
);
329 clear_buffer_uptodate(bh
);
330 if (!quiet_error(bh
))
336 * Be _very_ careful from here on. Bad things can happen if
337 * two buffer heads end IO at almost the same time and both
338 * decide that the page is now completely done.
340 first
= page_buffers(page
);
341 local_irq_save(flags
);
342 bit_spin_lock(BH_Uptodate_Lock
, &first
->b_state
);
343 clear_buffer_async_read(bh
);
347 if (!buffer_uptodate(tmp
))
349 if (buffer_async_read(tmp
)) {
350 BUG_ON(!buffer_locked(tmp
));
353 tmp
= tmp
->b_this_page
;
355 bit_spin_unlock(BH_Uptodate_Lock
, &first
->b_state
);
356 local_irq_restore(flags
);
359 * If none of the buffers had errors and they are all
360 * uptodate then we can set the page uptodate.
362 if (page_uptodate
&& !PageError(page
))
363 SetPageUptodate(page
);
368 bit_spin_unlock(BH_Uptodate_Lock
, &first
->b_state
);
369 local_irq_restore(flags
);
374 * Completion handler for block_write_full_page() - pages which are unlocked
375 * during I/O, and which have PageWriteback cleared upon I/O completion.
377 void end_buffer_async_write(struct buffer_head
*bh
, int uptodate
)
379 char b
[BDEVNAME_SIZE
];
381 struct buffer_head
*first
;
382 struct buffer_head
*tmp
;
385 BUG_ON(!buffer_async_write(bh
));
389 set_buffer_uptodate(bh
);
391 if (!quiet_error(bh
)) {
393 printk(KERN_WARNING
"lost page write due to "
395 bdevname(bh
->b_bdev
, b
));
397 set_bit(AS_EIO
, &page
->mapping
->flags
);
398 set_buffer_write_io_error(bh
);
399 clear_buffer_uptodate(bh
);
403 first
= page_buffers(page
);
404 local_irq_save(flags
);
405 bit_spin_lock(BH_Uptodate_Lock
, &first
->b_state
);
407 clear_buffer_async_write(bh
);
409 tmp
= bh
->b_this_page
;
411 if (buffer_async_write(tmp
)) {
412 BUG_ON(!buffer_locked(tmp
));
415 tmp
= tmp
->b_this_page
;
417 bit_spin_unlock(BH_Uptodate_Lock
, &first
->b_state
);
418 local_irq_restore(flags
);
419 end_page_writeback(page
);
423 bit_spin_unlock(BH_Uptodate_Lock
, &first
->b_state
);
424 local_irq_restore(flags
);
427 EXPORT_SYMBOL(end_buffer_async_write
);
430 * If a page's buffers are under async readin (end_buffer_async_read
431 * completion) then there is a possibility that another thread of
432 * control could lock one of the buffers after it has completed
433 * but while some of the other buffers have not completed. This
434 * locked buffer would confuse end_buffer_async_read() into not unlocking
435 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
436 * that this buffer is not under async I/O.
438 * The page comes unlocked when it has no locked buffer_async buffers
441 * PageLocked prevents anyone starting new async I/O reads any of
444 * PageWriteback is used to prevent simultaneous writeout of the same
447 * PageLocked prevents anyone from starting writeback of a page which is
448 * under read I/O (PageWriteback is only ever set against a locked page).
450 static void mark_buffer_async_read(struct buffer_head
*bh
)
452 bh
->b_end_io
= end_buffer_async_read
;
453 set_buffer_async_read(bh
);
456 static void mark_buffer_async_write_endio(struct buffer_head
*bh
,
457 bh_end_io_t
*handler
)
459 bh
->b_end_io
= handler
;
460 set_buffer_async_write(bh
);
463 void mark_buffer_async_write(struct buffer_head
*bh
)
465 mark_buffer_async_write_endio(bh
, end_buffer_async_write
);
467 EXPORT_SYMBOL(mark_buffer_async_write
);
471 * fs/buffer.c contains helper functions for buffer-backed address space's
472 * fsync functions. A common requirement for buffer-based filesystems is
473 * that certain data from the backing blockdev needs to be written out for
474 * a successful fsync(). For example, ext2 indirect blocks need to be
475 * written back and waited upon before fsync() returns.
477 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
478 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
479 * management of a list of dependent buffers at ->i_mapping->private_list.
481 * Locking is a little subtle: try_to_free_buffers() will remove buffers
482 * from their controlling inode's queue when they are being freed. But
483 * try_to_free_buffers() will be operating against the *blockdev* mapping
484 * at the time, not against the S_ISREG file which depends on those buffers.
485 * So the locking for private_list is via the private_lock in the address_space
486 * which backs the buffers. Which is different from the address_space
487 * against which the buffers are listed. So for a particular address_space,
488 * mapping->private_lock does *not* protect mapping->private_list! In fact,
489 * mapping->private_list will always be protected by the backing blockdev's
492 * Which introduces a requirement: all buffers on an address_space's
493 * ->private_list must be from the same address_space: the blockdev's.
495 * address_spaces which do not place buffers at ->private_list via these
496 * utility functions are free to use private_lock and private_list for
497 * whatever they want. The only requirement is that list_empty(private_list)
498 * be true at clear_inode() time.
500 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
501 * filesystems should do that. invalidate_inode_buffers() should just go
502 * BUG_ON(!list_empty).
504 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
505 * take an address_space, not an inode. And it should be called
506 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
509 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
510 * list if it is already on a list. Because if the buffer is on a list,
511 * it *must* already be on the right one. If not, the filesystem is being
512 * silly. This will save a ton of locking. But first we have to ensure
513 * that buffers are taken *off* the old inode's list when they are freed
514 * (presumably in truncate). That requires careful auditing of all
515 * filesystems (do it inside bforget()). It could also be done by bringing
520 * The buffer's backing address_space's private_lock must be held
522 static void __remove_assoc_queue(struct buffer_head
*bh
)
524 list_del_init(&bh
->b_assoc_buffers
);
525 WARN_ON(!bh
->b_assoc_map
);
526 if (buffer_write_io_error(bh
))
527 set_bit(AS_EIO
, &bh
->b_assoc_map
->flags
);
528 bh
->b_assoc_map
= NULL
;
531 int inode_has_buffers(struct inode
*inode
)
533 return !list_empty(&inode
->i_data
.private_list
);
537 * osync is designed to support O_SYNC io. It waits synchronously for
538 * all already-submitted IO to complete, but does not queue any new
539 * writes to the disk.
541 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
542 * you dirty the buffers, and then use osync_inode_buffers to wait for
543 * completion. Any other dirty buffers which are not yet queued for
544 * write will not be flushed to disk by the osync.
546 static int osync_buffers_list(spinlock_t
*lock
, struct list_head
*list
)
548 struct buffer_head
*bh
;
554 list_for_each_prev(p
, list
) {
556 if (buffer_locked(bh
)) {
560 if (!buffer_uptodate(bh
))
571 static void do_thaw_all(struct work_struct
*work
)
573 struct super_block
*sb
;
574 char b
[BDEVNAME_SIZE
];
578 list_for_each_entry(sb
, &super_blocks
, s_list
) {
580 spin_unlock(&sb_lock
);
581 down_read(&sb
->s_umount
);
582 while (sb
->s_bdev
&& !thaw_bdev(sb
->s_bdev
, sb
))
583 printk(KERN_WARNING
"Emergency Thaw on %s\n",
584 bdevname(sb
->s_bdev
, b
));
585 up_read(&sb
->s_umount
);
587 if (__put_super_and_need_restart(sb
))
590 spin_unlock(&sb_lock
);
592 printk(KERN_WARNING
"Emergency Thaw complete\n");
596 * emergency_thaw_all -- forcibly thaw every frozen filesystem
598 * Used for emergency unfreeze of all filesystems via SysRq
600 void emergency_thaw_all(void)
602 struct work_struct
*work
;
604 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
606 INIT_WORK(work
, do_thaw_all
);
612 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
613 * @mapping: the mapping which wants those buffers written
615 * Starts I/O against the buffers at mapping->private_list, and waits upon
618 * Basically, this is a convenience function for fsync().
619 * @mapping is a file or directory which needs those buffers to be written for
620 * a successful fsync().
622 int sync_mapping_buffers(struct address_space
*mapping
)
624 struct address_space
*buffer_mapping
= mapping
->assoc_mapping
;
626 if (buffer_mapping
== NULL
|| list_empty(&mapping
->private_list
))
629 return fsync_buffers_list(&buffer_mapping
->private_lock
,
630 &mapping
->private_list
);
632 EXPORT_SYMBOL(sync_mapping_buffers
);
635 * Called when we've recently written block `bblock', and it is known that
636 * `bblock' was for a buffer_boundary() buffer. This means that the block at
637 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
638 * dirty, schedule it for IO. So that indirects merge nicely with their data.
640 void write_boundary_block(struct block_device
*bdev
,
641 sector_t bblock
, unsigned blocksize
)
643 struct buffer_head
*bh
= __find_get_block(bdev
, bblock
+ 1, blocksize
);
645 if (buffer_dirty(bh
))
646 ll_rw_block(WRITE
, 1, &bh
);
651 void mark_buffer_dirty_inode(struct buffer_head
*bh
, struct inode
*inode
)
653 struct address_space
*mapping
= inode
->i_mapping
;
654 struct address_space
*buffer_mapping
= bh
->b_page
->mapping
;
656 mark_buffer_dirty(bh
);
657 if (!mapping
->assoc_mapping
) {
658 mapping
->assoc_mapping
= buffer_mapping
;
660 BUG_ON(mapping
->assoc_mapping
!= buffer_mapping
);
662 if (!bh
->b_assoc_map
) {
663 spin_lock(&buffer_mapping
->private_lock
);
664 list_move_tail(&bh
->b_assoc_buffers
,
665 &mapping
->private_list
);
666 bh
->b_assoc_map
= mapping
;
667 spin_unlock(&buffer_mapping
->private_lock
);
670 EXPORT_SYMBOL(mark_buffer_dirty_inode
);
673 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
676 * If warn is true, then emit a warning if the page is not uptodate and has
677 * not been truncated.
679 static void __set_page_dirty(struct page
*page
,
680 struct address_space
*mapping
, int warn
)
682 spin_lock_irq(&mapping
->tree_lock
);
683 if (page
->mapping
) { /* Race with truncate? */
684 WARN_ON_ONCE(warn
&& !PageUptodate(page
));
685 account_page_dirtied(page
, mapping
);
686 radix_tree_tag_set(&mapping
->page_tree
,
687 page_index(page
), PAGECACHE_TAG_DIRTY
);
689 spin_unlock_irq(&mapping
->tree_lock
);
690 __mark_inode_dirty(mapping
->host
, I_DIRTY_PAGES
);
694 * Add a page to the dirty page list.
696 * It is a sad fact of life that this function is called from several places
697 * deeply under spinlocking. It may not sleep.
699 * If the page has buffers, the uptodate buffers are set dirty, to preserve
700 * dirty-state coherency between the page and the buffers. It the page does
701 * not have buffers then when they are later attached they will all be set
704 * The buffers are dirtied before the page is dirtied. There's a small race
705 * window in which a writepage caller may see the page cleanness but not the
706 * buffer dirtiness. That's fine. If this code were to set the page dirty
707 * before the buffers, a concurrent writepage caller could clear the page dirty
708 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
709 * page on the dirty page list.
711 * We use private_lock to lock against try_to_free_buffers while using the
712 * page's buffer list. Also use this to protect against clean buffers being
713 * added to the page after it was set dirty.
715 * FIXME: may need to call ->reservepage here as well. That's rather up to the
716 * address_space though.
718 int __set_page_dirty_buffers(struct page
*page
)
721 struct address_space
*mapping
= page_mapping(page
);
723 if (unlikely(!mapping
))
724 return !TestSetPageDirty(page
);
726 spin_lock(&mapping
->private_lock
);
727 if (page_has_buffers(page
)) {
728 struct buffer_head
*head
= page_buffers(page
);
729 struct buffer_head
*bh
= head
;
732 set_buffer_dirty(bh
);
733 bh
= bh
->b_this_page
;
734 } while (bh
!= head
);
736 newly_dirty
= !TestSetPageDirty(page
);
737 spin_unlock(&mapping
->private_lock
);
740 __set_page_dirty(page
, mapping
, 1);
743 EXPORT_SYMBOL(__set_page_dirty_buffers
);
746 * Write out and wait upon a list of buffers.
748 * We have conflicting pressures: we want to make sure that all
749 * initially dirty buffers get waited on, but that any subsequently
750 * dirtied buffers don't. After all, we don't want fsync to last
751 * forever if somebody is actively writing to the file.
753 * Do this in two main stages: first we copy dirty buffers to a
754 * temporary inode list, queueing the writes as we go. Then we clean
755 * up, waiting for those writes to complete.
757 * During this second stage, any subsequent updates to the file may end
758 * up refiling the buffer on the original inode's dirty list again, so
759 * there is a chance we will end up with a buffer queued for write but
760 * not yet completed on that list. So, as a final cleanup we go through
761 * the osync code to catch these locked, dirty buffers without requeuing
762 * any newly dirty buffers for write.
764 static int fsync_buffers_list(spinlock_t
*lock
, struct list_head
*list
)
766 struct buffer_head
*bh
;
767 struct list_head tmp
;
768 struct address_space
*mapping
, *prev_mapping
= NULL
;
771 INIT_LIST_HEAD(&tmp
);
774 while (!list_empty(list
)) {
775 bh
= BH_ENTRY(list
->next
);
776 mapping
= bh
->b_assoc_map
;
777 __remove_assoc_queue(bh
);
778 /* Avoid race with mark_buffer_dirty_inode() which does
779 * a lockless check and we rely on seeing the dirty bit */
781 if (buffer_dirty(bh
) || buffer_locked(bh
)) {
782 list_add(&bh
->b_assoc_buffers
, &tmp
);
783 bh
->b_assoc_map
= mapping
;
784 if (buffer_dirty(bh
)) {
788 * Ensure any pending I/O completes so that
789 * ll_rw_block() actually writes the current
790 * contents - it is a noop if I/O is still in
791 * flight on potentially older contents.
793 ll_rw_block(SWRITE_SYNC_PLUG
, 1, &bh
);
796 * Kick off IO for the previous mapping. Note
797 * that we will not run the very last mapping,
798 * wait_on_buffer() will do that for us
799 * through sync_buffer().
801 if (prev_mapping
&& prev_mapping
!= mapping
)
802 blk_run_address_space(prev_mapping
);
803 prev_mapping
= mapping
;
811 while (!list_empty(&tmp
)) {
812 bh
= BH_ENTRY(tmp
.prev
);
814 mapping
= bh
->b_assoc_map
;
815 __remove_assoc_queue(bh
);
816 /* Avoid race with mark_buffer_dirty_inode() which does
817 * a lockless check and we rely on seeing the dirty bit */
819 if (buffer_dirty(bh
)) {
820 list_add(&bh
->b_assoc_buffers
,
821 &mapping
->private_list
);
822 bh
->b_assoc_map
= mapping
;
826 if (!buffer_uptodate(bh
))
833 err2
= osync_buffers_list(lock
, list
);
841 * Invalidate any and all dirty buffers on a given inode. We are
842 * probably unmounting the fs, but that doesn't mean we have already
843 * done a sync(). Just drop the buffers from the inode list.
845 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
846 * assumes that all the buffers are against the blockdev. Not true
849 void invalidate_inode_buffers(struct inode
*inode
)
851 if (inode_has_buffers(inode
)) {
852 struct address_space
*mapping
= &inode
->i_data
;
853 struct list_head
*list
= &mapping
->private_list
;
854 struct address_space
*buffer_mapping
= mapping
->assoc_mapping
;
856 spin_lock(&buffer_mapping
->private_lock
);
857 while (!list_empty(list
))
858 __remove_assoc_queue(BH_ENTRY(list
->next
));
859 spin_unlock(&buffer_mapping
->private_lock
);
862 EXPORT_SYMBOL(invalidate_inode_buffers
);
865 * Remove any clean buffers from the inode's buffer list. This is called
866 * when we're trying to free the inode itself. Those buffers can pin it.
868 * Returns true if all buffers were removed.
870 int remove_inode_buffers(struct inode
*inode
)
874 if (inode_has_buffers(inode
)) {
875 struct address_space
*mapping
= &inode
->i_data
;
876 struct list_head
*list
= &mapping
->private_list
;
877 struct address_space
*buffer_mapping
= mapping
->assoc_mapping
;
879 spin_lock(&buffer_mapping
->private_lock
);
880 while (!list_empty(list
)) {
881 struct buffer_head
*bh
= BH_ENTRY(list
->next
);
882 if (buffer_dirty(bh
)) {
886 __remove_assoc_queue(bh
);
888 spin_unlock(&buffer_mapping
->private_lock
);
894 * Create the appropriate buffers when given a page for data area and
895 * the size of each buffer.. Use the bh->b_this_page linked list to
896 * follow the buffers created. Return NULL if unable to create more
899 * The retry flag is used to differentiate async IO (paging, swapping)
900 * which may not fail from ordinary buffer allocations.
902 struct buffer_head
*alloc_page_buffers(struct page
*page
, unsigned long size
,
905 struct buffer_head
*bh
, *head
;
911 while ((offset
-= size
) >= 0) {
912 bh
= alloc_buffer_head(GFP_NOFS
);
917 bh
->b_this_page
= head
;
922 atomic_set(&bh
->b_count
, 0);
923 bh
->b_private
= NULL
;
926 /* Link the buffer to its page */
927 set_bh_page(bh
, page
, offset
);
929 init_buffer(bh
, NULL
, NULL
);
933 * In case anything failed, we just free everything we got.
939 head
= head
->b_this_page
;
940 free_buffer_head(bh
);
945 * Return failure for non-async IO requests. Async IO requests
946 * are not allowed to fail, so we have to wait until buffer heads
947 * become available. But we don't want tasks sleeping with
948 * partially complete buffers, so all were released above.
953 /* We're _really_ low on memory. Now we just
954 * wait for old buffer heads to become free due to
955 * finishing IO. Since this is an async request and
956 * the reserve list is empty, we're sure there are
957 * async buffer heads in use.
962 EXPORT_SYMBOL_GPL(alloc_page_buffers
);
965 link_dev_buffers(struct page
*page
, struct buffer_head
*head
)
967 struct buffer_head
*bh
, *tail
;
972 bh
= bh
->b_this_page
;
974 tail
->b_this_page
= head
;
975 attach_page_buffers(page
, head
);
979 * Initialise the state of a blockdev page's buffers.
982 init_page_buffers(struct page
*page
, struct block_device
*bdev
,
983 sector_t block
, int size
)
985 struct buffer_head
*head
= page_buffers(page
);
986 struct buffer_head
*bh
= head
;
987 int uptodate
= PageUptodate(page
);
990 if (!buffer_mapped(bh
)) {
991 init_buffer(bh
, NULL
, NULL
);
993 bh
->b_blocknr
= block
;
995 set_buffer_uptodate(bh
);
996 set_buffer_mapped(bh
);
999 bh
= bh
->b_this_page
;
1000 } while (bh
!= head
);
1004 * Create the page-cache page that contains the requested block.
1006 * This is user purely for blockdev mappings.
1008 static struct page
*
1009 grow_dev_page(struct block_device
*bdev
, sector_t block
,
1010 pgoff_t index
, int size
)
1012 struct inode
*inode
= bdev
->bd_inode
;
1014 struct buffer_head
*bh
;
1016 page
= find_or_create_page(inode
->i_mapping
, index
,
1017 (mapping_gfp_mask(inode
->i_mapping
) & ~__GFP_FS
)|__GFP_MOVABLE
);
1021 BUG_ON(!PageLocked(page
));
1023 if (page_has_buffers(page
)) {
1024 bh
= page_buffers(page
);
1025 if (bh
->b_size
== size
) {
1026 init_page_buffers(page
, bdev
, block
, size
);
1029 if (!try_to_free_buffers(page
))
1034 * Allocate some buffers for this page
1036 bh
= alloc_page_buffers(page
, size
, 0);
1041 * Link the page to the buffers and initialise them. Take the
1042 * lock to be atomic wrt __find_get_block(), which does not
1043 * run under the page lock.
1045 spin_lock(&inode
->i_mapping
->private_lock
);
1046 link_dev_buffers(page
, bh
);
1047 init_page_buffers(page
, bdev
, block
, size
);
1048 spin_unlock(&inode
->i_mapping
->private_lock
);
1054 page_cache_release(page
);
1059 * Create buffers for the specified block device block's page. If
1060 * that page was dirty, the buffers are set dirty also.
1063 grow_buffers(struct block_device
*bdev
, sector_t block
, int size
)
1072 } while ((size
<< sizebits
) < PAGE_SIZE
);
1074 index
= block
>> sizebits
;
1077 * Check for a block which wants to lie outside our maximum possible
1078 * pagecache index. (this comparison is done using sector_t types).
1080 if (unlikely(index
!= block
>> sizebits
)) {
1081 char b
[BDEVNAME_SIZE
];
1083 printk(KERN_ERR
"%s: requested out-of-range block %llu for "
1085 __func__
, (unsigned long long)block
,
1089 block
= index
<< sizebits
;
1090 /* Create a page with the proper size buffers.. */
1091 page
= grow_dev_page(bdev
, block
, index
, size
);
1095 page_cache_release(page
);
1099 static struct buffer_head
*
1100 __getblk_slow(struct block_device
*bdev
, sector_t block
, int size
)
1102 /* Size must be multiple of hard sectorsize */
1103 if (unlikely(size
& (bdev_logical_block_size(bdev
)-1) ||
1104 (size
< 512 || size
> PAGE_SIZE
))) {
1105 printk(KERN_ERR
"getblk(): invalid block size %d requested\n",
1107 printk(KERN_ERR
"logical block size: %d\n",
1108 bdev_logical_block_size(bdev
));
1114 #if (BITS_PER_LONG == 32) && defined(CONFIG_LBD)
1115 if ((block
>> (PAGE_CACHE_SHIFT
- bdev
->bd_inode
->i_blkbits
)) &
1116 0xffffffff00000000ULL
) {
1118 * We'll fail because the block is outside the range
1119 * which a 32-bit pagecache index can address
1121 printk(KERN_ERR
"getblk(): sector number too large for 32-bit"
1129 struct buffer_head
* bh
;
1132 bh
= __find_get_block(bdev
, block
, size
);
1136 ret
= grow_buffers(bdev
, block
, size
);
1145 * The relationship between dirty buffers and dirty pages:
1147 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1148 * the page is tagged dirty in its radix tree.
1150 * At all times, the dirtiness of the buffers represents the dirtiness of
1151 * subsections of the page. If the page has buffers, the page dirty bit is
1152 * merely a hint about the true dirty state.
1154 * When a page is set dirty in its entirety, all its buffers are marked dirty
1155 * (if the page has buffers).
1157 * When a buffer is marked dirty, its page is dirtied, but the page's other
1160 * Also. When blockdev buffers are explicitly read with bread(), they
1161 * individually become uptodate. But their backing page remains not
1162 * uptodate - even if all of its buffers are uptodate. A subsequent
1163 * block_read_full_page() against that page will discover all the uptodate
1164 * buffers, will set the page uptodate and will perform no I/O.
1168 * mark_buffer_dirty - mark a buffer_head as needing writeout
1169 * @bh: the buffer_head to mark dirty
1171 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1172 * backing page dirty, then tag the page as dirty in its address_space's radix
1173 * tree and then attach the address_space's inode to its superblock's dirty
1176 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1177 * mapping->tree_lock and the global inode_lock.
1179 void mark_buffer_dirty(struct buffer_head
*bh
)
1181 WARN_ON_ONCE(!buffer_uptodate(bh
));
1184 * Very *carefully* optimize the it-is-already-dirty case.
1186 * Don't let the final "is it dirty" escape to before we
1187 * perhaps modified the buffer.
1189 if (buffer_dirty(bh
)) {
1191 if (buffer_dirty(bh
))
1195 if (!test_set_buffer_dirty(bh
)) {
1196 struct page
*page
= bh
->b_page
;
1197 if (!TestSetPageDirty(page
)) {
1198 struct address_space
*mapping
= page_mapping(page
);
1200 __set_page_dirty(page
, mapping
, 0);
1204 EXPORT_SYMBOL(mark_buffer_dirty
);
1207 * Decrement a buffer_head's reference count. If all buffers against a page
1208 * have zero reference count, are clean and unlocked, and if the page is clean
1209 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1210 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1211 * a page but it ends up not being freed, and buffers may later be reattached).
1213 void __brelse(struct buffer_head
* buf
)
1215 if (atomic_read(&buf
->b_count
)) {
1219 WARN(1, KERN_ERR
"VFS: brelse: Trying to free free buffer\n");
1221 EXPORT_SYMBOL(__brelse
);
1224 * bforget() is like brelse(), except it discards any
1225 * potentially dirty data.
1227 void __bforget(struct buffer_head
*bh
)
1229 clear_buffer_dirty(bh
);
1230 if (bh
->b_assoc_map
) {
1231 struct address_space
*buffer_mapping
= bh
->b_page
->mapping
;
1233 spin_lock(&buffer_mapping
->private_lock
);
1234 list_del_init(&bh
->b_assoc_buffers
);
1235 bh
->b_assoc_map
= NULL
;
1236 spin_unlock(&buffer_mapping
->private_lock
);
1240 EXPORT_SYMBOL(__bforget
);
1242 static struct buffer_head
*__bread_slow(struct buffer_head
*bh
)
1245 if (buffer_uptodate(bh
)) {
1250 bh
->b_end_io
= end_buffer_read_sync
;
1251 submit_bh(READ
, bh
);
1253 if (buffer_uptodate(bh
))
1261 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1262 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1263 * refcount elevated by one when they're in an LRU. A buffer can only appear
1264 * once in a particular CPU's LRU. A single buffer can be present in multiple
1265 * CPU's LRUs at the same time.
1267 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1268 * sb_find_get_block().
1270 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1271 * a local interrupt disable for that.
1274 #define BH_LRU_SIZE 8
1277 struct buffer_head
*bhs
[BH_LRU_SIZE
];
1280 static DEFINE_PER_CPU(struct bh_lru
, bh_lrus
) = {{ NULL
}};
1283 #define bh_lru_lock() local_irq_disable()
1284 #define bh_lru_unlock() local_irq_enable()
1286 #define bh_lru_lock() preempt_disable()
1287 #define bh_lru_unlock() preempt_enable()
1290 static inline void check_irqs_on(void)
1292 #ifdef irqs_disabled
1293 BUG_ON(irqs_disabled());
1298 * The LRU management algorithm is dopey-but-simple. Sorry.
1300 static void bh_lru_install(struct buffer_head
*bh
)
1302 struct buffer_head
*evictee
= NULL
;
1307 lru
= &__get_cpu_var(bh_lrus
);
1308 if (lru
->bhs
[0] != bh
) {
1309 struct buffer_head
*bhs
[BH_LRU_SIZE
];
1315 for (in
= 0; in
< BH_LRU_SIZE
; in
++) {
1316 struct buffer_head
*bh2
= lru
->bhs
[in
];
1321 if (out
>= BH_LRU_SIZE
) {
1322 BUG_ON(evictee
!= NULL
);
1329 while (out
< BH_LRU_SIZE
)
1331 memcpy(lru
->bhs
, bhs
, sizeof(bhs
));
1340 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1342 static struct buffer_head
*
1343 lookup_bh_lru(struct block_device
*bdev
, sector_t block
, unsigned size
)
1345 struct buffer_head
*ret
= NULL
;
1351 lru
= &__get_cpu_var(bh_lrus
);
1352 for (i
= 0; i
< BH_LRU_SIZE
; i
++) {
1353 struct buffer_head
*bh
= lru
->bhs
[i
];
1355 if (bh
&& bh
->b_bdev
== bdev
&&
1356 bh
->b_blocknr
== block
&& bh
->b_size
== size
) {
1359 lru
->bhs
[i
] = lru
->bhs
[i
- 1];
1374 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1375 * it in the LRU and mark it as accessed. If it is not present then return
1378 struct buffer_head
*
1379 __find_get_block(struct block_device
*bdev
, sector_t block
, unsigned size
)
1381 struct buffer_head
*bh
= lookup_bh_lru(bdev
, block
, size
);
1384 bh
= __find_get_block_slow(bdev
, block
);
1392 EXPORT_SYMBOL(__find_get_block
);
1395 * __getblk will locate (and, if necessary, create) the buffer_head
1396 * which corresponds to the passed block_device, block and size. The
1397 * returned buffer has its reference count incremented.
1399 * __getblk() cannot fail - it just keeps trying. If you pass it an
1400 * illegal block number, __getblk() will happily return a buffer_head
1401 * which represents the non-existent block. Very weird.
1403 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1404 * attempt is failing. FIXME, perhaps?
1406 struct buffer_head
*
1407 __getblk(struct block_device
*bdev
, sector_t block
, unsigned size
)
1409 struct buffer_head
*bh
= __find_get_block(bdev
, block
, size
);
1413 bh
= __getblk_slow(bdev
, block
, size
);
1416 EXPORT_SYMBOL(__getblk
);
1419 * Do async read-ahead on a buffer..
1421 void __breadahead(struct block_device
*bdev
, sector_t block
, unsigned size
)
1423 struct buffer_head
*bh
= __getblk(bdev
, block
, size
);
1425 ll_rw_block(READA
, 1, &bh
);
1429 EXPORT_SYMBOL(__breadahead
);
1432 * __bread() - reads a specified block and returns the bh
1433 * @bdev: the block_device to read from
1434 * @block: number of block
1435 * @size: size (in bytes) to read
1437 * Reads a specified block, and returns buffer head that contains it.
1438 * It returns NULL if the block was unreadable.
1440 struct buffer_head
*
1441 __bread(struct block_device
*bdev
, sector_t block
, unsigned size
)
1443 struct buffer_head
*bh
= __getblk(bdev
, block
, size
);
1445 if (likely(bh
) && !buffer_uptodate(bh
))
1446 bh
= __bread_slow(bh
);
1449 EXPORT_SYMBOL(__bread
);
1452 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1453 * This doesn't race because it runs in each cpu either in irq
1454 * or with preempt disabled.
1456 static void invalidate_bh_lru(void *arg
)
1458 struct bh_lru
*b
= &get_cpu_var(bh_lrus
);
1461 for (i
= 0; i
< BH_LRU_SIZE
; i
++) {
1465 put_cpu_var(bh_lrus
);
1468 void invalidate_bh_lrus(void)
1470 on_each_cpu(invalidate_bh_lru
, NULL
, 1);
1472 EXPORT_SYMBOL_GPL(invalidate_bh_lrus
);
1474 void set_bh_page(struct buffer_head
*bh
,
1475 struct page
*page
, unsigned long offset
)
1478 BUG_ON(offset
>= PAGE_SIZE
);
1479 if (PageHighMem(page
))
1481 * This catches illegal uses and preserves the offset:
1483 bh
->b_data
= (char *)(0 + offset
);
1485 bh
->b_data
= page_address(page
) + offset
;
1487 EXPORT_SYMBOL(set_bh_page
);
1490 * Called when truncating a buffer on a page completely.
1492 static void discard_buffer(struct buffer_head
* bh
)
1495 clear_buffer_dirty(bh
);
1497 clear_buffer_mapped(bh
);
1498 clear_buffer_req(bh
);
1499 clear_buffer_new(bh
);
1500 clear_buffer_delay(bh
);
1501 clear_buffer_unwritten(bh
);
1506 * block_invalidatepage - invalidate part of all of a buffer-backed page
1508 * @page: the page which is affected
1509 * @offset: the index of the truncation point
1511 * block_invalidatepage() is called when all or part of the page has become
1512 * invalidatedby a truncate operation.
1514 * block_invalidatepage() does not have to release all buffers, but it must
1515 * ensure that no dirty buffer is left outside @offset and that no I/O
1516 * is underway against any of the blocks which are outside the truncation
1517 * point. Because the caller is about to free (and possibly reuse) those
1520 void block_invalidatepage(struct page
*page
, unsigned long offset
)
1522 struct buffer_head
*head
, *bh
, *next
;
1523 unsigned int curr_off
= 0;
1525 BUG_ON(!PageLocked(page
));
1526 if (!page_has_buffers(page
))
1529 head
= page_buffers(page
);
1532 unsigned int next_off
= curr_off
+ bh
->b_size
;
1533 next
= bh
->b_this_page
;
1536 * is this block fully invalidated?
1538 if (offset
<= curr_off
)
1540 curr_off
= next_off
;
1542 } while (bh
!= head
);
1545 * We release buffers only if the entire page is being invalidated.
1546 * The get_block cached value has been unconditionally invalidated,
1547 * so real IO is not possible anymore.
1550 try_to_release_page(page
, 0);
1554 EXPORT_SYMBOL(block_invalidatepage
);
1557 * We attach and possibly dirty the buffers atomically wrt
1558 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1559 * is already excluded via the page lock.
1561 void create_empty_buffers(struct page
*page
,
1562 unsigned long blocksize
, unsigned long b_state
)
1564 struct buffer_head
*bh
, *head
, *tail
;
1566 head
= alloc_page_buffers(page
, blocksize
, 1);
1569 bh
->b_state
|= b_state
;
1571 bh
= bh
->b_this_page
;
1573 tail
->b_this_page
= head
;
1575 spin_lock(&page
->mapping
->private_lock
);
1576 if (PageUptodate(page
) || PageDirty(page
)) {
1579 if (PageDirty(page
))
1580 set_buffer_dirty(bh
);
1581 if (PageUptodate(page
))
1582 set_buffer_uptodate(bh
);
1583 bh
= bh
->b_this_page
;
1584 } while (bh
!= head
);
1586 attach_page_buffers(page
, head
);
1587 spin_unlock(&page
->mapping
->private_lock
);
1589 EXPORT_SYMBOL(create_empty_buffers
);
1592 * We are taking a block for data and we don't want any output from any
1593 * buffer-cache aliases starting from return from that function and
1594 * until the moment when something will explicitly mark the buffer
1595 * dirty (hopefully that will not happen until we will free that block ;-)
1596 * We don't even need to mark it not-uptodate - nobody can expect
1597 * anything from a newly allocated buffer anyway. We used to used
1598 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1599 * don't want to mark the alias unmapped, for example - it would confuse
1600 * anyone who might pick it with bread() afterwards...
1602 * Also.. Note that bforget() doesn't lock the buffer. So there can
1603 * be writeout I/O going on against recently-freed buffers. We don't
1604 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1605 * only if we really need to. That happens here.
1607 void unmap_underlying_metadata(struct block_device
*bdev
, sector_t block
)
1609 struct buffer_head
*old_bh
;
1613 old_bh
= __find_get_block_slow(bdev
, block
);
1615 clear_buffer_dirty(old_bh
);
1616 wait_on_buffer(old_bh
);
1617 clear_buffer_req(old_bh
);
1621 EXPORT_SYMBOL(unmap_underlying_metadata
);
1624 * NOTE! All mapped/uptodate combinations are valid:
1626 * Mapped Uptodate Meaning
1628 * No No "unknown" - must do get_block()
1629 * No Yes "hole" - zero-filled
1630 * Yes No "allocated" - allocated on disk, not read in
1631 * Yes Yes "valid" - allocated and up-to-date in memory.
1633 * "Dirty" is valid only with the last case (mapped+uptodate).
1637 * While block_write_full_page is writing back the dirty buffers under
1638 * the page lock, whoever dirtied the buffers may decide to clean them
1639 * again at any time. We handle that by only looking at the buffer
1640 * state inside lock_buffer().
1642 * If block_write_full_page() is called for regular writeback
1643 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1644 * locked buffer. This only can happen if someone has written the buffer
1645 * directly, with submit_bh(). At the address_space level PageWriteback
1646 * prevents this contention from occurring.
1648 * If block_write_full_page() is called with wbc->sync_mode ==
1649 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1650 * causes the writes to be flagged as synchronous writes, but the
1651 * block device queue will NOT be unplugged, since usually many pages
1652 * will be pushed to the out before the higher-level caller actually
1653 * waits for the writes to be completed. The various wait functions,
1654 * such as wait_on_writeback_range() will ultimately call sync_page()
1655 * which will ultimately call blk_run_backing_dev(), which will end up
1656 * unplugging the device queue.
1658 static int __block_write_full_page(struct inode
*inode
, struct page
*page
,
1659 get_block_t
*get_block
, struct writeback_control
*wbc
,
1660 bh_end_io_t
*handler
)
1664 sector_t last_block
;
1665 struct buffer_head
*bh
, *head
;
1666 const unsigned blocksize
= 1 << inode
->i_blkbits
;
1667 int nr_underway
= 0;
1668 int write_op
= (wbc
->sync_mode
== WB_SYNC_ALL
?
1669 WRITE_SYNC_PLUG
: WRITE
);
1671 BUG_ON(!PageLocked(page
));
1673 last_block
= (i_size_read(inode
) - 1) >> inode
->i_blkbits
;
1675 if (!page_has_buffers(page
)) {
1676 create_empty_buffers(page
, blocksize
,
1677 (1 << BH_Dirty
)|(1 << BH_Uptodate
));
1681 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1682 * here, and the (potentially unmapped) buffers may become dirty at
1683 * any time. If a buffer becomes dirty here after we've inspected it
1684 * then we just miss that fact, and the page stays dirty.
1686 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1687 * handle that here by just cleaning them.
1690 block
= (sector_t
)page
->index
<< (PAGE_CACHE_SHIFT
- inode
->i_blkbits
);
1691 head
= page_buffers(page
);
1695 * Get all the dirty buffers mapped to disk addresses and
1696 * handle any aliases from the underlying blockdev's mapping.
1699 if (block
> last_block
) {
1701 * mapped buffers outside i_size will occur, because
1702 * this page can be outside i_size when there is a
1703 * truncate in progress.
1706 * The buffer was zeroed by block_write_full_page()
1708 clear_buffer_dirty(bh
);
1709 set_buffer_uptodate(bh
);
1710 } else if ((!buffer_mapped(bh
) || buffer_delay(bh
)) &&
1712 WARN_ON(bh
->b_size
!= blocksize
);
1713 err
= get_block(inode
, block
, bh
, 1);
1716 clear_buffer_delay(bh
);
1717 if (buffer_new(bh
)) {
1718 /* blockdev mappings never come here */
1719 clear_buffer_new(bh
);
1720 unmap_underlying_metadata(bh
->b_bdev
,
1724 bh
= bh
->b_this_page
;
1726 } while (bh
!= head
);
1729 if (!buffer_mapped(bh
))
1732 * If it's a fully non-blocking write attempt and we cannot
1733 * lock the buffer then redirty the page. Note that this can
1734 * potentially cause a busy-wait loop from writeback threads
1735 * and kswapd activity, but those code paths have their own
1736 * higher-level throttling.
1738 if (wbc
->sync_mode
!= WB_SYNC_NONE
|| !wbc
->nonblocking
) {
1740 } else if (!trylock_buffer(bh
)) {
1741 redirty_page_for_writepage(wbc
, page
);
1744 if (test_clear_buffer_dirty(bh
)) {
1745 mark_buffer_async_write_endio(bh
, handler
);
1749 } while ((bh
= bh
->b_this_page
) != head
);
1752 * The page and its buffers are protected by PageWriteback(), so we can
1753 * drop the bh refcounts early.
1755 BUG_ON(PageWriteback(page
));
1756 set_page_writeback(page
);
1759 struct buffer_head
*next
= bh
->b_this_page
;
1760 if (buffer_async_write(bh
)) {
1761 submit_bh(write_op
, bh
);
1765 } while (bh
!= head
);
1770 if (nr_underway
== 0) {
1772 * The page was marked dirty, but the buffers were
1773 * clean. Someone wrote them back by hand with
1774 * ll_rw_block/submit_bh. A rare case.
1776 end_page_writeback(page
);
1779 * The page and buffer_heads can be released at any time from
1787 * ENOSPC, or some other error. We may already have added some
1788 * blocks to the file, so we need to write these out to avoid
1789 * exposing stale data.
1790 * The page is currently locked and not marked for writeback
1793 /* Recovery: lock and submit the mapped buffers */
1795 if (buffer_mapped(bh
) && buffer_dirty(bh
) &&
1796 !buffer_delay(bh
)) {
1798 mark_buffer_async_write_endio(bh
, handler
);
1801 * The buffer may have been set dirty during
1802 * attachment to a dirty page.
1804 clear_buffer_dirty(bh
);
1806 } while ((bh
= bh
->b_this_page
) != head
);
1808 BUG_ON(PageWriteback(page
));
1809 mapping_set_error(page
->mapping
, err
);
1810 set_page_writeback(page
);
1812 struct buffer_head
*next
= bh
->b_this_page
;
1813 if (buffer_async_write(bh
)) {
1814 clear_buffer_dirty(bh
);
1815 submit_bh(write_op
, bh
);
1819 } while (bh
!= head
);
1825 * If a page has any new buffers, zero them out here, and mark them uptodate
1826 * and dirty so they'll be written out (in order to prevent uninitialised
1827 * block data from leaking). And clear the new bit.
1829 void page_zero_new_buffers(struct page
*page
, unsigned from
, unsigned to
)
1831 unsigned int block_start
, block_end
;
1832 struct buffer_head
*head
, *bh
;
1834 BUG_ON(!PageLocked(page
));
1835 if (!page_has_buffers(page
))
1838 bh
= head
= page_buffers(page
);
1841 block_end
= block_start
+ bh
->b_size
;
1843 if (buffer_new(bh
)) {
1844 if (block_end
> from
&& block_start
< to
) {
1845 if (!PageUptodate(page
)) {
1846 unsigned start
, size
;
1848 start
= max(from
, block_start
);
1849 size
= min(to
, block_end
) - start
;
1851 zero_user(page
, start
, size
);
1852 set_buffer_uptodate(bh
);
1855 clear_buffer_new(bh
);
1856 mark_buffer_dirty(bh
);
1860 block_start
= block_end
;
1861 bh
= bh
->b_this_page
;
1862 } while (bh
!= head
);
1864 EXPORT_SYMBOL(page_zero_new_buffers
);
1866 static int __block_prepare_write(struct inode
*inode
, struct page
*page
,
1867 unsigned from
, unsigned to
, get_block_t
*get_block
)
1869 unsigned block_start
, block_end
;
1872 unsigned blocksize
, bbits
;
1873 struct buffer_head
*bh
, *head
, *wait
[2], **wait_bh
=wait
;
1875 BUG_ON(!PageLocked(page
));
1876 BUG_ON(from
> PAGE_CACHE_SIZE
);
1877 BUG_ON(to
> PAGE_CACHE_SIZE
);
1880 blocksize
= 1 << inode
->i_blkbits
;
1881 if (!page_has_buffers(page
))
1882 create_empty_buffers(page
, blocksize
, 0);
1883 head
= page_buffers(page
);
1885 bbits
= inode
->i_blkbits
;
1886 block
= (sector_t
)page
->index
<< (PAGE_CACHE_SHIFT
- bbits
);
1888 for(bh
= head
, block_start
= 0; bh
!= head
|| !block_start
;
1889 block
++, block_start
=block_end
, bh
= bh
->b_this_page
) {
1890 block_end
= block_start
+ blocksize
;
1891 if (block_end
<= from
|| block_start
>= to
) {
1892 if (PageUptodate(page
)) {
1893 if (!buffer_uptodate(bh
))
1894 set_buffer_uptodate(bh
);
1899 clear_buffer_new(bh
);
1900 if (!buffer_mapped(bh
)) {
1901 WARN_ON(bh
->b_size
!= blocksize
);
1902 err
= get_block(inode
, block
, bh
, 1);
1905 if (buffer_new(bh
)) {
1906 unmap_underlying_metadata(bh
->b_bdev
,
1908 if (PageUptodate(page
)) {
1909 clear_buffer_new(bh
);
1910 set_buffer_uptodate(bh
);
1911 mark_buffer_dirty(bh
);
1914 if (block_end
> to
|| block_start
< from
)
1915 zero_user_segments(page
,
1921 if (PageUptodate(page
)) {
1922 if (!buffer_uptodate(bh
))
1923 set_buffer_uptodate(bh
);
1926 if (!buffer_uptodate(bh
) && !buffer_delay(bh
) &&
1927 !buffer_unwritten(bh
) &&
1928 (block_start
< from
|| block_end
> to
)) {
1929 ll_rw_block(READ
, 1, &bh
);
1934 * If we issued read requests - let them complete.
1936 while(wait_bh
> wait
) {
1937 wait_on_buffer(*--wait_bh
);
1938 if (!buffer_uptodate(*wait_bh
))
1942 page_zero_new_buffers(page
, from
, to
);
1946 static int __block_commit_write(struct inode
*inode
, struct page
*page
,
1947 unsigned from
, unsigned to
)
1949 unsigned block_start
, block_end
;
1952 struct buffer_head
*bh
, *head
;
1954 blocksize
= 1 << inode
->i_blkbits
;
1956 for(bh
= head
= page_buffers(page
), block_start
= 0;
1957 bh
!= head
|| !block_start
;
1958 block_start
=block_end
, bh
= bh
->b_this_page
) {
1959 block_end
= block_start
+ blocksize
;
1960 if (block_end
<= from
|| block_start
>= to
) {
1961 if (!buffer_uptodate(bh
))
1964 set_buffer_uptodate(bh
);
1965 mark_buffer_dirty(bh
);
1967 clear_buffer_new(bh
);
1971 * If this is a partial write which happened to make all buffers
1972 * uptodate then we can optimize away a bogus readpage() for
1973 * the next read(). Here we 'discover' whether the page went
1974 * uptodate as a result of this (potentially partial) write.
1977 SetPageUptodate(page
);
1982 * block_write_begin takes care of the basic task of block allocation and
1983 * bringing partial write blocks uptodate first.
1985 * If *pagep is not NULL, then block_write_begin uses the locked page
1986 * at *pagep rather than allocating its own. In this case, the page will
1987 * not be unlocked or deallocated on failure.
1989 int block_write_begin(struct file
*file
, struct address_space
*mapping
,
1990 loff_t pos
, unsigned len
, unsigned flags
,
1991 struct page
**pagep
, void **fsdata
,
1992 get_block_t
*get_block
)
1994 struct inode
*inode
= mapping
->host
;
1998 unsigned start
, end
;
2001 index
= pos
>> PAGE_CACHE_SHIFT
;
2002 start
= pos
& (PAGE_CACHE_SIZE
- 1);
2008 page
= grab_cache_page_write_begin(mapping
, index
, flags
);
2015 BUG_ON(!PageLocked(page
));
2017 status
= __block_prepare_write(inode
, page
, start
, end
, get_block
);
2018 if (unlikely(status
)) {
2019 ClearPageUptodate(page
);
2023 page_cache_release(page
);
2027 * prepare_write() may have instantiated a few blocks
2028 * outside i_size. Trim these off again. Don't need
2029 * i_size_read because we hold i_mutex.
2031 if (pos
+ len
> inode
->i_size
)
2032 vmtruncate(inode
, inode
->i_size
);
2039 EXPORT_SYMBOL(block_write_begin
);
2041 int block_write_end(struct file
*file
, struct address_space
*mapping
,
2042 loff_t pos
, unsigned len
, unsigned copied
,
2043 struct page
*page
, void *fsdata
)
2045 struct inode
*inode
= mapping
->host
;
2048 start
= pos
& (PAGE_CACHE_SIZE
- 1);
2050 if (unlikely(copied
< len
)) {
2052 * The buffers that were written will now be uptodate, so we
2053 * don't have to worry about a readpage reading them and
2054 * overwriting a partial write. However if we have encountered
2055 * a short write and only partially written into a buffer, it
2056 * will not be marked uptodate, so a readpage might come in and
2057 * destroy our partial write.
2059 * Do the simplest thing, and just treat any short write to a
2060 * non uptodate page as a zero-length write, and force the
2061 * caller to redo the whole thing.
2063 if (!PageUptodate(page
))
2066 page_zero_new_buffers(page
, start
+copied
, start
+len
);
2068 flush_dcache_page(page
);
2070 /* This could be a short (even 0-length) commit */
2071 __block_commit_write(inode
, page
, start
, start
+copied
);
2075 EXPORT_SYMBOL(block_write_end
);
2077 int generic_write_end(struct file
*file
, struct address_space
*mapping
,
2078 loff_t pos
, unsigned len
, unsigned copied
,
2079 struct page
*page
, void *fsdata
)
2081 struct inode
*inode
= mapping
->host
;
2082 int i_size_changed
= 0;
2084 copied
= block_write_end(file
, mapping
, pos
, len
, copied
, page
, fsdata
);
2087 * No need to use i_size_read() here, the i_size
2088 * cannot change under us because we hold i_mutex.
2090 * But it's important to update i_size while still holding page lock:
2091 * page writeout could otherwise come in and zero beyond i_size.
2093 if (pos
+copied
> inode
->i_size
) {
2094 i_size_write(inode
, pos
+copied
);
2099 page_cache_release(page
);
2102 * Don't mark the inode dirty under page lock. First, it unnecessarily
2103 * makes the holding time of page lock longer. Second, it forces lock
2104 * ordering of page lock and transaction start for journaling
2108 mark_inode_dirty(inode
);
2112 EXPORT_SYMBOL(generic_write_end
);
2115 * block_is_partially_uptodate checks whether buffers within a page are
2118 * Returns true if all buffers which correspond to a file portion
2119 * we want to read are uptodate.
2121 int block_is_partially_uptodate(struct page
*page
, read_descriptor_t
*desc
,
2124 struct inode
*inode
= page
->mapping
->host
;
2125 unsigned block_start
, block_end
, blocksize
;
2127 struct buffer_head
*bh
, *head
;
2130 if (!page_has_buffers(page
))
2133 blocksize
= 1 << inode
->i_blkbits
;
2134 to
= min_t(unsigned, PAGE_CACHE_SIZE
- from
, desc
->count
);
2136 if (from
< blocksize
&& to
> PAGE_CACHE_SIZE
- blocksize
)
2139 head
= page_buffers(page
);
2143 block_end
= block_start
+ blocksize
;
2144 if (block_end
> from
&& block_start
< to
) {
2145 if (!buffer_uptodate(bh
)) {
2149 if (block_end
>= to
)
2152 block_start
= block_end
;
2153 bh
= bh
->b_this_page
;
2154 } while (bh
!= head
);
2158 EXPORT_SYMBOL(block_is_partially_uptodate
);
2161 * Generic "read page" function for block devices that have the normal
2162 * get_block functionality. This is most of the block device filesystems.
2163 * Reads the page asynchronously --- the unlock_buffer() and
2164 * set/clear_buffer_uptodate() functions propagate buffer state into the
2165 * page struct once IO has completed.
2167 int block_read_full_page(struct page
*page
, get_block_t
*get_block
)
2169 struct inode
*inode
= page
->mapping
->host
;
2170 sector_t iblock
, lblock
;
2171 struct buffer_head
*bh
, *head
, *arr
[MAX_BUF_PER_PAGE
];
2172 unsigned int blocksize
;
2174 int fully_mapped
= 1;
2176 BUG_ON(!PageLocked(page
));
2177 blocksize
= 1 << inode
->i_blkbits
;
2178 if (!page_has_buffers(page
))
2179 create_empty_buffers(page
, blocksize
, 0);
2180 head
= page_buffers(page
);
2182 iblock
= (sector_t
)page
->index
<< (PAGE_CACHE_SHIFT
- inode
->i_blkbits
);
2183 lblock
= (i_size_read(inode
)+blocksize
-1) >> inode
->i_blkbits
;
2189 if (buffer_uptodate(bh
))
2192 if (!buffer_mapped(bh
)) {
2196 if (iblock
< lblock
) {
2197 WARN_ON(bh
->b_size
!= blocksize
);
2198 err
= get_block(inode
, iblock
, bh
, 0);
2202 if (!buffer_mapped(bh
)) {
2203 zero_user(page
, i
* blocksize
, blocksize
);
2205 set_buffer_uptodate(bh
);
2209 * get_block() might have updated the buffer
2212 if (buffer_uptodate(bh
))
2216 } while (i
++, iblock
++, (bh
= bh
->b_this_page
) != head
);
2219 SetPageMappedToDisk(page
);
2223 * All buffers are uptodate - we can set the page uptodate
2224 * as well. But not if get_block() returned an error.
2226 if (!PageError(page
))
2227 SetPageUptodate(page
);
2232 /* Stage two: lock the buffers */
2233 for (i
= 0; i
< nr
; i
++) {
2236 mark_buffer_async_read(bh
);
2240 * Stage 3: start the IO. Check for uptodateness
2241 * inside the buffer lock in case another process reading
2242 * the underlying blockdev brought it uptodate (the sct fix).
2244 for (i
= 0; i
< nr
; i
++) {
2246 if (buffer_uptodate(bh
))
2247 end_buffer_async_read(bh
, 1);
2249 submit_bh(READ
, bh
);
2253 EXPORT_SYMBOL(block_read_full_page
);
2255 /* utility function for filesystems that need to do work on expanding
2256 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2257 * deal with the hole.
2259 int generic_cont_expand_simple(struct inode
*inode
, loff_t size
)
2261 struct address_space
*mapping
= inode
->i_mapping
;
2266 err
= inode_newsize_ok(inode
, size
);
2270 err
= pagecache_write_begin(NULL
, mapping
, size
, 0,
2271 AOP_FLAG_UNINTERRUPTIBLE
|AOP_FLAG_CONT_EXPAND
,
2276 err
= pagecache_write_end(NULL
, mapping
, size
, 0, 0, page
, fsdata
);
2282 EXPORT_SYMBOL(generic_cont_expand_simple
);
2284 static int cont_expand_zero(struct file
*file
, struct address_space
*mapping
,
2285 loff_t pos
, loff_t
*bytes
)
2287 struct inode
*inode
= mapping
->host
;
2288 unsigned blocksize
= 1 << inode
->i_blkbits
;
2291 pgoff_t index
, curidx
;
2293 unsigned zerofrom
, offset
, len
;
2296 index
= pos
>> PAGE_CACHE_SHIFT
;
2297 offset
= pos
& ~PAGE_CACHE_MASK
;
2299 while (index
> (curidx
= (curpos
= *bytes
)>>PAGE_CACHE_SHIFT
)) {
2300 zerofrom
= curpos
& ~PAGE_CACHE_MASK
;
2301 if (zerofrom
& (blocksize
-1)) {
2302 *bytes
|= (blocksize
-1);
2305 len
= PAGE_CACHE_SIZE
- zerofrom
;
2307 err
= pagecache_write_begin(file
, mapping
, curpos
, len
,
2308 AOP_FLAG_UNINTERRUPTIBLE
,
2312 zero_user(page
, zerofrom
, len
);
2313 err
= pagecache_write_end(file
, mapping
, curpos
, len
, len
,
2320 balance_dirty_pages_ratelimited(mapping
);
2323 /* page covers the boundary, find the boundary offset */
2324 if (index
== curidx
) {
2325 zerofrom
= curpos
& ~PAGE_CACHE_MASK
;
2326 /* if we will expand the thing last block will be filled */
2327 if (offset
<= zerofrom
) {
2330 if (zerofrom
& (blocksize
-1)) {
2331 *bytes
|= (blocksize
-1);
2334 len
= offset
- zerofrom
;
2336 err
= pagecache_write_begin(file
, mapping
, curpos
, len
,
2337 AOP_FLAG_UNINTERRUPTIBLE
,
2341 zero_user(page
, zerofrom
, len
);
2342 err
= pagecache_write_end(file
, mapping
, curpos
, len
, len
,
2354 * For moronic filesystems that do not allow holes in file.
2355 * We may have to extend the file.
2357 int cont_write_begin(struct file
*file
, struct address_space
*mapping
,
2358 loff_t pos
, unsigned len
, unsigned flags
,
2359 struct page
**pagep
, void **fsdata
,
2360 get_block_t
*get_block
, loff_t
*bytes
)
2362 struct inode
*inode
= mapping
->host
;
2363 unsigned blocksize
= 1 << inode
->i_blkbits
;
2367 err
= cont_expand_zero(file
, mapping
, pos
, bytes
);
2371 zerofrom
= *bytes
& ~PAGE_CACHE_MASK
;
2372 if (pos
+len
> *bytes
&& zerofrom
& (blocksize
-1)) {
2373 *bytes
|= (blocksize
-1);
2378 err
= block_write_begin(file
, mapping
, pos
, len
,
2379 flags
, pagep
, fsdata
, get_block
);
2383 EXPORT_SYMBOL(cont_write_begin
);
2385 int block_prepare_write(struct page
*page
, unsigned from
, unsigned to
,
2386 get_block_t
*get_block
)
2388 struct inode
*inode
= page
->mapping
->host
;
2389 int err
= __block_prepare_write(inode
, page
, from
, to
, get_block
);
2391 ClearPageUptodate(page
);
2394 EXPORT_SYMBOL(block_prepare_write
);
2396 int block_commit_write(struct page
*page
, unsigned from
, unsigned to
)
2398 struct inode
*inode
= page
->mapping
->host
;
2399 __block_commit_write(inode
,page
,from
,to
);
2402 EXPORT_SYMBOL(block_commit_write
);
2405 * block_page_mkwrite() is not allowed to change the file size as it gets
2406 * called from a page fault handler when a page is first dirtied. Hence we must
2407 * be careful to check for EOF conditions here. We set the page up correctly
2408 * for a written page which means we get ENOSPC checking when writing into
2409 * holes and correct delalloc and unwritten extent mapping on filesystems that
2410 * support these features.
2412 * We are not allowed to take the i_mutex here so we have to play games to
2413 * protect against truncate races as the page could now be beyond EOF. Because
2414 * vmtruncate() writes the inode size before removing pages, once we have the
2415 * page lock we can determine safely if the page is beyond EOF. If it is not
2416 * beyond EOF, then the page is guaranteed safe against truncation until we
2420 block_page_mkwrite(struct vm_area_struct
*vma
, struct vm_fault
*vmf
,
2421 get_block_t get_block
)
2423 struct page
*page
= vmf
->page
;
2424 struct inode
*inode
= vma
->vm_file
->f_path
.dentry
->d_inode
;
2427 int ret
= VM_FAULT_NOPAGE
; /* make the VM retry the fault */
2430 size
= i_size_read(inode
);
2431 if ((page
->mapping
!= inode
->i_mapping
) ||
2432 (page_offset(page
) > size
)) {
2433 /* page got truncated out from underneath us */
2438 /* page is wholly or partially inside EOF */
2439 if (((page
->index
+ 1) << PAGE_CACHE_SHIFT
) > size
)
2440 end
= size
& ~PAGE_CACHE_MASK
;
2442 end
= PAGE_CACHE_SIZE
;
2444 ret
= block_prepare_write(page
, 0, end
, get_block
);
2446 ret
= block_commit_write(page
, 0, end
);
2448 if (unlikely(ret
)) {
2452 else /* -ENOSPC, -EIO, etc */
2453 ret
= VM_FAULT_SIGBUS
;
2455 ret
= VM_FAULT_LOCKED
;
2460 EXPORT_SYMBOL(block_page_mkwrite
);
2463 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2464 * immediately, while under the page lock. So it needs a special end_io
2465 * handler which does not touch the bh after unlocking it.
2467 static void end_buffer_read_nobh(struct buffer_head
*bh
, int uptodate
)
2469 __end_buffer_read_notouch(bh
, uptodate
);
2473 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2474 * the page (converting it to circular linked list and taking care of page
2477 static void attach_nobh_buffers(struct page
*page
, struct buffer_head
*head
)
2479 struct buffer_head
*bh
;
2481 BUG_ON(!PageLocked(page
));
2483 spin_lock(&page
->mapping
->private_lock
);
2486 if (PageDirty(page
))
2487 set_buffer_dirty(bh
);
2488 if (!bh
->b_this_page
)
2489 bh
->b_this_page
= head
;
2490 bh
= bh
->b_this_page
;
2491 } while (bh
!= head
);
2492 attach_page_buffers(page
, head
);
2493 spin_unlock(&page
->mapping
->private_lock
);
2497 * On entry, the page is fully not uptodate.
2498 * On exit the page is fully uptodate in the areas outside (from,to)
2500 int nobh_write_begin(struct file
*file
, struct address_space
*mapping
,
2501 loff_t pos
, unsigned len
, unsigned flags
,
2502 struct page
**pagep
, void **fsdata
,
2503 get_block_t
*get_block
)
2505 struct inode
*inode
= mapping
->host
;
2506 const unsigned blkbits
= inode
->i_blkbits
;
2507 const unsigned blocksize
= 1 << blkbits
;
2508 struct buffer_head
*head
, *bh
;
2512 unsigned block_in_page
;
2513 unsigned block_start
, block_end
;
2514 sector_t block_in_file
;
2517 int is_mapped_to_disk
= 1;
2519 index
= pos
>> PAGE_CACHE_SHIFT
;
2520 from
= pos
& (PAGE_CACHE_SIZE
- 1);
2523 page
= grab_cache_page_write_begin(mapping
, index
, flags
);
2529 if (page_has_buffers(page
)) {
2531 page_cache_release(page
);
2533 return block_write_begin(file
, mapping
, pos
, len
, flags
, pagep
,
2537 if (PageMappedToDisk(page
))
2541 * Allocate buffers so that we can keep track of state, and potentially
2542 * attach them to the page if an error occurs. In the common case of
2543 * no error, they will just be freed again without ever being attached
2544 * to the page (which is all OK, because we're under the page lock).
2546 * Be careful: the buffer linked list is a NULL terminated one, rather
2547 * than the circular one we're used to.
2549 head
= alloc_page_buffers(page
, blocksize
, 0);
2555 block_in_file
= (sector_t
)page
->index
<< (PAGE_CACHE_SHIFT
- blkbits
);
2558 * We loop across all blocks in the page, whether or not they are
2559 * part of the affected region. This is so we can discover if the
2560 * page is fully mapped-to-disk.
2562 for (block_start
= 0, block_in_page
= 0, bh
= head
;
2563 block_start
< PAGE_CACHE_SIZE
;
2564 block_in_page
++, block_start
+= blocksize
, bh
= bh
->b_this_page
) {
2567 block_end
= block_start
+ blocksize
;
2570 if (block_start
>= to
)
2572 ret
= get_block(inode
, block_in_file
+ block_in_page
,
2576 if (!buffer_mapped(bh
))
2577 is_mapped_to_disk
= 0;
2579 unmap_underlying_metadata(bh
->b_bdev
, bh
->b_blocknr
);
2580 if (PageUptodate(page
)) {
2581 set_buffer_uptodate(bh
);
2584 if (buffer_new(bh
) || !buffer_mapped(bh
)) {
2585 zero_user_segments(page
, block_start
, from
,
2589 if (buffer_uptodate(bh
))
2590 continue; /* reiserfs does this */
2591 if (block_start
< from
|| block_end
> to
) {
2593 bh
->b_end_io
= end_buffer_read_nobh
;
2594 submit_bh(READ
, bh
);
2601 * The page is locked, so these buffers are protected from
2602 * any VM or truncate activity. Hence we don't need to care
2603 * for the buffer_head refcounts.
2605 for (bh
= head
; bh
; bh
= bh
->b_this_page
) {
2607 if (!buffer_uptodate(bh
))
2614 if (is_mapped_to_disk
)
2615 SetPageMappedToDisk(page
);
2617 *fsdata
= head
; /* to be released by nobh_write_end */
2624 * Error recovery is a bit difficult. We need to zero out blocks that
2625 * were newly allocated, and dirty them to ensure they get written out.
2626 * Buffers need to be attached to the page at this point, otherwise
2627 * the handling of potential IO errors during writeout would be hard
2628 * (could try doing synchronous writeout, but what if that fails too?)
2630 attach_nobh_buffers(page
, head
);
2631 page_zero_new_buffers(page
, from
, to
);
2635 page_cache_release(page
);
2638 if (pos
+ len
> inode
->i_size
)
2639 vmtruncate(inode
, inode
->i_size
);
2643 EXPORT_SYMBOL(nobh_write_begin
);
2645 int nobh_write_end(struct file
*file
, struct address_space
*mapping
,
2646 loff_t pos
, unsigned len
, unsigned copied
,
2647 struct page
*page
, void *fsdata
)
2649 struct inode
*inode
= page
->mapping
->host
;
2650 struct buffer_head
*head
= fsdata
;
2651 struct buffer_head
*bh
;
2652 BUG_ON(fsdata
!= NULL
&& page_has_buffers(page
));
2654 if (unlikely(copied
< len
) && head
)
2655 attach_nobh_buffers(page
, head
);
2656 if (page_has_buffers(page
))
2657 return generic_write_end(file
, mapping
, pos
, len
,
2658 copied
, page
, fsdata
);
2660 SetPageUptodate(page
);
2661 set_page_dirty(page
);
2662 if (pos
+copied
> inode
->i_size
) {
2663 i_size_write(inode
, pos
+copied
);
2664 mark_inode_dirty(inode
);
2668 page_cache_release(page
);
2672 head
= head
->b_this_page
;
2673 free_buffer_head(bh
);
2678 EXPORT_SYMBOL(nobh_write_end
);
2681 * nobh_writepage() - based on block_full_write_page() except
2682 * that it tries to operate without attaching bufferheads to
2685 int nobh_writepage(struct page
*page
, get_block_t
*get_block
,
2686 struct writeback_control
*wbc
)
2688 struct inode
* const inode
= page
->mapping
->host
;
2689 loff_t i_size
= i_size_read(inode
);
2690 const pgoff_t end_index
= i_size
>> PAGE_CACHE_SHIFT
;
2694 /* Is the page fully inside i_size? */
2695 if (page
->index
< end_index
)
2698 /* Is the page fully outside i_size? (truncate in progress) */
2699 offset
= i_size
& (PAGE_CACHE_SIZE
-1);
2700 if (page
->index
>= end_index
+1 || !offset
) {
2702 * The page may have dirty, unmapped buffers. For example,
2703 * they may have been added in ext3_writepage(). Make them
2704 * freeable here, so the page does not leak.
2707 /* Not really sure about this - do we need this ? */
2708 if (page
->mapping
->a_ops
->invalidatepage
)
2709 page
->mapping
->a_ops
->invalidatepage(page
, offset
);
2712 return 0; /* don't care */
2716 * The page straddles i_size. It must be zeroed out on each and every
2717 * writepage invocation because it may be mmapped. "A file is mapped
2718 * in multiples of the page size. For a file that is not a multiple of
2719 * the page size, the remaining memory is zeroed when mapped, and
2720 * writes to that region are not written out to the file."
2722 zero_user_segment(page
, offset
, PAGE_CACHE_SIZE
);
2724 ret
= mpage_writepage(page
, get_block
, wbc
);
2726 ret
= __block_write_full_page(inode
, page
, get_block
, wbc
,
2727 end_buffer_async_write
);
2730 EXPORT_SYMBOL(nobh_writepage
);
2732 int nobh_truncate_page(struct address_space
*mapping
,
2733 loff_t from
, get_block_t
*get_block
)
2735 pgoff_t index
= from
>> PAGE_CACHE_SHIFT
;
2736 unsigned offset
= from
& (PAGE_CACHE_SIZE
-1);
2739 unsigned length
, pos
;
2740 struct inode
*inode
= mapping
->host
;
2742 struct buffer_head map_bh
;
2745 blocksize
= 1 << inode
->i_blkbits
;
2746 length
= offset
& (blocksize
- 1);
2748 /* Block boundary? Nothing to do */
2752 length
= blocksize
- length
;
2753 iblock
= (sector_t
)index
<< (PAGE_CACHE_SHIFT
- inode
->i_blkbits
);
2755 page
= grab_cache_page(mapping
, index
);
2760 if (page_has_buffers(page
)) {
2763 page_cache_release(page
);
2764 return block_truncate_page(mapping
, from
, get_block
);
2767 /* Find the buffer that contains "offset" */
2769 while (offset
>= pos
) {
2774 map_bh
.b_size
= blocksize
;
2776 err
= get_block(inode
, iblock
, &map_bh
, 0);
2779 /* unmapped? It's a hole - nothing to do */
2780 if (!buffer_mapped(&map_bh
))
2783 /* Ok, it's mapped. Make sure it's up-to-date */
2784 if (!PageUptodate(page
)) {
2785 err
= mapping
->a_ops
->readpage(NULL
, page
);
2787 page_cache_release(page
);
2791 if (!PageUptodate(page
)) {
2795 if (page_has_buffers(page
))
2798 zero_user(page
, offset
, length
);
2799 set_page_dirty(page
);
2804 page_cache_release(page
);
2808 EXPORT_SYMBOL(nobh_truncate_page
);
2810 int block_truncate_page(struct address_space
*mapping
,
2811 loff_t from
, get_block_t
*get_block
)
2813 pgoff_t index
= from
>> PAGE_CACHE_SHIFT
;
2814 unsigned offset
= from
& (PAGE_CACHE_SIZE
-1);
2817 unsigned length
, pos
;
2818 struct inode
*inode
= mapping
->host
;
2820 struct buffer_head
*bh
;
2823 blocksize
= 1 << inode
->i_blkbits
;
2824 length
= offset
& (blocksize
- 1);
2826 /* Block boundary? Nothing to do */
2830 length
= blocksize
- length
;
2831 iblock
= (sector_t
)index
<< (PAGE_CACHE_SHIFT
- inode
->i_blkbits
);
2833 page
= grab_cache_page(mapping
, index
);
2838 if (!page_has_buffers(page
))
2839 create_empty_buffers(page
, blocksize
, 0);
2841 /* Find the buffer that contains "offset" */
2842 bh
= page_buffers(page
);
2844 while (offset
>= pos
) {
2845 bh
= bh
->b_this_page
;
2851 if (!buffer_mapped(bh
)) {
2852 WARN_ON(bh
->b_size
!= blocksize
);
2853 err
= get_block(inode
, iblock
, bh
, 0);
2856 /* unmapped? It's a hole - nothing to do */
2857 if (!buffer_mapped(bh
))
2861 /* Ok, it's mapped. Make sure it's up-to-date */
2862 if (PageUptodate(page
))
2863 set_buffer_uptodate(bh
);
2865 if (!buffer_uptodate(bh
) && !buffer_delay(bh
) && !buffer_unwritten(bh
)) {
2867 ll_rw_block(READ
, 1, &bh
);
2869 /* Uhhuh. Read error. Complain and punt. */
2870 if (!buffer_uptodate(bh
))
2874 zero_user(page
, offset
, length
);
2875 mark_buffer_dirty(bh
);
2880 page_cache_release(page
);
2884 EXPORT_SYMBOL(block_truncate_page
);
2887 * The generic ->writepage function for buffer-backed address_spaces
2888 * this form passes in the end_io handler used to finish the IO.
2890 int block_write_full_page_endio(struct page
*page
, get_block_t
*get_block
,
2891 struct writeback_control
*wbc
, bh_end_io_t
*handler
)
2893 struct inode
* const inode
= page
->mapping
->host
;
2894 loff_t i_size
= i_size_read(inode
);
2895 const pgoff_t end_index
= i_size
>> PAGE_CACHE_SHIFT
;
2898 /* Is the page fully inside i_size? */
2899 if (page
->index
< end_index
)
2900 return __block_write_full_page(inode
, page
, get_block
, wbc
,
2903 /* Is the page fully outside i_size? (truncate in progress) */
2904 offset
= i_size
& (PAGE_CACHE_SIZE
-1);
2905 if (page
->index
>= end_index
+1 || !offset
) {
2907 * The page may have dirty, unmapped buffers. For example,
2908 * they may have been added in ext3_writepage(). Make them
2909 * freeable here, so the page does not leak.
2911 do_invalidatepage(page
, 0);
2913 return 0; /* don't care */
2917 * The page straddles i_size. It must be zeroed out on each and every
2918 * writepage invokation because it may be mmapped. "A file is mapped
2919 * in multiples of the page size. For a file that is not a multiple of
2920 * the page size, the remaining memory is zeroed when mapped, and
2921 * writes to that region are not written out to the file."
2923 zero_user_segment(page
, offset
, PAGE_CACHE_SIZE
);
2924 return __block_write_full_page(inode
, page
, get_block
, wbc
, handler
);
2926 EXPORT_SYMBOL(block_write_full_page_endio
);
2929 * The generic ->writepage function for buffer-backed address_spaces
2931 int block_write_full_page(struct page
*page
, get_block_t
*get_block
,
2932 struct writeback_control
*wbc
)
2934 return block_write_full_page_endio(page
, get_block
, wbc
,
2935 end_buffer_async_write
);
2937 EXPORT_SYMBOL(block_write_full_page
);
2939 sector_t
generic_block_bmap(struct address_space
*mapping
, sector_t block
,
2940 get_block_t
*get_block
)
2942 struct buffer_head tmp
;
2943 struct inode
*inode
= mapping
->host
;
2946 tmp
.b_size
= 1 << inode
->i_blkbits
;
2947 get_block(inode
, block
, &tmp
, 0);
2948 return tmp
.b_blocknr
;
2950 EXPORT_SYMBOL(generic_block_bmap
);
2952 static void end_bio_bh_io_sync(struct bio
*bio
, int err
)
2954 struct buffer_head
*bh
= bio
->bi_private
;
2956 if (err
== -EOPNOTSUPP
) {
2957 set_bit(BIO_EOPNOTSUPP
, &bio
->bi_flags
);
2958 set_bit(BH_Eopnotsupp
, &bh
->b_state
);
2961 if (unlikely (test_bit(BIO_QUIET
,&bio
->bi_flags
)))
2962 set_bit(BH_Quiet
, &bh
->b_state
);
2964 bh
->b_end_io(bh
, test_bit(BIO_UPTODATE
, &bio
->bi_flags
));
2968 int submit_bh(int rw
, struct buffer_head
* bh
)
2973 BUG_ON(!buffer_locked(bh
));
2974 BUG_ON(!buffer_mapped(bh
));
2975 BUG_ON(!bh
->b_end_io
);
2976 BUG_ON(buffer_delay(bh
));
2977 BUG_ON(buffer_unwritten(bh
));
2980 * Mask in barrier bit for a write (could be either a WRITE or a
2983 if (buffer_ordered(bh
) && (rw
& WRITE
))
2984 rw
|= WRITE_BARRIER
;
2987 * Only clear out a write error when rewriting
2989 if (test_set_buffer_req(bh
) && (rw
& WRITE
))
2990 clear_buffer_write_io_error(bh
);
2993 * from here on down, it's all bio -- do the initial mapping,
2994 * submit_bio -> generic_make_request may further map this bio around
2996 bio
= bio_alloc(GFP_NOIO
, 1);
2998 bio
->bi_sector
= bh
->b_blocknr
* (bh
->b_size
>> 9);
2999 bio
->bi_bdev
= bh
->b_bdev
;
3000 bio
->bi_io_vec
[0].bv_page
= bh
->b_page
;
3001 bio
->bi_io_vec
[0].bv_len
= bh
->b_size
;
3002 bio
->bi_io_vec
[0].bv_offset
= bh_offset(bh
);
3006 bio
->bi_size
= bh
->b_size
;
3008 bio
->bi_end_io
= end_bio_bh_io_sync
;
3009 bio
->bi_private
= bh
;
3012 submit_bio(rw
, bio
);
3014 if (bio_flagged(bio
, BIO_EOPNOTSUPP
))
3020 EXPORT_SYMBOL(submit_bh
);
3023 * ll_rw_block: low-level access to block devices (DEPRECATED)
3024 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
3025 * @nr: number of &struct buffer_heads in the array
3026 * @bhs: array of pointers to &struct buffer_head
3028 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3029 * requests an I/O operation on them, either a %READ or a %WRITE. The third
3030 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
3031 * are sent to disk. The fourth %READA option is described in the documentation
3032 * for generic_make_request() which ll_rw_block() calls.
3034 * This function drops any buffer that it cannot get a lock on (with the
3035 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
3036 * clean when doing a write request, and any buffer that appears to be
3037 * up-to-date when doing read request. Further it marks as clean buffers that
3038 * are processed for writing (the buffer cache won't assume that they are
3039 * actually clean until the buffer gets unlocked).
3041 * ll_rw_block sets b_end_io to simple completion handler that marks
3042 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
3045 * All of the buffers must be for the same device, and must also be a
3046 * multiple of the current approved size for the device.
3048 void ll_rw_block(int rw
, int nr
, struct buffer_head
*bhs
[])
3052 for (i
= 0; i
< nr
; i
++) {
3053 struct buffer_head
*bh
= bhs
[i
];
3055 if (rw
== SWRITE
|| rw
== SWRITE_SYNC
|| rw
== SWRITE_SYNC_PLUG
)
3057 else if (!trylock_buffer(bh
))
3060 if (rw
== WRITE
|| rw
== SWRITE
|| rw
== SWRITE_SYNC
||
3061 rw
== SWRITE_SYNC_PLUG
) {
3062 if (test_clear_buffer_dirty(bh
)) {
3063 bh
->b_end_io
= end_buffer_write_sync
;
3065 if (rw
== SWRITE_SYNC
)
3066 submit_bh(WRITE_SYNC
, bh
);
3068 submit_bh(WRITE
, bh
);
3072 if (!buffer_uptodate(bh
)) {
3073 bh
->b_end_io
= end_buffer_read_sync
;
3082 EXPORT_SYMBOL(ll_rw_block
);
3085 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3086 * and then start new I/O and then wait upon it. The caller must have a ref on
3089 int sync_dirty_buffer(struct buffer_head
*bh
)
3093 WARN_ON(atomic_read(&bh
->b_count
) < 1);
3095 if (test_clear_buffer_dirty(bh
)) {
3097 bh
->b_end_io
= end_buffer_write_sync
;
3098 ret
= submit_bh(WRITE_SYNC
, bh
);
3100 if (buffer_eopnotsupp(bh
)) {
3101 clear_buffer_eopnotsupp(bh
);
3104 if (!ret
&& !buffer_uptodate(bh
))
3111 EXPORT_SYMBOL(sync_dirty_buffer
);
3114 * try_to_free_buffers() checks if all the buffers on this particular page
3115 * are unused, and releases them if so.
3117 * Exclusion against try_to_free_buffers may be obtained by either
3118 * locking the page or by holding its mapping's private_lock.
3120 * If the page is dirty but all the buffers are clean then we need to
3121 * be sure to mark the page clean as well. This is because the page
3122 * may be against a block device, and a later reattachment of buffers
3123 * to a dirty page will set *all* buffers dirty. Which would corrupt
3124 * filesystem data on the same device.
3126 * The same applies to regular filesystem pages: if all the buffers are
3127 * clean then we set the page clean and proceed. To do that, we require
3128 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3131 * try_to_free_buffers() is non-blocking.
3133 static inline int buffer_busy(struct buffer_head
*bh
)
3135 return atomic_read(&bh
->b_count
) |
3136 (bh
->b_state
& ((1 << BH_Dirty
) | (1 << BH_Lock
)));
3140 drop_buffers(struct page
*page
, struct buffer_head
**buffers_to_free
)
3142 struct buffer_head
*head
= page_buffers(page
);
3143 struct buffer_head
*bh
;
3147 if (buffer_write_io_error(bh
) && page
->mapping
)
3148 set_bit(AS_EIO
, &page
->mapping
->flags
);
3149 if (buffer_busy(bh
))
3151 bh
= bh
->b_this_page
;
3152 } while (bh
!= head
);
3155 struct buffer_head
*next
= bh
->b_this_page
;
3157 if (bh
->b_assoc_map
)
3158 __remove_assoc_queue(bh
);
3160 } while (bh
!= head
);
3161 *buffers_to_free
= head
;
3162 __clear_page_buffers(page
);
3168 int try_to_free_buffers(struct page
*page
)
3170 struct address_space
* const mapping
= page
->mapping
;
3171 struct buffer_head
*buffers_to_free
= NULL
;
3174 BUG_ON(!PageLocked(page
));
3175 if (PageWriteback(page
))
3178 if (mapping
== NULL
) { /* can this still happen? */
3179 ret
= drop_buffers(page
, &buffers_to_free
);
3183 spin_lock(&mapping
->private_lock
);
3184 ret
= drop_buffers(page
, &buffers_to_free
);
3187 * If the filesystem writes its buffers by hand (eg ext3)
3188 * then we can have clean buffers against a dirty page. We
3189 * clean the page here; otherwise the VM will never notice
3190 * that the filesystem did any IO at all.
3192 * Also, during truncate, discard_buffer will have marked all
3193 * the page's buffers clean. We discover that here and clean
3196 * private_lock must be held over this entire operation in order
3197 * to synchronise against __set_page_dirty_buffers and prevent the
3198 * dirty bit from being lost.
3201 cancel_dirty_page(page
, PAGE_CACHE_SIZE
);
3202 spin_unlock(&mapping
->private_lock
);
3204 if (buffers_to_free
) {
3205 struct buffer_head
*bh
= buffers_to_free
;
3208 struct buffer_head
*next
= bh
->b_this_page
;
3209 free_buffer_head(bh
);
3211 } while (bh
!= buffers_to_free
);
3215 EXPORT_SYMBOL(try_to_free_buffers
);
3217 void block_sync_page(struct page
*page
)
3219 struct address_space
*mapping
;
3222 mapping
= page_mapping(page
);
3224 blk_run_backing_dev(mapping
->backing_dev_info
, page
);
3226 EXPORT_SYMBOL(block_sync_page
);
3229 * There are no bdflush tunables left. But distributions are
3230 * still running obsolete flush daemons, so we terminate them here.
3232 * Use of bdflush() is deprecated and will be removed in a future kernel.
3233 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3235 SYSCALL_DEFINE2(bdflush
, int, func
, long, data
)
3237 static int msg_count
;
3239 if (!capable(CAP_SYS_ADMIN
))
3242 if (msg_count
< 5) {
3245 "warning: process `%s' used the obsolete bdflush"
3246 " system call\n", current
->comm
);
3247 printk(KERN_INFO
"Fix your initscripts?\n");
3256 * Buffer-head allocation
3258 static struct kmem_cache
*bh_cachep
;
3261 * Once the number of bh's in the machine exceeds this level, we start
3262 * stripping them in writeback.
3264 static int max_buffer_heads
;
3266 int buffer_heads_over_limit
;
3268 struct bh_accounting
{
3269 int nr
; /* Number of live bh's */
3270 int ratelimit
; /* Limit cacheline bouncing */
3273 static DEFINE_PER_CPU(struct bh_accounting
, bh_accounting
) = {0, 0};
3275 static void recalc_bh_state(void)
3280 if (__get_cpu_var(bh_accounting
).ratelimit
++ < 4096)
3282 __get_cpu_var(bh_accounting
).ratelimit
= 0;
3283 for_each_online_cpu(i
)
3284 tot
+= per_cpu(bh_accounting
, i
).nr
;
3285 buffer_heads_over_limit
= (tot
> max_buffer_heads
);
3288 struct buffer_head
*alloc_buffer_head(gfp_t gfp_flags
)
3290 struct buffer_head
*ret
= kmem_cache_alloc(bh_cachep
, gfp_flags
);
3292 INIT_LIST_HEAD(&ret
->b_assoc_buffers
);
3293 get_cpu_var(bh_accounting
).nr
++;
3295 put_cpu_var(bh_accounting
);
3299 EXPORT_SYMBOL(alloc_buffer_head
);
3301 void free_buffer_head(struct buffer_head
*bh
)
3303 BUG_ON(!list_empty(&bh
->b_assoc_buffers
));
3304 kmem_cache_free(bh_cachep
, bh
);
3305 get_cpu_var(bh_accounting
).nr
--;
3307 put_cpu_var(bh_accounting
);
3309 EXPORT_SYMBOL(free_buffer_head
);
3311 static void buffer_exit_cpu(int cpu
)
3314 struct bh_lru
*b
= &per_cpu(bh_lrus
, cpu
);
3316 for (i
= 0; i
< BH_LRU_SIZE
; i
++) {
3320 get_cpu_var(bh_accounting
).nr
+= per_cpu(bh_accounting
, cpu
).nr
;
3321 per_cpu(bh_accounting
, cpu
).nr
= 0;
3322 put_cpu_var(bh_accounting
);
3325 static int buffer_cpu_notify(struct notifier_block
*self
,
3326 unsigned long action
, void *hcpu
)
3328 if (action
== CPU_DEAD
|| action
== CPU_DEAD_FROZEN
)
3329 buffer_exit_cpu((unsigned long)hcpu
);
3334 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3335 * @bh: struct buffer_head
3337 * Return true if the buffer is up-to-date and false,
3338 * with the buffer locked, if not.
3340 int bh_uptodate_or_lock(struct buffer_head
*bh
)
3342 if (!buffer_uptodate(bh
)) {
3344 if (!buffer_uptodate(bh
))
3350 EXPORT_SYMBOL(bh_uptodate_or_lock
);
3353 * bh_submit_read - Submit a locked buffer for reading
3354 * @bh: struct buffer_head
3356 * Returns zero on success and -EIO on error.
3358 int bh_submit_read(struct buffer_head
*bh
)
3360 BUG_ON(!buffer_locked(bh
));
3362 if (buffer_uptodate(bh
)) {
3368 bh
->b_end_io
= end_buffer_read_sync
;
3369 submit_bh(READ
, bh
);
3371 if (buffer_uptodate(bh
))
3375 EXPORT_SYMBOL(bh_submit_read
);
3378 init_buffer_head(void *data
)
3380 struct buffer_head
*bh
= data
;
3382 memset(bh
, 0, sizeof(*bh
));
3383 INIT_LIST_HEAD(&bh
->b_assoc_buffers
);
3386 void __init
buffer_init(void)
3390 bh_cachep
= kmem_cache_create("buffer_head",
3391 sizeof(struct buffer_head
), 0,
3392 (SLAB_RECLAIM_ACCOUNT
|SLAB_PANIC
|
3397 * Limit the bh occupancy to 10% of ZONE_NORMAL
3399 nrpages
= (nr_free_buffer_pages() * 10) / 100;
3400 max_buffer_heads
= nrpages
* (PAGE_SIZE
/ sizeof(struct buffer_head
));
3401 hotcpu_notifier(buffer_cpu_notify
, 0);