1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
6 * File open, close, extend, truncate
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
26 #include <linux/capability.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 #include <linux/pagemap.h>
32 #include <linux/uio.h>
33 #include <linux/sched.h>
34 #include <linux/pipe_fs_i.h>
35 #include <linux/mount.h>
37 #define MLOG_MASK_PREFIX ML_INODE
38 #include <cluster/masklog.h>
46 #include "extent_map.h"
56 #include "buffer_head_io.h"
58 static int ocfs2_sync_inode(struct inode
*inode
)
60 filemap_fdatawrite(inode
->i_mapping
);
61 return sync_mapping_buffers(inode
->i_mapping
);
64 static int ocfs2_file_open(struct inode
*inode
, struct file
*file
)
67 int mode
= file
->f_flags
;
68 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
70 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode
, file
,
71 file
->f_path
.dentry
->d_name
.len
, file
->f_path
.dentry
->d_name
.name
);
73 spin_lock(&oi
->ip_lock
);
75 /* Check that the inode hasn't been wiped from disk by another
76 * node. If it hasn't then we're safe as long as we hold the
77 * spin lock until our increment of open count. */
78 if (OCFS2_I(inode
)->ip_flags
& OCFS2_INODE_DELETED
) {
79 spin_unlock(&oi
->ip_lock
);
86 oi
->ip_flags
|= OCFS2_INODE_OPEN_DIRECT
;
89 spin_unlock(&oi
->ip_lock
);
96 static int ocfs2_file_release(struct inode
*inode
, struct file
*file
)
98 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
100 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode
, file
,
101 file
->f_path
.dentry
->d_name
.len
,
102 file
->f_path
.dentry
->d_name
.name
);
104 spin_lock(&oi
->ip_lock
);
105 if (!--oi
->ip_open_count
)
106 oi
->ip_flags
&= ~OCFS2_INODE_OPEN_DIRECT
;
107 spin_unlock(&oi
->ip_lock
);
114 static int ocfs2_sync_file(struct file
*file
,
115 struct dentry
*dentry
,
120 struct inode
*inode
= dentry
->d_inode
;
121 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
123 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file
, dentry
, datasync
,
124 dentry
->d_name
.len
, dentry
->d_name
.name
);
126 err
= ocfs2_sync_inode(dentry
->d_inode
);
130 journal
= osb
->journal
->j_journal
;
131 err
= journal_force_commit(journal
);
136 return (err
< 0) ? -EIO
: 0;
139 int ocfs2_should_update_atime(struct inode
*inode
,
140 struct vfsmount
*vfsmnt
)
143 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
145 if (ocfs2_is_hard_readonly(osb
) || ocfs2_is_soft_readonly(osb
))
148 if ((inode
->i_flags
& S_NOATIME
) ||
149 ((inode
->i_sb
->s_flags
& MS_NODIRATIME
) && S_ISDIR(inode
->i_mode
)))
152 if ((vfsmnt
->mnt_flags
& MNT_NOATIME
) ||
153 ((vfsmnt
->mnt_flags
& MNT_NODIRATIME
) && S_ISDIR(inode
->i_mode
)))
156 if (vfsmnt
->mnt_flags
& MNT_RELATIME
) {
157 if ((timespec_compare(&inode
->i_atime
, &inode
->i_mtime
) <= 0) ||
158 (timespec_compare(&inode
->i_atime
, &inode
->i_ctime
) <= 0))
165 if ((now
.tv_sec
- inode
->i_atime
.tv_sec
<= osb
->s_atime_quantum
))
171 int ocfs2_update_inode_atime(struct inode
*inode
,
172 struct buffer_head
*bh
)
175 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
180 handle
= ocfs2_start_trans(osb
, OCFS2_INODE_UPDATE_CREDITS
);
181 if (handle
== NULL
) {
187 inode
->i_atime
= CURRENT_TIME
;
188 ret
= ocfs2_mark_inode_dirty(handle
, inode
, bh
);
192 ocfs2_commit_trans(OCFS2_SB(inode
->i_sb
), handle
);
198 int ocfs2_set_inode_size(handle_t
*handle
,
200 struct buffer_head
*fe_bh
,
206 i_size_write(inode
, new_i_size
);
207 inode
->i_blocks
= ocfs2_align_bytes_to_sectors(new_i_size
);
208 inode
->i_ctime
= inode
->i_mtime
= CURRENT_TIME
;
210 status
= ocfs2_mark_inode_dirty(handle
, inode
, fe_bh
);
221 static int ocfs2_simple_size_update(struct inode
*inode
,
222 struct buffer_head
*di_bh
,
226 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
227 handle_t
*handle
= NULL
;
229 handle
= ocfs2_start_trans(osb
, OCFS2_INODE_UPDATE_CREDITS
);
230 if (handle
== NULL
) {
236 ret
= ocfs2_set_inode_size(handle
, inode
, di_bh
,
241 ocfs2_commit_trans(osb
, handle
);
246 static int ocfs2_orphan_for_truncate(struct ocfs2_super
*osb
,
248 struct buffer_head
*fe_bh
,
256 /* TODO: This needs to actually orphan the inode in this
259 handle
= ocfs2_start_trans(osb
, OCFS2_INODE_UPDATE_CREDITS
);
260 if (IS_ERR(handle
)) {
261 status
= PTR_ERR(handle
);
266 status
= ocfs2_set_inode_size(handle
, inode
, fe_bh
, new_i_size
);
270 ocfs2_commit_trans(osb
, handle
);
276 static int ocfs2_truncate_file(struct inode
*inode
,
277 struct buffer_head
*di_bh
,
281 struct ocfs2_dinode
*fe
= NULL
;
282 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
283 struct ocfs2_truncate_context
*tc
= NULL
;
285 mlog_entry("(inode = %llu, new_i_size = %llu\n",
286 (unsigned long long)OCFS2_I(inode
)->ip_blkno
,
287 (unsigned long long)new_i_size
);
289 truncate_inode_pages(inode
->i_mapping
, new_i_size
);
291 fe
= (struct ocfs2_dinode
*) di_bh
->b_data
;
292 if (!OCFS2_IS_VALID_DINODE(fe
)) {
293 OCFS2_RO_ON_INVALID_DINODE(inode
->i_sb
, fe
);
298 mlog_bug_on_msg(le64_to_cpu(fe
->i_size
) != i_size_read(inode
),
299 "Inode %llu, inode i_size = %lld != di "
300 "i_size = %llu, i_flags = 0x%x\n",
301 (unsigned long long)OCFS2_I(inode
)->ip_blkno
,
303 (unsigned long long)le64_to_cpu(fe
->i_size
),
304 le32_to_cpu(fe
->i_flags
));
306 if (new_i_size
> le64_to_cpu(fe
->i_size
)) {
307 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
308 (unsigned long long)le64_to_cpu(fe
->i_size
),
309 (unsigned long long)new_i_size
);
315 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
316 (unsigned long long)le64_to_cpu(fe
->i_blkno
),
317 (unsigned long long)le64_to_cpu(fe
->i_size
),
318 (unsigned long long)new_i_size
);
320 /* lets handle the simple truncate cases before doing any more
321 * cluster locking. */
322 if (new_i_size
== le64_to_cpu(fe
->i_size
))
325 /* This forces other nodes to sync and drop their pages. Do
326 * this even if we have a truncate without allocation change -
327 * ocfs2 cluster sizes can be much greater than page size, so
328 * we have to truncate them anyway. */
329 status
= ocfs2_data_lock(inode
, 1);
334 ocfs2_data_unlock(inode
, 1);
336 if (le32_to_cpu(fe
->i_clusters
) ==
337 ocfs2_clusters_for_bytes(osb
->sb
, new_i_size
)) {
338 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
340 /* No allocation change is required, so lets fast path
342 status
= ocfs2_simple_size_update(inode
, di_bh
, new_i_size
);
348 /* alright, we're going to need to do a full blown alloc size
349 * change. Orphan the inode so that recovery can complete the
350 * truncate if necessary. This does the task of marking
352 status
= ocfs2_orphan_for_truncate(osb
, inode
, di_bh
, new_i_size
);
358 status
= ocfs2_prepare_truncate(osb
, inode
, di_bh
, &tc
);
364 status
= ocfs2_commit_truncate(osb
, inode
, di_bh
, tc
);
370 /* TODO: orphan dir cleanup here. */
378 * extend allocation only here.
379 * we'll update all the disk stuff, and oip->alloc_size
381 * expect stuff to be locked, a transaction started and enough data /
382 * metadata reservations in the contexts.
384 * Will return -EAGAIN, and a reason if a restart is needed.
385 * If passed in, *reason will always be set, even in error.
387 int ocfs2_do_extend_allocation(struct ocfs2_super
*osb
,
390 struct buffer_head
*fe_bh
,
392 struct ocfs2_alloc_context
*data_ac
,
393 struct ocfs2_alloc_context
*meta_ac
,
394 enum ocfs2_alloc_restarted
*reason_ret
)
398 struct ocfs2_dinode
*fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
399 enum ocfs2_alloc_restarted reason
= RESTART_NONE
;
400 u32 bit_off
, num_bits
;
403 BUG_ON(!clusters_to_add
);
405 free_extents
= ocfs2_num_free_extents(osb
, inode
, fe
);
406 if (free_extents
< 0) {
407 status
= free_extents
;
412 /* there are two cases which could cause us to EAGAIN in the
413 * we-need-more-metadata case:
414 * 1) we haven't reserved *any*
415 * 2) we are so fragmented, we've needed to add metadata too
417 if (!free_extents
&& !meta_ac
) {
418 mlog(0, "we haven't reserved any metadata!\n");
420 reason
= RESTART_META
;
422 } else if ((!free_extents
)
423 && (ocfs2_alloc_context_bits_left(meta_ac
)
424 < ocfs2_extend_meta_needed(fe
))) {
425 mlog(0, "filesystem is really fragmented...\n");
427 reason
= RESTART_META
;
431 status
= ocfs2_claim_clusters(osb
, handle
, data_ac
, 1,
432 &bit_off
, &num_bits
);
434 if (status
!= -ENOSPC
)
439 BUG_ON(num_bits
> clusters_to_add
);
441 /* reserve our write early -- insert_extent may update the inode */
442 status
= ocfs2_journal_access(handle
, inode
, fe_bh
,
443 OCFS2_JOURNAL_ACCESS_WRITE
);
449 block
= ocfs2_clusters_to_blocks(osb
->sb
, bit_off
);
450 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
451 num_bits
, bit_off
, (unsigned long long)OCFS2_I(inode
)->ip_blkno
);
452 status
= ocfs2_insert_extent(osb
, handle
, inode
, fe_bh
, block
,
459 le32_add_cpu(&fe
->i_clusters
, num_bits
);
460 spin_lock(&OCFS2_I(inode
)->ip_lock
);
461 OCFS2_I(inode
)->ip_clusters
= le32_to_cpu(fe
->i_clusters
);
462 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
464 status
= ocfs2_journal_dirty(handle
, fe_bh
);
470 clusters_to_add
-= num_bits
;
472 if (clusters_to_add
) {
473 mlog(0, "need to alloc once more, clusters = %u, wanted = "
474 "%u\n", fe
->i_clusters
, clusters_to_add
);
476 reason
= RESTART_TRANS
;
482 *reason_ret
= reason
;
486 static int ocfs2_extend_allocation(struct inode
*inode
,
490 int restart_func
= 0;
491 int drop_alloc_sem
= 0;
492 int credits
, num_free_extents
;
494 struct buffer_head
*bh
= NULL
;
495 struct ocfs2_dinode
*fe
= NULL
;
496 handle_t
*handle
= NULL
;
497 struct ocfs2_alloc_context
*data_ac
= NULL
;
498 struct ocfs2_alloc_context
*meta_ac
= NULL
;
499 enum ocfs2_alloc_restarted why
;
500 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
502 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add
);
504 status
= ocfs2_read_block(osb
, OCFS2_I(inode
)->ip_blkno
, &bh
,
505 OCFS2_BH_CACHED
, inode
);
511 fe
= (struct ocfs2_dinode
*) bh
->b_data
;
512 if (!OCFS2_IS_VALID_DINODE(fe
)) {
513 OCFS2_RO_ON_INVALID_DINODE(inode
->i_sb
, fe
);
519 BUG_ON(le32_to_cpu(fe
->i_clusters
) != OCFS2_I(inode
)->ip_clusters
);
521 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
522 "clusters_to_add = %u\n",
523 (unsigned long long)OCFS2_I(inode
)->ip_blkno
, i_size_read(inode
),
524 fe
->i_clusters
, clusters_to_add
);
526 num_free_extents
= ocfs2_num_free_extents(osb
,
529 if (num_free_extents
< 0) {
530 status
= num_free_extents
;
535 if (!num_free_extents
) {
536 status
= ocfs2_reserve_new_metadata(osb
, fe
, &meta_ac
);
538 if (status
!= -ENOSPC
)
544 status
= ocfs2_reserve_clusters(osb
, clusters_to_add
, &data_ac
);
546 if (status
!= -ENOSPC
)
551 /* blocks peope in read/write from reading our allocation
552 * until we're done changing it. We depend on i_mutex to block
553 * other extend/truncate calls while we're here. Ordering wrt
554 * start_trans is important here -- always do it before! */
555 down_write(&OCFS2_I(inode
)->ip_alloc_sem
);
558 credits
= ocfs2_calc_extend_credits(osb
->sb
, fe
, clusters_to_add
);
559 handle
= ocfs2_start_trans(osb
, credits
);
560 if (IS_ERR(handle
)) {
561 status
= PTR_ERR(handle
);
567 restarted_transaction
:
568 /* reserve a write to the file entry early on - that we if we
569 * run out of credits in the allocation path, we can still
571 status
= ocfs2_journal_access(handle
, inode
, bh
,
572 OCFS2_JOURNAL_ACCESS_WRITE
);
578 prev_clusters
= OCFS2_I(inode
)->ip_clusters
;
580 status
= ocfs2_do_extend_allocation(osb
,
588 if ((status
< 0) && (status
!= -EAGAIN
)) {
589 if (status
!= -ENOSPC
)
594 status
= ocfs2_journal_dirty(handle
, bh
);
600 spin_lock(&OCFS2_I(inode
)->ip_lock
);
601 clusters_to_add
-= (OCFS2_I(inode
)->ip_clusters
- prev_clusters
);
602 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
604 if (why
!= RESTART_NONE
&& clusters_to_add
) {
605 if (why
== RESTART_META
) {
606 mlog(0, "restarting function.\n");
609 BUG_ON(why
!= RESTART_TRANS
);
611 mlog(0, "restarting transaction.\n");
612 /* TODO: This can be more intelligent. */
613 credits
= ocfs2_calc_extend_credits(osb
->sb
,
616 status
= ocfs2_extend_trans(handle
, credits
);
618 /* handle still has to be committed at
624 goto restarted_transaction
;
628 mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
629 fe
->i_clusters
, (unsigned long long)fe
->i_size
);
630 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
631 OCFS2_I(inode
)->ip_clusters
, i_size_read(inode
));
634 if (drop_alloc_sem
) {
635 up_write(&OCFS2_I(inode
)->ip_alloc_sem
);
639 ocfs2_commit_trans(osb
, handle
);
643 ocfs2_free_alloc_context(data_ac
);
647 ocfs2_free_alloc_context(meta_ac
);
650 if ((!status
) && restart_func
) {
663 /* Some parts of this taken from generic_cont_expand, which turned out
664 * to be too fragile to do exactly what we need without us having to
665 * worry about recursive locking in ->prepare_write() and
666 * ->commit_write(). */
667 static int ocfs2_write_zero_page(struct inode
*inode
,
670 struct address_space
*mapping
= inode
->i_mapping
;
674 handle_t
*handle
= NULL
;
677 offset
= (size
& (PAGE_CACHE_SIZE
-1)); /* Within page */
678 /* ugh. in prepare/commit_write, if from==to==start of block, we
679 ** skip the prepare. make sure we never send an offset for the start
682 if ((offset
& (inode
->i_sb
->s_blocksize
- 1)) == 0) {
685 index
= size
>> PAGE_CACHE_SHIFT
;
687 page
= grab_cache_page(mapping
, index
);
694 ret
= ocfs2_prepare_write_nolock(inode
, page
, offset
, offset
);
700 if (ocfs2_should_order_data(inode
)) {
701 handle
= ocfs2_start_walk_page_trans(inode
, page
, offset
,
703 if (IS_ERR(handle
)) {
704 ret
= PTR_ERR(handle
);
710 /* must not update i_size! */
711 ret
= block_commit_write(page
, offset
, offset
);
718 ocfs2_commit_trans(OCFS2_SB(inode
->i_sb
), handle
);
721 page_cache_release(page
);
726 static int ocfs2_zero_extend(struct inode
*inode
,
731 struct super_block
*sb
= inode
->i_sb
;
733 start_off
= ocfs2_align_bytes_to_blocks(sb
, i_size_read(inode
));
734 while (start_off
< zero_to_size
) {
735 ret
= ocfs2_write_zero_page(inode
, start_off
);
741 start_off
+= sb
->s_blocksize
;
744 * Very large extends have the potential to lock up
745 * the cpu for extended periods of time.
755 * A tail_to_skip value > 0 indicates that we're being called from
756 * ocfs2_file_aio_write(). This has the following implications:
758 * - we don't want to update i_size
759 * - di_bh will be NULL, which is fine because it's only used in the
760 * case where we want to update i_size.
761 * - ocfs2_zero_extend() will then only be filling the hole created
762 * between i_size and the start of the write.
764 static int ocfs2_extend_file(struct inode
*inode
,
765 struct buffer_head
*di_bh
,
772 BUG_ON(!tail_to_skip
&& !di_bh
);
774 /* setattr sometimes calls us like this. */
778 if (i_size_read(inode
) == new_i_size
)
780 BUG_ON(new_i_size
< i_size_read(inode
));
782 clusters_to_add
= ocfs2_clusters_for_bytes(inode
->i_sb
, new_i_size
) -
783 OCFS2_I(inode
)->ip_clusters
;
786 * protect the pages that ocfs2_zero_extend is going to be
787 * pulling into the page cache.. we do this before the
788 * metadata extend so that we don't get into the situation
789 * where we've extended the metadata but can't get the data
792 ret
= ocfs2_data_lock(inode
, 1);
798 if (clusters_to_add
) {
799 ret
= ocfs2_extend_allocation(inode
, clusters_to_add
);
807 * Call this even if we don't add any clusters to the tree. We
808 * still need to zero the area between the old i_size and the
811 ret
= ocfs2_zero_extend(inode
, (u64
)new_i_size
- tail_to_skip
);
818 /* We're being called from ocfs2_setattr() which wants
819 * us to update i_size */
820 ret
= ocfs2_simple_size_update(inode
, di_bh
, new_i_size
);
826 ocfs2_data_unlock(inode
, 1);
832 int ocfs2_setattr(struct dentry
*dentry
, struct iattr
*attr
)
834 int status
= 0, size_change
;
835 struct inode
*inode
= dentry
->d_inode
;
836 struct super_block
*sb
= inode
->i_sb
;
837 struct ocfs2_super
*osb
= OCFS2_SB(sb
);
838 struct buffer_head
*bh
= NULL
;
839 handle_t
*handle
= NULL
;
841 mlog_entry("(0x%p, '%.*s')\n", dentry
,
842 dentry
->d_name
.len
, dentry
->d_name
.name
);
844 if (attr
->ia_valid
& ATTR_MODE
)
845 mlog(0, "mode change: %d\n", attr
->ia_mode
);
846 if (attr
->ia_valid
& ATTR_UID
)
847 mlog(0, "uid change: %d\n", attr
->ia_uid
);
848 if (attr
->ia_valid
& ATTR_GID
)
849 mlog(0, "gid change: %d\n", attr
->ia_gid
);
850 if (attr
->ia_valid
& ATTR_SIZE
)
851 mlog(0, "size change...\n");
852 if (attr
->ia_valid
& (ATTR_ATIME
| ATTR_MTIME
| ATTR_CTIME
))
853 mlog(0, "time change...\n");
855 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
856 | ATTR_GID | ATTR_UID | ATTR_MODE)
857 if (!(attr
->ia_valid
& OCFS2_VALID_ATTRS
)) {
858 mlog(0, "can't handle attrs: 0x%x\n", attr
->ia_valid
);
862 status
= inode_change_ok(inode
, attr
);
866 size_change
= S_ISREG(inode
->i_mode
) && attr
->ia_valid
& ATTR_SIZE
;
868 status
= ocfs2_rw_lock(inode
, 1);
875 status
= ocfs2_meta_lock(inode
, &bh
, 1);
877 if (status
!= -ENOENT
)
882 if (size_change
&& attr
->ia_size
!= i_size_read(inode
)) {
883 if (i_size_read(inode
) > attr
->ia_size
)
884 status
= ocfs2_truncate_file(inode
, bh
, attr
->ia_size
);
886 status
= ocfs2_extend_file(inode
, bh
, attr
->ia_size
, 0);
888 if (status
!= -ENOSPC
)
895 handle
= ocfs2_start_trans(osb
, OCFS2_INODE_UPDATE_CREDITS
);
896 if (IS_ERR(handle
)) {
897 status
= PTR_ERR(handle
);
902 status
= inode_setattr(inode
, attr
);
908 status
= ocfs2_mark_inode_dirty(handle
, inode
, bh
);
913 ocfs2_commit_trans(osb
, handle
);
915 ocfs2_meta_unlock(inode
, 1);
918 ocfs2_rw_unlock(inode
, 1);
927 int ocfs2_getattr(struct vfsmount
*mnt
,
928 struct dentry
*dentry
,
931 struct inode
*inode
= dentry
->d_inode
;
932 struct super_block
*sb
= dentry
->d_inode
->i_sb
;
933 struct ocfs2_super
*osb
= sb
->s_fs_info
;
938 err
= ocfs2_inode_revalidate(dentry
);
945 generic_fillattr(inode
, stat
);
947 /* We set the blksize from the cluster size for performance */
948 stat
->blksize
= osb
->s_clustersize
;
956 int ocfs2_permission(struct inode
*inode
, int mask
, struct nameidata
*nd
)
962 ret
= ocfs2_meta_lock(inode
, NULL
, 0);
968 ret
= generic_permission(inode
, mask
, NULL
);
972 ocfs2_meta_unlock(inode
, 0);
978 static int ocfs2_write_remove_suid(struct inode
*inode
)
981 struct buffer_head
*bh
= NULL
;
982 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
984 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
985 struct ocfs2_dinode
*di
;
987 mlog_entry("(Inode %llu, mode 0%o)\n",
988 (unsigned long long)oi
->ip_blkno
, inode
->i_mode
);
990 handle
= ocfs2_start_trans(osb
, OCFS2_INODE_UPDATE_CREDITS
);
991 if (handle
== NULL
) {
997 ret
= ocfs2_read_block(osb
, oi
->ip_blkno
, &bh
, OCFS2_BH_CACHED
, inode
);
1003 ret
= ocfs2_journal_access(handle
, inode
, bh
,
1004 OCFS2_JOURNAL_ACCESS_WRITE
);
1010 inode
->i_mode
&= ~S_ISUID
;
1011 if ((inode
->i_mode
& S_ISGID
) && (inode
->i_mode
& S_IXGRP
))
1012 inode
->i_mode
&= ~S_ISGID
;
1014 di
= (struct ocfs2_dinode
*) bh
->b_data
;
1015 di
->i_mode
= cpu_to_le16(inode
->i_mode
);
1017 ret
= ocfs2_journal_dirty(handle
, bh
);
1023 ocfs2_commit_trans(osb
, handle
);
1029 static int ocfs2_prepare_inode_for_write(struct dentry
*dentry
,
1034 int ret
= 0, meta_level
= appending
;
1035 struct inode
*inode
= dentry
->d_inode
;
1037 loff_t newsize
, saved_pos
;
1040 * We sample i_size under a read level meta lock to see if our write
1041 * is extending the file, if it is we back off and get a write level
1045 ret
= ocfs2_meta_lock(inode
, NULL
, meta_level
);
1052 /* Clear suid / sgid if necessary. We do this here
1053 * instead of later in the write path because
1054 * remove_suid() calls ->setattr without any hint that
1055 * we may have already done our cluster locking. Since
1056 * ocfs2_setattr() *must* take cluster locks to
1057 * proceeed, this will lead us to recursively lock the
1058 * inode. There's also the dinode i_size state which
1059 * can be lost via setattr during extending writes (we
1060 * set inode->i_size at the end of a write. */
1061 if (should_remove_suid(dentry
)) {
1062 if (meta_level
== 0) {
1063 ocfs2_meta_unlock(inode
, meta_level
);
1068 ret
= ocfs2_write_remove_suid(inode
);
1075 /* work on a copy of ppos until we're sure that we won't have
1076 * to recalculate it due to relocking. */
1078 saved_pos
= i_size_read(inode
);
1079 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos
);
1083 newsize
= count
+ saved_pos
;
1085 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1086 (long long) saved_pos
, (long long) newsize
,
1087 (long long) i_size_read(inode
));
1089 /* No need for a higher level metadata lock if we're
1090 * never going past i_size. */
1091 if (newsize
<= i_size_read(inode
))
1094 if (meta_level
== 0) {
1095 ocfs2_meta_unlock(inode
, meta_level
);
1100 spin_lock(&OCFS2_I(inode
)->ip_lock
);
1101 clusters
= ocfs2_clusters_for_bytes(inode
->i_sb
, newsize
) -
1102 OCFS2_I(inode
)->ip_clusters
;
1103 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
1105 mlog(0, "Writing at EOF, may need more allocation: "
1106 "i_size = %lld, newsize = %lld, need %u clusters\n",
1107 (long long) i_size_read(inode
), (long long) newsize
,
1110 /* We only want to continue the rest of this loop if
1111 * our extend will actually require more
1116 ret
= ocfs2_extend_file(inode
, NULL
, newsize
, count
);
1129 ocfs2_meta_unlock(inode
, meta_level
);
1135 static ssize_t
ocfs2_file_aio_write(struct kiocb
*iocb
,
1136 const struct iovec
*iov
,
1137 unsigned long nr_segs
,
1140 int ret
, rw_level
, have_alloc_sem
= 0;
1141 struct file
*filp
= iocb
->ki_filp
;
1142 struct inode
*inode
= filp
->f_path
.dentry
->d_inode
;
1143 int appending
= filp
->f_flags
& O_APPEND
? 1 : 0;
1145 mlog_entry("(0x%p, %u, '%.*s')\n", filp
,
1146 (unsigned int)nr_segs
,
1147 filp
->f_path
.dentry
->d_name
.len
,
1148 filp
->f_path
.dentry
->d_name
.name
);
1150 /* happy write of zero bytes */
1151 if (iocb
->ki_left
== 0)
1154 mutex_lock(&inode
->i_mutex
);
1155 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1156 if (filp
->f_flags
& O_DIRECT
) {
1158 down_read(&inode
->i_alloc_sem
);
1161 /* concurrent O_DIRECT writes are allowed */
1162 rw_level
= (filp
->f_flags
& O_DIRECT
) ? 0 : 1;
1163 ret
= ocfs2_rw_lock(inode
, rw_level
);
1170 ret
= ocfs2_prepare_inode_for_write(filp
->f_path
.dentry
, &iocb
->ki_pos
,
1171 iocb
->ki_left
, appending
);
1177 /* communicate with ocfs2_dio_end_io */
1178 ocfs2_iocb_set_rw_locked(iocb
);
1180 ret
= generic_file_aio_write_nolock(iocb
, iov
, nr_segs
, iocb
->ki_pos
);
1182 /* buffered aio wouldn't have proper lock coverage today */
1183 BUG_ON(ret
== -EIOCBQUEUED
&& !(filp
->f_flags
& O_DIRECT
));
1186 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1187 * function pointer which is called when o_direct io completes so that
1188 * it can unlock our rw lock. (it's the clustered equivalent of
1189 * i_alloc_sem; protects truncate from racing with pending ios).
1190 * Unfortunately there are error cases which call end_io and others
1191 * that don't. so we don't have to unlock the rw_lock if either an
1192 * async dio is going to do it in the future or an end_io after an
1193 * error has already done it.
1195 if (ret
== -EIOCBQUEUED
|| !ocfs2_iocb_is_rw_locked(iocb
)) {
1202 up_read(&inode
->i_alloc_sem
);
1204 ocfs2_rw_unlock(inode
, rw_level
);
1205 mutex_unlock(&inode
->i_mutex
);
1211 static ssize_t
ocfs2_file_splice_write(struct pipe_inode_info
*pipe
,
1218 struct inode
*inode
= out
->f_path
.dentry
->d_inode
;
1220 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out
, pipe
,
1222 out
->f_path
.dentry
->d_name
.len
,
1223 out
->f_path
.dentry
->d_name
.name
);
1225 inode_double_lock(inode
, pipe
->inode
);
1227 ret
= ocfs2_rw_lock(inode
, 1);
1233 ret
= ocfs2_prepare_inode_for_write(out
->f_path
.dentry
, ppos
, len
, 0);
1239 /* ok, we're done with i_size and alloc work */
1240 ret
= generic_file_splice_write_nolock(pipe
, out
, ppos
, len
, flags
);
1243 ocfs2_rw_unlock(inode
, 1);
1245 inode_double_unlock(inode
, pipe
->inode
);
1251 static ssize_t
ocfs2_file_splice_read(struct file
*in
,
1253 struct pipe_inode_info
*pipe
,
1258 struct inode
*inode
= in
->f_path
.dentry
->d_inode
;
1260 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in
, pipe
,
1262 in
->f_path
.dentry
->d_name
.len
,
1263 in
->f_path
.dentry
->d_name
.name
);
1266 * See the comment in ocfs2_file_aio_read()
1268 ret
= ocfs2_meta_lock(inode
, NULL
, 0);
1273 ocfs2_meta_unlock(inode
, 0);
1275 ret
= generic_file_splice_read(in
, ppos
, pipe
, len
, flags
);
1282 static ssize_t
ocfs2_file_aio_read(struct kiocb
*iocb
,
1283 const struct iovec
*iov
,
1284 unsigned long nr_segs
,
1287 int ret
= 0, rw_level
= -1, have_alloc_sem
= 0, lock_level
= 0;
1288 struct file
*filp
= iocb
->ki_filp
;
1289 struct inode
*inode
= filp
->f_path
.dentry
->d_inode
;
1291 mlog_entry("(0x%p, %u, '%.*s')\n", filp
,
1292 (unsigned int)nr_segs
,
1293 filp
->f_path
.dentry
->d_name
.len
,
1294 filp
->f_path
.dentry
->d_name
.name
);
1303 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
1304 * need locks to protect pending reads from racing with truncate.
1306 if (filp
->f_flags
& O_DIRECT
) {
1307 down_read(&inode
->i_alloc_sem
);
1310 ret
= ocfs2_rw_lock(inode
, 0);
1316 /* communicate with ocfs2_dio_end_io */
1317 ocfs2_iocb_set_rw_locked(iocb
);
1321 * We're fine letting folks race truncates and extending
1322 * writes with read across the cluster, just like they can
1323 * locally. Hence no rw_lock during read.
1325 * Take and drop the meta data lock to update inode fields
1326 * like i_size. This allows the checks down below
1327 * generic_file_aio_read() a chance of actually working.
1329 ret
= ocfs2_meta_lock_atime(inode
, filp
->f_vfsmnt
, &lock_level
);
1334 ocfs2_meta_unlock(inode
, lock_level
);
1336 ret
= generic_file_aio_read(iocb
, iov
, nr_segs
, iocb
->ki_pos
);
1338 mlog(ML_ERROR
, "generic_file_aio_read returned -EINVAL\n");
1340 /* buffered aio wouldn't have proper lock coverage today */
1341 BUG_ON(ret
== -EIOCBQUEUED
&& !(filp
->f_flags
& O_DIRECT
));
1343 /* see ocfs2_file_aio_write */
1344 if (ret
== -EIOCBQUEUED
|| !ocfs2_iocb_is_rw_locked(iocb
)) {
1351 up_read(&inode
->i_alloc_sem
);
1353 ocfs2_rw_unlock(inode
, rw_level
);
1359 struct inode_operations ocfs2_file_iops
= {
1360 .setattr
= ocfs2_setattr
,
1361 .getattr
= ocfs2_getattr
,
1362 .permission
= ocfs2_permission
,
1365 struct inode_operations ocfs2_special_file_iops
= {
1366 .setattr
= ocfs2_setattr
,
1367 .getattr
= ocfs2_getattr
,
1368 .permission
= ocfs2_permission
,
1371 const struct file_operations ocfs2_fops
= {
1372 .read
= do_sync_read
,
1373 .write
= do_sync_write
,
1374 .sendfile
= generic_file_sendfile
,
1376 .fsync
= ocfs2_sync_file
,
1377 .release
= ocfs2_file_release
,
1378 .open
= ocfs2_file_open
,
1379 .aio_read
= ocfs2_file_aio_read
,
1380 .aio_write
= ocfs2_file_aio_write
,
1381 .ioctl
= ocfs2_ioctl
,
1382 .splice_read
= ocfs2_file_splice_read
,
1383 .splice_write
= ocfs2_file_splice_write
,
1386 const struct file_operations ocfs2_dops
= {
1387 .read
= generic_read_dir
,
1388 .readdir
= ocfs2_readdir
,
1389 .fsync
= ocfs2_sync_file
,
1390 .ioctl
= ocfs2_ioctl
,