2 * Copyright (C) 2011+ Evgeniy Polyakov <zbr@ioremap.net>
5 #include <linux/buffer_head.h>
6 #include <linux/cred.h>
7 #include <linux/fiemap.h>
8 #include <linux/mpage.h>
9 #include <linux/namei.h>
10 #include <linux/pagevec.h>
11 #include <linux/pagemap.h>
12 #include <linux/random.h>
13 #include <linux/scatterlist.h>
14 #include <linux/slab.h>
15 #include <linux/time.h>
16 #include <linux/writeback.h>
20 char *pohmelfs_dump_id_len_raw(const unsigned char *id
, unsigned int len
, char *dst
)
24 if (len
> SHA512_DIGEST_SIZE
)
25 len
= SHA512_DIGEST_SIZE
;
28 sprintf(&dst
[2*i
], "%02x", id
[i
]);
32 #define pohmelfs_dump_len 6
34 char id_str
[pohmelfs_dump_len
* 2 + 1];
36 static DEFINE_PER_CPU(pohmelfs_dump_t
, pohmelfs_dump_per_cpu
);
38 char *pohmelfs_dump_id(const unsigned char *id
)
42 ptr
= &get_cpu_var(pohmelfs_dump_per_cpu
);
43 pohmelfs_dump_id_len_raw(id
, pohmelfs_dump_len
, ptr
->id_str
);
49 #define dnet_raw_id_scratch 6
53 } dnet_raw_id_scratch_t
;
54 static DEFINE_PER_CPU(dnet_raw_id_scratch_t
, dnet_raw_id_scratch_per_cpu
);
56 static int pohmelfs_gen_id(struct pohmelfs_sb
*psb
, struct dnet_raw_id
*id
)
58 dnet_raw_id_scratch_t
*sc
;
62 get_random_bytes(&rand
, sizeof(sc
->rand
));
64 sc
= &get_cpu_var(dnet_raw_id_scratch_per_cpu
);
66 sc
->ts
= CURRENT_TIME
;
68 err
= pohmelfs_hash(psb
, sc
, sizeof(dnet_raw_id_scratch_t
), id
);
74 static int pohmelfs_sb_inode_insert(struct pohmelfs_sb
*psb
, struct pohmelfs_inode
*pi
)
76 struct rb_node
**n
= &psb
->inode_root
.rb_node
, *parent
= NULL
;
77 struct pohmelfs_inode
*tmp
;
80 spin_lock(&psb
->inode_lock
);
84 tmp
= rb_entry(parent
, struct pohmelfs_inode
, node
);
86 cmp
= dnet_id_cmp_str(tmp
->id
.id
, pi
->id
.id
);
90 n
= &parent
->rb_right
;
97 rb_link_node(&pi
->node
, parent
, n
);
98 rb_insert_color(&pi
->node
, &psb
->inode_root
);
101 spin_unlock(&psb
->inode_lock
);
106 struct pohmelfs_inode
*pohmelfs_sb_inode_lookup(struct pohmelfs_sb
*psb
, struct dnet_raw_id
*id
)
108 struct rb_node
*n
= psb
->inode_root
.rb_node
;
109 struct pohmelfs_inode
*pi
, *found
= NULL
;
112 spin_lock(&psb
->inode_lock
);
114 pi
= rb_entry(n
, struct pohmelfs_inode
, node
);
116 cmp
= dnet_id_cmp_str(pi
->id
.id
, id
->id
);
127 if (!igrab(&found
->vfs_inode
))
130 spin_unlock(&psb
->inode_lock
);
135 struct inode
*pohmelfs_alloc_inode(struct super_block
*sb
)
137 struct pohmelfs_inode
*pi
;
139 pi
= kmem_cache_zalloc(pohmelfs_inode_cache
, GFP_NOIO
);
143 inode_init_once(&pi
->vfs_inode
);
147 rb_init_node(&pi
->node
);
149 return &pi
->vfs_inode
;
155 void pohmelfs_destroy_inode(struct inode
*inode
)
157 struct pohmelfs_inode
*pi
= pohmelfs_inode(inode
);
159 pr_debug("pohmelfs: %s: destroy: ino: %ld, dirty: %lx\n", pohmelfs_dump_id(pi
->id
.id
), inode
->i_ino
, inode
->i_state
& I_DIRTY
);
161 kmem_cache_free(pohmelfs_inode_cache
, pi
);
164 int pohmelfs_hash(struct pohmelfs_sb
*psb
, const void *data
, const size_t size
, struct dnet_raw_id
*id
)
166 struct scatterlist sg
;
167 struct hash_desc desc
;
169 sg_init_table(&sg
, 1);
170 sg_set_buf(&sg
, data
, size
);
172 desc
.tfm
= psb
->hash
;
175 return crypto_hash_digest(&desc
, &sg
, size
, id
->id
);
178 static void pohmelfs_readpages_destroy(struct pohmelfs_trans
*t
)
180 struct pohmelfs_wait
*wait
= t
->priv
;
183 pohmelfs_wait_put(wait
);
186 static int pohmelfs_readpages_complete(struct pohmelfs_trans
*t
, struct pohmelfs_state
*recv
)
188 struct pohmelfs_wait
*wait
= t
->priv
;
189 struct dnet_cmd
*cmd
= &recv
->cmd
;
191 if (!(cmd
->flags
& DNET_FLAGS_MORE
)) {
192 if (!wait
->condition
) {
193 wait
->condition
= cmd
->status
;
194 if (!wait
->condition
)
199 pr_debug("pohmelfs: %d:%s: pohmelfs_readpages_complete: read: %ld, wait: %d\n",
200 cmd
->id
.group_id
, pohmelfs_dump_id(wait
->pi
->id
.id
), atomic_long_read(&wait
->count
), wait
->condition
);
205 static int pohmelfs_readpages_init(struct pohmelfs_trans
*t
)
207 struct pohmelfs_wait
*wait
= t
->priv
;
209 pohmelfs_wait_get(wait
);
213 static int pohmelfs_readpages_recv_reply(struct pohmelfs_trans
*t
, struct pohmelfs_state
*recv
)
215 struct pohmelfs_wait
*wait
= t
->priv
;
216 struct pohmelfs_inode
*pi
= wait
->pi
;
217 struct address_space
*mapping
= wait
->ret
;
218 unsigned int asize
= sizeof(struct dnet_attr
) + sizeof(struct dnet_io_attr
);
219 void *data
= &t
->cmd
.attr
; /* overwrite send buffer used for attr/ioattr */
220 struct dnet_cmd
*cmd
= &recv
->cmd
;
225 if (t
->recv_offset
< asize
) {
226 size
= asize
- t
->recv_offset
;
227 data
+= t
->recv_offset
;
228 err
= pohmelfs_recv(t
, recv
, data
, size
);
232 dnet_convert_io_attr(&t
->cmd
.p
.io
);
235 while (t
->recv_offset
!= cmd
->size
) {
236 offset
= (t
->recv_offset
- asize
) & (PAGE_CACHE_SIZE
- 1);
237 size
= PAGE_CACHE_SIZE
- offset
;
239 if (size
> cmd
->size
- t
->recv_offset
)
240 size
= cmd
->size
- t
->recv_offset
;
242 page
= find_or_create_page(mapping
, (t
->recv_offset
- asize
+ t
->cmd
.p
.io
.offset
) >> PAGE_CACHE_SHIFT
, GFP_NOIO
);
249 err
= pohmelfs_recv(t
, recv
, data
+ offset
, size
);
252 if (err
> 0 && ((err
+ offset
== PAGE_CACHE_SIZE
) || (t
->recv_offset
== cmd
->size
))) {
253 SetPageUptodate(page
);
257 page_cache_release(page
);
262 atomic_long_add(err
, &wait
->count
);
268 if ((err
< 0) && (err
!= -ENOENT
) && (err
!= -EAGAIN
))
269 pr_info("pohmelfs: %d:%s: pohmelfs_readpages_recv_data: offset: %lld, data size: %llu, err: %d\n",
270 cmd
->id
.group_id
, pohmelfs_dump_id(pi
->id
.id
), t
->recv_offset
- asize
+ t
->cmd
.p
.io
.offset
,
271 (unsigned long long)cmd
->size
- asize
, err
);
276 static int pohmelfs_readpages_group(struct address_space
*mapping
, int group_id
, pgoff_t offset
, size_t size
)
278 struct inode
*inode
= mapping
->host
;
279 struct pohmelfs_sb
*psb
= pohmelfs_sb(inode
->i_sb
);
280 struct pohmelfs_inode
*pi
= pohmelfs_inode(inode
);
281 struct pohmelfs_wait
*wait
;
282 struct pohmelfs_io
*io
;
286 wait
= pohmelfs_wait_alloc(pi
);
292 io
= kmem_cache_zalloc(pohmelfs_io_cache
, GFP_NOIO
);
300 io
->cmd
= DNET_CMD_READ
;
301 io
->cflags
= DNET_FLAGS_NEED_ACK
;
304 if (psb
->no_read_csum
)
305 io
->ioflags
= DNET_IO_FLAGS_NOCSUM
;
306 io
->cb
.init
= pohmelfs_readpages_init
;
307 io
->cb
.complete
= pohmelfs_readpages_complete
;
308 io
->cb
.destroy
= pohmelfs_readpages_destroy
;
309 io
->cb
.recv_reply
= pohmelfs_readpages_recv_reply
;
312 /* it is safe, since we hold a reference to corresponding inode in wait->pi */
315 err
= pohmelfs_send_io_group(io
, group_id
);
319 ret
= wait_event_interruptible_timeout(wait
->wq
, wait
->condition
!= 0, msecs_to_jiffies(psb
->read_wait_timeout
));
327 if (wait
->condition
< 0) {
328 err
= wait
->condition
;
332 err
= atomic_long_read(&wait
->count
);
335 kmem_cache_free(pohmelfs_io_cache
, io
);
337 pohmelfs_wait_put(wait
);
342 static int pohmelfs_readpages(struct file
*filp
, struct address_space
*mapping
,
343 struct list_head
*pages
, unsigned nr_pages
)
345 struct inode
*inode
= mapping
->host
;
346 struct pohmelfs_sb
*psb
= pohmelfs_sb(inode
->i_sb
);
347 struct pohmelfs_inode
*pi
= pohmelfs_inode(inode
);
348 int i
, err
= -ENOENT
;
350 struct page
*tmp
, *page
;
352 list_for_each_entry_safe(page
, tmp
, pages
, lru
) {
353 list_del(&page
->lru
);
355 if (page_offset(page
) < offset
)
356 offset
= page_offset(page
);
359 * we do not really care about these pages
360 * completion callback will try to find it in mapping
361 * and will allocate new pages if mapping is empty
363 if (!add_to_page_cache_lru(page
, mapping
, page
->index
, GFP_KERNEL
))
365 page_cache_release(page
);
368 for (i
= 0; i
< psb
->group_num
; ++i
) {
369 err
= pohmelfs_readpages_group(mapping
, psb
->groups
[i
], offset
, nr_pages
* PAGE_CACHE_SIZE
);
377 pr_debug("pohmelfs: %s: readpages: ino: %lu, offset: %lu, pages: %u: %d\n",
378 pohmelfs_dump_id(pi
->id
.id
), inode
->i_ino
, offset
, nr_pages
, err
);
383 static int pohmelfs_readpage(struct file
*file
, struct page
*page
)
385 struct inode
*inode
= page
->mapping
->host
;
386 struct pohmelfs_sb
*psb
= pohmelfs_sb(inode
->i_sb
);
387 int i
, err
= -ENOENT
;
389 if (inode
->i_size
<= page
->index
<< PAGE_CACHE_SHIFT
) {
390 SetPageUptodate(page
);
397 for (i
= 0; i
< psb
->group_num
; ++i
) {
398 err
= pohmelfs_readpages_group(page
->mapping
, psb
->groups
[i
], page_offset(page
), PAGE_CACHE_SIZE
);
406 if ((err
< 0) && (err
!= -ENOENT
))
407 pr_err("pohmelfs: %s: readpage: ino: %lu, offset: %lu, uptodate: %d, err: %d\n",
408 pohmelfs_dump_id(pohmelfs_inode(inode
)->id
.id
), inode
->i_ino
, (long)page_offset(page
),
409 PageUptodate(page
), err
);
413 void pohmelfs_write_ctl_release(struct kref
*kref
)
415 struct pohmelfs_write_ctl
*ctl
= container_of(kref
, struct pohmelfs_write_ctl
, refcnt
);
416 int bad_write
= atomic_read(&ctl
->good_writes
) < ctl
->psb
->group_num
/ 2 + 1;
421 struct inode
*inode
= ctl
->pvec
.pages
[0]->mapping
->host
;
422 struct pohmelfs_inode
*pi
= pohmelfs_inode(inode
);
423 unsigned long long offset
= page_offset(ctl
->pvec
.pages
[0]);
425 pr_debug("pohmelfs: %s: bad write: ino: %lu, isize: %llu, offset: %llu: %d/%d\n",
426 pohmelfs_dump_id(pi
->id
.id
), inode
->i_ino
, inode
->i_size
, offset
,
427 atomic_read(&ctl
->good_writes
), ctl
->psb
->group_num
);
430 for (i
= 0; i
< pagevec_count(&ctl
->pvec
); ++i
) {
431 page
= ctl
->pvec
.pages
[i
];
433 if (PageLocked(page
)) {
434 end_page_writeback(page
);
438 set_page_dirty(page
);
444 pagevec_release(&ctl
->pvec
);
445 kmem_cache_free(pohmelfs_write_cache
, ctl
);
448 static int pohmelfs_writepages_chunk(struct pohmelfs_inode
*pi
, struct pohmelfs_write_ctl
*ctl
, struct writeback_control
*wbc
)
450 struct inode
*inode
= &pi
->vfs_inode
;
451 uint64_t offset
, size
;
455 offset
= page_offset(ctl
->pvec
.pages
[0]);
458 /* we will lookup them again when doing actual send */
459 for (i
= 0; i
< pagevec_count(&ctl
->pvec
); ++i
) {
460 struct page
*page
= ctl
->pvec
.pages
[i
];
463 /* just write all pages even if they were truncated - this is handled by inode info metadata */
465 if (unlikely(page
->mapping
!= mapping
)) {
471 if (!PageDirty(page
))
472 goto continue_unlock
;
474 if (!clear_page_dirty_for_io(page
))
475 goto continue_unlock
;
477 clear_page_dirty_for_io(page
);
480 set_page_writeback(page
);
482 size
+= PAGE_CACHE_SIZE
;
486 if (offset
+ size
> inode
->i_size
)
487 size
= inode
->i_size
- offset
;
489 err
= pohmelfs_write_command(pi
, ctl
, offset
, size
);
494 kref_put(&ctl
->refcnt
, pohmelfs_write_ctl_release
);
498 static int pohmelfs_writepages(struct address_space
*mapping
, struct writeback_control
*wbc
)
500 struct inode
*inode
= mapping
->host
;
501 struct pohmelfs_inode
*pi
= pohmelfs_inode(inode
);
502 struct pohmelfs_write_ctl
*ctl
;
504 pgoff_t end
; /* Inclusive */
505 int nr_pages
, err
= 0;
507 index
= wbc
->range_start
>> PAGE_CACHE_SHIFT
;
508 end
= wbc
->range_end
>> PAGE_CACHE_SHIFT
;
510 pr_debug("pohmelfs: %s: writepages: ino: %ld, nr: %ld, index: %llu, end: %llu, total_size: %lu, sync: %d\n",
511 pohmelfs_dump_id(pohmelfs_inode(inode
)->id
.id
), inode
->i_ino
,
512 wbc
->nr_to_write
, wbc
->range_start
, wbc
->range_end
, (unsigned long)inode
->i_size
, wbc
->sync_mode
);
514 if ((!wbc
->range_start
&& !wbc
->range_end
) || !inode
->i_size
) {
519 while (index
<= end
) {
520 ctl
= kmem_cache_zalloc(pohmelfs_write_cache
, GFP_NOIO
);
526 kref_init(&ctl
->refcnt
);
527 atomic_set(&ctl
->good_writes
, 0);
528 ctl
->psb
= pohmelfs_sb(inode
->i_sb
);
530 nr_pages
= pagevec_lookup_tag(&ctl
->pvec
, mapping
, &index
, PAGECACHE_TAG_DIRTY
,
531 min(end
- index
, (pgoff_t
)PAGEVEC_SIZE
-1) + 1);
534 kmem_cache_free(pohmelfs_write_cache
, ctl
);
538 err
= pohmelfs_writepages_chunk(pi
, ctl
, wbc
);
543 err
= pohmelfs_metadata_inode(pi
, wbc
->sync_mode
!= WB_SYNC_NONE
);
551 static const struct address_space_operations pohmelfs_aops
= {
552 .write_begin
= simple_write_begin
,
553 .write_end
= simple_write_end
,
554 .writepages
= pohmelfs_writepages
,
555 .readpage
= pohmelfs_readpage
,
556 .readpages
= pohmelfs_readpages
,
557 .set_page_dirty
= __set_page_dirty_nobuffers
,
560 void pohmelfs_convert_inode_info(struct pohmelfs_inode_info
*info
)
562 info
->ino
= cpu_to_le64(info
->ino
);
563 info
->mode
= cpu_to_le64(info
->mode
);
564 info
->nlink
= cpu_to_le64(info
->nlink
);
565 info
->uid
= cpu_to_le32(info
->uid
);
566 info
->gid
= cpu_to_le32(info
->gid
);
567 info
->namelen
= cpu_to_le32(info
->namelen
);
568 info
->blocks
= cpu_to_le64(info
->blocks
);
569 info
->rdev
= cpu_to_le64(info
->rdev
);
570 info
->size
= cpu_to_le64(info
->size
);
571 info
->version
= cpu_to_le64(info
->version
);
572 info
->blocksize
= cpu_to_le64(info
->blocksize
);
573 info
->flags
= cpu_to_le64(info
->flags
);
575 dnet_convert_time(&info
->ctime
);
576 dnet_convert_time(&info
->mtime
);
577 dnet_convert_time(&info
->atime
);
580 void pohmelfs_fill_inode_info(struct inode
*inode
, struct pohmelfs_inode_info
*info
)
582 struct pohmelfs_inode
*pi
= pohmelfs_inode(inode
);
584 memcpy(info
->id
.id
, pi
->id
.id
, DNET_ID_SIZE
);
586 info
->ino
= inode
->i_ino
;
587 info
->mode
= inode
->i_mode
;
588 info
->nlink
= inode
->i_nlink
;
589 info
->uid
= inode
->i_uid
;
590 info
->gid
= inode
->i_gid
;
591 info
->blocks
= inode
->i_blocks
;
592 info
->rdev
= inode
->i_rdev
;
593 info
->size
= inode
->i_size
;
594 info
->version
= inode
->i_version
;
595 info
->blocksize
= 1 << inode
->i_blkbits
;
597 info
->ctime
.tsec
= inode
->i_ctime
.tv_sec
;
598 info
->ctime
.tnsec
= inode
->i_ctime
.tv_nsec
;
600 info
->mtime
.tsec
= inode
->i_mtime
.tv_sec
;
601 info
->mtime
.tnsec
= inode
->i_mtime
.tv_nsec
;
603 info
->atime
.tsec
= inode
->i_atime
.tv_sec
;
604 info
->atime
.tnsec
= inode
->i_atime
.tv_nsec
;
609 void pohmelfs_fill_inode(struct inode
*inode
, struct pohmelfs_inode_info
*info
)
611 pr_debug("pohmelfs: %s: ino: %lu inode is regular: %d, dir: %d, link: %d, mode: %o, "
612 "namelen: %u, size: %llu, state: %lx, mtime: %llu.%llu/%lu.%lu\n",
613 pohmelfs_dump_id(info
->id
.id
), inode
->i_ino
,
614 S_ISREG(inode
->i_mode
), S_ISDIR(inode
->i_mode
),
615 S_ISLNK(inode
->i_mode
), inode
->i_mode
, info
->namelen
, inode
->i_size
, inode
->i_state
,
616 (unsigned long long)info
->mtime
.tsec
, (unsigned long long)info
->mtime
.tnsec
,
617 inode
->i_mtime
.tv_sec
, inode
->i_mtime
.tv_nsec
);
619 if (info
->mtime
.tsec
< inode
->i_mtime
.tv_sec
)
621 if ((info
->mtime
.tsec
== inode
->i_mtime
.tv_sec
) &&
622 (info
->mtime
.tnsec
< inode
->i_mtime
.tv_nsec
))
625 pohmelfs_inode(inode
)->id
= info
->id
;
627 inode
->i_mode
= info
->mode
;
628 inode
->i_nlink
= info
->nlink
;
629 inode
->i_uid
= info
->uid
;
630 inode
->i_gid
= info
->gid
;
631 inode
->i_blocks
= info
->blocks
;
632 inode
->i_rdev
= info
->rdev
;
633 inode
->i_size
= info
->size
;
634 inode
->i_version
= info
->version
;
635 inode
->i_blkbits
= ffs(info
->blocksize
);
637 inode
->i_mtime
= pohmelfs_date(&info
->mtime
);
638 inode
->i_atime
= pohmelfs_date(&info
->atime
);
639 inode
->i_ctime
= pohmelfs_date(&info
->ctime
);
642 static void pohmelfs_inode_info_current(struct pohmelfs_sb
*psb
, struct pohmelfs_inode_info
*info
)
644 struct timespec ts
= CURRENT_TIME
;
645 struct dnet_time dtime
;
647 info
->nlink
= S_ISDIR(info
->mode
) ? 2 : 1;
648 info
->uid
= current_fsuid();
649 info
->gid
= current_fsgid();
651 info
->blocksize
= PAGE_SIZE
;
656 dtime
.tsec
= ts
.tv_sec
;
657 dtime
.tnsec
= ts
.tv_nsec
;
663 pohmelfs_gen_id(psb
, &info
->id
);
666 struct pohmelfs_inode
*pohmelfs_existing_inode(struct pohmelfs_sb
*psb
, struct pohmelfs_inode_info
*info
)
668 struct pohmelfs_inode
*pi
;
672 inode
= iget_locked(psb
->sb
, atomic_long_inc_return(&psb
->ino
));
678 pi
= pohmelfs_inode(inode
);
680 if (inode
->i_state
& I_NEW
) {
681 pohmelfs_fill_inode(inode
, info
);
683 * i_mapping is a pointer to i_data during inode initialization.
685 inode
->i_data
.a_ops
= &pohmelfs_aops
;
687 if (S_ISREG(inode
->i_mode
)) {
688 inode
->i_fop
= &pohmelfs_file_ops
;
689 inode
->i_op
= &pohmelfs_file_inode_operations
;
690 } else if (S_ISDIR(inode
->i_mode
)) {
691 inode
->i_fop
= &pohmelfs_dir_fops
;
692 inode
->i_op
= &pohmelfs_dir_inode_operations
;
693 } else if (S_ISLNK(inode
->i_mode
)) {
694 inode
->i_op
= &pohmelfs_symlink_inode_operations
;
695 inode
->i_mapping
->a_ops
= &pohmelfs_aops
;
697 inode
->i_fop
= &generic_ro_fops
;
700 err
= pohmelfs_sb_inode_insert(psb
, pi
);
704 unlock_new_inode(inode
);
710 unlock_new_inode(inode
);
716 struct pohmelfs_inode
*pohmelfs_new_inode(struct pohmelfs_sb
*psb
, int mode
)
718 struct pohmelfs_inode
*pi
;
719 struct pohmelfs_inode_info
*info
;
722 info
= kmem_cache_zalloc(pohmelfs_inode_info_cache
, GFP_NOIO
);
730 pohmelfs_inode_info_current(psb
, info
);
732 pi
= pohmelfs_existing_inode(psb
, info
);
738 kmem_cache_free(pohmelfs_inode_info_cache
, info
);
742 kmem_cache_free(pohmelfs_inode_info_cache
, info
);
747 struct pohmelfs_wait
*pohmelfs_wait_alloc(struct pohmelfs_inode
*pi
)
749 struct pohmelfs_wait
*wait
;
751 wait
= kmem_cache_zalloc(pohmelfs_wait_cache
, GFP_NOIO
);
756 if (!igrab(&pi
->vfs_inode
))
761 atomic_long_set(&wait
->count
, 0);
762 init_waitqueue_head(&wait
->wq
);
763 kref_init(&wait
->refcnt
);
768 kmem_cache_free(pohmelfs_wait_cache
, wait
);
773 static void pohmelfs_wait_free(struct kref
*kref
)
775 struct pohmelfs_wait
*wait
= container_of(kref
, struct pohmelfs_wait
, refcnt
);
776 struct inode
*inode
= &wait
->pi
->vfs_inode
;
779 kmem_cache_free(pohmelfs_wait_cache
, wait
);
782 void pohmelfs_wait_put(struct pohmelfs_wait
*wait
)
784 kref_put(&wait
->refcnt
, pohmelfs_wait_free
);