fs/fuse/file.c

   1 /*
   2   FUSE: Filesystem in Userspace
   3   Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
   4
   5   This program can be distributed under the terms of the GNU GPL.
   6   See the file COPYING.
   7 */
   8
   9 #include "fuse_i.h"
  10
  11 #include <linux/pagemap.h>
  12 #include <linux/slab.h>
  13 #include <linux/kernel.h>
  14 #include <linux/sched.h>
  15 #include <linux/sched/signal.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/falloc.h>
  19 #include <linux/uio.h>
  20 #include <linux/fs.h>
  21 #include <linux/filelock.h>
  22 #include <linux/splice.h>
  23 #include <linux/task_io_accounting_ops.h>
  24
  25 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
  26                           unsigned int open_flags, int opcode,
  27                           struct fuse_open_out *outargp)
  28 {
  29         struct fuse_open_in inarg;
  30         FUSE_ARGS(args);
  31
  32         memset(&inarg, 0, sizeof(inarg));
  33         inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
  34         if (!fm->fc->atomic_o_trunc)
  35                 inarg.flags &= ~O_TRUNC;
  36
  37         if (fm->fc->handle_killpriv_v2 &&
  38             (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
  39                 inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
  40         }
  41
  42         args.opcode = opcode;
  43         args.nodeid = nodeid;
  44         args.in_numargs = 1;
  45         args.in_args[0].size = sizeof(inarg);
  46         args.in_args[0].value = &inarg;
  47         args.out_numargs = 1;
  48         args.out_args[0].size = sizeof(*outargp);
  49         args.out_args[0].value = outargp;
  50
  51         return fuse_simple_request(fm, &args);
  52 }
  53
  54 struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
  55 {
  56         struct fuse_file *ff;
  57
  58         ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
  59         if (unlikely(!ff))
  60                 return NULL;
  61
  62         ff->fm = fm;
  63         if (release) {
  64                 ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT);
  65                 if (!ff->args) {
  66                         kfree(ff);
  67                         return NULL;
  68                 }
  69         }
  70
  71         INIT_LIST_HEAD(&ff->write_entry);
  72         refcount_set(&ff->count, 1);
  73         RB_CLEAR_NODE(&ff->polled_node);
  74         init_waitqueue_head(&ff->poll_wait);
  75
  76         ff->kh = atomic64_inc_return(&fm->fc->khctr);
  77
  78         return ff;
  79 }
  80
  81 void fuse_file_free(struct fuse_file *ff)
  82 {
  83         kfree(ff->args);
  84         kfree(ff);
  85 }
  86
  87 static struct fuse_file *fuse_file_get(struct fuse_file *ff)
  88 {
  89         refcount_inc(&ff->count);
  90         return ff;
  91 }
  92
  93 static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
  94                              int error)
  95 {
  96         struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
  97
  98         iput(ra->inode);
  99         kfree(ra);
 100 }
 101
 102 static void fuse_file_put(struct fuse_file *ff, bool sync)
 103 {
 104         if (refcount_dec_and_test(&ff->count)) {
 105                 struct fuse_release_args *ra = &ff->args->release_args;
 106                 struct fuse_args *args = (ra ? &ra->args : NULL);
 107
 108                 if (ra && ra->inode)
 109                         fuse_file_io_release(ff, ra->inode);
 110
 111                 if (!args) {
 112                         /* Do nothing when server does not implement 'open' */
 113                 } else if (sync) {
 114                         fuse_simple_request(ff->fm, args);
 115                         fuse_release_end(ff->fm, args, 0);
 116                 } else {
 117                         args->end = fuse_release_end;
 118                         if (fuse_simple_background(ff->fm, args,
 119                                                    GFP_KERNEL | __GFP_NOFAIL))
 120                                 fuse_release_end(ff->fm, args, -ENOTCONN);
 121                 }
 122                 kfree(ff);
 123         }
 124 }
 125
 126 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 127                                  unsigned int open_flags, bool isdir)
 128 {
 129         struct fuse_conn *fc = fm->fc;
 130         struct fuse_file *ff;
 131         int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 132         bool open = isdir ? !fc->no_opendir : !fc->no_open;
 133
 134         ff = fuse_file_alloc(fm, open);
 135         if (!ff)
 136                 return ERR_PTR(-ENOMEM);
 137
 138         ff->fh = 0;
 139         /* Default for no-open */
 140         ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
 141         if (open) {
 142                 /* Store outarg for fuse_finish_open() */
 143                 struct fuse_open_out *outargp = &ff->args->open_outarg;
 144                 int err;
 145
 146                 err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
 147                 if (!err) {
 148                         ff->fh = outargp->fh;
 149                         ff->open_flags = outargp->open_flags;
 150                 } else if (err != -ENOSYS) {
 151                         fuse_file_free(ff);
 152                         return ERR_PTR(err);
 153                 } else {
 154                         /* No release needed */
 155                         kfree(ff->args);
 156                         ff->args = NULL;
 157                         if (isdir)
 158                                 fc->no_opendir = 1;
 159                         else
 160                                 fc->no_open = 1;
 161                 }
 162         }
 163
 164         if (isdir)
 165                 ff->open_flags &= ~FOPEN_DIRECT_IO;
 166
 167         ff->nodeid = nodeid;
 168
 169         return ff;
 170 }
 171
 172 int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
 173                  bool isdir)
 174 {
 175         struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
 176
 177         if (!IS_ERR(ff))
 178                 file->private_data = ff;
 179
 180         return PTR_ERR_OR_ZERO(ff);
 181 }
 182 EXPORT_SYMBOL_GPL(fuse_do_open);
 183
 184 static void fuse_link_write_file(struct file *file)
 185 {
 186         struct inode *inode = file_inode(file);
 187         struct fuse_inode *fi = get_fuse_inode(inode);
 188         struct fuse_file *ff = file->private_data;
 189         /*
 190          * file may be written through mmap, so chain it onto the
 191          * inodes's write_file list
 192          */
 193         spin_lock(&fi->lock);
 194         if (list_empty(&ff->write_entry))
 195                 list_add(&ff->write_entry, &fi->write_files);
 196         spin_unlock(&fi->lock);
 197 }
 198
 199 int fuse_finish_open(struct inode *inode, struct file *file)
 200 {
 201         struct fuse_file *ff = file->private_data;
 202         struct fuse_conn *fc = get_fuse_conn(inode);
 203         int err;
 204
 205         err = fuse_file_io_open(file, inode);
 206         if (err)
 207                 return err;
 208
 209         if (ff->open_flags & FOPEN_STREAM)
 210                 stream_open(inode, file);
 211         else if (ff->open_flags & FOPEN_NONSEEKABLE)
 212                 nonseekable_open(inode, file);
 213
 214         if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
 215                 fuse_link_write_file(file);
 216
 217         return 0;
 218 }
 219
 220 static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
 221 {
 222         struct fuse_conn *fc = get_fuse_conn(inode);
 223         struct fuse_inode *fi = get_fuse_inode(inode);
 224
 225         spin_lock(&fi->lock);
 226         fi->attr_version = atomic64_inc_return(&fc->attr_version);
 227         i_size_write(inode, 0);
 228         spin_unlock(&fi->lock);
 229         file_update_time(file);
 230         fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
 231 }
 232
 233 static int fuse_open(struct inode *inode, struct file *file)
 234 {
 235         struct fuse_mount *fm = get_fuse_mount(inode);
 236         struct fuse_inode *fi = get_fuse_inode(inode);
 237         struct fuse_conn *fc = fm->fc;
 238         struct fuse_file *ff;
 239         int err;
 240         bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
 241         bool is_wb_truncate = is_truncate && fc->writeback_cache;
 242         bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
 243
 244         if (fuse_is_bad(inode))
 245                 return -EIO;
 246
 247         err = generic_file_open(inode, file);
 248         if (err)
 249                 return err;
 250
 251         if (is_wb_truncate || dax_truncate)
 252                 inode_lock(inode);
 253
 254         if (dax_truncate) {
 255                 filemap_invalidate_lock(inode->i_mapping);
 256                 err = fuse_dax_break_layouts(inode, 0, 0);
 257                 if (err)
 258                         goto out_inode_unlock;
 259         }
 260
 261         if (is_wb_truncate || dax_truncate)
 262                 fuse_set_nowrite(inode);
 263
 264         err = fuse_do_open(fm, get_node_id(inode), file, false);
 265         if (!err) {
 266                 ff = file->private_data;
 267                 err = fuse_finish_open(inode, file);
 268                 if (err)
 269                         fuse_sync_release(fi, ff, file->f_flags);
 270                 else if (is_truncate)
 271                         fuse_truncate_update_attr(inode, file);
 272         }
 273
 274         if (is_wb_truncate || dax_truncate)
 275                 fuse_release_nowrite(inode);
 276         if (!err) {
 277                 if (is_truncate)
 278                         truncate_pagecache(inode, 0);
 279                 else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
 280                         invalidate_inode_pages2(inode->i_mapping);
 281         }
 282         if (dax_truncate)
 283                 filemap_invalidate_unlock(inode->i_mapping);
 284 out_inode_unlock:
 285         if (is_wb_truncate || dax_truncate)
 286                 inode_unlock(inode);
 287
 288         return err;
 289 }
 290
 291 static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
 292                                  unsigned int flags, int opcode, bool sync)
 293 {
 294         struct fuse_conn *fc = ff->fm->fc;
 295         struct fuse_release_args *ra = &ff->args->release_args;
 296
 297         if (fuse_file_passthrough(ff))
 298                 fuse_passthrough_release(ff, fuse_inode_backing(fi));
 299
 300         /* Inode is NULL on error path of fuse_create_open() */
 301         if (likely(fi)) {
 302                 spin_lock(&fi->lock);
 303                 list_del(&ff->write_entry);
 304                 spin_unlock(&fi->lock);
 305         }
 306         spin_lock(&fc->lock);
 307         if (!RB_EMPTY_NODE(&ff->polled_node))
 308                 rb_erase(&ff->polled_node, &fc->polled_files);
 309         spin_unlock(&fc->lock);
 310
 311         wake_up_interruptible_all(&ff->poll_wait);
 312
 313         if (!ra)
 314                 return;
 315
 316         /* ff->args was used for open outarg */
 317         memset(ff->args, 0, sizeof(*ff->args));
 318         ra->inarg.fh = ff->fh;
 319         ra->inarg.flags = flags;
 320         ra->args.in_numargs = 1;
 321         ra->args.in_args[0].size = sizeof(struct fuse_release_in);
 322         ra->args.in_args[0].value = &ra->inarg;
 323         ra->args.opcode = opcode;
 324         ra->args.nodeid = ff->nodeid;
 325         ra->args.force = true;
 326         ra->args.nocreds = true;
 327
 328         /*
 329          * Hold inode until release is finished.
 330          * From fuse_sync_release() the refcount is 1 and everything's
 331          * synchronous, so we are fine with not doing igrab() here.
 332          */
 333         ra->inode = sync ? NULL : igrab(&fi->inode);
 334 }
 335
 336 void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 337                        unsigned int open_flags, fl_owner_t id, bool isdir)
 338 {
 339         struct fuse_inode *fi = get_fuse_inode(inode);
 340         struct fuse_release_args *ra = &ff->args->release_args;
 341         int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 342
 343         fuse_prepare_release(fi, ff, open_flags, opcode, false);
 344
 345         if (ra && ff->flock) {
 346                 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
 347                 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
 348         }
 349
 350         /*
 351          * Normally this will send the RELEASE request, however if
 352          * some asynchronous READ or WRITE requests are outstanding,
 353          * the sending will be delayed.
 354          *
 355          * Make the release synchronous if this is a fuseblk mount,
 356          * synchronous RELEASE is allowed (and desirable) in this case
 357          * because the server can be trusted not to screw up.
 358          */
 359         fuse_file_put(ff, ff->fm->fc->destroy);
 360 }
 361
 362 void fuse_release_common(struct file *file, bool isdir)
 363 {
 364         fuse_file_release(file_inode(file), file->private_data, file->f_flags,
 365                           (fl_owner_t) file, isdir);
 366 }
 367
 368 static int fuse_release(struct inode *inode, struct file *file)
 369 {
 370         struct fuse_conn *fc = get_fuse_conn(inode);
 371
 372         /*
 373          * Dirty pages might remain despite write_inode_now() call from
 374          * fuse_flush() due to writes racing with the close.
 375          */
 376         if (fc->writeback_cache)
 377                 write_inode_now(inode, 1);
 378
 379         fuse_release_common(file, false);
 380
 381         /* return value is ignored by VFS */
 382         return 0;
 383 }
 384
 385 void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
 386                        unsigned int flags)
 387 {
 388         WARN_ON(refcount_read(&ff->count) > 1);
 389         fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
 390         fuse_file_put(ff, true);
 391 }
 392 EXPORT_SYMBOL_GPL(fuse_sync_release);
 393
 394 /*
 395  * Scramble the ID space with XTEA, so that the value of the files_struct
 396  * pointer is not exposed to userspace.
 397  */
 398 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 399 {
 400         u32 *k = fc->scramble_key;
 401         u64 v = (unsigned long) id;
 402         u32 v0 = v;
 403         u32 v1 = v >> 32;
 404         u32 sum = 0;
 405         int i;
 406
 407         for (i = 0; i < 32; i++) {
 408                 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
 409                 sum += 0x9E3779B9;
 410                 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
 411         }
 412
 413         return (u64) v0 + ((u64) v1 << 32);
 414 }
 415
 416 struct fuse_writepage_args {
 417         struct fuse_io_args ia;
 418         struct rb_node writepages_entry;
 419         struct list_head queue_entry;
 420         struct fuse_writepage_args *next;
 421         struct inode *inode;
 422         struct fuse_sync_bucket *bucket;
 423 };
 424
 425 static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
 426                                             pgoff_t idx_from, pgoff_t idx_to)
 427 {
 428         struct rb_node *n;
 429
 430         n = fi->writepages.rb_node;
 431
 432         while (n) {
 433                 struct fuse_writepage_args *wpa;
 434                 pgoff_t curr_index;
 435
 436                 wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
 437                 WARN_ON(get_fuse_inode(wpa->inode) != fi);
 438                 curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
 439                 if (idx_from >= curr_index + wpa->ia.ap.num_folios)
 440                         n = n->rb_right;
 441                 else if (idx_to < curr_index)
 442                         n = n->rb_left;
 443                 else
 444                         return wpa;
 445         }
 446         return NULL;
 447 }
 448
 449 /*
 450  * Check if any page in a range is under writeback
 451  */
 452 static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
 453                                    pgoff_t idx_to)
 454 {
 455         struct fuse_inode *fi = get_fuse_inode(inode);
 456         bool found;
 457
 458         if (RB_EMPTY_ROOT(&fi->writepages))
 459                 return false;
 460
 461         spin_lock(&fi->lock);
 462         found = fuse_find_writeback(fi, idx_from, idx_to);
 463         spin_unlock(&fi->lock);
 464
 465         return found;
 466 }
 467
 468 static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
 469 {
 470         return fuse_range_is_writeback(inode, index, index);
 471 }
 472
 473 /*
 474  * Wait for page writeback to be completed.
 475  *
 476  * Since fuse doesn't rely on the VM writeback tracking, this has to
 477  * use some other means.
 478  */
 479 static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 480 {
 481         struct fuse_inode *fi = get_fuse_inode(inode);
 482
 483         wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
 484 }
 485
 486 static inline bool fuse_folio_is_writeback(struct inode *inode,
 487                                            struct folio *folio)
 488 {
 489         pgoff_t last = folio_next_index(folio) - 1;
 490         return fuse_range_is_writeback(inode, folio_index(folio), last);
 491 }
 492
 493 static void fuse_wait_on_folio_writeback(struct inode *inode,
 494                                          struct folio *folio)
 495 {
 496         struct fuse_inode *fi = get_fuse_inode(inode);
 497
 498         wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio));
 499 }
 500
 501 /*
 502  * Wait for all pending writepages on the inode to finish.
 503  *
 504  * This is currently done by blocking further writes with FUSE_NOWRITE
 505  * and waiting for all sent writes to complete.
 506  *
 507  * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
 508  * could conflict with truncation.
 509  */
 510 static void fuse_sync_writes(struct inode *inode)
 511 {
 512         fuse_set_nowrite(inode);
 513         fuse_release_nowrite(inode);
 514 }
 515
 516 static int fuse_flush(struct file *file, fl_owner_t id)
 517 {
 518         struct inode *inode = file_inode(file);
 519         struct fuse_mount *fm = get_fuse_mount(inode);
 520         struct fuse_file *ff = file->private_data;
 521         struct fuse_flush_in inarg;
 522         FUSE_ARGS(args);
 523         int err;
 524
 525         if (fuse_is_bad(inode))
 526                 return -EIO;
 527
 528         if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
 529                 return 0;
 530
 531         err = write_inode_now(inode, 1);
 532         if (err)
 533                 return err;
 534
 535         inode_lock(inode);
 536         fuse_sync_writes(inode);
 537         inode_unlock(inode);
 538
 539         err = filemap_check_errors(file->f_mapping);
 540         if (err)
 541                 return err;
 542
 543         err = 0;
 544         if (fm->fc->no_flush)
 545                 goto inval_attr_out;
 546
 547         memset(&inarg, 0, sizeof(inarg));
 548         inarg.fh = ff->fh;
 549         inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
 550         args.opcode = FUSE_FLUSH;
 551         args.nodeid = get_node_id(inode);
 552         args.in_numargs = 1;
 553         args.in_args[0].size = sizeof(inarg);
 554         args.in_args[0].value = &inarg;
 555         args.force = true;
 556
 557         err = fuse_simple_request(fm, &args);
 558         if (err == -ENOSYS) {
 559                 fm->fc->no_flush = 1;
 560                 err = 0;
 561         }
 562
 563 inval_attr_out:
 564         /*
 565          * In memory i_blocks is not maintained by fuse, if writeback cache is
 566          * enabled, i_blocks from cached attr may not be accurate.
 567          */
 568         if (!err && fm->fc->writeback_cache)
 569                 fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
 570         return err;
 571 }
 572
 573 int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 574                       int datasync, int opcode)
 575 {
 576         struct inode *inode = file->f_mapping->host;
 577         struct fuse_mount *fm = get_fuse_mount(inode);
 578         struct fuse_file *ff = file->private_data;
 579         FUSE_ARGS(args);
 580         struct fuse_fsync_in inarg;
 581
 582         memset(&inarg, 0, sizeof(inarg));
 583         inarg.fh = ff->fh;
 584         inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
 585         args.opcode = opcode;
 586         args.nodeid = get_node_id(inode);
 587         args.in_numargs = 1;
 588         args.in_args[0].size = sizeof(inarg);
 589         args.in_args[0].value = &inarg;
 590         return fuse_simple_request(fm, &args);
 591 }
 592
 593 static int fuse_fsync(struct file *file, loff_t start, loff_t end,
 594                       int datasync)
 595 {
 596         struct inode *inode = file->f_mapping->host;
 597         struct fuse_conn *fc = get_fuse_conn(inode);
 598         int err;
 599
 600         if (fuse_is_bad(inode))
 601                 return -EIO;
 602
 603         inode_lock(inode);
 604
 605         /*
 606          * Start writeback against all dirty pages of the inode, then
 607          * wait for all outstanding writes, before sending the FSYNC
 608          * request.
 609          */
 610         err = file_write_and_wait_range(file, start, end);
 611         if (err)
 612                 goto out;
 613
 614         fuse_sync_writes(inode);
 615
 616         /*
 617          * Due to implementation of fuse writeback
 618          * file_write_and_wait_range() does not catch errors.
 619          * We have to do this directly after fuse_sync_writes()
 620          */
 621         err = file_check_and_advance_wb_err(file);
 622         if (err)
 623                 goto out;
 624
 625         err = sync_inode_metadata(inode, 1);
 626         if (err)
 627                 goto out;
 628
 629         if (fc->no_fsync)
 630                 goto out;
 631
 632         err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
 633         if (err == -ENOSYS) {
 634                 fc->no_fsync = 1;
 635                 err = 0;
 636         }
 637 out:
 638         inode_unlock(inode);
 639
 640         return err;
 641 }
 642
 643 void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
 644                          size_t count, int opcode)
 645 {
 646         struct fuse_file *ff = file->private_data;
 647         struct fuse_args *args = &ia->ap.args;
 648
 649         ia->read.in.fh = ff->fh;
 650         ia->read.in.offset = pos;
 651         ia->read.in.size = count;
 652         ia->read.in.flags = file->f_flags;
 653         args->opcode = opcode;
 654         args->nodeid = ff->nodeid;
 655         args->in_numargs = 1;
 656         args->in_args[0].size = sizeof(ia->read.in);
 657         args->in_args[0].value = &ia->read.in;
 658         args->out_argvar = true;
 659         args->out_numargs = 1;
 660         args->out_args[0].size = count;
 661 }
 662
 663 static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres,
 664                                     bool should_dirty)
 665 {
 666         unsigned int i;
 667
 668         for (i = 0; i < ap->num_folios; i++) {
 669                 if (should_dirty)
 670                         folio_mark_dirty_lock(ap->folios[i]);
 671                 if (ap->args.is_pinned)
 672                         unpin_folio(ap->folios[i]);
 673         }
 674
 675         if (nres > 0 && ap->args.invalidate_vmap)
 676                 invalidate_kernel_vmap_range(ap->args.vmap_base, nres);
 677 }
 678
 679 static void fuse_io_release(struct kref *kref)
 680 {
 681         kfree(container_of(kref, struct fuse_io_priv, refcnt));
 682 }
 683
 684 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
 685 {
 686         if (io->err)
 687                 return io->err;
 688
 689         if (io->bytes >= 0 && io->write)
 690                 return -EIO;
 691
 692         return io->bytes < 0 ? io->size : io->bytes;
 693 }
 694
 695 /*
 696  * In case of short read, the caller sets 'pos' to the position of
 697  * actual end of fuse request in IO request. Otherwise, if bytes_requested
 698  * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
 699  *
 700  * An example:
 701  * User requested DIO read of 64K. It was split into two 32K fuse requests,
 702  * both submitted asynchronously. The first of them was ACKed by userspace as
 703  * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
 704  * second request was ACKed as short, e.g. only 1K was read, resulting in
 705  * pos == 33K.
 706  *
 707  * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
 708  * will be equal to the length of the longest contiguous fragment of
 709  * transferred data starting from the beginning of IO request.
 710  */
 711 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 712 {
 713         int left;
 714
 715         spin_lock(&io->lock);
 716         if (err)
 717                 io->err = io->err ? : err;
 718         else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
 719                 io->bytes = pos;
 720
 721         left = --io->reqs;
 722         if (!left && io->blocking)
 723                 complete(io->done);
 724         spin_unlock(&io->lock);
 725
 726         if (!left && !io->blocking) {
 727                 ssize_t res = fuse_get_res_by_io(io);
 728
 729                 if (res >= 0) {
 730                         struct inode *inode = file_inode(io->iocb->ki_filp);
 731                         struct fuse_conn *fc = get_fuse_conn(inode);
 732                         struct fuse_inode *fi = get_fuse_inode(inode);
 733
 734                         spin_lock(&fi->lock);
 735                         fi->attr_version = atomic64_inc_return(&fc->attr_version);
 736                         spin_unlock(&fi->lock);
 737                 }
 738
 739                 io->iocb->ki_complete(io->iocb, res);
 740         }
 741
 742         kref_put(&io->refcnt, fuse_io_release);
 743 }
 744
 745 static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
 746                                                  unsigned int nfolios)
 747 {
 748         struct fuse_io_args *ia;
 749
 750         ia = kzalloc(sizeof(*ia), GFP_KERNEL);
 751         if (ia) {
 752                 ia->io = io;
 753                 ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL,
 754                                                   &ia->ap.descs);
 755                 if (!ia->ap.folios) {
 756                         kfree(ia);
 757                         ia = NULL;
 758                 }
 759         }
 760         return ia;
 761 }
 762
 763 static void fuse_io_free(struct fuse_io_args *ia)
 764 {
 765         kfree(ia->ap.folios);
 766         kfree(ia);
 767 }
 768
 769 static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
 770                                   int err)
 771 {
 772         struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
 773         struct fuse_io_priv *io = ia->io;
 774         ssize_t pos = -1;
 775         size_t nres;
 776
 777         if (err) {
 778                 /* Nothing */
 779         } else if (io->write) {
 780                 if (ia->write.out.size > ia->write.in.size) {
 781                         err = -EIO;
 782                 } else {
 783                         nres = ia->write.out.size;
 784                         if (ia->write.in.size != ia->write.out.size)
 785                                 pos = ia->write.in.offset - io->offset +
 786                                       ia->write.out.size;
 787                 }
 788         } else {
 789                 u32 outsize = args->out_args[0].size;
 790
 791                 nres = outsize;
 792                 if (ia->read.in.size != outsize)
 793                         pos = ia->read.in.offset - io->offset + outsize;
 794         }
 795
 796         fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty);
 797
 798         fuse_aio_complete(io, err, pos);
 799         fuse_io_free(ia);
 800 }
 801
 802 static ssize_t fuse_async_req_send(struct fuse_mount *fm,
 803                                    struct fuse_io_args *ia, size_t num_bytes)
 804 {
 805         ssize_t err;
 806         struct fuse_io_priv *io = ia->io;
 807
 808         spin_lock(&io->lock);
 809         kref_get(&io->refcnt);
 810         io->size += num_bytes;
 811         io->reqs++;
 812         spin_unlock(&io->lock);
 813
 814         ia->ap.args.end = fuse_aio_complete_req;
 815         ia->ap.args.may_block = io->should_dirty;
 816         err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
 817         if (err)
 818                 fuse_aio_complete_req(fm, &ia->ap.args, err);
 819
 820         return num_bytes;
 821 }
 822
 823 static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
 824                               fl_owner_t owner)
 825 {
 826         struct file *file = ia->io->iocb->ki_filp;
 827         struct fuse_file *ff = file->private_data;
 828         struct fuse_mount *fm = ff->fm;
 829
 830         fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 831         if (owner != NULL) {
 832                 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
 833                 ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
 834         }
 835
 836         if (ia->io->async)
 837                 return fuse_async_req_send(fm, ia, count);
 838
 839         return fuse_simple_request(fm, &ia->ap.args);
 840 }
 841
 842 static void fuse_read_update_size(struct inode *inode, loff_t size,
 843                                   u64 attr_ver)
 844 {
 845         struct fuse_conn *fc = get_fuse_conn(inode);
 846         struct fuse_inode *fi = get_fuse_inode(inode);
 847
 848         spin_lock(&fi->lock);
 849         if (attr_ver >= fi->attr_version && size < inode->i_size &&
 850             !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
 851                 fi->attr_version = atomic64_inc_return(&fc->attr_version);
 852                 i_size_write(inode, size);
 853         }
 854         spin_unlock(&fi->lock);
 855 }
 856
 857 static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
 858                             struct fuse_args_pages *ap)
 859 {
 860         struct fuse_conn *fc = get_fuse_conn(inode);
 861
 862         /*
 863          * If writeback_cache is enabled, a short read means there's a hole in
 864          * the file.  Some data after the hole is in page cache, but has not
 865          * reached the client fs yet.  So the hole is not present there.
 866          */
 867         if (!fc->writeback_cache) {
 868                 loff_t pos = folio_pos(ap->folios[0]) + num_read;
 869                 fuse_read_update_size(inode, pos, attr_ver);
 870         }
 871 }
 872
 873 static int fuse_do_readfolio(struct file *file, struct folio *folio)
 874 {
 875         struct inode *inode = folio->mapping->host;
 876         struct fuse_mount *fm = get_fuse_mount(inode);
 877         loff_t pos = folio_pos(folio);
 878         struct fuse_folio_desc desc = { .length = PAGE_SIZE };
 879         struct fuse_io_args ia = {
 880                 .ap.args.page_zeroing = true,
 881                 .ap.args.out_pages = true,
 882                 .ap.num_folios = 1,
 883                 .ap.folios = &folio,
 884                 .ap.descs = &desc,
 885         };
 886         ssize_t res;
 887         u64 attr_ver;
 888
 889         /*
 890          * With the temporary pages that are used to complete writeback, we can
 891          * have writeback that extends beyond the lifetime of the folio.  So
 892          * make sure we read a properly synced folio.
 893          */
 894         fuse_wait_on_folio_writeback(inode, folio);
 895
 896         attr_ver = fuse_get_attr_version(fm->fc);
 897
 898         /* Don't overflow end offset */
 899         if (pos + (desc.length - 1) == LLONG_MAX)
 900                 desc.length--;
 901
 902         fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
 903         res = fuse_simple_request(fm, &ia.ap.args);
 904         if (res < 0)
 905                 return res;
 906         /*
 907          * Short read means EOF.  If file size is larger, truncate it
 908          */
 909         if (res < desc.length)
 910                 fuse_short_read(inode, attr_ver, res, &ia.ap);
 911
 912         folio_mark_uptodate(folio);
 913
 914         return 0;
 915 }
 916
 917 static int fuse_read_folio(struct file *file, struct folio *folio)
 918 {
 919         struct inode *inode = folio->mapping->host;
 920         int err;
 921
 922         err = -EIO;
 923         if (fuse_is_bad(inode))
 924                 goto out;
 925
 926         err = fuse_do_readfolio(file, folio);
 927         fuse_invalidate_atime(inode);
 928  out:
 929         folio_unlock(folio);
 930         return err;
 931 }
 932
 933 static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
 934                                int err)
 935 {
 936         int i;
 937         struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
 938         struct fuse_args_pages *ap = &ia->ap;
 939         size_t count = ia->read.in.size;
 940         size_t num_read = args->out_args[0].size;
 941         struct address_space *mapping = NULL;
 942
 943         for (i = 0; mapping == NULL && i < ap->num_folios; i++)
 944                 mapping = ap->folios[i]->mapping;
 945
 946         if (mapping) {
 947                 struct inode *inode = mapping->host;
 948
 949                 /*
 950                  * Short read means EOF. If file size is larger, truncate it
 951                  */
 952                 if (!err && num_read < count)
 953                         fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
 954
 955                 fuse_invalidate_atime(inode);
 956         }
 957
 958         for (i = 0; i < ap->num_folios; i++)
 959                 folio_end_read(ap->folios[i], !err);
 960         if (ia->ff)
 961                 fuse_file_put(ia->ff, false);
 962
 963         fuse_io_free(ia);
 964 }
 965
 966 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
 967 {
 968         struct fuse_file *ff = file->private_data;
 969         struct fuse_mount *fm = ff->fm;
 970         struct fuse_args_pages *ap = &ia->ap;
 971         loff_t pos = folio_pos(ap->folios[0]);
 972         /* Currently, all folios in FUSE are one page */
 973         size_t count = ap->num_folios << PAGE_SHIFT;
 974         ssize_t res;
 975         int err;
 976
 977         ap->args.out_pages = true;
 978         ap->args.page_zeroing = true;
 979         ap->args.page_replace = true;
 980
 981         /* Don't overflow end offset */
 982         if (pos + (count - 1) == LLONG_MAX) {
 983                 count--;
 984                 ap->descs[ap->num_folios - 1].length--;
 985         }
 986         WARN_ON((loff_t) (pos + count) < 0);
 987
 988         fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 989         ia->read.attr_ver = fuse_get_attr_version(fm->fc);
 990         if (fm->fc->async_read) {
 991                 ia->ff = fuse_file_get(ff);
 992                 ap->args.end = fuse_readpages_end;
 993                 err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
 994                 if (!err)
 995                         return;
 996         } else {
 997                 res = fuse_simple_request(fm, &ap->args);
 998                 err = res < 0 ? res : 0;
 999         }
1000         fuse_readpages_end(fm, &ap->args, err);
1001 }
1002
1003 static void fuse_readahead(struct readahead_control *rac)
1004 {
1005         struct inode *inode = rac->mapping->host;
1006         struct fuse_inode *fi = get_fuse_inode(inode);
1007         struct fuse_conn *fc = get_fuse_conn(inode);
1008         unsigned int max_pages, nr_pages;
1009         pgoff_t first = readahead_index(rac);
1010         pgoff_t last = first + readahead_count(rac) - 1;
1011
1012         if (fuse_is_bad(inode))
1013                 return;
1014
1015         wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last));
1016
1017         max_pages = min_t(unsigned int, fc->max_pages,
1018                         fc->max_read / PAGE_SIZE);
1019
1020         /*
1021          * This is only accurate the first time through, since readahead_folio()
1022          * doesn't update readahead_count() from the previous folio until the
1023          * next call.  Grab nr_pages here so we know how many pages we're going
1024          * to have to process.  This means that we will exit here with
1025          * readahead_count() == folio_nr_pages(last_folio), but we will have
1026          * consumed all of the folios, and read_pages() will call
1027          * readahead_folio() again which will clean up the rac.
1028          */
1029         nr_pages = readahead_count(rac);
1030
1031         while (nr_pages) {
1032                 struct fuse_io_args *ia;
1033                 struct fuse_args_pages *ap;
1034                 struct folio *folio;
1035                 unsigned cur_pages = min(max_pages, nr_pages);
1036
1037                 if (fc->num_background >= fc->congestion_threshold &&
1038                     rac->ra->async_size >= readahead_count(rac))
1039                         /*
1040                          * Congested and only async pages left, so skip the
1041                          * rest.
1042                          */
1043                         break;
1044
1045                 ia = fuse_io_alloc(NULL, cur_pages);
1046                 if (!ia)
1047                         return;
1048                 ap = &ia->ap;
1049
1050                 while (ap->num_folios < cur_pages) {
1051                         folio = readahead_folio(rac);
1052                         ap->folios[ap->num_folios] = folio;
1053                         ap->descs[ap->num_folios].length = folio_size(folio);
1054                         ap->num_folios++;
1055                 }
1056                 fuse_send_readpages(ia, rac->file);
1057                 nr_pages -= cur_pages;
1058         }
1059 }
1060
1061 static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
1062 {
1063         struct inode *inode = iocb->ki_filp->f_mapping->host;
1064         struct fuse_conn *fc = get_fuse_conn(inode);
1065
1066         /*
1067          * In auto invalidate mode, always update attributes on read.
1068          * Otherwise, only update if we attempt to read past EOF (to ensure
1069          * i_size is up to date).
1070          */
1071         if (fc->auto_inval_data ||
1072             (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
1073                 int err;
1074                 err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
1075                 if (err)
1076                         return err;
1077         }
1078
1079         return generic_file_read_iter(iocb, to);
1080 }
1081
1082 static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
1083                                  loff_t pos, size_t count)
1084 {
1085         struct fuse_args *args = &ia->ap.args;
1086
1087         ia->write.in.fh = ff->fh;
1088         ia->write.in.offset = pos;
1089         ia->write.in.size = count;
1090         args->opcode = FUSE_WRITE;
1091         args->nodeid = ff->nodeid;
1092         args->in_numargs = 2;
1093         if (ff->fm->fc->minor < 9)
1094                 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
1095         else
1096                 args->in_args[0].size = sizeof(ia->write.in);
1097         args->in_args[0].value = &ia->write.in;
1098         args->in_args[1].size = count;
1099         args->out_numargs = 1;
1100         args->out_args[0].size = sizeof(ia->write.out);
1101         args->out_args[0].value = &ia->write.out;
1102 }
1103
1104 static unsigned int fuse_write_flags(struct kiocb *iocb)
1105 {
1106         unsigned int flags = iocb->ki_filp->f_flags;
1107
1108         if (iocb_is_dsync(iocb))
1109                 flags |= O_DSYNC;
1110         if (iocb->ki_flags & IOCB_SYNC)
1111                 flags |= O_SYNC;
1112
1113         return flags;
1114 }
1115
1116 static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
1117                                size_t count, fl_owner_t owner)
1118 {
1119         struct kiocb *iocb = ia->io->iocb;
1120         struct file *file = iocb->ki_filp;
1121         struct fuse_file *ff = file->private_data;
1122         struct fuse_mount *fm = ff->fm;
1123         struct fuse_write_in *inarg = &ia->write.in;
1124         ssize_t err;
1125
1126         fuse_write_args_fill(ia, ff, pos, count);
1127         inarg->flags = fuse_write_flags(iocb);
1128         if (owner != NULL) {
1129                 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
1130                 inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
1131         }
1132
1133         if (ia->io->async)
1134                 return fuse_async_req_send(fm, ia, count);
1135
1136         err = fuse_simple_request(fm, &ia->ap.args);
1137         if (!err && ia->write.out.size > count)
1138                 err = -EIO;
1139
1140         return err ?: ia->write.out.size;
1141 }
1142
1143 bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
1144 {
1145         struct fuse_conn *fc = get_fuse_conn(inode);
1146         struct fuse_inode *fi = get_fuse_inode(inode);
1147         bool ret = false;
1148
1149         spin_lock(&fi->lock);
1150         fi->attr_version = atomic64_inc_return(&fc->attr_version);
1151         if (written > 0 && pos > inode->i_size) {
1152                 i_size_write(inode, pos);
1153                 ret = true;
1154         }
1155         spin_unlock(&fi->lock);
1156
1157         fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
1158
1159         return ret;
1160 }
1161
1162 static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
1163                                      struct kiocb *iocb, struct inode *inode,
1164                                      loff_t pos, size_t count)
1165 {
1166         struct fuse_args_pages *ap = &ia->ap;
1167         struct file *file = iocb->ki_filp;
1168         struct fuse_file *ff = file->private_data;
1169         struct fuse_mount *fm = ff->fm;
1170         unsigned int offset, i;
1171         bool short_write;
1172         int err;
1173
1174         for (i = 0; i < ap->num_folios; i++)
1175                 fuse_wait_on_folio_writeback(inode, ap->folios[i]);
1176
1177         fuse_write_args_fill(ia, ff, pos, count);
1178         ia->write.in.flags = fuse_write_flags(iocb);
1179         if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
1180                 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1181
1182         err = fuse_simple_request(fm, &ap->args);
1183         if (!err && ia->write.out.size > count)
1184                 err = -EIO;
1185
1186         short_write = ia->write.out.size < count;
1187         offset = ap->descs[0].offset;
1188         count = ia->write.out.size;
1189         for (i = 0; i < ap->num_folios; i++) {
1190                 struct folio *folio = ap->folios[i];
1191
1192                 if (err) {
1193                         folio_clear_uptodate(folio);
1194                 } else {
1195                         if (count >= folio_size(folio) - offset)
1196                                 count -= folio_size(folio) - offset;
1197                         else {
1198                                 if (short_write)
1199                                         folio_clear_uptodate(folio);
1200                                 count = 0;
1201                         }
1202                         offset = 0;
1203                 }
1204                 if (ia->write.folio_locked && (i == ap->num_folios - 1))
1205                         folio_unlock(folio);
1206                 folio_put(folio);
1207         }
1208
1209         return err;
1210 }
1211
1212 static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
1213                                      struct address_space *mapping,
1214                                      struct iov_iter *ii, loff_t pos,
1215                                      unsigned int max_pages)
1216 {
1217         struct fuse_args_pages *ap = &ia->ap;
1218         struct fuse_conn *fc = get_fuse_conn(mapping->host);
1219         unsigned offset = pos & (PAGE_SIZE - 1);
1220         unsigned int nr_pages = 0;
1221         size_t count = 0;
1222         int err;
1223
1224         ap->args.in_pages = true;
1225         ap->descs[0].offset = offset;
1226
1227         do {
1228                 size_t tmp;
1229                 struct folio *folio;
1230                 pgoff_t index = pos >> PAGE_SHIFT;
1231                 size_t bytes = min_t(size_t, PAGE_SIZE - offset,
1232                                      iov_iter_count(ii));
1233
1234                 bytes = min_t(size_t, bytes, fc->max_write - count);
1235
1236  again:
1237                 err = -EFAULT;
1238                 if (fault_in_iov_iter_readable(ii, bytes))
1239                         break;
1240
1241                 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
1242                                             mapping_gfp_mask(mapping));
1243                 if (IS_ERR(folio)) {
1244                         err = PTR_ERR(folio);
1245                         break;
1246                 }
1247
1248                 if (mapping_writably_mapped(mapping))
1249                         flush_dcache_folio(folio);
1250
1251                 tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii);
1252                 flush_dcache_folio(folio);
1253
1254                 if (!tmp) {
1255                         folio_unlock(folio);
1256                         folio_put(folio);
1257                         goto again;
1258                 }
1259
1260                 err = 0;
1261                 ap->folios[ap->num_folios] = folio;
1262                 ap->descs[ap->num_folios].length = tmp;
1263                 ap->num_folios++;
1264                 nr_pages++;
1265
1266                 count += tmp;
1267                 pos += tmp;
1268                 offset += tmp;
1269                 if (offset == PAGE_SIZE)
1270                         offset = 0;
1271
1272                 /* If we copied full page, mark it uptodate */
1273                 if (tmp == PAGE_SIZE)
1274                         folio_mark_uptodate(folio);
1275
1276                 if (folio_test_uptodate(folio)) {
1277                         folio_unlock(folio);
1278                 } else {
1279                         ia->write.folio_locked = true;
1280                         break;
1281                 }
1282                 if (!fc->big_writes)
1283                         break;
1284         } while (iov_iter_count(ii) && count < fc->max_write &&
1285                  nr_pages < max_pages && offset == 0);
1286
1287         return count > 0 ? count : err;
1288 }
1289
1290 static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
1291                                      unsigned int max_pages)
1292 {
1293         return min_t(unsigned int,
1294                      ((pos + len - 1) >> PAGE_SHIFT) -
1295                      (pos >> PAGE_SHIFT) + 1,
1296                      max_pages);
1297 }
1298
1299 static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
1300 {
1301         struct address_space *mapping = iocb->ki_filp->f_mapping;
1302         struct inode *inode = mapping->host;
1303         struct fuse_conn *fc = get_fuse_conn(inode);
1304         struct fuse_inode *fi = get_fuse_inode(inode);
1305         loff_t pos = iocb->ki_pos;
1306         int err = 0;
1307         ssize_t res = 0;
1308
1309         if (inode->i_size < pos + iov_iter_count(ii))
1310                 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1311
1312         do {
1313                 ssize_t count;
1314                 struct fuse_io_args ia = {};
1315                 struct fuse_args_pages *ap = &ia.ap;
1316                 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
1317                                                       fc->max_pages);
1318
1319                 ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs);
1320                 if (!ap->folios) {
1321                         err = -ENOMEM;
1322                         break;
1323                 }
1324
1325                 count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
1326                 if (count <= 0) {
1327                         err = count;
1328                 } else {
1329                         err = fuse_send_write_pages(&ia, iocb, inode,
1330                                                     pos, count);
1331                         if (!err) {
1332                                 size_t num_written = ia.write.out.size;
1333
1334                                 res += num_written;
1335                                 pos += num_written;
1336
1337                                 /* break out of the loop on short write */
1338                                 if (num_written != count)
1339                                         err = -EIO;
1340                         }
1341                 }
1342                 kfree(ap->folios);
1343         } while (!err && iov_iter_count(ii));
1344
1345         fuse_write_update_attr(inode, pos, res);
1346         clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1347
1348         if (!res)
1349                 return err;
1350         iocb->ki_pos += res;
1351         return res;
1352 }
1353
1354 static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
1355 {
1356         struct inode *inode = file_inode(iocb->ki_filp);
1357
1358         return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
1359 }
1360
1361 /*
1362  * @return true if an exclusive lock for direct IO writes is needed
1363  */
1364 static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
1365 {
1366         struct file *file = iocb->ki_filp;
1367         struct fuse_file *ff = file->private_data;
1368         struct inode *inode = file_inode(iocb->ki_filp);
1369         struct fuse_inode *fi = get_fuse_inode(inode);
1370
1371         /* Server side has to advise that it supports parallel dio writes. */
1372         if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
1373                 return true;
1374
1375         /*
1376          * Append will need to know the eventual EOF - always needs an
1377          * exclusive lock.
1378          */
1379         if (iocb->ki_flags & IOCB_APPEND)
1380                 return true;
1381
1382         /* shared locks are not allowed with parallel page cache IO */
1383         if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
1384                 return true;
1385
1386         /* Parallel dio beyond EOF is not supported, at least for now. */
1387         if (fuse_io_past_eof(iocb, from))
1388                 return true;
1389
1390         return false;
1391 }
1392
1393 static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
1394                           bool *exclusive)
1395 {
1396         struct inode *inode = file_inode(iocb->ki_filp);
1397         struct fuse_inode *fi = get_fuse_inode(inode);
1398
1399         *exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
1400         if (*exclusive) {
1401                 inode_lock(inode);
1402         } else {
1403                 inode_lock_shared(inode);
1404                 /*
1405                  * New parallal dio allowed only if inode is not in caching
1406                  * mode and denies new opens in caching mode. This check
1407                  * should be performed only after taking shared inode lock.
1408                  * Previous past eof check was without inode lock and might
1409                  * have raced, so check it again.
1410                  */
1411                 if (fuse_io_past_eof(iocb, from) ||
1412                     fuse_inode_uncached_io_start(fi, NULL) != 0) {
1413                         inode_unlock_shared(inode);
1414                         inode_lock(inode);
1415                         *exclusive = true;
1416                 }
1417         }
1418 }
1419
1420 static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
1421 {
1422         struct inode *inode = file_inode(iocb->ki_filp);
1423         struct fuse_inode *fi = get_fuse_inode(inode);
1424
1425         if (exclusive) {
1426                 inode_unlock(inode);
1427         } else {
1428                 /* Allow opens in caching mode after last parallel dio end */
1429                 fuse_inode_uncached_io_end(fi);
1430                 inode_unlock_shared(inode);
1431         }
1432 }
1433
1434 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
1435 {
1436         struct file *file = iocb->ki_filp;
1437         struct mnt_idmap *idmap = file_mnt_idmap(file);
1438         struct address_space *mapping = file->f_mapping;
1439         ssize_t written = 0;
1440         struct inode *inode = mapping->host;
1441         ssize_t err, count;
1442         struct fuse_conn *fc = get_fuse_conn(inode);
1443
1444         if (fc->writeback_cache) {
1445                 /* Update size (EOF optimization) and mode (SUID clearing) */
1446                 err = fuse_update_attributes(mapping->host, file,
1447                                              STATX_SIZE | STATX_MODE);
1448                 if (err)
1449                         return err;
1450
1451                 if (fc->handle_killpriv_v2 &&
1452                     setattr_should_drop_suidgid(idmap,
1453                                                 file_inode(file))) {
1454                         goto writethrough;
1455                 }
1456
1457                 return generic_file_write_iter(iocb, from);
1458         }
1459
1460 writethrough:
1461         inode_lock(inode);
1462
1463         err = count = generic_write_checks(iocb, from);
1464         if (err <= 0)
1465                 goto out;
1466
1467         task_io_account_write(count);
1468
1469         err = kiocb_modified(iocb);
1470         if (err)
1471                 goto out;
1472
1473         if (iocb->ki_flags & IOCB_DIRECT) {
1474                 written = generic_file_direct_write(iocb, from);
1475                 if (written < 0 || !iov_iter_count(from))
1476                         goto out;
1477                 written = direct_write_fallback(iocb, from, written,
1478                                 fuse_perform_write(iocb, from));
1479         } else {
1480                 written = fuse_perform_write(iocb, from);
1481         }
1482 out:
1483         inode_unlock(inode);
1484         if (written > 0)
1485                 written = generic_write_sync(iocb, written);
1486
1487         return written ? written : err;
1488 }
1489
1490 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
1491 {
1492         return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset;
1493 }
1494
1495 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1496                                         size_t max_size)
1497 {
1498         return min(iov_iter_single_seg_count(ii), max_size);
1499 }
1500
1501 static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
1502                                size_t *nbytesp, int write,
1503                                unsigned int max_pages,
1504                                bool use_pages_for_kvec_io)
1505 {
1506         bool flush_or_invalidate = false;
1507         unsigned int nr_pages = 0;
1508         size_t nbytes = 0;  /* # bytes already packed in req */
1509         ssize_t ret = 0;
1510
1511         /* Special case for kernel I/O: can copy directly into the buffer.
1512          * However if the implementation of fuse_conn requires pages instead of
1513          * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
1514          */
1515         if (iov_iter_is_kvec(ii)) {
1516                 void *user_addr = (void *)fuse_get_user_addr(ii);
1517
1518                 if (!use_pages_for_kvec_io) {
1519                         size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1520
1521                         if (write)
1522                                 ap->args.in_args[1].value = user_addr;
1523                         else
1524                                 ap->args.out_args[0].value = user_addr;
1525
1526                         iov_iter_advance(ii, frag_size);
1527                         *nbytesp = frag_size;
1528                         return 0;
1529                 }
1530
1531                 if (is_vmalloc_addr(user_addr)) {
1532                         ap->args.vmap_base = user_addr;
1533                         flush_or_invalidate = true;
1534                 }
1535         }
1536
1537         /*
1538          * Until there is support for iov_iter_extract_folios(), we have to
1539          * manually extract pages using iov_iter_extract_pages() and then
1540          * copy that to a folios array.
1541          */
1542         struct page **pages = kzalloc(max_pages * sizeof(struct page *),
1543                                       GFP_KERNEL);
1544         if (!pages)
1545                 return -ENOMEM;
1546
1547         while (nbytes < *nbytesp && nr_pages < max_pages) {
1548                 unsigned nfolios, i;
1549                 size_t start;
1550
1551                 ret = iov_iter_extract_pages(ii, &pages,
1552                                              *nbytesp - nbytes,
1553                                              max_pages - nr_pages,
1554                                              0, &start);
1555                 if (ret < 0)
1556                         break;
1557
1558                 nbytes += ret;
1559
1560                 ret += start;
1561                 /* Currently, all folios in FUSE are one page */
1562                 nfolios = DIV_ROUND_UP(ret, PAGE_SIZE);
1563
1564                 ap->descs[ap->num_folios].offset = start;
1565                 fuse_folio_descs_length_init(ap->descs, ap->num_folios, nfolios);
1566                 for (i = 0; i < nfolios; i++)
1567                         ap->folios[i + ap->num_folios] = page_folio(pages[i]);
1568
1569                 ap->num_folios += nfolios;
1570                 ap->descs[ap->num_folios - 1].length -=
1571                         (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1572                 nr_pages += nfolios;
1573         }
1574         kfree(pages);
1575
1576         if (write && flush_or_invalidate)
1577                 flush_kernel_vmap_range(ap->args.vmap_base, nbytes);
1578
1579         ap->args.invalidate_vmap = !write && flush_or_invalidate;
1580         ap->args.is_pinned = iov_iter_extract_will_pin(ii);
1581         ap->args.user_pages = true;
1582         if (write)
1583                 ap->args.in_pages = true;
1584         else
1585                 ap->args.out_pages = true;
1586
1587         *nbytesp = nbytes;
1588
1589         return ret < 0 ? ret : 0;
1590 }
1591
1592 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1593                        loff_t *ppos, int flags)
1594 {
1595         int write = flags & FUSE_DIO_WRITE;
1596         int cuse = flags & FUSE_DIO_CUSE;
1597         struct file *file = io->iocb->ki_filp;
1598         struct address_space *mapping = file->f_mapping;
1599         struct inode *inode = mapping->host;
1600         struct fuse_file *ff = file->private_data;
1601         struct fuse_conn *fc = ff->fm->fc;
1602         size_t nmax = write ? fc->max_write : fc->max_read;
1603         loff_t pos = *ppos;
1604         size_t count = iov_iter_count(iter);
1605         pgoff_t idx_from = pos >> PAGE_SHIFT;
1606         pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
1607         ssize_t res = 0;
1608         int err = 0;
1609         struct fuse_io_args *ia;
1610         unsigned int max_pages;
1611         bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO;
1612
1613         max_pages = iov_iter_npages(iter, fc->max_pages);
1614         ia = fuse_io_alloc(io, max_pages);
1615         if (!ia)
1616                 return -ENOMEM;
1617
1618         if (fopen_direct_io && fc->direct_io_allow_mmap) {
1619                 res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
1620                 if (res) {
1621                         fuse_io_free(ia);
1622                         return res;
1623                 }
1624         }
1625         if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1626                 if (!write)
1627                         inode_lock(inode);
1628                 fuse_sync_writes(inode);
1629                 if (!write)
1630                         inode_unlock(inode);
1631         }
1632
1633         if (fopen_direct_io && write) {
1634                 res = invalidate_inode_pages2_range(mapping, idx_from, idx_to);
1635                 if (res) {
1636                         fuse_io_free(ia);
1637                         return res;
1638                 }
1639         }
1640
1641         io->should_dirty = !write && user_backed_iter(iter);
1642         while (count) {
1643                 ssize_t nres;
1644                 fl_owner_t owner = current->files;
1645                 size_t nbytes = min(count, nmax);
1646
1647                 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
1648                                           max_pages, fc->use_pages_for_kvec_io);
1649                 if (err && !nbytes)
1650                         break;
1651
1652                 if (write) {
1653                         if (!capable(CAP_FSETID))
1654                                 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1655
1656                         nres = fuse_send_write(ia, pos, nbytes, owner);
1657                 } else {
1658                         nres = fuse_send_read(ia, pos, nbytes, owner);
1659                 }
1660
1661                 if (!io->async || nres < 0) {
1662                         fuse_release_user_pages(&ia->ap, nres, io->should_dirty);
1663                         fuse_io_free(ia);
1664                 }
1665                 ia = NULL;
1666                 if (nres < 0) {
1667                         iov_iter_revert(iter, nbytes);
1668                         err = nres;
1669                         break;
1670                 }
1671                 WARN_ON(nres > nbytes);
1672
1673                 count -= nres;
1674                 res += nres;
1675                 pos += nres;
1676                 if (nres != nbytes) {
1677                         iov_iter_revert(iter, nbytes - nres);
1678                         break;
1679                 }
1680                 if (count) {
1681                         max_pages = iov_iter_npages(iter, fc->max_pages);
1682                         ia = fuse_io_alloc(io, max_pages);
1683                         if (!ia)
1684                                 break;
1685                 }
1686         }
1687         if (ia)
1688                 fuse_io_free(ia);
1689         if (res > 0)
1690                 *ppos = pos;
1691
1692         return res > 0 ? res : err;
1693 }
1694 EXPORT_SYMBOL_GPL(fuse_direct_io);
1695
1696 static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1697                                   struct iov_iter *iter,
1698                                   loff_t *ppos)
1699 {
1700         ssize_t res;
1701         struct inode *inode = file_inode(io->iocb->ki_filp);
1702
1703         res = fuse_direct_io(io, iter, ppos, 0);
1704
1705         fuse_invalidate_atime(inode);
1706
1707         return res;
1708 }
1709
1710 static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
1711
1712 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1713 {
1714         ssize_t res;
1715
1716         if (!is_sync_kiocb(iocb)) {
1717                 res = fuse_direct_IO(iocb, to);
1718         } else {
1719                 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1720
1721                 res = __fuse_direct_read(&io, to, &iocb->ki_pos);
1722         }
1723
1724         return res;
1725 }
1726
1727 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1728 {
1729         struct inode *inode = file_inode(iocb->ki_filp);
1730         ssize_t res;
1731         bool exclusive;
1732
1733         fuse_dio_lock(iocb, from, &exclusive);
1734         res = generic_write_checks(iocb, from);
1735         if (res > 0) {
1736                 task_io_account_write(res);
1737                 if (!is_sync_kiocb(iocb)) {
1738                         res = fuse_direct_IO(iocb, from);
1739                 } else {
1740                         struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1741
1742                         res = fuse_direct_io(&io, from, &iocb->ki_pos,
1743                                              FUSE_DIO_WRITE);
1744                         fuse_write_update_attr(inode, iocb->ki_pos, res);
1745                 }
1746         }
1747         fuse_dio_unlock(iocb, exclusive);
1748
1749         return res;
1750 }
1751
1752 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1753 {
1754         struct file *file = iocb->ki_filp;
1755         struct fuse_file *ff = file->private_data;
1756         struct inode *inode = file_inode(file);
1757
1758         if (fuse_is_bad(inode))
1759                 return -EIO;
1760
1761         if (FUSE_IS_DAX(inode))
1762                 return fuse_dax_read_iter(iocb, to);
1763
1764         /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1765         if (ff->open_flags & FOPEN_DIRECT_IO)
1766                 return fuse_direct_read_iter(iocb, to);
1767         else if (fuse_file_passthrough(ff))
1768                 return fuse_passthrough_read_iter(iocb, to);
1769         else
1770                 return fuse_cache_read_iter(iocb, to);
1771 }
1772
1773 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1774 {
1775         struct file *file = iocb->ki_filp;
1776         struct fuse_file *ff = file->private_data;
1777         struct inode *inode = file_inode(file);
1778
1779         if (fuse_is_bad(inode))
1780                 return -EIO;
1781
1782         if (FUSE_IS_DAX(inode))
1783                 return fuse_dax_write_iter(iocb, from);
1784
1785         /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1786         if (ff->open_flags & FOPEN_DIRECT_IO)
1787                 return fuse_direct_write_iter(iocb, from);
1788         else if (fuse_file_passthrough(ff))
1789                 return fuse_passthrough_write_iter(iocb, from);
1790         else
1791                 return fuse_cache_write_iter(iocb, from);
1792 }
1793
1794 static ssize_t fuse_splice_read(struct file *in, loff_t *ppos,
1795                                 struct pipe_inode_info *pipe, size_t len,
1796                                 unsigned int flags)
1797 {
1798         struct fuse_file *ff = in->private_data;
1799
1800         /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1801         if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
1802                 return fuse_passthrough_splice_read(in, ppos, pipe, len, flags);
1803         else
1804                 return filemap_splice_read(in, ppos, pipe, len, flags);
1805 }
1806
1807 static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
1808                                  loff_t *ppos, size_t len, unsigned int flags)
1809 {
1810         struct fuse_file *ff = out->private_data;
1811
1812         /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1813         if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
1814                 return fuse_passthrough_splice_write(pipe, out, ppos, len, flags);
1815         else
1816                 return iter_file_splice_write(pipe, out, ppos, len, flags);
1817 }
1818
1819 static void fuse_writepage_free(struct fuse_writepage_args *wpa)
1820 {
1821         struct fuse_args_pages *ap = &wpa->ia.ap;
1822         int i;
1823
1824         if (wpa->bucket)
1825                 fuse_sync_bucket_dec(wpa->bucket);
1826
1827         for (i = 0; i < ap->num_folios; i++)
1828                 folio_put(ap->folios[i]);
1829
1830         fuse_file_put(wpa->ia.ff, false);
1831
1832         kfree(ap->folios);
1833         kfree(wpa);
1834 }
1835
1836 static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio)
1837 {
1838         struct backing_dev_info *bdi = inode_to_bdi(inode);
1839
1840         dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1841         node_stat_sub_folio(folio, NR_WRITEBACK_TEMP);
1842         wb_writeout_inc(&bdi->wb);
1843 }
1844
1845 static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
1846 {
1847         struct fuse_args_pages *ap = &wpa->ia.ap;
1848         struct inode *inode = wpa->inode;
1849         struct fuse_inode *fi = get_fuse_inode(inode);
1850         int i;
1851
1852         for (i = 0; i < ap->num_folios; i++)
1853                 fuse_writepage_finish_stat(inode, ap->folios[i]);
1854
1855         wake_up(&fi->page_waitq);
1856 }
1857
1858 /* Called under fi->lock, may release and reacquire it */
1859 static void fuse_send_writepage(struct fuse_mount *fm,
1860                                 struct fuse_writepage_args *wpa, loff_t size)
1861 __releases(fi->lock)
1862 __acquires(fi->lock)
1863 {
1864         struct fuse_writepage_args *aux, *next;
1865         struct fuse_inode *fi = get_fuse_inode(wpa->inode);
1866         struct fuse_write_in *inarg = &wpa->ia.write.in;
1867         struct fuse_args *args = &wpa->ia.ap.args;
1868         /* Currently, all folios in FUSE are one page */
1869         __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE;
1870         int err;
1871
1872         fi->writectr++;
1873         if (inarg->offset + data_size <= size) {
1874                 inarg->size = data_size;
1875         } else if (inarg->offset < size) {
1876                 inarg->size = size - inarg->offset;
1877         } else {
1878                 /* Got truncated off completely */
1879                 goto out_free;
1880         }
1881
1882         args->in_args[1].size = inarg->size;
1883         args->force = true;
1884         args->nocreds = true;
1885
1886         err = fuse_simple_background(fm, args, GFP_ATOMIC);
1887         if (err == -ENOMEM) {
1888                 spin_unlock(&fi->lock);
1889                 err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
1890                 spin_lock(&fi->lock);
1891         }
1892
1893         /* Fails on broken connection only */
1894         if (unlikely(err))
1895                 goto out_free;
1896
1897         return;
1898
1899  out_free:
1900         fi->writectr--;
1901         rb_erase(&wpa->writepages_entry, &fi->writepages);
1902         fuse_writepage_finish(wpa);
1903         spin_unlock(&fi->lock);
1904
1905         /* After rb_erase() aux request list is private */
1906         for (aux = wpa->next; aux; aux = next) {
1907                 next = aux->next;
1908                 aux->next = NULL;
1909                 fuse_writepage_finish_stat(aux->inode,
1910                                            aux->ia.ap.folios[0]);
1911                 fuse_writepage_free(aux);
1912         }
1913
1914         fuse_writepage_free(wpa);
1915         spin_lock(&fi->lock);
1916 }
1917
1918 /*
1919  * If fi->writectr is positive (no truncate or fsync going on) send
1920  * all queued writepage requests.
1921  *
1922  * Called with fi->lock
1923  */
1924 void fuse_flush_writepages(struct inode *inode)
1925 __releases(fi->lock)
1926 __acquires(fi->lock)
1927 {
1928         struct fuse_mount *fm = get_fuse_mount(inode);
1929         struct fuse_inode *fi = get_fuse_inode(inode);
1930         loff_t crop = i_size_read(inode);
1931         struct fuse_writepage_args *wpa;
1932
1933         while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1934                 wpa = list_entry(fi->queued_writes.next,
1935                                  struct fuse_writepage_args, queue_entry);
1936                 list_del_init(&wpa->queue_entry);
1937                 fuse_send_writepage(fm, wpa, crop);
1938         }
1939 }
1940
1941 static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
1942                                                 struct fuse_writepage_args *wpa)
1943 {
1944         pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
1945         pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1;
1946         struct rb_node **p = &root->rb_node;
1947         struct rb_node  *parent = NULL;
1948
1949         WARN_ON(!wpa->ia.ap.num_folios);
1950         while (*p) {
1951                 struct fuse_writepage_args *curr;
1952                 pgoff_t curr_index;
1953
1954                 parent = *p;
1955                 curr = rb_entry(parent, struct fuse_writepage_args,
1956                                 writepages_entry);
1957                 WARN_ON(curr->inode != wpa->inode);
1958                 curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
1959
1960                 if (idx_from >= curr_index + curr->ia.ap.num_folios)
1961                         p = &(*p)->rb_right;
1962                 else if (idx_to < curr_index)
1963                         p = &(*p)->rb_left;
1964                 else
1965                         return curr;
1966         }
1967
1968         rb_link_node(&wpa->writepages_entry, parent, p);
1969         rb_insert_color(&wpa->writepages_entry, root);
1970         return NULL;
1971 }
1972
1973 static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
1974 {
1975         WARN_ON(fuse_insert_writeback(root, wpa));
1976 }
1977
1978 static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
1979                                int error)
1980 {
1981         struct fuse_writepage_args *wpa =
1982                 container_of(args, typeof(*wpa), ia.ap.args);
1983         struct inode *inode = wpa->inode;
1984         struct fuse_inode *fi = get_fuse_inode(inode);
1985         struct fuse_conn *fc = get_fuse_conn(inode);
1986
1987         mapping_set_error(inode->i_mapping, error);
1988         /*
1989          * A writeback finished and this might have updated mtime/ctime on
1990          * server making local mtime/ctime stale.  Hence invalidate attrs.
1991          * Do this only if writeback_cache is not enabled.  If writeback_cache
1992          * is enabled, we trust local ctime/mtime.
1993          */
1994         if (!fc->writeback_cache)
1995                 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
1996         spin_lock(&fi->lock);
1997         rb_erase(&wpa->writepages_entry, &fi->writepages);
1998         while (wpa->next) {
1999                 struct fuse_mount *fm = get_fuse_mount(inode);
2000                 struct fuse_write_in *inarg = &wpa->ia.write.in;
2001                 struct fuse_writepage_args *next = wpa->next;
2002
2003                 wpa->next = next->next;
2004                 next->next = NULL;
2005                 tree_insert(&fi->writepages, next);
2006
2007                 /*
2008                  * Skip fuse_flush_writepages() to make it easy to crop requests
2009                  * based on primary request size.
2010                  *
2011                  * 1st case (trivial): there are no concurrent activities using
2012                  * fuse_set/release_nowrite.  Then we're on safe side because
2013                  * fuse_flush_writepages() would call fuse_send_writepage()
2014                  * anyway.
2015                  *
2016                  * 2nd case: someone called fuse_set_nowrite and it is waiting
2017                  * now for completion of all in-flight requests.  This happens
2018                  * rarely and no more than once per page, so this should be
2019                  * okay.
2020                  *
2021                  * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
2022                  * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
2023                  * that fuse_set_nowrite returned implies that all in-flight
2024                  * requests were completed along with all of their secondary
2025                  * requests.  Further primary requests are blocked by negative
2026                  * writectr.  Hence there cannot be any in-flight requests and
2027                  * no invocations of fuse_writepage_end() while we're in
2028                  * fuse_set_nowrite..fuse_release_nowrite section.
2029                  */
2030                 fuse_send_writepage(fm, next, inarg->offset + inarg->size);
2031         }
2032         fi->writectr--;
2033         fuse_writepage_finish(wpa);
2034         spin_unlock(&fi->lock);
2035         fuse_writepage_free(wpa);
2036 }
2037
2038 static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
2039 {
2040         struct fuse_file *ff;
2041
2042         spin_lock(&fi->lock);
2043         ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
2044                                       write_entry);
2045         if (ff)
2046                 fuse_file_get(ff);
2047         spin_unlock(&fi->lock);
2048
2049         return ff;
2050 }
2051
2052 static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
2053 {
2054         struct fuse_file *ff = __fuse_write_file_get(fi);
2055         WARN_ON(!ff);
2056         return ff;
2057 }
2058
2059 int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
2060 {
2061         struct fuse_inode *fi = get_fuse_inode(inode);
2062         struct fuse_file *ff;
2063         int err;
2064
2065         /*
2066          * Inode is always written before the last reference is dropped and
2067          * hence this should not be reached from reclaim.
2068          *
2069          * Writing back the inode from reclaim can deadlock if the request
2070          * processing itself needs an allocation.  Allocations triggering
2071          * reclaim while serving a request can't be prevented, because it can
2072          * involve any number of unrelated userspace processes.
2073          */
2074         WARN_ON(wbc->for_reclaim);
2075
2076         ff = __fuse_write_file_get(fi);
2077         err = fuse_flush_times(inode, ff);
2078         if (ff)
2079                 fuse_file_put(ff, false);
2080
2081         return err;
2082 }
2083
2084 static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
2085 {
2086         struct fuse_writepage_args *wpa;
2087         struct fuse_args_pages *ap;
2088
2089         wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
2090         if (wpa) {
2091                 ap = &wpa->ia.ap;
2092                 ap->num_folios = 0;
2093                 ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs);
2094                 if (!ap->folios) {
2095                         kfree(wpa);
2096                         wpa = NULL;
2097                 }
2098         }
2099         return wpa;
2100
2101 }
2102
2103 static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
2104                                          struct fuse_writepage_args *wpa)
2105 {
2106         if (!fc->sync_fs)
2107                 return;
2108
2109         rcu_read_lock();
2110         /* Prevent resurrection of dead bucket in unlikely race with syncfs */
2111         do {
2112                 wpa->bucket = rcu_dereference(fc->curr_bucket);
2113         } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
2114         rcu_read_unlock();
2115 }
2116
2117 static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
2118                                           struct folio *tmp_folio, uint32_t folio_index)
2119 {
2120         struct inode *inode = folio->mapping->host;
2121         struct fuse_args_pages *ap = &wpa->ia.ap;
2122
2123         folio_copy(tmp_folio, folio);
2124
2125         ap->folios[folio_index] = tmp_folio;
2126         ap->descs[folio_index].offset = 0;
2127         ap->descs[folio_index].length = PAGE_SIZE;
2128
2129         inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2130         node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
2131 }
2132
2133 static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
2134                                                              struct fuse_file *ff)
2135 {
2136         struct inode *inode = folio->mapping->host;
2137         struct fuse_conn *fc = get_fuse_conn(inode);
2138         struct fuse_writepage_args *wpa;
2139         struct fuse_args_pages *ap;
2140
2141         wpa = fuse_writepage_args_alloc();
2142         if (!wpa)
2143                 return NULL;
2144
2145         fuse_writepage_add_to_bucket(fc, wpa);
2146         fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
2147         wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
2148         wpa->inode = inode;
2149         wpa->ia.ff = ff;
2150
2151         ap = &wpa->ia.ap;
2152         ap->args.in_pages = true;
2153         ap->args.end = fuse_writepage_end;
2154
2155         return wpa;
2156 }
2157
2158 static int fuse_writepage_locked(struct folio *folio)
2159 {
2160         struct address_space *mapping = folio->mapping;
2161         struct inode *inode = mapping->host;
2162         struct fuse_inode *fi = get_fuse_inode(inode);
2163         struct fuse_writepage_args *wpa;
2164         struct fuse_args_pages *ap;
2165         struct folio *tmp_folio;
2166         struct fuse_file *ff;
2167         int error = -ENOMEM;
2168
2169         tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
2170         if (!tmp_folio)
2171                 goto err;
2172
2173         error = -EIO;
2174         ff = fuse_write_file_get(fi);
2175         if (!ff)
2176                 goto err_nofile;
2177
2178         wpa = fuse_writepage_args_setup(folio, ff);
2179         error = -ENOMEM;
2180         if (!wpa)
2181                 goto err_writepage_args;
2182
2183         ap = &wpa->ia.ap;
2184         ap->num_folios = 1;
2185
2186         folio_start_writeback(folio);
2187         fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0);
2188
2189         spin_lock(&fi->lock);
2190         tree_insert(&fi->writepages, wpa);
2191         list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2192         fuse_flush_writepages(inode);
2193         spin_unlock(&fi->lock);
2194
2195         folio_end_writeback(folio);
2196
2197         return 0;
2198
2199 err_writepage_args:
2200         fuse_file_put(ff, false);
2201 err_nofile:
2202         folio_put(tmp_folio);
2203 err:
2204         mapping_set_error(folio->mapping, error);
2205         return error;
2206 }
2207
2208 struct fuse_fill_wb_data {
2209         struct fuse_writepage_args *wpa;
2210         struct fuse_file *ff;
2211         struct inode *inode;
2212         struct folio **orig_folios;
2213         unsigned int max_folios;
2214 };
2215
2216 static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
2217 {
2218         struct fuse_args_pages *ap = &data->wpa->ia.ap;
2219         struct fuse_conn *fc = get_fuse_conn(data->inode);
2220         struct folio **folios;
2221         struct fuse_folio_desc *descs;
2222         unsigned int nfolios = min_t(unsigned int,
2223                                      max_t(unsigned int, data->max_folios * 2,
2224                                            FUSE_DEFAULT_MAX_PAGES_PER_REQ),
2225                                     fc->max_pages);
2226         WARN_ON(nfolios <= data->max_folios);
2227
2228         folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs);
2229         if (!folios)
2230                 return false;
2231
2232         memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios);
2233         memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios);
2234         kfree(ap->folios);
2235         ap->folios = folios;
2236         ap->descs = descs;
2237         data->max_folios = nfolios;
2238
2239         return true;
2240 }
2241
2242 static void fuse_writepages_send(struct fuse_fill_wb_data *data)
2243 {
2244         struct fuse_writepage_args *wpa = data->wpa;
2245         struct inode *inode = data->inode;
2246         struct fuse_inode *fi = get_fuse_inode(inode);
2247         int num_folios = wpa->ia.ap.num_folios;
2248         int i;
2249
2250         spin_lock(&fi->lock);
2251         list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2252         fuse_flush_writepages(inode);
2253         spin_unlock(&fi->lock);
2254
2255         for (i = 0; i < num_folios; i++)
2256                 folio_end_writeback(data->orig_folios[i]);
2257 }
2258
2259 /*
2260  * Check under fi->lock if the page is under writeback, and insert it onto the
2261  * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
2262  * one already added for a page at this offset.  If there's none, then insert
2263  * this new request onto the auxiliary list, otherwise reuse the existing one by
2264  * swapping the new temp page with the old one.
2265  */
2266 static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
2267                                struct folio *folio)
2268 {
2269         struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
2270         struct fuse_writepage_args *tmp;
2271         struct fuse_writepage_args *old_wpa;
2272         struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
2273
2274         WARN_ON(new_ap->num_folios != 0);
2275         new_ap->num_folios = 1;
2276
2277         spin_lock(&fi->lock);
2278         old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
2279         if (!old_wpa) {
2280                 spin_unlock(&fi->lock);
2281                 return true;
2282         }
2283
2284         for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
2285                 pgoff_t curr_index;
2286
2287                 WARN_ON(tmp->inode != new_wpa->inode);
2288                 curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
2289                 if (curr_index == folio->index) {
2290                         WARN_ON(tmp->ia.ap.num_folios != 1);
2291                         swap(tmp->ia.ap.folios[0], new_ap->folios[0]);
2292                         break;
2293                 }
2294         }
2295
2296         if (!tmp) {
2297                 new_wpa->next = old_wpa->next;
2298                 old_wpa->next = new_wpa;
2299         }
2300
2301         spin_unlock(&fi->lock);
2302
2303         if (tmp) {
2304                 fuse_writepage_finish_stat(new_wpa->inode,
2305                                            folio);
2306                 fuse_writepage_free(new_wpa);
2307         }
2308
2309         return false;
2310 }
2311
2312 static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio,
2313                                      struct fuse_args_pages *ap,
2314                                      struct fuse_fill_wb_data *data)
2315 {
2316         WARN_ON(!ap->num_folios);
2317
2318         /*
2319          * Being under writeback is unlikely but possible.  For example direct
2320          * read to an mmaped fuse file will set the page dirty twice; once when
2321          * the pages are faulted with get_user_pages(), and then after the read
2322          * completed.
2323          */
2324         if (fuse_folio_is_writeback(data->inode, folio))
2325                 return true;
2326
2327         /* Reached max pages */
2328         if (ap->num_folios == fc->max_pages)
2329                 return true;
2330
2331         /* Reached max write bytes */
2332         if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write)
2333                 return true;
2334
2335         /* Discontinuity */
2336         if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio))
2337                 return true;
2338
2339         /* Need to grow the pages array?  If so, did the expansion fail? */
2340         if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data))
2341                 return true;
2342
2343         return false;
2344 }
2345
2346 static int fuse_writepages_fill(struct folio *folio,
2347                 struct writeback_control *wbc, void *_data)
2348 {
2349         struct fuse_fill_wb_data *data = _data;
2350         struct fuse_writepage_args *wpa = data->wpa;
2351         struct fuse_args_pages *ap = &wpa->ia.ap;
2352         struct inode *inode = data->inode;
2353         struct fuse_inode *fi = get_fuse_inode(inode);
2354         struct fuse_conn *fc = get_fuse_conn(inode);
2355         struct folio *tmp_folio;
2356         int err;
2357
2358         if (!data->ff) {
2359                 err = -EIO;
2360                 data->ff = fuse_write_file_get(fi);
2361                 if (!data->ff)
2362                         goto out_unlock;
2363         }
2364
2365         if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) {
2366                 fuse_writepages_send(data);
2367                 data->wpa = NULL;
2368         }
2369
2370         err = -ENOMEM;
2371         tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
2372         if (!tmp_folio)
2373                 goto out_unlock;
2374
2375         /*
2376          * The page must not be redirtied until the writeout is completed
2377          * (i.e. userspace has sent a reply to the write request).  Otherwise
2378          * there could be more than one temporary page instance for each real
2379          * page.
2380          *
2381          * This is ensured by holding the page lock in page_mkwrite() while
2382          * checking fuse_page_is_writeback().  We already hold the page lock
2383          * since clear_page_dirty_for_io() and keep it held until we add the
2384          * request to the fi->writepages list and increment ap->num_folios.
2385          * After this fuse_page_is_writeback() will indicate that the page is
2386          * under writeback, so we can release the page lock.
2387          */
2388         if (data->wpa == NULL) {
2389                 err = -ENOMEM;
2390                 wpa = fuse_writepage_args_setup(folio, data->ff);
2391                 if (!wpa) {
2392                         folio_put(tmp_folio);
2393                         goto out_unlock;
2394                 }
2395                 fuse_file_get(wpa->ia.ff);
2396                 data->max_folios = 1;
2397                 ap = &wpa->ia.ap;
2398         }
2399         folio_start_writeback(folio);
2400
2401         fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios);
2402         data->orig_folios[ap->num_folios] = folio;
2403
2404         err = 0;
2405         if (data->wpa) {
2406                 /*
2407                  * Protected by fi->lock against concurrent access by
2408                  * fuse_page_is_writeback().
2409                  */
2410                 spin_lock(&fi->lock);
2411                 ap->num_folios++;
2412                 spin_unlock(&fi->lock);
2413         } else if (fuse_writepage_add(wpa, folio)) {
2414                 data->wpa = wpa;
2415         } else {
2416                 folio_end_writeback(folio);
2417         }
2418 out_unlock:
2419         folio_unlock(folio);
2420
2421         return err;
2422 }
2423
2424 static int fuse_writepages(struct address_space *mapping,
2425                            struct writeback_control *wbc)
2426 {
2427         struct inode *inode = mapping->host;
2428         struct fuse_conn *fc = get_fuse_conn(inode);
2429         struct fuse_fill_wb_data data;
2430         int err;
2431
2432         err = -EIO;
2433         if (fuse_is_bad(inode))
2434                 goto out;
2435
2436         if (wbc->sync_mode == WB_SYNC_NONE &&
2437             fc->num_background >= fc->congestion_threshold)
2438                 return 0;
2439
2440         data.inode = inode;
2441         data.wpa = NULL;
2442         data.ff = NULL;
2443
2444         err = -ENOMEM;
2445         data.orig_folios = kcalloc(fc->max_pages,
2446                                    sizeof(struct folio *),
2447                                    GFP_NOFS);
2448         if (!data.orig_folios)
2449                 goto out;
2450
2451         err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2452         if (data.wpa) {
2453                 WARN_ON(!data.wpa->ia.ap.num_folios);
2454                 fuse_writepages_send(&data);
2455         }
2456         if (data.ff)
2457                 fuse_file_put(data.ff, false);
2458
2459         kfree(data.orig_folios);
2460 out:
2461         return err;
2462 }
2463
2464 /*
2465  * It's worthy to make sure that space is reserved on disk for the write,
2466  * but how to implement it without killing performance need more thinking.
2467  */
2468 static int fuse_write_begin(struct file *file, struct address_space *mapping,
2469                 loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
2470 {
2471         pgoff_t index = pos >> PAGE_SHIFT;
2472         struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2473         struct folio *folio;
2474         loff_t fsize;
2475         int err = -ENOMEM;
2476
2477         WARN_ON(!fc->writeback_cache);
2478
2479         folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2480                         mapping_gfp_mask(mapping));
2481         if (IS_ERR(folio))
2482                 goto error;
2483
2484         fuse_wait_on_page_writeback(mapping->host, folio->index);
2485
2486         if (folio_test_uptodate(folio) || len >= folio_size(folio))
2487                 goto success;
2488         /*
2489          * Check if the start of this folio comes after the end of file,
2490          * in which case the readpage can be optimized away.
2491          */
2492         fsize = i_size_read(mapping->host);
2493         if (fsize <= folio_pos(folio)) {
2494                 size_t off = offset_in_folio(folio, pos);
2495                 if (off)
2496                         folio_zero_segment(folio, 0, off);
2497                 goto success;
2498         }
2499         err = fuse_do_readfolio(file, folio);
2500         if (err)
2501                 goto cleanup;
2502 success:
2503         *foliop = folio;
2504         return 0;
2505
2506 cleanup:
2507         folio_unlock(folio);
2508         folio_put(folio);
2509 error:
2510         return err;
2511 }
2512
2513 static int fuse_write_end(struct file *file, struct address_space *mapping,
2514                 loff_t pos, unsigned len, unsigned copied,
2515                 struct folio *folio, void *fsdata)
2516 {
2517         struct inode *inode = folio->mapping->host;
2518
2519         /* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
2520         if (!copied)
2521                 goto unlock;
2522
2523         pos += copied;
2524         if (!folio_test_uptodate(folio)) {
2525                 /* Zero any unwritten bytes at the end of the page */
2526                 size_t endoff = pos & ~PAGE_MASK;
2527                 if (endoff)
2528                         folio_zero_segment(folio, endoff, PAGE_SIZE);
2529                 folio_mark_uptodate(folio);
2530         }
2531
2532         if (pos > inode->i_size)
2533                 i_size_write(inode, pos);
2534
2535         folio_mark_dirty(folio);
2536
2537 unlock:
2538         folio_unlock(folio);
2539         folio_put(folio);
2540
2541         return copied;
2542 }
2543
2544 static int fuse_launder_folio(struct folio *folio)
2545 {
2546         int err = 0;
2547         if (folio_clear_dirty_for_io(folio)) {
2548                 struct inode *inode = folio->mapping->host;
2549
2550                 /* Serialize with pending writeback for the same page */
2551                 fuse_wait_on_page_writeback(inode, folio->index);
2552                 err = fuse_writepage_locked(folio);
2553                 if (!err)
2554                         fuse_wait_on_page_writeback(inode, folio->index);
2555         }
2556         return err;
2557 }
2558
2559 /*
2560  * Write back dirty data/metadata now (there may not be any suitable
2561  * open files later for data)
2562  */
2563 static void fuse_vma_close(struct vm_area_struct *vma)
2564 {
2565         int err;
2566
2567         err = write_inode_now(vma->vm_file->f_mapping->host, 1);
2568         mapping_set_error(vma->vm_file->f_mapping, err);
2569 }
2570
2571 /*
2572  * Wait for writeback against this page to complete before allowing it
2573  * to be marked dirty again, and hence written back again, possibly
2574  * before the previous writepage completed.
2575  *
2576  * Block here, instead of in ->writepage(), so that the userspace fs
2577  * can only block processes actually operating on the filesystem.
2578  *
2579  * Otherwise unprivileged userspace fs would be able to block
2580  * unrelated:
2581  *
2582  * - page migration
2583  * - sync(2)
2584  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
2585  */
2586 static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
2587 {
2588         struct folio *folio = page_folio(vmf->page);
2589         struct inode *inode = file_inode(vmf->vma->vm_file);
2590
2591         file_update_time(vmf->vma->vm_file);
2592         folio_lock(folio);
2593         if (folio->mapping != inode->i_mapping) {
2594                 folio_unlock(folio);
2595                 return VM_FAULT_NOPAGE;
2596         }
2597
2598         fuse_wait_on_folio_writeback(inode, folio);
2599         return VM_FAULT_LOCKED;
2600 }
2601
2602 static const struct vm_operations_struct fuse_file_vm_ops = {
2603         .close          = fuse_vma_close,
2604         .fault          = filemap_fault,
2605         .map_pages      = filemap_map_pages,
2606         .page_mkwrite   = fuse_page_mkwrite,
2607 };
2608
2609 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
2610 {
2611         struct fuse_file *ff = file->private_data;
2612         struct fuse_conn *fc = ff->fm->fc;
2613         struct inode *inode = file_inode(file);
2614         int rc;
2615
2616         /* DAX mmap is superior to direct_io mmap */
2617         if (FUSE_IS_DAX(inode))
2618                 return fuse_dax_mmap(file, vma);
2619
2620         /*
2621          * If inode is in passthrough io mode, because it has some file open
2622          * in passthrough mode, either mmap to backing file or fail mmap,
2623          * because mixing cached mmap and passthrough io mode is not allowed.
2624          */
2625         if (fuse_file_passthrough(ff))
2626                 return fuse_passthrough_mmap(file, vma);
2627         else if (fuse_inode_backing(get_fuse_inode(inode)))
2628                 return -ENODEV;
2629
2630         /*
2631          * FOPEN_DIRECT_IO handling is special compared to O_DIRECT,
2632          * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
2633          */
2634         if (ff->open_flags & FOPEN_DIRECT_IO) {
2635                 /*
2636                  * Can't provide the coherency needed for MAP_SHARED
2637                  * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
2638                  */
2639                 if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
2640                         return -ENODEV;
2641
2642                 invalidate_inode_pages2(file->f_mapping);
2643
2644                 if (!(vma->vm_flags & VM_MAYSHARE)) {
2645                         /* MAP_PRIVATE */
2646                         return generic_file_mmap(file, vma);
2647                 }
2648
2649                 /*
2650                  * First mmap of direct_io file enters caching inode io mode.
2651                  * Also waits for parallel dio writers to go into serial mode
2652                  * (exclusive instead of shared lock).
2653                  * After first mmap, the inode stays in caching io mode until
2654                  * the direct_io file release.
2655                  */
2656                 rc = fuse_file_cached_io_open(inode, ff);
2657                 if (rc)
2658                         return rc;
2659         }
2660
2661         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2662                 fuse_link_write_file(file);
2663
2664         file_accessed(file);
2665         vma->vm_ops = &fuse_file_vm_ops;
2666         return 0;
2667 }
2668
2669 static int convert_fuse_file_lock(struct fuse_conn *fc,
2670                                   const struct fuse_file_lock *ffl,
2671                                   struct file_lock *fl)
2672 {
2673         switch (ffl->type) {
2674         case F_UNLCK:
2675                 break;
2676
2677         case F_RDLCK:
2678         case F_WRLCK:
2679                 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
2680                     ffl->end < ffl->start)
2681                         return -EIO;
2682
2683                 fl->fl_start = ffl->start;
2684                 fl->fl_end = ffl->end;
2685
2686                 /*
2687                  * Convert pid into init's pid namespace.  The locks API will
2688                  * translate it into the caller's pid namespace.
2689                  */
2690                 rcu_read_lock();
2691                 fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2692                 rcu_read_unlock();
2693                 break;
2694
2695         default:
2696                 return -EIO;
2697         }
2698         fl->c.flc_type = ffl->type;
2699         return 0;
2700 }
2701
2702 static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2703                          const struct file_lock *fl, int opcode, pid_t pid,
2704                          int flock, struct fuse_lk_in *inarg)
2705 {
2706         struct inode *inode = file_inode(file);
2707         struct fuse_conn *fc = get_fuse_conn(inode);
2708         struct fuse_file *ff = file->private_data;
2709
2710         memset(inarg, 0, sizeof(*inarg));
2711         inarg->fh = ff->fh;
2712         inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner);
2713         inarg->lk.start = fl->fl_start;
2714         inarg->lk.end = fl->fl_end;
2715         inarg->lk.type = fl->c.flc_type;
2716         inarg->lk.pid = pid;
2717         if (flock)
2718                 inarg->lk_flags |= FUSE_LK_FLOCK;
2719         args->opcode = opcode;
2720         args->nodeid = get_node_id(inode);
2721         args->in_numargs = 1;
2722         args->in_args[0].size = sizeof(*inarg);
2723         args->in_args[0].value = inarg;
2724 }
2725
2726 static int fuse_getlk(struct file *file, struct file_lock *fl)
2727 {
2728         struct inode *inode = file_inode(file);
2729         struct fuse_mount *fm = get_fuse_mount(inode);
2730         FUSE_ARGS(args);
2731         struct fuse_lk_in inarg;
2732         struct fuse_lk_out outarg;
2733         int err;
2734
2735         fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2736         args.out_numargs = 1;
2737         args.out_args[0].size = sizeof(outarg);
2738         args.out_args[0].value = &outarg;
2739         err = fuse_simple_request(fm, &args);
2740         if (!err)
2741                 err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
2742
2743         return err;
2744 }
2745
2746 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2747 {
2748         struct inode *inode = file_inode(file);
2749         struct fuse_mount *fm = get_fuse_mount(inode);
2750         FUSE_ARGS(args);
2751         struct fuse_lk_in inarg;
2752         int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2753         struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL;
2754         pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
2755         int err;
2756
2757         if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
2758                 /* NLM needs asynchronous locks, which we don't support yet */
2759                 return -ENOLCK;
2760         }
2761
2762         fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2763         err = fuse_simple_request(fm, &args);
2764
2765         /* locking is restartable */
2766         if (err == -EINTR)
2767                 err = -ERESTARTSYS;
2768
2769         return err;
2770 }
2771
2772 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
2773 {
2774         struct inode *inode = file_inode(file);
2775         struct fuse_conn *fc = get_fuse_conn(inode);
2776         int err;
2777
2778         if (cmd == F_CANCELLK) {
2779                 err = 0;
2780         } else if (cmd == F_GETLK) {
2781                 if (fc->no_lock) {
2782                         posix_test_lock(file, fl);
2783                         err = 0;
2784                 } else
2785                         err = fuse_getlk(file, fl);
2786         } else {
2787                 if (fc->no_lock)
2788                         err = posix_lock_file(file, fl, NULL);
2789                 else
2790                         err = fuse_setlk(file, fl, 0);
2791         }
2792         return err;
2793 }
2794
2795 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
2796 {
2797         struct inode *inode = file_inode(file);
2798         struct fuse_conn *fc = get_fuse_conn(inode);
2799         int err;
2800
2801         if (fc->no_flock) {
2802                 err = locks_lock_file_wait(file, fl);
2803         } else {
2804                 struct fuse_file *ff = file->private_data;
2805
2806                 /* emulate flock with POSIX locks */
2807                 ff->flock = true;
2808                 err = fuse_setlk(file, fl, 1);
2809         }
2810
2811         return err;
2812 }
2813
2814 static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2815 {
2816         struct inode *inode = mapping->host;
2817         struct fuse_mount *fm = get_fuse_mount(inode);
2818         FUSE_ARGS(args);
2819         struct fuse_bmap_in inarg;
2820         struct fuse_bmap_out outarg;
2821         int err;
2822
2823         if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
2824                 return 0;
2825
2826         memset(&inarg, 0, sizeof(inarg));
2827         inarg.block = block;
2828         inarg.blocksize = inode->i_sb->s_blocksize;
2829         args.opcode = FUSE_BMAP;
2830         args.nodeid = get_node_id(inode);
2831         args.in_numargs = 1;
2832         args.in_args[0].size = sizeof(inarg);
2833         args.in_args[0].value = &inarg;
2834         args.out_numargs = 1;
2835         args.out_args[0].size = sizeof(outarg);
2836         args.out_args[0].value = &outarg;
2837         err = fuse_simple_request(fm, &args);
2838         if (err == -ENOSYS)
2839                 fm->fc->no_bmap = 1;
2840
2841         return err ? 0 : outarg.block;
2842 }
2843
2844 static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
2845 {
2846         struct inode *inode = file->f_mapping->host;
2847         struct fuse_mount *fm = get_fuse_mount(inode);
2848         struct fuse_file *ff = file->private_data;
2849         FUSE_ARGS(args);
2850         struct fuse_lseek_in inarg = {
2851                 .fh = ff->fh,
2852                 .offset = offset,
2853                 .whence = whence
2854         };
2855         struct fuse_lseek_out outarg;
2856         int err;
2857
2858         if (fm->fc->no_lseek)
2859                 goto fallback;
2860
2861         args.opcode = FUSE_LSEEK;
2862         args.nodeid = ff->nodeid;
2863         args.in_numargs = 1;
2864         args.in_args[0].size = sizeof(inarg);
2865         args.in_args[0].value = &inarg;
2866         args.out_numargs = 1;
2867         args.out_args[0].size = sizeof(outarg);
2868         args.out_args[0].value = &outarg;
2869         err = fuse_simple_request(fm, &args);
2870         if (err) {
2871                 if (err == -ENOSYS) {
2872                         fm->fc->no_lseek = 1;
2873                         goto fallback;
2874                 }
2875                 return err;
2876         }
2877
2878         return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
2879
2880 fallback:
2881         err = fuse_update_attributes(inode, file, STATX_SIZE);
2882         if (!err)
2883                 return generic_file_llseek(file, offset, whence);
2884         else
2885                 return err;
2886 }
2887
2888 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
2889 {
2890         loff_t retval;
2891         struct inode *inode = file_inode(file);
2892
2893         switch (whence) {
2894         case SEEK_SET:
2895         case SEEK_CUR:
2896                  /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2897                 retval = generic_file_llseek(file, offset, whence);
2898                 break;
2899         case SEEK_END:
2900                 inode_lock(inode);
2901                 retval = fuse_update_attributes(inode, file, STATX_SIZE);
2902                 if (!retval)
2903                         retval = generic_file_llseek(file, offset, whence);
2904                 inode_unlock(inode);
2905                 break;
2906         case SEEK_HOLE:
2907         case SEEK_DATA:
2908                 inode_lock(inode);
2909                 retval = fuse_lseek(file, offset, whence);
2910                 inode_unlock(inode);
2911                 break;
2912         default:
2913                 retval = -EINVAL;
2914         }
2915
2916         return retval;
2917 }
2918
2919 /*
2920  * All files which have been polled are linked to RB tree
2921  * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
2922  * find the matching one.
2923  */
2924 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
2925                                               struct rb_node **parent_out)
2926 {
2927         struct rb_node **link = &fc->polled_files.rb_node;
2928         struct rb_node *last = NULL;
2929
2930         while (*link) {
2931                 struct fuse_file *ff;
2932
2933                 last = *link;
2934                 ff = rb_entry(last, struct fuse_file, polled_node);
2935
2936                 if (kh < ff->kh)
2937                         link = &last->rb_left;
2938                 else if (kh > ff->kh)
2939                         link = &last->rb_right;
2940                 else
2941                         return link;
2942         }
2943
2944         if (parent_out)
2945                 *parent_out = last;
2946         return link;
2947 }
2948
2949 /*
2950  * The file is about to be polled.  Make sure it's on the polled_files
2951  * RB tree.  Note that files once added to the polled_files tree are
2952  * not removed before the file is released.  This is because a file
2953  * polled once is likely to be polled again.
2954  */
2955 static void fuse_register_polled_file(struct fuse_conn *fc,
2956                                       struct fuse_file *ff)
2957 {
2958         spin_lock(&fc->lock);
2959         if (RB_EMPTY_NODE(&ff->polled_node)) {
2960                 struct rb_node **link, *parent;
2961
2962                 link = fuse_find_polled_node(fc, ff->kh, &parent);
2963                 BUG_ON(*link);
2964                 rb_link_node(&ff->polled_node, parent, link);
2965                 rb_insert_color(&ff->polled_node, &fc->polled_files);
2966         }
2967         spin_unlock(&fc->lock);
2968 }
2969
2970 __poll_t fuse_file_poll(struct file *file, poll_table *wait)
2971 {
2972         struct fuse_file *ff = file->private_data;
2973         struct fuse_mount *fm = ff->fm;
2974         struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2975         struct fuse_poll_out outarg;
2976         FUSE_ARGS(args);
2977         int err;
2978
2979         if (fm->fc->no_poll)
2980                 return DEFAULT_POLLMASK;
2981
2982         poll_wait(file, &ff->poll_wait, wait);
2983         inarg.events = mangle_poll(poll_requested_events(wait));
2984
2985         /*
2986          * Ask for notification iff there's someone waiting for it.
2987          * The client may ignore the flag and always notify.
2988          */
2989         if (waitqueue_active(&ff->poll_wait)) {
2990                 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
2991                 fuse_register_polled_file(fm->fc, ff);
2992         }
2993
2994         args.opcode = FUSE_POLL;
2995         args.nodeid = ff->nodeid;
2996         args.in_numargs = 1;
2997         args.in_args[0].size = sizeof(inarg);
2998         args.in_args[0].value = &inarg;
2999         args.out_numargs = 1;
3000         args.out_args[0].size = sizeof(outarg);
3001         args.out_args[0].value = &outarg;
3002         err = fuse_simple_request(fm, &args);
3003
3004         if (!err)
3005                 return demangle_poll(outarg.revents);
3006         if (err == -ENOSYS) {
3007                 fm->fc->no_poll = 1;
3008                 return DEFAULT_POLLMASK;
3009         }
3010         return EPOLLERR;
3011 }
3012 EXPORT_SYMBOL_GPL(fuse_file_poll);
3013
3014 /*
3015  * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
3016  * wakes up the poll waiters.
3017  */
3018 int fuse_notify_poll_wakeup(struct fuse_conn *fc,
3019                             struct fuse_notify_poll_wakeup_out *outarg)
3020 {
3021         u64 kh = outarg->kh;
3022         struct rb_node **link;
3023
3024         spin_lock(&fc->lock);
3025
3026         link = fuse_find_polled_node(fc, kh, NULL);
3027         if (*link) {
3028                 struct fuse_file *ff;
3029
3030                 ff = rb_entry(*link, struct fuse_file, polled_node);
3031                 wake_up_interruptible_sync(&ff->poll_wait);
3032         }
3033
3034         spin_unlock(&fc->lock);
3035         return 0;
3036 }
3037
3038 static void fuse_do_truncate(struct file *file)
3039 {
3040         struct inode *inode = file->f_mapping->host;
3041         struct iattr attr;
3042
3043         attr.ia_valid = ATTR_SIZE;
3044         attr.ia_size = i_size_read(inode);
3045
3046         attr.ia_file = file;
3047         attr.ia_valid |= ATTR_FILE;
3048
3049         fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file);
3050 }
3051
3052 static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
3053 {
3054         return round_up(off, fc->max_pages << PAGE_SHIFT);
3055 }
3056
3057 static ssize_t
3058 fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3059 {
3060         DECLARE_COMPLETION_ONSTACK(wait);
3061         ssize_t ret = 0;
3062         struct file *file = iocb->ki_filp;
3063         struct fuse_file *ff = file->private_data;
3064         loff_t pos = 0;
3065         struct inode *inode;
3066         loff_t i_size;
3067         size_t count = iov_iter_count(iter), shortened = 0;
3068         loff_t offset = iocb->ki_pos;
3069         struct fuse_io_priv *io;
3070
3071         pos = offset;
3072         inode = file->f_mapping->host;
3073         i_size = i_size_read(inode);
3074
3075         if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
3076                 return 0;
3077
3078         io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
3079         if (!io)
3080                 return -ENOMEM;
3081         spin_lock_init(&io->lock);
3082         kref_init(&io->refcnt);
3083         io->reqs = 1;
3084         io->bytes = -1;
3085         io->size = 0;
3086         io->offset = offset;
3087         io->write = (iov_iter_rw(iter) == WRITE);
3088         io->err = 0;
3089         /*
3090          * By default, we want to optimize all I/Os with async request
3091          * submission to the client filesystem if supported.
3092          */
3093         io->async = ff->fm->fc->async_dio;
3094         io->iocb = iocb;
3095         io->blocking = is_sync_kiocb(iocb);
3096
3097         /* optimization for short read */
3098         if (io->async && !io->write && offset + count > i_size) {
3099                 iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
3100                 shortened = count - iov_iter_count(iter);
3101                 count -= shortened;
3102         }
3103
3104         /*
3105          * We cannot asynchronously extend the size of a file.
3106          * In such case the aio will behave exactly like sync io.
3107          */
3108         if ((offset + count > i_size) && io->write)
3109                 io->blocking = true;
3110
3111         if (io->async && io->blocking) {
3112                 /*
3113                  * Additional reference to keep io around after
3114                  * calling fuse_aio_complete()
3115                  */
3116                 kref_get(&io->refcnt);
3117                 io->done = &wait;
3118         }
3119
3120         if (iov_iter_rw(iter) == WRITE) {
3121                 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
3122                 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
3123         } else {
3124                 ret = __fuse_direct_read(io, iter, &pos);
3125         }
3126         iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
3127
3128         if (io->async) {
3129                 bool blocking = io->blocking;
3130
3131                 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
3132
3133                 /* we have a non-extending, async request, so return */
3134                 if (!blocking)
3135                         return -EIOCBQUEUED;
3136
3137                 wait_for_completion(&wait);
3138                 ret = fuse_get_res_by_io(io);
3139         }
3140
3141         kref_put(&io->refcnt, fuse_io_release);
3142
3143         if (iov_iter_rw(iter) == WRITE) {
3144                 fuse_write_update_attr(inode, pos, ret);
3145                 /* For extending writes we already hold exclusive lock */
3146                 if (ret < 0 && offset + count > i_size)
3147                         fuse_do_truncate(file);
3148         }
3149
3150         return ret;
3151 }
3152
3153 static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
3154 {
3155         int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
3156
3157         if (!err)
3158                 fuse_sync_writes(inode);
3159
3160         return err;
3161 }
3162
3163 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
3164                                 loff_t length)
3165 {
3166         struct fuse_file *ff = file->private_data;
3167         struct inode *inode = file_inode(file);
3168         struct fuse_inode *fi = get_fuse_inode(inode);
3169         struct fuse_mount *fm = ff->fm;
3170         FUSE_ARGS(args);
3171         struct fuse_fallocate_in inarg = {
3172                 .fh = ff->fh,
3173                 .offset = offset,
3174                 .length = length,
3175                 .mode = mode
3176         };
3177         int err;
3178         bool block_faults = FUSE_IS_DAX(inode) &&
3179                 (!(mode & FALLOC_FL_KEEP_SIZE) ||
3180                  (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
3181
3182         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3183                      FALLOC_FL_ZERO_RANGE))
3184                 return -EOPNOTSUPP;
3185
3186         if (fm->fc->no_fallocate)
3187                 return -EOPNOTSUPP;
3188
3189         inode_lock(inode);
3190         if (block_faults) {
3191                 filemap_invalidate_lock(inode->i_mapping);
3192                 err = fuse_dax_break_layouts(inode, 0, 0);
3193                 if (err)
3194                         goto out;
3195         }
3196
3197         if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
3198                 loff_t endbyte = offset + length - 1;
3199
3200                 err = fuse_writeback_range(inode, offset, endbyte);
3201                 if (err)
3202                         goto out;
3203         }
3204
3205         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
3206             offset + length > i_size_read(inode)) {
3207                 err = inode_newsize_ok(inode, offset + length);
3208                 if (err)
3209                         goto out;
3210         }
3211
3212         err = file_modified(file);
3213         if (err)
3214                 goto out;
3215
3216         if (!(mode & FALLOC_FL_KEEP_SIZE))
3217                 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3218
3219         args.opcode = FUSE_FALLOCATE;
3220         args.nodeid = ff->nodeid;
3221         args.in_numargs = 1;
3222         args.in_args[0].size = sizeof(inarg);
3223         args.in_args[0].value = &inarg;
3224         err = fuse_simple_request(fm, &args);
3225         if (err == -ENOSYS) {
3226                 fm->fc->no_fallocate = 1;
3227                 err = -EOPNOTSUPP;
3228         }
3229         if (err)
3230                 goto out;
3231
3232         /* we could have extended the file */
3233         if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3234                 if (fuse_write_update_attr(inode, offset + length, length))
3235                         file_update_time(file);
3236         }
3237
3238         if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
3239                 truncate_pagecache_range(inode, offset, offset + length - 1);
3240
3241         fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
3242
3243 out:
3244         if (!(mode & FALLOC_FL_KEEP_SIZE))
3245                 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3246
3247         if (block_faults)
3248                 filemap_invalidate_unlock(inode->i_mapping);
3249
3250         inode_unlock(inode);
3251
3252         fuse_flush_time_update(inode);
3253
3254         return err;
3255 }
3256
3257 static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
3258                                       struct file *file_out, loff_t pos_out,
3259                                       size_t len, unsigned int flags)
3260 {
3261         struct fuse_file *ff_in = file_in->private_data;
3262         struct fuse_file *ff_out = file_out->private_data;
3263         struct inode *inode_in = file_inode(file_in);
3264         struct inode *inode_out = file_inode(file_out);
3265         struct fuse_inode *fi_out = get_fuse_inode(inode_out);
3266         struct fuse_mount *fm = ff_in->fm;
3267         struct fuse_conn *fc = fm->fc;
3268         FUSE_ARGS(args);
3269         struct fuse_copy_file_range_in inarg = {
3270                 .fh_in = ff_in->fh,
3271                 .off_in = pos_in,
3272                 .nodeid_out = ff_out->nodeid,
3273                 .fh_out = ff_out->fh,
3274                 .off_out = pos_out,
3275                 .len = len,
3276                 .flags = flags
3277         };
3278         struct fuse_write_out outarg;
3279         ssize_t err;
3280         /* mark unstable when write-back is not used, and file_out gets
3281          * extended */
3282         bool is_unstable = (!fc->writeback_cache) &&
3283                            ((pos_out + len) > inode_out->i_size);
3284
3285         if (fc->no_copy_file_range)
3286                 return -EOPNOTSUPP;
3287
3288         if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
3289                 return -EXDEV;
3290
3291         inode_lock(inode_in);
3292         err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
3293         inode_unlock(inode_in);
3294         if (err)
3295                 return err;
3296
3297         inode_lock(inode_out);
3298
3299         err = file_modified(file_out);
3300         if (err)
3301                 goto out;
3302
3303         /*
3304          * Write out dirty pages in the destination file before sending the COPY
3305          * request to userspace.  After the request is completed, truncate off
3306          * pages (including partial ones) from the cache that have been copied,
3307          * since these contain stale data at that point.
3308          *
3309          * This should be mostly correct, but if the COPY writes to partial
3310          * pages (at the start or end) and the parts not covered by the COPY are
3311          * written through a memory map after calling fuse_writeback_range(),
3312          * then these partial page modifications will be lost on truncation.
3313          *
3314          * It is unlikely that someone would rely on such mixed style
3315          * modifications.  Yet this does give less guarantees than if the
3316          * copying was performed with write(2).
3317          *
3318          * To fix this a mapping->invalidate_lock could be used to prevent new
3319          * faults while the copy is ongoing.
3320          */
3321         err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
3322         if (err)
3323                 goto out;
3324
3325         if (is_unstable)
3326                 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3327
3328         args.opcode = FUSE_COPY_FILE_RANGE;
3329         args.nodeid = ff_in->nodeid;
3330         args.in_numargs = 1;
3331         args.in_args[0].size = sizeof(inarg);
3332         args.in_args[0].value = &inarg;
3333         args.out_numargs = 1;
3334         args.out_args[0].size = sizeof(outarg);
3335         args.out_args[0].value = &outarg;
3336         err = fuse_simple_request(fm, &args);
3337         if (err == -ENOSYS) {
3338                 fc->no_copy_file_range = 1;
3339                 err = -EOPNOTSUPP;
3340         }
3341         if (err)
3342                 goto out;
3343
3344         truncate_inode_pages_range(inode_out->i_mapping,
3345                                    ALIGN_DOWN(pos_out, PAGE_SIZE),
3346                                    ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
3347
3348         file_update_time(file_out);
3349         fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
3350
3351         err = outarg.size;
3352 out:
3353         if (is_unstable)
3354                 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3355
3356         inode_unlock(inode_out);
3357         file_accessed(file_in);
3358
3359         fuse_flush_time_update(inode_out);
3360
3361         return err;
3362 }
3363
3364 static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
3365                                     struct file *dst_file, loff_t dst_off,
3366                                     size_t len, unsigned int flags)
3367 {
3368         ssize_t ret;
3369
3370         ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
3371                                      len, flags);
3372
3373         if (ret == -EOPNOTSUPP || ret == -EXDEV)
3374                 ret = splice_copy_file_range(src_file, src_off, dst_file,
3375                                              dst_off, len);
3376         return ret;
3377 }
3378
3379 static const struct file_operations fuse_file_operations = {
3380         .llseek         = fuse_file_llseek,
3381         .read_iter      = fuse_file_read_iter,
3382         .write_iter     = fuse_file_write_iter,
3383         .mmap           = fuse_file_mmap,
3384         .open           = fuse_open,
3385         .flush          = fuse_flush,
3386         .release        = fuse_release,
3387         .fsync          = fuse_fsync,
3388         .lock           = fuse_file_lock,
3389         .get_unmapped_area = thp_get_unmapped_area,
3390         .flock          = fuse_file_flock,
3391         .splice_read    = fuse_splice_read,
3392         .splice_write   = fuse_splice_write,
3393         .unlocked_ioctl = fuse_file_ioctl,
3394         .compat_ioctl   = fuse_file_compat_ioctl,
3395         .poll           = fuse_file_poll,
3396         .fallocate      = fuse_file_fallocate,
3397         .copy_file_range = fuse_copy_file_range,
3398 };
3399
3400 static const struct address_space_operations fuse_file_aops  = {
3401         .read_folio     = fuse_read_folio,
3402         .readahead      = fuse_readahead,
3403         .writepages     = fuse_writepages,
3404         .launder_folio  = fuse_launder_folio,
3405         .dirty_folio    = filemap_dirty_folio,
3406         .migrate_folio  = filemap_migrate_folio,
3407         .bmap           = fuse_bmap,
3408         .direct_IO      = fuse_direct_IO,
3409         .write_begin    = fuse_write_begin,
3410         .write_end      = fuse_write_end,
3411 };
3412
3413 void fuse_init_file_inode(struct inode *inode, unsigned int flags)
3414 {
3415         struct fuse_inode *fi = get_fuse_inode(inode);
3416
3417         inode->i_fop = &fuse_file_operations;
3418         inode->i_data.a_ops = &fuse_file_aops;
3419
3420         INIT_LIST_HEAD(&fi->write_files);
3421         INIT_LIST_HEAD(&fi->queued_writes);
3422         fi->writectr = 0;
3423         fi->iocachectr = 0;
3424         init_waitqueue_head(&fi->page_waitq);
3425         init_waitqueue_head(&fi->direct_io_waitq);
3426         fi->writepages = RB_ROOT;
3427
3428         if (IS_ENABLED(CONFIG_FUSE_DAX))
3429                 fuse_dax_inode_init(inode, flags);
3430 }