fs/fuse/file.c

   1 /*
   2   FUSE: Filesystem in Userspace
   3   Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
   4
   5   This program can be distributed under the terms of the GNU GPL.
   6   See the file COPYING.
   7 */
   8
   9 #include "fuse_i.h"
  10
  11 #include <linux/pagemap.h>
  12 #include <linux/slab.h>
  13 #include <linux/kernel.h>
  14 #include <linux/sched.h>
  15 #include <linux/sched/signal.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/falloc.h>
  19 #include <linux/uio.h>
  20 #include <linux/fs.h>
  21 #include <linux/filelock.h>
  22 #include <linux/splice.h>
  23 #include <linux/task_io_accounting_ops.h>
  24
  25 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
  26                           unsigned int open_flags, int opcode,
  27                           struct fuse_open_out *outargp)
  28 {
  29         struct fuse_open_in inarg;
  30         FUSE_ARGS(args);
  31
  32         memset(&inarg, 0, sizeof(inarg));
  33         inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
  34         if (!fm->fc->atomic_o_trunc)
  35                 inarg.flags &= ~O_TRUNC;
  36
  37         if (fm->fc->handle_killpriv_v2 &&
  38             (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
  39                 inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
  40         }
  41
  42         args.opcode = opcode;
  43         args.nodeid = nodeid;
  44         args.in_numargs = 1;
  45         args.in_args[0].size = sizeof(inarg);
  46         args.in_args[0].value = &inarg;
  47         args.out_numargs = 1;
  48         args.out_args[0].size = sizeof(*outargp);
  49         args.out_args[0].value = outargp;
  50
  51         return fuse_simple_request(fm, &args);
  52 }
  53
  54 struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
  55 {
  56         struct fuse_file *ff;
  57
  58         ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
  59         if (unlikely(!ff))
  60                 return NULL;
  61
  62         ff->fm = fm;
  63         if (release) {
  64                 ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT);
  65                 if (!ff->args) {
  66                         kfree(ff);
  67                         return NULL;
  68                 }
  69         }
  70
  71         INIT_LIST_HEAD(&ff->write_entry);
  72         refcount_set(&ff->count, 1);
  73         RB_CLEAR_NODE(&ff->polled_node);
  74         init_waitqueue_head(&ff->poll_wait);
  75
  76         ff->kh = atomic64_inc_return(&fm->fc->khctr);
  77
  78         return ff;
  79 }
  80
  81 void fuse_file_free(struct fuse_file *ff)
  82 {
  83         kfree(ff->args);
  84         kfree(ff);
  85 }
  86
  87 static struct fuse_file *fuse_file_get(struct fuse_file *ff)
  88 {
  89         refcount_inc(&ff->count);
  90         return ff;
  91 }
  92
  93 static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
  94                              int error)
  95 {
  96         struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
  97
  98         iput(ra->inode);
  99         kfree(ra);
 100 }
 101
 102 static void fuse_file_put(struct fuse_file *ff, bool sync)
 103 {
 104         if (refcount_dec_and_test(&ff->count)) {
 105                 struct fuse_release_args *ra = &ff->args->release_args;
 106                 struct fuse_args *args = (ra ? &ra->args : NULL);
 107
 108                 if (ra && ra->inode)
 109                         fuse_file_io_release(ff, ra->inode);
 110
 111                 if (!args) {
 112                         /* Do nothing when server does not implement 'open' */
 113                 } else if (sync) {
 114                         fuse_simple_request(ff->fm, args);
 115                         fuse_release_end(ff->fm, args, 0);
 116                 } else {
 117                         args->end = fuse_release_end;
 118                         if (fuse_simple_background(ff->fm, args,
 119                                                    GFP_KERNEL | __GFP_NOFAIL))
 120                                 fuse_release_end(ff->fm, args, -ENOTCONN);
 121                 }
 122                 kfree(ff);
 123         }
 124 }
 125
 126 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 127                                  unsigned int open_flags, bool isdir)
 128 {
 129         struct fuse_conn *fc = fm->fc;
 130         struct fuse_file *ff;
 131         int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 132         bool open = isdir ? !fc->no_opendir : !fc->no_open;
 133
 134         ff = fuse_file_alloc(fm, open);
 135         if (!ff)
 136                 return ERR_PTR(-ENOMEM);
 137
 138         ff->fh = 0;
 139         /* Default for no-open */
 140         ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
 141         if (open) {
 142                 /* Store outarg for fuse_finish_open() */
 143                 struct fuse_open_out *outargp = &ff->args->open_outarg;
 144                 int err;
 145
 146                 err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
 147                 if (!err) {
 148                         ff->fh = outargp->fh;
 149                         ff->open_flags = outargp->open_flags;
 150                 } else if (err != -ENOSYS) {
 151                         fuse_file_free(ff);
 152                         return ERR_PTR(err);
 153                 } else {
 154                         /* No release needed */
 155                         kfree(ff->args);
 156                         ff->args = NULL;
 157                         if (isdir)
 158                                 fc->no_opendir = 1;
 159                         else
 160                                 fc->no_open = 1;
 161                 }
 162         }
 163
 164         if (isdir)
 165                 ff->open_flags &= ~FOPEN_DIRECT_IO;
 166
 167         ff->nodeid = nodeid;
 168
 169         return ff;
 170 }
 171
 172 int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
 173                  bool isdir)
 174 {
 175         struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
 176
 177         if (!IS_ERR(ff))
 178                 file->private_data = ff;
 179
 180         return PTR_ERR_OR_ZERO(ff);
 181 }
 182 EXPORT_SYMBOL_GPL(fuse_do_open);
 183
 184 static void fuse_link_write_file(struct file *file)
 185 {
 186         struct inode *inode = file_inode(file);
 187         struct fuse_inode *fi = get_fuse_inode(inode);
 188         struct fuse_file *ff = file->private_data;
 189         /*
 190          * file may be written through mmap, so chain it onto the
 191          * inodes's write_file list
 192          */
 193         spin_lock(&fi->lock);
 194         if (list_empty(&ff->write_entry))
 195                 list_add(&ff->write_entry, &fi->write_files);
 196         spin_unlock(&fi->lock);
 197 }
 198
 199 int fuse_finish_open(struct inode *inode, struct file *file)
 200 {
 201         struct fuse_file *ff = file->private_data;
 202         struct fuse_conn *fc = get_fuse_conn(inode);
 203         int err;
 204
 205         err = fuse_file_io_open(file, inode);
 206         if (err)
 207                 return err;
 208
 209         if (ff->open_flags & FOPEN_STREAM)
 210                 stream_open(inode, file);
 211         else if (ff->open_flags & FOPEN_NONSEEKABLE)
 212                 nonseekable_open(inode, file);
 213
 214         if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
 215                 fuse_link_write_file(file);
 216
 217         return 0;
 218 }
 219
 220 static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
 221 {
 222         struct fuse_conn *fc = get_fuse_conn(inode);
 223         struct fuse_inode *fi = get_fuse_inode(inode);
 224
 225         spin_lock(&fi->lock);
 226         fi->attr_version = atomic64_inc_return(&fc->attr_version);
 227         i_size_write(inode, 0);
 228         spin_unlock(&fi->lock);
 229         file_update_time(file);
 230         fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
 231 }
 232
 233 static int fuse_open(struct inode *inode, struct file *file)
 234 {
 235         struct fuse_mount *fm = get_fuse_mount(inode);
 236         struct fuse_inode *fi = get_fuse_inode(inode);
 237         struct fuse_conn *fc = fm->fc;
 238         struct fuse_file *ff;
 239         int err;
 240         bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
 241         bool is_wb_truncate = is_truncate && fc->writeback_cache;
 242         bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
 243
 244         if (fuse_is_bad(inode))
 245                 return -EIO;
 246
 247         err = generic_file_open(inode, file);
 248         if (err)
 249                 return err;
 250
 251         if (is_wb_truncate || dax_truncate)
 252                 inode_lock(inode);
 253
 254         if (dax_truncate) {
 255                 filemap_invalidate_lock(inode->i_mapping);
 256                 err = fuse_dax_break_layouts(inode, 0, 0);
 257                 if (err)
 258                         goto out_inode_unlock;
 259         }
 260
 261         if (is_wb_truncate || dax_truncate)
 262                 fuse_set_nowrite(inode);
 263
 264         err = fuse_do_open(fm, get_node_id(inode), file, false);
 265         if (!err) {
 266                 ff = file->private_data;
 267                 err = fuse_finish_open(inode, file);
 268                 if (err)
 269                         fuse_sync_release(fi, ff, file->f_flags);
 270                 else if (is_truncate)
 271                         fuse_truncate_update_attr(inode, file);
 272         }
 273
 274         if (is_wb_truncate || dax_truncate)
 275                 fuse_release_nowrite(inode);
 276         if (!err) {
 277                 if (is_truncate)
 278                         truncate_pagecache(inode, 0);
 279                 else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
 280                         invalidate_inode_pages2(inode->i_mapping);
 281         }
 282         if (dax_truncate)
 283                 filemap_invalidate_unlock(inode->i_mapping);
 284 out_inode_unlock:
 285         if (is_wb_truncate || dax_truncate)
 286                 inode_unlock(inode);
 287
 288         return err;
 289 }
 290
 291 static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
 292                                  unsigned int flags, int opcode, bool sync)
 293 {
 294         struct fuse_conn *fc = ff->fm->fc;
 295         struct fuse_release_args *ra = &ff->args->release_args;
 296
 297         if (fuse_file_passthrough(ff))
 298                 fuse_passthrough_release(ff, fuse_inode_backing(fi));
 299
 300         /* Inode is NULL on error path of fuse_create_open() */
 301         if (likely(fi)) {
 302                 spin_lock(&fi->lock);
 303                 list_del(&ff->write_entry);
 304                 spin_unlock(&fi->lock);
 305         }
 306         spin_lock(&fc->lock);
 307         if (!RB_EMPTY_NODE(&ff->polled_node))
 308                 rb_erase(&ff->polled_node, &fc->polled_files);
 309         spin_unlock(&fc->lock);
 310
 311         wake_up_interruptible_all(&ff->poll_wait);
 312
 313         if (!ra)
 314                 return;
 315
 316         /* ff->args was used for open outarg */
 317         memset(ff->args, 0, sizeof(*ff->args));
 318         ra->inarg.fh = ff->fh;
 319         ra->inarg.flags = flags;
 320         ra->args.in_numargs = 1;
 321         ra->args.in_args[0].size = sizeof(struct fuse_release_in);
 322         ra->args.in_args[0].value = &ra->inarg;
 323         ra->args.opcode = opcode;
 324         ra->args.nodeid = ff->nodeid;
 325         ra->args.force = true;
 326         ra->args.nocreds = true;
 327
 328         /*
 329          * Hold inode until release is finished.
 330          * From fuse_sync_release() the refcount is 1 and everything's
 331          * synchronous, so we are fine with not doing igrab() here.
 332          */
 333         ra->inode = sync ? NULL : igrab(&fi->inode);
 334 }
 335
 336 void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 337                        unsigned int open_flags, fl_owner_t id, bool isdir)
 338 {
 339         struct fuse_inode *fi = get_fuse_inode(inode);
 340         struct fuse_release_args *ra = &ff->args->release_args;
 341         int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 342
 343         fuse_prepare_release(fi, ff, open_flags, opcode, false);
 344
 345         if (ra && ff->flock) {
 346                 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
 347                 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
 348         }
 349
 350         /*
 351          * Normally this will send the RELEASE request, however if
 352          * some asynchronous READ or WRITE requests are outstanding,
 353          * the sending will be delayed.
 354          *
 355          * Make the release synchronous if this is a fuseblk mount,
 356          * synchronous RELEASE is allowed (and desirable) in this case
 357          * because the server can be trusted not to screw up.
 358          */
 359         fuse_file_put(ff, ff->fm->fc->destroy);
 360 }
 361
 362 void fuse_release_common(struct file *file, bool isdir)
 363 {
 364         fuse_file_release(file_inode(file), file->private_data, file->f_flags,
 365                           (fl_owner_t) file, isdir);
 366 }
 367
 368 static int fuse_release(struct inode *inode, struct file *file)
 369 {
 370         struct fuse_conn *fc = get_fuse_conn(inode);
 371
 372         /*
 373          * Dirty pages might remain despite write_inode_now() call from
 374          * fuse_flush() due to writes racing with the close.
 375          */
 376         if (fc->writeback_cache)
 377                 write_inode_now(inode, 1);
 378
 379         fuse_release_common(file, false);
 380
 381         /* return value is ignored by VFS */
 382         return 0;
 383 }
 384
 385 void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
 386                        unsigned int flags)
 387 {
 388         WARN_ON(refcount_read(&ff->count) > 1);
 389         fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
 390         fuse_file_put(ff, true);
 391 }
 392 EXPORT_SYMBOL_GPL(fuse_sync_release);
 393
 394 /*
 395  * Scramble the ID space with XTEA, so that the value of the files_struct
 396  * pointer is not exposed to userspace.
 397  */
 398 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 399 {
 400         u32 *k = fc->scramble_key;
 401         u64 v = (unsigned long) id;
 402         u32 v0 = v;
 403         u32 v1 = v >> 32;
 404         u32 sum = 0;
 405         int i;
 406
 407         for (i = 0; i < 32; i++) {
 408                 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
 409                 sum += 0x9E3779B9;
 410                 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
 411         }
 412
 413         return (u64) v0 + ((u64) v1 << 32);
 414 }
 415
 416 struct fuse_writepage_args {
 417         struct fuse_io_args ia;
 418         struct rb_node writepages_entry;
 419         struct list_head queue_entry;
 420         struct fuse_writepage_args *next;
 421         struct inode *inode;
 422         struct fuse_sync_bucket *bucket;
 423 };
 424
 425 static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
 426                                             pgoff_t idx_from, pgoff_t idx_to)
 427 {
 428         struct rb_node *n;
 429
 430         n = fi->writepages.rb_node;
 431
 432         while (n) {
 433                 struct fuse_writepage_args *wpa;
 434                 pgoff_t curr_index;
 435
 436                 wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
 437                 WARN_ON(get_fuse_inode(wpa->inode) != fi);
 438                 curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
 439                 if (idx_from >= curr_index + wpa->ia.ap.num_folios)
 440                         n = n->rb_right;
 441                 else if (idx_to < curr_index)
 442                         n = n->rb_left;
 443                 else
 444                         return wpa;
 445         }
 446         return NULL;
 447 }
 448
 449 /*
 450  * Check if any page in a range is under writeback
 451  */
 452 static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
 453                                    pgoff_t idx_to)
 454 {
 455         struct fuse_inode *fi = get_fuse_inode(inode);
 456         bool found;
 457
 458         if (RB_EMPTY_ROOT(&fi->writepages))
 459                 return false;
 460
 461         spin_lock(&fi->lock);
 462         found = fuse_find_writeback(fi, idx_from, idx_to);
 463         spin_unlock(&fi->lock);
 464
 465         return found;
 466 }
 467
 468 static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
 469 {
 470         return fuse_range_is_writeback(inode, index, index);
 471 }
 472
 473 /*
 474  * Wait for page writeback to be completed.
 475  *
 476  * Since fuse doesn't rely on the VM writeback tracking, this has to
 477  * use some other means.
 478  */
 479 static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 480 {
 481         struct fuse_inode *fi = get_fuse_inode(inode);
 482
 483         wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
 484 }
 485
 486 static inline bool fuse_folio_is_writeback(struct inode *inode,
 487                                            struct folio *folio)
 488 {
 489         pgoff_t last = folio_next_index(folio) - 1;
 490         return fuse_range_is_writeback(inode, folio_index(folio), last);
 491 }
 492
 493 static void fuse_wait_on_folio_writeback(struct inode *inode,
 494                                          struct folio *folio)
 495 {
 496         struct fuse_inode *fi = get_fuse_inode(inode);
 497
 498         wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio));
 499 }
 500
 501 /*
 502  * Wait for all pending writepages on the inode to finish.
 503  *
 504  * This is currently done by blocking further writes with FUSE_NOWRITE
 505  * and waiting for all sent writes to complete.
 506  *
 507  * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
 508  * could conflict with truncation.
 509  */
 510 static void fuse_sync_writes(struct inode *inode)
 511 {
 512         fuse_set_nowrite(inode);
 513         fuse_release_nowrite(inode);
 514 }
 515
 516 static int fuse_flush(struct file *file, fl_owner_t id)
 517 {
 518         struct inode *inode = file_inode(file);
 519         struct fuse_mount *fm = get_fuse_mount(inode);
 520         struct fuse_file *ff = file->private_data;
 521         struct fuse_flush_in inarg;
 522         FUSE_ARGS(args);
 523         int err;
 524
 525         if (fuse_is_bad(inode))
 526                 return -EIO;
 527
 528         if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
 529                 return 0;
 530
 531         err = write_inode_now(inode, 1);
 532         if (err)
 533                 return err;
 534
 535         inode_lock(inode);
 536         fuse_sync_writes(inode);
 537         inode_unlock(inode);
 538
 539         err = filemap_check_errors(file->f_mapping);
 540         if (err)
 541                 return err;
 542
 543         err = 0;
 544         if (fm->fc->no_flush)
 545                 goto inval_attr_out;
 546
 547         memset(&inarg, 0, sizeof(inarg));
 548         inarg.fh = ff->fh;
 549         inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
 550         args.opcode = FUSE_FLUSH;
 551         args.nodeid = get_node_id(inode);
 552         args.in_numargs = 1;
 553         args.in_args[0].size = sizeof(inarg);
 554         args.in_args[0].value = &inarg;
 555         args.force = true;
 556
 557         err = fuse_simple_request(fm, &args);
 558         if (err == -ENOSYS) {
 559                 fm->fc->no_flush = 1;
 560                 err = 0;
 561         }
 562
 563 inval_attr_out:
 564         /*
 565          * In memory i_blocks is not maintained by fuse, if writeback cache is
 566          * enabled, i_blocks from cached attr may not be accurate.
 567          */
 568         if (!err && fm->fc->writeback_cache)
 569                 fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
 570         return err;
 571 }
 572
 573 int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 574                       int datasync, int opcode)
 575 {
 576         struct inode *inode = file->f_mapping->host;
 577         struct fuse_mount *fm = get_fuse_mount(inode);
 578         struct fuse_file *ff = file->private_data;
 579         FUSE_ARGS(args);
 580         struct fuse_fsync_in inarg;
 581
 582         memset(&inarg, 0, sizeof(inarg));
 583         inarg.fh = ff->fh;
 584         inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
 585         args.opcode = opcode;
 586         args.nodeid = get_node_id(inode);
 587         args.in_numargs = 1;
 588         args.in_args[0].size = sizeof(inarg);
 589         args.in_args[0].value = &inarg;
 590         return fuse_simple_request(fm, &args);
 591 }
 592
 593 static int fuse_fsync(struct file *file, loff_t start, loff_t end,
 594                       int datasync)
 595 {
 596         struct inode *inode = file->f_mapping->host;
 597         struct fuse_conn *fc = get_fuse_conn(inode);
 598         int err;
 599
 600         if (fuse_is_bad(inode))
 601                 return -EIO;
 602
 603         inode_lock(inode);
 604
 605         /*
 606          * Start writeback against all dirty pages of the inode, then
 607          * wait for all outstanding writes, before sending the FSYNC
 608          * request.
 609          */
 610         err = file_write_and_wait_range(file, start, end);
 611         if (err)
 612                 goto out;
 613
 614         fuse_sync_writes(inode);
 615
 616         /*
 617          * Due to implementation of fuse writeback
 618          * file_write_and_wait_range() does not catch errors.
 619          * We have to do this directly after fuse_sync_writes()
 620          */
 621         err = file_check_and_advance_wb_err(file);
 622         if (err)
 623                 goto out;
 624
 625         err = sync_inode_metadata(inode, 1);
 626         if (err)
 627                 goto out;
 628
 629         if (fc->no_fsync)
 630                 goto out;
 631
 632         err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
 633         if (err == -ENOSYS) {
 634                 fc->no_fsync = 1;
 635                 err = 0;
 636         }
 637 out:
 638         inode_unlock(inode);
 639
 640         return err;
 641 }
 642
 643 void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
 644                          size_t count, int opcode)
 645 {
 646         struct fuse_file *ff = file->private_data;
 647         struct fuse_args *args = &ia->ap.args;
 648
 649         ia->read.in.fh = ff->fh;
 650         ia->read.in.offset = pos;
 651         ia->read.in.size = count;
 652         ia->read.in.flags = file->f_flags;
 653         args->opcode = opcode;
 654         args->nodeid = ff->nodeid;
 655         args->in_numargs = 1;
 656         args->in_args[0].size = sizeof(ia->read.in);
 657         args->in_args[0].value = &ia->read.in;
 658         args->out_argvar = true;
 659         args->out_numargs = 1;
 660         args->out_args[0].size = count;
 661 }
 662
 663 static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres,
 664                                     bool should_dirty)
 665 {
 666         unsigned int i;
 667
 668         for (i = 0; i < ap->num_folios; i++) {
 669                 if (should_dirty)
 670                         folio_mark_dirty_lock(ap->folios[i]);
 671                 if (ap->args.is_pinned)
 672                         unpin_folio(ap->folios[i]);
 673         }
 674
 675         if (nres > 0 && ap->args.invalidate_vmap)
 676                 invalidate_kernel_vmap_range(ap->args.vmap_base, nres);
 677 }
 678
 679 static void fuse_io_release(struct kref *kref)
 680 {
 681         kfree(container_of(kref, struct fuse_io_priv, refcnt));
 682 }
 683
 684 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
 685 {
 686         if (io->err)
 687                 return io->err;
 688
 689         if (io->bytes >= 0 && io->write)
 690                 return -EIO;
 691
 692         return io->bytes < 0 ? io->size : io->bytes;
 693 }
 694
 695 /*
 696  * In case of short read, the caller sets 'pos' to the position of
 697  * actual end of fuse request in IO request. Otherwise, if bytes_requested
 698  * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
 699  *
 700  * An example:
 701  * User requested DIO read of 64K. It was split into two 32K fuse requests,
 702  * both submitted asynchronously. The first of them was ACKed by userspace as
 703  * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
 704  * second request was ACKed as short, e.g. only 1K was read, resulting in
 705  * pos == 33K.
 706  *
 707  * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
 708  * will be equal to the length of the longest contiguous fragment of
 709  * transferred data starting from the beginning of IO request.
 710  */
 711 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 712 {
 713         int left;
 714
 715         spin_lock(&io->lock);
 716         if (err)
 717                 io->err = io->err ? : err;
 718         else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
 719                 io->bytes = pos;
 720
 721         left = --io->reqs;
 722         if (!left && io->blocking)
 723                 complete(io->done);
 724         spin_unlock(&io->lock);
 725
 726         if (!left && !io->blocking) {
 727                 ssize_t res = fuse_get_res_by_io(io);
 728
 729                 if (res >= 0) {
 730                         struct inode *inode = file_inode(io->iocb->ki_filp);
 731                         struct fuse_conn *fc = get_fuse_conn(inode);
 732                         struct fuse_inode *fi = get_fuse_inode(inode);
 733
 734                         spin_lock(&fi->lock);
 735                         fi->attr_version = atomic64_inc_return(&fc->attr_version);
 736                         spin_unlock(&fi->lock);
 737                 }
 738
 739                 io->iocb->ki_complete(io->iocb, res);
 740         }
 741
 742         kref_put(&io->refcnt, fuse_io_release);
 743 }
 744
 745 static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
 746                                                  unsigned int nfolios)
 747 {
 748         struct fuse_io_args *ia;
 749
 750         ia = kzalloc(sizeof(*ia), GFP_KERNEL);
 751         if (ia) {
 752                 ia->io = io;
 753                 ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL,
 754                                                   &ia->ap.descs);
 755                 if (!ia->ap.folios) {
 756                         kfree(ia);
 757                         ia = NULL;
 758                 }
 759         }
 760         return ia;
 761 }
 762
 763 static void fuse_io_free(struct fuse_io_args *ia)
 764 {
 765         kfree(ia->ap.folios);
 766         kfree(ia);
 767 }
 768
 769 static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
 770                                   int err)
 771 {
 772         struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
 773         struct fuse_io_priv *io = ia->io;
 774         ssize_t pos = -1;
 775         size_t nres;
 776
 777         if (err) {
 778                 /* Nothing */
 779         } else if (io->write) {
 780                 if (ia->write.out.size > ia->write.in.size) {
 781                         err = -EIO;
 782                 } else {
 783                         nres = ia->write.out.size;
 784                         if (ia->write.in.size != ia->write.out.size)
 785                                 pos = ia->write.in.offset - io->offset +
 786                                       ia->write.out.size;
 787                 }
 788         } else {
 789                 u32 outsize = args->out_args[0].size;
 790
 791                 nres = outsize;
 792                 if (ia->read.in.size != outsize)
 793                         pos = ia->read.in.offset - io->offset + outsize;
 794         }
 795
 796         fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty);
 797
 798         fuse_aio_complete(io, err, pos);
 799         fuse_io_free(ia);
 800 }
 801
 802 static ssize_t fuse_async_req_send(struct fuse_mount *fm,
 803                                    struct fuse_io_args *ia, size_t num_bytes)
 804 {
 805         ssize_t err;
 806         struct fuse_io_priv *io = ia->io;
 807
 808         spin_lock(&io->lock);
 809         kref_get(&io->refcnt);
 810         io->size += num_bytes;
 811         io->reqs++;
 812         spin_unlock(&io->lock);
 813
 814         ia->ap.args.end = fuse_aio_complete_req;
 815         ia->ap.args.may_block = io->should_dirty;
 816         err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
 817         if (err)
 818                 fuse_aio_complete_req(fm, &ia->ap.args, err);
 819
 820         return num_bytes;
 821 }
 822
 823 static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
 824                               fl_owner_t owner)
 825 {
 826         struct file *file = ia->io->iocb->ki_filp;
 827         struct fuse_file *ff = file->private_data;
 828         struct fuse_mount *fm = ff->fm;
 829
 830         fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 831         if (owner != NULL) {
 832                 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
 833                 ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
 834         }
 835
 836         if (ia->io->async)
 837                 return fuse_async_req_send(fm, ia, count);
 838
 839         return fuse_simple_request(fm, &ia->ap.args);
 840 }
 841
 842 static void fuse_read_update_size(struct inode *inode, loff_t size,
 843                                   u64 attr_ver)
 844 {
 845         struct fuse_conn *fc = get_fuse_conn(inode);
 846         struct fuse_inode *fi = get_fuse_inode(inode);
 847
 848         spin_lock(&fi->lock);
 849         if (attr_ver >= fi->attr_version && size < inode->i_size &&
 850             !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
 851                 fi->attr_version = atomic64_inc_return(&fc->attr_version);
 852                 i_size_write(inode, size);
 853         }
 854         spin_unlock(&fi->lock);
 855 }
 856
 857 static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
 858                             struct fuse_args_pages *ap)
 859 {
 860         struct fuse_conn *fc = get_fuse_conn(inode);
 861
 862         /*
 863          * If writeback_cache is enabled, a short read means there's a hole in
 864          * the file.  Some data after the hole is in page cache, but has not
 865          * reached the client fs yet.  So the hole is not present there.
 866          */
 867         if (!fc->writeback_cache) {
 868                 loff_t pos = folio_pos(ap->folios[0]) + num_read;
 869                 fuse_read_update_size(inode, pos, attr_ver);
 870         }
 871 }
 872
 873 static int fuse_do_readfolio(struct file *file, struct folio *folio)
 874 {
 875         struct inode *inode = folio->mapping->host;
 876         struct fuse_mount *fm = get_fuse_mount(inode);
 877         loff_t pos = folio_pos(folio);
 878         struct fuse_folio_desc desc = { .length = PAGE_SIZE };
 879         struct fuse_io_args ia = {
 880                 .ap.args.page_zeroing = true,
 881                 .ap.args.out_pages = true,
 882                 .ap.num_folios = 1,
 883                 .ap.folios = &folio,
 884                 .ap.descs = &desc,
 885         };
 886         ssize_t res;
 887         u64 attr_ver;
 888
 889         /*
 890          * With the temporary pages that are used to complete writeback, we can
 891          * have writeback that extends beyond the lifetime of the folio.  So
 892          * make sure we read a properly synced folio.
 893          */
 894         fuse_wait_on_folio_writeback(inode, folio);
 895
 896         attr_ver = fuse_get_attr_version(fm->fc);
 897
 898         /* Don't overflow end offset */
 899         if (pos + (desc.length - 1) == LLONG_MAX)
 900                 desc.length--;
 901
 902         fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
 903         res = fuse_simple_request(fm, &ia.ap.args);
 904         if (res < 0)
 905                 return res;
 906         /*
 907          * Short read means EOF.  If file size is larger, truncate it
 908          */
 909         if (res < desc.length)
 910                 fuse_short_read(inode, attr_ver, res, &ia.ap);
 911
 912         folio_mark_uptodate(folio);
 913
 914         return 0;
 915 }
 916
 917 static int fuse_read_folio(struct file *file, struct folio *folio)
 918 {
 919         struct inode *inode = folio->mapping->host;
 920         int err;
 921
 922         err = -EIO;
 923         if (fuse_is_bad(inode))
 924                 goto out;
 925
 926         err = fuse_do_readfolio(file, folio);
 927         fuse_invalidate_atime(inode);
 928  out:
 929         folio_unlock(folio);
 930         return err;
 931 }
 932
 933 static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
 934                                int err)
 935 {
 936         int i;
 937         struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
 938         struct fuse_args_pages *ap = &ia->ap;
 939         size_t count = ia->read.in.size;
 940         size_t num_read = args->out_args[0].size;
 941         struct address_space *mapping = NULL;
 942
 943         for (i = 0; mapping == NULL && i < ap->num_folios; i++)
 944                 mapping = ap->folios[i]->mapping;
 945
 946         if (mapping) {
 947                 struct inode *inode = mapping->host;
 948
 949                 /*
 950                  * Short read means EOF. If file size is larger, truncate it
 951                  */
 952                 if (!err && num_read < count)
 953                         fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
 954
 955                 fuse_invalidate_atime(inode);
 956         }
 957
 958         for (i = 0; i < ap->num_folios; i++)
 959                 folio_end_read(ap->folios[i], !err);
 960         if (ia->ff)
 961                 fuse_file_put(ia->ff, false);
 962
 963         fuse_io_free(ia);
 964 }
 965
 966 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
 967 {
 968         struct fuse_file *ff = file->private_data;
 969         struct fuse_mount *fm = ff->fm;
 970         struct fuse_args_pages *ap = &ia->ap;
 971         loff_t pos = folio_pos(ap->folios[0]);
 972         /* Currently, all folios in FUSE are one page */
 973         size_t count = ap->num_folios << PAGE_SHIFT;
 974         ssize_t res;
 975         int err;
 976
 977         ap->args.out_pages = true;
 978         ap->args.page_zeroing = true;
 979         ap->args.page_replace = true;
 980
 981         /* Don't overflow end offset */
 982         if (pos + (count - 1) == LLONG_MAX) {
 983                 count--;
 984                 ap->descs[ap->num_folios - 1].length--;
 985         }
 986         WARN_ON((loff_t) (pos + count) < 0);
 987
 988         fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 989         ia->read.attr_ver = fuse_get_attr_version(fm->fc);
 990         if (fm->fc->async_read) {
 991                 ia->ff = fuse_file_get(ff);
 992                 ap->args.end = fuse_readpages_end;
 993                 err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
 994                 if (!err)
 995                         return;
 996         } else {
 997                 res = fuse_simple_request(fm, &ap->args);
 998                 err = res < 0 ? res : 0;
 999         }
1000         fuse_readpages_end(fm, &ap->args, err);
1001 }
1002
1003 static void fuse_readahead(struct readahead_control *rac)
1004 {
1005         struct inode *inode = rac->mapping->host;
1006         struct fuse_inode *fi = get_fuse_inode(inode);
1007         struct fuse_conn *fc = get_fuse_conn(inode);
1008         unsigned int max_pages, nr_pages;
1009         pgoff_t first = readahead_index(rac);
1010         pgoff_t last = first + readahead_count(rac) - 1;
1011
1012         if (fuse_is_bad(inode))
1013                 return;
1014
1015         wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last));
1016
1017         max_pages = min_t(unsigned int, fc->max_pages,
1018                         fc->max_read / PAGE_SIZE);
1019
1020         /*
1021          * This is only accurate the first time through, since readahead_folio()
1022          * doesn't update readahead_count() from the previous folio until the
1023          * next call.  Grab nr_pages here so we know how many pages we're going
1024          * to have to process.  This means that we will exit here with
1025          * readahead_count() == folio_nr_pages(last_folio), but we will have
1026          * consumed all of the folios, and read_pages() will call
1027          * readahead_folio() again which will clean up the rac.
1028          */
1029         nr_pages = readahead_count(rac);
1030
1031         while (nr_pages) {
1032                 struct fuse_io_args *ia;
1033                 struct fuse_args_pages *ap;
1034                 struct folio *folio;
1035                 unsigned cur_pages = min(max_pages, nr_pages);
1036
1037                 if (fc->num_background >= fc->congestion_threshold &&
1038                     rac->ra->async_size >= readahead_count(rac))
1039                         /*
1040                          * Congested and only async pages left, so skip the
1041                          * rest.
1042                          */
1043                         break;
1044
1045                 ia = fuse_io_alloc(NULL, cur_pages);
1046                 if (!ia)
1047                         return;
1048                 ap = &ia->ap;
1049
1050                 while (ap->num_folios < cur_pages) {
1051                         folio = readahead_folio(rac);
1052                         ap->folios[ap->num_folios] = folio;
1053                         ap->descs[ap->num_folios].length = folio_size(folio);
1054                         ap->num_folios++;
1055                 }
1056                 fuse_send_readpages(ia, rac->file);
1057                 nr_pages -= cur_pages;
1058         }
1059 }
1060
1061 static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
1062 {
1063         struct inode *inode = iocb->ki_filp->f_mapping->host;
1064         struct fuse_conn *fc = get_fuse_conn(inode);
1065
1066         /*
1067          * In auto invalidate mode, always update attributes on read.
1068          * Otherwise, only update if we attempt to read past EOF (to ensure
1069          * i_size is up to date).
1070          */
1071         if (fc->auto_inval_data ||
1072             (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
1073                 int err;
1074                 err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
1075                 if (err)
1076                         return err;
1077         }
1078
1079         return generic_file_read_iter(iocb, to);
1080 }
1081
1082 static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
1083                                  loff_t pos, size_t count)
1084 {
1085         struct fuse_args *args = &ia->ap.args;
1086
1087         ia->write.in.fh = ff->fh;
1088         ia->write.in.offset = pos;
1089         ia->write.in.size = count;
1090         args->opcode = FUSE_WRITE;
1091         args->nodeid = ff->nodeid;
1092         args->in_numargs = 2;
1093         if (ff->fm->fc->minor < 9)
1094                 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
1095         else
1096                 args->in_args[0].size = sizeof(ia->write.in);
1097         args->in_args[0].value = &ia->write.in;
1098         args->in_args[1].size = count;
1099         args->out_numargs = 1;
1100         args->out_args[0].size = sizeof(ia->write.out);
1101         args->out_args[0].value = &ia->write.out;
1102 }
1103
1104 static unsigned int fuse_write_flags(struct kiocb *iocb)
1105 {
1106         unsigned int flags = iocb->ki_filp->f_flags;
1107
1108         if (iocb_is_dsync(iocb))
1109                 flags |= O_DSYNC;
1110         if (iocb->ki_flags & IOCB_SYNC)
1111                 flags |= O_SYNC;
1112
1113         return flags;
1114 }
1115
1116 static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
1117                                size_t count, fl_owner_t owner)
1118 {
1119         struct kiocb *iocb = ia->io->iocb;
1120         struct file *file = iocb->ki_filp;
1121         struct fuse_file *ff = file->private_data;
1122         struct fuse_mount *fm = ff->fm;
1123         struct fuse_write_in *inarg = &ia->write.in;
1124         ssize_t err;
1125
1126         fuse_write_args_fill(ia, ff, pos, count);
1127         inarg->flags = fuse_write_flags(iocb);
1128         if (owner != NULL) {
1129                 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
1130                 inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
1131         }
1132
1133         if (ia->io->async)
1134                 return fuse_async_req_send(fm, ia, count);
1135
1136         err = fuse_simple_request(fm, &ia->ap.args);
1137         if (!err && ia->write.out.size > count)
1138                 err = -EIO;
1139
1140         return err ?: ia->write.out.size;
1141 }
1142
1143 bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
1144 {
1145         struct fuse_conn *fc = get_fuse_conn(inode);
1146         struct fuse_inode *fi = get_fuse_inode(inode);
1147         bool ret = false;
1148
1149         spin_lock(&fi->lock);
1150         fi->attr_version = atomic64_inc_return(&fc->attr_version);
1151         if (written > 0 && pos > inode->i_size) {
1152                 i_size_write(inode, pos);
1153                 ret = true;
1154         }
1155         spin_unlock(&fi->lock);
1156
1157         fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
1158
1159         return ret;
1160 }
1161
1162 static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
1163                                      struct kiocb *iocb, struct inode *inode,
1164                                      loff_t pos, size_t count)
1165 {
1166         struct fuse_args_pages *ap = &ia->ap;
1167         struct file *file = iocb->ki_filp;
1168         struct fuse_file *ff = file->private_data;
1169         struct fuse_mount *fm = ff->fm;
1170         unsigned int offset, i;
1171         bool short_write;
1172         int err;
1173
1174         for (i = 0; i < ap->num_folios; i++)
1175                 fuse_wait_on_folio_writeback(inode, ap->folios[i]);
1176
1177         fuse_write_args_fill(ia, ff, pos, count);
1178         ia->write.in.flags = fuse_write_flags(iocb);
1179         if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
1180                 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1181
1182         err = fuse_simple_request(fm, &ap->args);
1183         if (!err && ia->write.out.size > count)
1184                 err = -EIO;
1185
1186         short_write = ia->write.out.size < count;
1187         offset = ap->descs[0].offset;
1188         count = ia->write.out.size;
1189         for (i = 0; i < ap->num_folios; i++) {
1190                 struct folio *folio = ap->folios[i];
1191
1192                 if (err) {
1193                         folio_clear_uptodate(folio);
1194                 } else {
1195                         if (count >= folio_size(folio) - offset)
1196                                 count -= folio_size(folio) - offset;
1197                         else {
1198                                 if (short_write)
1199                                         folio_clear_uptodate(folio);
1200                                 count = 0;
1201                         }
1202                         offset = 0;
1203                 }
1204                 if (ia->write.folio_locked && (i == ap->num_folios - 1))
1205                         folio_unlock(folio);
1206                 folio_put(folio);
1207         }
1208
1209         return err;
1210 }
1211
1212 static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
1213                                      struct address_space *mapping,
1214                                      struct iov_iter *ii, loff_t pos,
1215                                      unsigned int max_pages)
1216 {
1217         struct fuse_args_pages *ap = &ia->ap;
1218         struct fuse_conn *fc = get_fuse_conn(mapping->host);
1219         unsigned offset = pos & (PAGE_SIZE - 1);
1220         unsigned int nr_pages = 0;
1221         size_t count = 0;
1222         int err;
1223
1224         ap->args.in_pages = true;
1225         ap->descs[0].offset = offset;
1226
1227         do {
1228                 size_t tmp;
1229                 struct folio *folio;
1230                 pgoff_t index = pos >> PAGE_SHIFT;
1231                 size_t bytes = min_t(size_t, PAGE_SIZE - offset,
1232                                      iov_iter_count(ii));
1233
1234                 bytes = min_t(size_t, bytes, fc->max_write - count);
1235
1236  again:
1237                 err = -EFAULT;
1238                 if (fault_in_iov_iter_readable(ii, bytes))
1239                         break;
1240
1241                 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
1242                                             mapping_gfp_mask(mapping));
1243                 if (IS_ERR(folio)) {
1244                         err = PTR_ERR(folio);
1245                         break;
1246                 }
1247
1248                 if (mapping_writably_mapped(mapping))
1249                         flush_dcache_folio(folio);
1250
1251                 tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii);
1252                 flush_dcache_folio(folio);
1253
1254                 if (!tmp) {
1255                         folio_unlock(folio);
1256                         folio_put(folio);
1257                         goto again;
1258                 }
1259
1260                 err = 0;
1261                 ap->folios[ap->num_folios] = folio;
1262                 ap->descs[ap->num_folios].length = tmp;
1263                 ap->num_folios++;
1264                 nr_pages++;
1265
1266                 count += tmp;
1267                 pos += tmp;
1268                 offset += tmp;
1269                 if (offset == PAGE_SIZE)
1270                         offset = 0;
1271
1272                 /* If we copied full page, mark it uptodate */
1273                 if (tmp == PAGE_SIZE)
1274                         folio_mark_uptodate(folio);
1275
1276                 if (folio_test_uptodate(folio)) {
1277                         folio_unlock(folio);
1278                 } else {
1279                         ia->write.folio_locked = true;
1280                         break;
1281                 }
1282                 if (!fc->big_writes)
1283                         break;
1284         } while (iov_iter_count(ii) && count < fc->max_write &&
1285                  nr_pages < max_pages && offset == 0);
1286
1287         return count > 0 ? count : err;
1288 }
1289
1290 static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
1291                                      unsigned int max_pages)
1292 {
1293         return min_t(unsigned int,
1294                      ((pos + len - 1) >> PAGE_SHIFT) -
1295                      (pos >> PAGE_SHIFT) + 1,
1296                      max_pages);
1297 }
1298
1299 static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
1300 {
1301         struct address_space *mapping = iocb->ki_filp->f_mapping;
1302         struct inode *inode = mapping->host;
1303         struct fuse_conn *fc = get_fuse_conn(inode);
1304         struct fuse_inode *fi = get_fuse_inode(inode);
1305         loff_t pos = iocb->ki_pos;
1306         int err = 0;
1307         ssize_t res = 0;
1308
1309         if (inode->i_size < pos + iov_iter_count(ii))
1310                 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1311
1312         do {
1313                 ssize_t count;
1314                 struct fuse_io_args ia = {};
1315                 struct fuse_args_pages *ap = &ia.ap;
1316                 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
1317                                                       fc->max_pages);
1318
1319                 ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs);
1320                 if (!ap->folios) {
1321                         err = -ENOMEM;
1322                         break;
1323                 }
1324
1325                 count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
1326                 if (count <= 0) {
1327                         err = count;
1328                 } else {
1329                         err = fuse_send_write_pages(&ia, iocb, inode,
1330                                                     pos, count);
1331                         if (!err) {
1332                                 size_t num_written = ia.write.out.size;
1333
1334                                 res += num_written;
1335                                 pos += num_written;
1336
1337                                 /* break out of the loop on short write */
1338                                 if (num_written != count)
1339                                         err = -EIO;
1340                         }
1341                 }
1342                 kfree(ap->folios);
1343         } while (!err && iov_iter_count(ii));
1344
1345         fuse_write_update_attr(inode, pos, res);
1346         clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1347
1348         if (!res)
1349                 return err;
1350         iocb->ki_pos += res;
1351         return res;
1352 }
1353
1354 static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
1355 {
1356         struct inode *inode = file_inode(iocb->ki_filp);
1357
1358         return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
1359 }
1360
1361 /*
1362  * @return true if an exclusive lock for direct IO writes is needed
1363  */
1364 static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
1365 {
1366         struct file *file = iocb->ki_filp;
1367         struct fuse_file *ff = file->private_data;
1368         struct inode *inode = file_inode(iocb->ki_filp);
1369         struct fuse_inode *fi = get_fuse_inode(inode);
1370
1371         /* Server side has to advise that it supports parallel dio writes. */
1372         if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
1373                 return true;
1374
1375         /*
1376          * Append will need to know the eventual EOF - always needs an
1377          * exclusive lock.
1378          */
1379         if (iocb->ki_flags & IOCB_APPEND)
1380                 return true;
1381
1382         /* shared locks are not allowed with parallel page cache IO */
1383         if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
1384                 return true;
1385
1386         /* Parallel dio beyond EOF is not supported, at least for now. */
1387         if (fuse_io_past_eof(iocb, from))
1388                 return true;
1389
1390         return false;
1391 }
1392
1393 static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
1394                           bool *exclusive)
1395 {
1396         struct inode *inode = file_inode(iocb->ki_filp);
1397         struct fuse_inode *fi = get_fuse_inode(inode);
1398
1399         *exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
1400         if (*exclusive) {
1401                 inode_lock(inode);
1402         } else {
1403                 inode_lock_shared(inode);
1404                 /*
1405                  * New parallal dio allowed only if inode is not in caching
1406                  * mode and denies new opens in caching mode. This check
1407                  * should be performed only after taking shared inode lock.
1408                  * Previous past eof check was without inode lock and might
1409                  * have raced, so check it again.
1410                  */
1411                 if (fuse_io_past_eof(iocb, from) ||
1412                     fuse_inode_uncached_io_start(fi, NULL) != 0) {
1413                         inode_unlock_shared(inode);
1414                         inode_lock(inode);
1415                         *exclusive = true;
1416                 }
1417         }
1418 }
1419
1420 static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
1421 {
1422         struct inode *inode = file_inode(iocb->ki_filp);
1423         struct fuse_inode *fi = get_fuse_inode(inode);
1424
1425         if (exclusive) {
1426                 inode_unlock(inode);
1427         } else {
1428                 /* Allow opens in caching mode after last parallel dio end */
1429                 fuse_inode_uncached_io_end(fi);
1430                 inode_unlock_shared(inode);
1431         }
1432 }
1433
1434 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
1435 {
1436         struct file *file = iocb->ki_filp;
1437         struct mnt_idmap *idmap = file_mnt_idmap(file);
1438         struct address_space *mapping = file->f_mapping;
1439         ssize_t written = 0;
1440         struct inode *inode = mapping->host;
1441         ssize_t err, count;
1442         struct fuse_conn *fc = get_fuse_conn(inode);
1443
1444         if (fc->writeback_cache) {
1445                 /* Update size (EOF optimization) and mode (SUID clearing) */
1446                 err = fuse_update_attributes(mapping->host, file,
1447                                              STATX_SIZE | STATX_MODE);
1448                 if (err)
1449                         return err;
1450
1451                 if (fc->handle_killpriv_v2 &&
1452                     setattr_should_drop_suidgid(idmap,
1453                                                 file_inode(file))) {
1454                         goto writethrough;
1455                 }
1456
1457                 return generic_file_write_iter(iocb, from);
1458         }
1459
1460 writethrough:
1461         inode_lock(inode);
1462
1463         err = count = generic_write_checks(iocb, from);
1464         if (err <= 0)
1465                 goto out;
1466
1467         task_io_account_write(count);
1468
1469         err = kiocb_modified(iocb);
1470         if (err)
1471                 goto out;
1472
1473         if (iocb->ki_flags & IOCB_DIRECT) {
1474                 written = generic_file_direct_write(iocb, from);
1475                 if (written < 0 || !iov_iter_count(from))
1476                         goto out;
1477                 written = direct_write_fallback(iocb, from, written,
1478                                 fuse_perform_write(iocb, from));
1479         } else {
1480                 written = fuse_perform_write(iocb, from);
1481         }
1482 out:
1483         inode_unlock(inode);
1484         if (written > 0)
1485                 written = generic_write_sync(iocb, written);
1486
1487         return written ? written : err;
1488 }
1489
1490 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
1491 {
1492         return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset;
1493 }
1494
1495 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1496                                         size_t max_size)
1497 {
1498         return min(iov_iter_single_seg_count(ii), max_size);
1499 }
1500
1501 static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
1502                                size_t *nbytesp, int write,
1503                                unsigned int max_pages,
1504                                bool use_pages_for_kvec_io)
1505 {
1506         bool flush_or_invalidate = false;
1507         unsigned int nr_pages = 0;
1508         size_t nbytes = 0;  /* # bytes already packed in req */
1509         ssize_t ret = 0;
1510
1511         /* Special case for kernel I/O: can copy directly into the buffer.
1512          * However if the implementation of fuse_conn requires pages instead of
1513          * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
1514          */
1515         if (iov_iter_is_kvec(ii)) {
1516                 void *user_addr = (void *)fuse_get_user_addr(ii);
1517
1518                 if (!use_pages_for_kvec_io) {
1519                         size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1520
1521                         if (write)
1522                                 ap->args.in_args[1].value = user_addr;
1523                         else
1524                                 ap->args.out_args[0].value = user_addr;
1525
1526                         iov_iter_advance(ii, frag_size);
1527                         *nbytesp = frag_size;
1528                         return 0;
1529                 }
1530
1531                 if (is_vmalloc_addr(user_addr)) {
1532                         ap->args.vmap_base = user_addr;
1533                         flush_or_invalidate = true;
1534                 }
1535         }
1536
1537         /*
1538          * Until there is support for iov_iter_extract_folios(), we have to
1539          * manually extract pages using iov_iter_extract_pages() and then
1540          * copy that to a folios array.
1541          */
1542         struct page **pages = kzalloc(max_pages * sizeof(struct page *),
1543                                       GFP_KERNEL);
1544         if (!pages) {
1545                 ret = -ENOMEM;
1546                 goto out;
1547         }
1548
1549         while (nbytes < *nbytesp && nr_pages < max_pages) {
1550                 unsigned nfolios, i;
1551                 size_t start;
1552
1553                 ret = iov_iter_extract_pages(ii, &pages,
1554                                              *nbytesp - nbytes,
1555                                              max_pages - nr_pages,
1556                                              0, &start);
1557                 if (ret < 0)
1558                         break;
1559
1560                 nbytes += ret;
1561
1562                 nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE);
1563
1564                 for (i = 0; i < nfolios; i++) {
1565                         struct folio *folio = page_folio(pages[i]);
1566                         unsigned int offset = start +
1567                                 (folio_page_idx(folio, pages[i]) << PAGE_SHIFT);
1568                         unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start);
1569
1570                         ap->descs[ap->num_folios].offset = offset;
1571                         ap->descs[ap->num_folios].length = len;
1572                         ap->folios[ap->num_folios] = folio;
1573                         start = 0;
1574                         ret -= len;
1575                         ap->num_folios++;
1576                 }
1577
1578                 nr_pages += nfolios;
1579         }
1580         kfree(pages);
1581
1582         if (write && flush_or_invalidate)
1583                 flush_kernel_vmap_range(ap->args.vmap_base, nbytes);
1584
1585         ap->args.invalidate_vmap = !write && flush_or_invalidate;
1586         ap->args.is_pinned = iov_iter_extract_will_pin(ii);
1587         ap->args.user_pages = true;
1588         if (write)
1589                 ap->args.in_pages = true;
1590         else
1591                 ap->args.out_pages = true;
1592
1593 out:
1594         *nbytesp = nbytes;
1595
1596         return ret < 0 ? ret : 0;
1597 }
1598
1599 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1600                        loff_t *ppos, int flags)
1601 {
1602         int write = flags & FUSE_DIO_WRITE;
1603         int cuse = flags & FUSE_DIO_CUSE;
1604         struct file *file = io->iocb->ki_filp;
1605         struct address_space *mapping = file->f_mapping;
1606         struct inode *inode = mapping->host;
1607         struct fuse_file *ff = file->private_data;
1608         struct fuse_conn *fc = ff->fm->fc;
1609         size_t nmax = write ? fc->max_write : fc->max_read;
1610         loff_t pos = *ppos;
1611         size_t count = iov_iter_count(iter);
1612         pgoff_t idx_from = pos >> PAGE_SHIFT;
1613         pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
1614         ssize_t res = 0;
1615         int err = 0;
1616         struct fuse_io_args *ia;
1617         unsigned int max_pages;
1618         bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO;
1619
1620         max_pages = iov_iter_npages(iter, fc->max_pages);
1621         ia = fuse_io_alloc(io, max_pages);
1622         if (!ia)
1623                 return -ENOMEM;
1624
1625         if (fopen_direct_io && fc->direct_io_allow_mmap) {
1626                 res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
1627                 if (res) {
1628                         fuse_io_free(ia);
1629                         return res;
1630                 }
1631         }
1632         if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1633                 if (!write)
1634                         inode_lock(inode);
1635                 fuse_sync_writes(inode);
1636                 if (!write)
1637                         inode_unlock(inode);
1638         }
1639
1640         if (fopen_direct_io && write) {
1641                 res = invalidate_inode_pages2_range(mapping, idx_from, idx_to);
1642                 if (res) {
1643                         fuse_io_free(ia);
1644                         return res;
1645                 }
1646         }
1647
1648         io->should_dirty = !write && user_backed_iter(iter);
1649         while (count) {
1650                 ssize_t nres;
1651                 fl_owner_t owner = current->files;
1652                 size_t nbytes = min(count, nmax);
1653
1654                 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
1655                                           max_pages, fc->use_pages_for_kvec_io);
1656                 if (err && !nbytes)
1657                         break;
1658
1659                 if (write) {
1660                         if (!capable(CAP_FSETID))
1661                                 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1662
1663                         nres = fuse_send_write(ia, pos, nbytes, owner);
1664                 } else {
1665                         nres = fuse_send_read(ia, pos, nbytes, owner);
1666                 }
1667
1668                 if (!io->async || nres < 0) {
1669                         fuse_release_user_pages(&ia->ap, nres, io->should_dirty);
1670                         fuse_io_free(ia);
1671                 }
1672                 ia = NULL;
1673                 if (nres < 0) {
1674                         iov_iter_revert(iter, nbytes);
1675                         err = nres;
1676                         break;
1677                 }
1678                 WARN_ON(nres > nbytes);
1679
1680                 count -= nres;
1681                 res += nres;
1682                 pos += nres;
1683                 if (nres != nbytes) {
1684                         iov_iter_revert(iter, nbytes - nres);
1685                         break;
1686                 }
1687                 if (count) {
1688                         max_pages = iov_iter_npages(iter, fc->max_pages);
1689                         ia = fuse_io_alloc(io, max_pages);
1690                         if (!ia)
1691                                 break;
1692                 }
1693         }
1694         if (ia)
1695                 fuse_io_free(ia);
1696         if (res > 0)
1697                 *ppos = pos;
1698
1699         return res > 0 ? res : err;
1700 }
1701 EXPORT_SYMBOL_GPL(fuse_direct_io);
1702
1703 static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1704                                   struct iov_iter *iter,
1705                                   loff_t *ppos)
1706 {
1707         ssize_t res;
1708         struct inode *inode = file_inode(io->iocb->ki_filp);
1709
1710         res = fuse_direct_io(io, iter, ppos, 0);
1711
1712         fuse_invalidate_atime(inode);
1713
1714         return res;
1715 }
1716
1717 static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
1718
1719 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1720 {
1721         ssize_t res;
1722
1723         if (!is_sync_kiocb(iocb)) {
1724                 res = fuse_direct_IO(iocb, to);
1725         } else {
1726                 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1727
1728                 res = __fuse_direct_read(&io, to, &iocb->ki_pos);
1729         }
1730
1731         return res;
1732 }
1733
1734 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1735 {
1736         struct inode *inode = file_inode(iocb->ki_filp);
1737         ssize_t res;
1738         bool exclusive;
1739
1740         fuse_dio_lock(iocb, from, &exclusive);
1741         res = generic_write_checks(iocb, from);
1742         if (res > 0) {
1743                 task_io_account_write(res);
1744                 if (!is_sync_kiocb(iocb)) {
1745                         res = fuse_direct_IO(iocb, from);
1746                 } else {
1747                         struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1748
1749                         res = fuse_direct_io(&io, from, &iocb->ki_pos,
1750                                              FUSE_DIO_WRITE);
1751                         fuse_write_update_attr(inode, iocb->ki_pos, res);
1752                 }
1753         }
1754         fuse_dio_unlock(iocb, exclusive);
1755
1756         return res;
1757 }
1758
1759 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1760 {
1761         struct file *file = iocb->ki_filp;
1762         struct fuse_file *ff = file->private_data;
1763         struct inode *inode = file_inode(file);
1764
1765         if (fuse_is_bad(inode))
1766                 return -EIO;
1767
1768         if (FUSE_IS_DAX(inode))
1769                 return fuse_dax_read_iter(iocb, to);
1770
1771         /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1772         if (ff->open_flags & FOPEN_DIRECT_IO)
1773                 return fuse_direct_read_iter(iocb, to);
1774         else if (fuse_file_passthrough(ff))
1775                 return fuse_passthrough_read_iter(iocb, to);
1776         else
1777                 return fuse_cache_read_iter(iocb, to);
1778 }
1779
1780 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1781 {
1782         struct file *file = iocb->ki_filp;
1783         struct fuse_file *ff = file->private_data;
1784         struct inode *inode = file_inode(file);
1785
1786         if (fuse_is_bad(inode))
1787                 return -EIO;
1788
1789         if (FUSE_IS_DAX(inode))
1790                 return fuse_dax_write_iter(iocb, from);
1791
1792         /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1793         if (ff->open_flags & FOPEN_DIRECT_IO)
1794                 return fuse_direct_write_iter(iocb, from);
1795         else if (fuse_file_passthrough(ff))
1796                 return fuse_passthrough_write_iter(iocb, from);
1797         else
1798                 return fuse_cache_write_iter(iocb, from);
1799 }
1800
1801 static ssize_t fuse_splice_read(struct file *in, loff_t *ppos,
1802                                 struct pipe_inode_info *pipe, size_t len,
1803                                 unsigned int flags)
1804 {
1805         struct fuse_file *ff = in->private_data;
1806
1807         /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1808         if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
1809                 return fuse_passthrough_splice_read(in, ppos, pipe, len, flags);
1810         else
1811                 return filemap_splice_read(in, ppos, pipe, len, flags);
1812 }
1813
1814 static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
1815                                  loff_t *ppos, size_t len, unsigned int flags)
1816 {
1817         struct fuse_file *ff = out->private_data;
1818
1819         /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1820         if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
1821                 return fuse_passthrough_splice_write(pipe, out, ppos, len, flags);
1822         else
1823                 return iter_file_splice_write(pipe, out, ppos, len, flags);
1824 }
1825
1826 static void fuse_writepage_free(struct fuse_writepage_args *wpa)
1827 {
1828         struct fuse_args_pages *ap = &wpa->ia.ap;
1829         int i;
1830
1831         if (wpa->bucket)
1832                 fuse_sync_bucket_dec(wpa->bucket);
1833
1834         for (i = 0; i < ap->num_folios; i++)
1835                 folio_put(ap->folios[i]);
1836
1837         fuse_file_put(wpa->ia.ff, false);
1838
1839         kfree(ap->folios);
1840         kfree(wpa);
1841 }
1842
1843 static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio)
1844 {
1845         struct backing_dev_info *bdi = inode_to_bdi(inode);
1846
1847         dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1848         node_stat_sub_folio(folio, NR_WRITEBACK_TEMP);
1849         wb_writeout_inc(&bdi->wb);
1850 }
1851
1852 static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
1853 {
1854         struct fuse_args_pages *ap = &wpa->ia.ap;
1855         struct inode *inode = wpa->inode;
1856         struct fuse_inode *fi = get_fuse_inode(inode);
1857         int i;
1858
1859         for (i = 0; i < ap->num_folios; i++)
1860                 fuse_writepage_finish_stat(inode, ap->folios[i]);
1861
1862         wake_up(&fi->page_waitq);
1863 }
1864
1865 /* Called under fi->lock, may release and reacquire it */
1866 static void fuse_send_writepage(struct fuse_mount *fm,
1867                                 struct fuse_writepage_args *wpa, loff_t size)
1868 __releases(fi->lock)
1869 __acquires(fi->lock)
1870 {
1871         struct fuse_writepage_args *aux, *next;
1872         struct fuse_inode *fi = get_fuse_inode(wpa->inode);
1873         struct fuse_write_in *inarg = &wpa->ia.write.in;
1874         struct fuse_args *args = &wpa->ia.ap.args;
1875         /* Currently, all folios in FUSE are one page */
1876         __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE;
1877         int err;
1878
1879         fi->writectr++;
1880         if (inarg->offset + data_size <= size) {
1881                 inarg->size = data_size;
1882         } else if (inarg->offset < size) {
1883                 inarg->size = size - inarg->offset;
1884         } else {
1885                 /* Got truncated off completely */
1886                 goto out_free;
1887         }
1888
1889         args->in_args[1].size = inarg->size;
1890         args->force = true;
1891         args->nocreds = true;
1892
1893         err = fuse_simple_background(fm, args, GFP_ATOMIC);
1894         if (err == -ENOMEM) {
1895                 spin_unlock(&fi->lock);
1896                 err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
1897                 spin_lock(&fi->lock);
1898         }
1899
1900         /* Fails on broken connection only */
1901         if (unlikely(err))
1902                 goto out_free;
1903
1904         return;
1905
1906  out_free:
1907         fi->writectr--;
1908         rb_erase(&wpa->writepages_entry, &fi->writepages);
1909         fuse_writepage_finish(wpa);
1910         spin_unlock(&fi->lock);
1911
1912         /* After rb_erase() aux request list is private */
1913         for (aux = wpa->next; aux; aux = next) {
1914                 next = aux->next;
1915                 aux->next = NULL;
1916                 fuse_writepage_finish_stat(aux->inode,
1917                                            aux->ia.ap.folios[0]);
1918                 fuse_writepage_free(aux);
1919         }
1920
1921         fuse_writepage_free(wpa);
1922         spin_lock(&fi->lock);
1923 }
1924
1925 /*
1926  * If fi->writectr is positive (no truncate or fsync going on) send
1927  * all queued writepage requests.
1928  *
1929  * Called with fi->lock
1930  */
1931 void fuse_flush_writepages(struct inode *inode)
1932 __releases(fi->lock)
1933 __acquires(fi->lock)
1934 {
1935         struct fuse_mount *fm = get_fuse_mount(inode);
1936         struct fuse_inode *fi = get_fuse_inode(inode);
1937         loff_t crop = i_size_read(inode);
1938         struct fuse_writepage_args *wpa;
1939
1940         while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1941                 wpa = list_entry(fi->queued_writes.next,
1942                                  struct fuse_writepage_args, queue_entry);
1943                 list_del_init(&wpa->queue_entry);
1944                 fuse_send_writepage(fm, wpa, crop);
1945         }
1946 }
1947
1948 static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
1949                                                 struct fuse_writepage_args *wpa)
1950 {
1951         pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
1952         pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1;
1953         struct rb_node **p = &root->rb_node;
1954         struct rb_node  *parent = NULL;
1955
1956         WARN_ON(!wpa->ia.ap.num_folios);
1957         while (*p) {
1958                 struct fuse_writepage_args *curr;
1959                 pgoff_t curr_index;
1960
1961                 parent = *p;
1962                 curr = rb_entry(parent, struct fuse_writepage_args,
1963                                 writepages_entry);
1964                 WARN_ON(curr->inode != wpa->inode);
1965                 curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
1966
1967                 if (idx_from >= curr_index + curr->ia.ap.num_folios)
1968                         p = &(*p)->rb_right;
1969                 else if (idx_to < curr_index)
1970                         p = &(*p)->rb_left;
1971                 else
1972                         return curr;
1973         }
1974
1975         rb_link_node(&wpa->writepages_entry, parent, p);
1976         rb_insert_color(&wpa->writepages_entry, root);
1977         return NULL;
1978 }
1979
1980 static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
1981 {
1982         WARN_ON(fuse_insert_writeback(root, wpa));
1983 }
1984
1985 static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
1986                                int error)
1987 {
1988         struct fuse_writepage_args *wpa =
1989                 container_of(args, typeof(*wpa), ia.ap.args);
1990         struct inode *inode = wpa->inode;
1991         struct fuse_inode *fi = get_fuse_inode(inode);
1992         struct fuse_conn *fc = get_fuse_conn(inode);
1993
1994         mapping_set_error(inode->i_mapping, error);
1995         /*
1996          * A writeback finished and this might have updated mtime/ctime on
1997          * server making local mtime/ctime stale.  Hence invalidate attrs.
1998          * Do this only if writeback_cache is not enabled.  If writeback_cache
1999          * is enabled, we trust local ctime/mtime.
2000          */
2001         if (!fc->writeback_cache)
2002                 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
2003         spin_lock(&fi->lock);
2004         rb_erase(&wpa->writepages_entry, &fi->writepages);
2005         while (wpa->next) {
2006                 struct fuse_mount *fm = get_fuse_mount(inode);
2007                 struct fuse_write_in *inarg = &wpa->ia.write.in;
2008                 struct fuse_writepage_args *next = wpa->next;
2009
2010                 wpa->next = next->next;
2011                 next->next = NULL;
2012                 tree_insert(&fi->writepages, next);
2013
2014                 /*
2015                  * Skip fuse_flush_writepages() to make it easy to crop requests
2016                  * based on primary request size.
2017                  *
2018                  * 1st case (trivial): there are no concurrent activities using
2019                  * fuse_set/release_nowrite.  Then we're on safe side because
2020                  * fuse_flush_writepages() would call fuse_send_writepage()
2021                  * anyway.
2022                  *
2023                  * 2nd case: someone called fuse_set_nowrite and it is waiting
2024                  * now for completion of all in-flight requests.  This happens
2025                  * rarely and no more than once per page, so this should be
2026                  * okay.
2027                  *
2028                  * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
2029                  * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
2030                  * that fuse_set_nowrite returned implies that all in-flight
2031                  * requests were completed along with all of their secondary
2032                  * requests.  Further primary requests are blocked by negative
2033                  * writectr.  Hence there cannot be any in-flight requests and
2034                  * no invocations of fuse_writepage_end() while we're in
2035                  * fuse_set_nowrite..fuse_release_nowrite section.
2036                  */
2037                 fuse_send_writepage(fm, next, inarg->offset + inarg->size);
2038         }
2039         fi->writectr--;
2040         fuse_writepage_finish(wpa);
2041         spin_unlock(&fi->lock);
2042         fuse_writepage_free(wpa);
2043 }
2044
2045 static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
2046 {
2047         struct fuse_file *ff;
2048
2049         spin_lock(&fi->lock);
2050         ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
2051                                       write_entry);
2052         if (ff)
2053                 fuse_file_get(ff);
2054         spin_unlock(&fi->lock);
2055
2056         return ff;
2057 }
2058
2059 static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
2060 {
2061         struct fuse_file *ff = __fuse_write_file_get(fi);
2062         WARN_ON(!ff);
2063         return ff;
2064 }
2065
2066 int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
2067 {
2068         struct fuse_inode *fi = get_fuse_inode(inode);
2069         struct fuse_file *ff;
2070         int err;
2071
2072         /*
2073          * Inode is always written before the last reference is dropped and
2074          * hence this should not be reached from reclaim.
2075          *
2076          * Writing back the inode from reclaim can deadlock if the request
2077          * processing itself needs an allocation.  Allocations triggering
2078          * reclaim while serving a request can't be prevented, because it can
2079          * involve any number of unrelated userspace processes.
2080          */
2081         WARN_ON(wbc->for_reclaim);
2082
2083         ff = __fuse_write_file_get(fi);
2084         err = fuse_flush_times(inode, ff);
2085         if (ff)
2086                 fuse_file_put(ff, false);
2087
2088         return err;
2089 }
2090
2091 static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
2092 {
2093         struct fuse_writepage_args *wpa;
2094         struct fuse_args_pages *ap;
2095
2096         wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
2097         if (wpa) {
2098                 ap = &wpa->ia.ap;
2099                 ap->num_folios = 0;
2100                 ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs);
2101                 if (!ap->folios) {
2102                         kfree(wpa);
2103                         wpa = NULL;
2104                 }
2105         }
2106         return wpa;
2107
2108 }
2109
2110 static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
2111                                          struct fuse_writepage_args *wpa)
2112 {
2113         if (!fc->sync_fs)
2114                 return;
2115
2116         rcu_read_lock();
2117         /* Prevent resurrection of dead bucket in unlikely race with syncfs */
2118         do {
2119                 wpa->bucket = rcu_dereference(fc->curr_bucket);
2120         } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
2121         rcu_read_unlock();
2122 }
2123
2124 static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
2125                                           struct folio *tmp_folio, uint32_t folio_index)
2126 {
2127         struct inode *inode = folio->mapping->host;
2128         struct fuse_args_pages *ap = &wpa->ia.ap;
2129
2130         folio_copy(tmp_folio, folio);
2131
2132         ap->folios[folio_index] = tmp_folio;
2133         ap->descs[folio_index].offset = 0;
2134         ap->descs[folio_index].length = PAGE_SIZE;
2135
2136         inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2137         node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
2138 }
2139
2140 static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
2141                                                              struct fuse_file *ff)
2142 {
2143         struct inode *inode = folio->mapping->host;
2144         struct fuse_conn *fc = get_fuse_conn(inode);
2145         struct fuse_writepage_args *wpa;
2146         struct fuse_args_pages *ap;
2147
2148         wpa = fuse_writepage_args_alloc();
2149         if (!wpa)
2150                 return NULL;
2151
2152         fuse_writepage_add_to_bucket(fc, wpa);
2153         fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
2154         wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
2155         wpa->inode = inode;
2156         wpa->ia.ff = ff;
2157
2158         ap = &wpa->ia.ap;
2159         ap->args.in_pages = true;
2160         ap->args.end = fuse_writepage_end;
2161
2162         return wpa;
2163 }
2164
2165 static int fuse_writepage_locked(struct folio *folio)
2166 {
2167         struct address_space *mapping = folio->mapping;
2168         struct inode *inode = mapping->host;
2169         struct fuse_inode *fi = get_fuse_inode(inode);
2170         struct fuse_writepage_args *wpa;
2171         struct fuse_args_pages *ap;
2172         struct folio *tmp_folio;
2173         struct fuse_file *ff;
2174         int error = -ENOMEM;
2175
2176         tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
2177         if (!tmp_folio)
2178                 goto err;
2179
2180         error = -EIO;
2181         ff = fuse_write_file_get(fi);
2182         if (!ff)
2183                 goto err_nofile;
2184
2185         wpa = fuse_writepage_args_setup(folio, ff);
2186         error = -ENOMEM;
2187         if (!wpa)
2188                 goto err_writepage_args;
2189
2190         ap = &wpa->ia.ap;
2191         ap->num_folios = 1;
2192
2193         folio_start_writeback(folio);
2194         fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0);
2195
2196         spin_lock(&fi->lock);
2197         tree_insert(&fi->writepages, wpa);
2198         list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2199         fuse_flush_writepages(inode);
2200         spin_unlock(&fi->lock);
2201
2202         folio_end_writeback(folio);
2203
2204         return 0;
2205
2206 err_writepage_args:
2207         fuse_file_put(ff, false);
2208 err_nofile:
2209         folio_put(tmp_folio);
2210 err:
2211         mapping_set_error(folio->mapping, error);
2212         return error;
2213 }
2214
2215 struct fuse_fill_wb_data {
2216         struct fuse_writepage_args *wpa;
2217         struct fuse_file *ff;
2218         struct inode *inode;
2219         struct folio **orig_folios;
2220         unsigned int max_folios;
2221 };
2222
2223 static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
2224 {
2225         struct fuse_args_pages *ap = &data->wpa->ia.ap;
2226         struct fuse_conn *fc = get_fuse_conn(data->inode);
2227         struct folio **folios;
2228         struct fuse_folio_desc *descs;
2229         unsigned int nfolios = min_t(unsigned int,
2230                                      max_t(unsigned int, data->max_folios * 2,
2231                                            FUSE_DEFAULT_MAX_PAGES_PER_REQ),
2232                                     fc->max_pages);
2233         WARN_ON(nfolios <= data->max_folios);
2234
2235         folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs);
2236         if (!folios)
2237                 return false;
2238
2239         memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios);
2240         memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios);
2241         kfree(ap->folios);
2242         ap->folios = folios;
2243         ap->descs = descs;
2244         data->max_folios = nfolios;
2245
2246         return true;
2247 }
2248
2249 static void fuse_writepages_send(struct fuse_fill_wb_data *data)
2250 {
2251         struct fuse_writepage_args *wpa = data->wpa;
2252         struct inode *inode = data->inode;
2253         struct fuse_inode *fi = get_fuse_inode(inode);
2254         int num_folios = wpa->ia.ap.num_folios;
2255         int i;
2256
2257         spin_lock(&fi->lock);
2258         list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2259         fuse_flush_writepages(inode);
2260         spin_unlock(&fi->lock);
2261
2262         for (i = 0; i < num_folios; i++)
2263                 folio_end_writeback(data->orig_folios[i]);
2264 }
2265
2266 /*
2267  * Check under fi->lock if the page is under writeback, and insert it onto the
2268  * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
2269  * one already added for a page at this offset.  If there's none, then insert
2270  * this new request onto the auxiliary list, otherwise reuse the existing one by
2271  * swapping the new temp page with the old one.
2272  */
2273 static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
2274                                struct folio *folio)
2275 {
2276         struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
2277         struct fuse_writepage_args *tmp;
2278         struct fuse_writepage_args *old_wpa;
2279         struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
2280
2281         WARN_ON(new_ap->num_folios != 0);
2282         new_ap->num_folios = 1;
2283
2284         spin_lock(&fi->lock);
2285         old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
2286         if (!old_wpa) {
2287                 spin_unlock(&fi->lock);
2288                 return true;
2289         }
2290
2291         for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
2292                 pgoff_t curr_index;
2293
2294                 WARN_ON(tmp->inode != new_wpa->inode);
2295                 curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
2296                 if (curr_index == folio->index) {
2297                         WARN_ON(tmp->ia.ap.num_folios != 1);
2298                         swap(tmp->ia.ap.folios[0], new_ap->folios[0]);
2299                         break;
2300                 }
2301         }
2302
2303         if (!tmp) {
2304                 new_wpa->next = old_wpa->next;
2305                 old_wpa->next = new_wpa;
2306         }
2307
2308         spin_unlock(&fi->lock);
2309
2310         if (tmp) {
2311                 fuse_writepage_finish_stat(new_wpa->inode,
2312                                            folio);
2313                 fuse_writepage_free(new_wpa);
2314         }
2315
2316         return false;
2317 }
2318
2319 static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio,
2320                                      struct fuse_args_pages *ap,
2321                                      struct fuse_fill_wb_data *data)
2322 {
2323         WARN_ON(!ap->num_folios);
2324
2325         /*
2326          * Being under writeback is unlikely but possible.  For example direct
2327          * read to an mmaped fuse file will set the page dirty twice; once when
2328          * the pages are faulted with get_user_pages(), and then after the read
2329          * completed.
2330          */
2331         if (fuse_folio_is_writeback(data->inode, folio))
2332                 return true;
2333
2334         /* Reached max pages */
2335         if (ap->num_folios == fc->max_pages)
2336                 return true;
2337
2338         /* Reached max write bytes */
2339         if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write)
2340                 return true;
2341
2342         /* Discontinuity */
2343         if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio))
2344                 return true;
2345
2346         /* Need to grow the pages array?  If so, did the expansion fail? */
2347         if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data))
2348                 return true;
2349
2350         return false;
2351 }
2352
2353 static int fuse_writepages_fill(struct folio *folio,
2354                 struct writeback_control *wbc, void *_data)
2355 {
2356         struct fuse_fill_wb_data *data = _data;
2357         struct fuse_writepage_args *wpa = data->wpa;
2358         struct fuse_args_pages *ap = &wpa->ia.ap;
2359         struct inode *inode = data->inode;
2360         struct fuse_inode *fi = get_fuse_inode(inode);
2361         struct fuse_conn *fc = get_fuse_conn(inode);
2362         struct folio *tmp_folio;
2363         int err;
2364
2365         if (!data->ff) {
2366                 err = -EIO;
2367                 data->ff = fuse_write_file_get(fi);
2368                 if (!data->ff)
2369                         goto out_unlock;
2370         }
2371
2372         if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) {
2373                 fuse_writepages_send(data);
2374                 data->wpa = NULL;
2375         }
2376
2377         err = -ENOMEM;
2378         tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
2379         if (!tmp_folio)
2380                 goto out_unlock;
2381
2382         /*
2383          * The page must not be redirtied until the writeout is completed
2384          * (i.e. userspace has sent a reply to the write request).  Otherwise
2385          * there could be more than one temporary page instance for each real
2386          * page.
2387          *
2388          * This is ensured by holding the page lock in page_mkwrite() while
2389          * checking fuse_page_is_writeback().  We already hold the page lock
2390          * since clear_page_dirty_for_io() and keep it held until we add the
2391          * request to the fi->writepages list and increment ap->num_folios.
2392          * After this fuse_page_is_writeback() will indicate that the page is
2393          * under writeback, so we can release the page lock.
2394          */
2395         if (data->wpa == NULL) {
2396                 err = -ENOMEM;
2397                 wpa = fuse_writepage_args_setup(folio, data->ff);
2398                 if (!wpa) {
2399                         folio_put(tmp_folio);
2400                         goto out_unlock;
2401                 }
2402                 fuse_file_get(wpa->ia.ff);
2403                 data->max_folios = 1;
2404                 ap = &wpa->ia.ap;
2405         }
2406         folio_start_writeback(folio);
2407
2408         fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios);
2409         data->orig_folios[ap->num_folios] = folio;
2410
2411         err = 0;
2412         if (data->wpa) {
2413                 /*
2414                  * Protected by fi->lock against concurrent access by
2415                  * fuse_page_is_writeback().
2416                  */
2417                 spin_lock(&fi->lock);
2418                 ap->num_folios++;
2419                 spin_unlock(&fi->lock);
2420         } else if (fuse_writepage_add(wpa, folio)) {
2421                 data->wpa = wpa;
2422         } else {
2423                 folio_end_writeback(folio);
2424         }
2425 out_unlock:
2426         folio_unlock(folio);
2427
2428         return err;
2429 }
2430
2431 static int fuse_writepages(struct address_space *mapping,
2432                            struct writeback_control *wbc)
2433 {
2434         struct inode *inode = mapping->host;
2435         struct fuse_conn *fc = get_fuse_conn(inode);
2436         struct fuse_fill_wb_data data;
2437         int err;
2438
2439         err = -EIO;
2440         if (fuse_is_bad(inode))
2441                 goto out;
2442
2443         if (wbc->sync_mode == WB_SYNC_NONE &&
2444             fc->num_background >= fc->congestion_threshold)
2445                 return 0;
2446
2447         data.inode = inode;
2448         data.wpa = NULL;
2449         data.ff = NULL;
2450
2451         err = -ENOMEM;
2452         data.orig_folios = kcalloc(fc->max_pages,
2453                                    sizeof(struct folio *),
2454                                    GFP_NOFS);
2455         if (!data.orig_folios)
2456                 goto out;
2457
2458         err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2459         if (data.wpa) {
2460                 WARN_ON(!data.wpa->ia.ap.num_folios);
2461                 fuse_writepages_send(&data);
2462         }
2463         if (data.ff)
2464                 fuse_file_put(data.ff, false);
2465
2466         kfree(data.orig_folios);
2467 out:
2468         return err;
2469 }
2470
2471 /*
2472  * It's worthy to make sure that space is reserved on disk for the write,
2473  * but how to implement it without killing performance need more thinking.
2474  */
2475 static int fuse_write_begin(struct file *file, struct address_space *mapping,
2476                 loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
2477 {
2478         pgoff_t index = pos >> PAGE_SHIFT;
2479         struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2480         struct folio *folio;
2481         loff_t fsize;
2482         int err = -ENOMEM;
2483
2484         WARN_ON(!fc->writeback_cache);
2485
2486         folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2487                         mapping_gfp_mask(mapping));
2488         if (IS_ERR(folio))
2489                 goto error;
2490
2491         fuse_wait_on_page_writeback(mapping->host, folio->index);
2492
2493         if (folio_test_uptodate(folio) || len >= folio_size(folio))
2494                 goto success;
2495         /*
2496          * Check if the start of this folio comes after the end of file,
2497          * in which case the readpage can be optimized away.
2498          */
2499         fsize = i_size_read(mapping->host);
2500         if (fsize <= folio_pos(folio)) {
2501                 size_t off = offset_in_folio(folio, pos);
2502                 if (off)
2503                         folio_zero_segment(folio, 0, off);
2504                 goto success;
2505         }
2506         err = fuse_do_readfolio(file, folio);
2507         if (err)
2508                 goto cleanup;
2509 success:
2510         *foliop = folio;
2511         return 0;
2512
2513 cleanup:
2514         folio_unlock(folio);
2515         folio_put(folio);
2516 error:
2517         return err;
2518 }
2519
2520 static int fuse_write_end(struct file *file, struct address_space *mapping,
2521                 loff_t pos, unsigned len, unsigned copied,
2522                 struct folio *folio, void *fsdata)
2523 {
2524         struct inode *inode = folio->mapping->host;
2525
2526         /* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
2527         if (!copied)
2528                 goto unlock;
2529
2530         pos += copied;
2531         if (!folio_test_uptodate(folio)) {
2532                 /* Zero any unwritten bytes at the end of the page */
2533                 size_t endoff = pos & ~PAGE_MASK;
2534                 if (endoff)
2535                         folio_zero_segment(folio, endoff, PAGE_SIZE);
2536                 folio_mark_uptodate(folio);
2537         }
2538
2539         if (pos > inode->i_size)
2540                 i_size_write(inode, pos);
2541
2542         folio_mark_dirty(folio);
2543
2544 unlock:
2545         folio_unlock(folio);
2546         folio_put(folio);
2547
2548         return copied;
2549 }
2550
2551 static int fuse_launder_folio(struct folio *folio)
2552 {
2553         int err = 0;
2554         if (folio_clear_dirty_for_io(folio)) {
2555                 struct inode *inode = folio->mapping->host;
2556
2557                 /* Serialize with pending writeback for the same page */
2558                 fuse_wait_on_page_writeback(inode, folio->index);
2559                 err = fuse_writepage_locked(folio);
2560                 if (!err)
2561                         fuse_wait_on_page_writeback(inode, folio->index);
2562         }
2563         return err;
2564 }
2565
2566 /*
2567  * Write back dirty data/metadata now (there may not be any suitable
2568  * open files later for data)
2569  */
2570 static void fuse_vma_close(struct vm_area_struct *vma)
2571 {
2572         int err;
2573
2574         err = write_inode_now(vma->vm_file->f_mapping->host, 1);
2575         mapping_set_error(vma->vm_file->f_mapping, err);
2576 }
2577
2578 /*
2579  * Wait for writeback against this page to complete before allowing it
2580  * to be marked dirty again, and hence written back again, possibly
2581  * before the previous writepage completed.
2582  *
2583  * Block here, instead of in ->writepage(), so that the userspace fs
2584  * can only block processes actually operating on the filesystem.
2585  *
2586  * Otherwise unprivileged userspace fs would be able to block
2587  * unrelated:
2588  *
2589  * - page migration
2590  * - sync(2)
2591  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
2592  */
2593 static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
2594 {
2595         struct folio *folio = page_folio(vmf->page);
2596         struct inode *inode = file_inode(vmf->vma->vm_file);
2597
2598         file_update_time(vmf->vma->vm_file);
2599         folio_lock(folio);
2600         if (folio->mapping != inode->i_mapping) {
2601                 folio_unlock(folio);
2602                 return VM_FAULT_NOPAGE;
2603         }
2604
2605         fuse_wait_on_folio_writeback(inode, folio);
2606         return VM_FAULT_LOCKED;
2607 }
2608
2609 static const struct vm_operations_struct fuse_file_vm_ops = {
2610         .close          = fuse_vma_close,
2611         .fault          = filemap_fault,
2612         .map_pages      = filemap_map_pages,
2613         .page_mkwrite   = fuse_page_mkwrite,
2614 };
2615
2616 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
2617 {
2618         struct fuse_file *ff = file->private_data;
2619         struct fuse_conn *fc = ff->fm->fc;
2620         struct inode *inode = file_inode(file);
2621         int rc;
2622
2623         /* DAX mmap is superior to direct_io mmap */
2624         if (FUSE_IS_DAX(inode))
2625                 return fuse_dax_mmap(file, vma);
2626
2627         /*
2628          * If inode is in passthrough io mode, because it has some file open
2629          * in passthrough mode, either mmap to backing file or fail mmap,
2630          * because mixing cached mmap and passthrough io mode is not allowed.
2631          */
2632         if (fuse_file_passthrough(ff))
2633                 return fuse_passthrough_mmap(file, vma);
2634         else if (fuse_inode_backing(get_fuse_inode(inode)))
2635                 return -ENODEV;
2636
2637         /*
2638          * FOPEN_DIRECT_IO handling is special compared to O_DIRECT,
2639          * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
2640          */
2641         if (ff->open_flags & FOPEN_DIRECT_IO) {
2642                 /*
2643                  * Can't provide the coherency needed for MAP_SHARED
2644                  * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
2645                  */
2646                 if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
2647                         return -ENODEV;
2648
2649                 invalidate_inode_pages2(file->f_mapping);
2650
2651                 if (!(vma->vm_flags & VM_MAYSHARE)) {
2652                         /* MAP_PRIVATE */
2653                         return generic_file_mmap(file, vma);
2654                 }
2655
2656                 /*
2657                  * First mmap of direct_io file enters caching inode io mode.
2658                  * Also waits for parallel dio writers to go into serial mode
2659                  * (exclusive instead of shared lock).
2660                  * After first mmap, the inode stays in caching io mode until
2661                  * the direct_io file release.
2662                  */
2663                 rc = fuse_file_cached_io_open(inode, ff);
2664                 if (rc)
2665                         return rc;
2666         }
2667
2668         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2669                 fuse_link_write_file(file);
2670
2671         file_accessed(file);
2672         vma->vm_ops = &fuse_file_vm_ops;
2673         return 0;
2674 }
2675
2676 static int convert_fuse_file_lock(struct fuse_conn *fc,
2677                                   const struct fuse_file_lock *ffl,
2678                                   struct file_lock *fl)
2679 {
2680         switch (ffl->type) {
2681         case F_UNLCK:
2682                 break;
2683
2684         case F_RDLCK:
2685         case F_WRLCK:
2686                 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
2687                     ffl->end < ffl->start)
2688                         return -EIO;
2689
2690                 fl->fl_start = ffl->start;
2691                 fl->fl_end = ffl->end;
2692
2693                 /*
2694                  * Convert pid into init's pid namespace.  The locks API will
2695                  * translate it into the caller's pid namespace.
2696                  */
2697                 rcu_read_lock();
2698                 fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2699                 rcu_read_unlock();
2700                 break;
2701
2702         default:
2703                 return -EIO;
2704         }
2705         fl->c.flc_type = ffl->type;
2706         return 0;
2707 }
2708
2709 static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2710                          const struct file_lock *fl, int opcode, pid_t pid,
2711                          int flock, struct fuse_lk_in *inarg)
2712 {
2713         struct inode *inode = file_inode(file);
2714         struct fuse_conn *fc = get_fuse_conn(inode);
2715         struct fuse_file *ff = file->private_data;
2716
2717         memset(inarg, 0, sizeof(*inarg));
2718         inarg->fh = ff->fh;
2719         inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner);
2720         inarg->lk.start = fl->fl_start;
2721         inarg->lk.end = fl->fl_end;
2722         inarg->lk.type = fl->c.flc_type;
2723         inarg->lk.pid = pid;
2724         if (flock)
2725                 inarg->lk_flags |= FUSE_LK_FLOCK;
2726         args->opcode = opcode;
2727         args->nodeid = get_node_id(inode);
2728         args->in_numargs = 1;
2729         args->in_args[0].size = sizeof(*inarg);
2730         args->in_args[0].value = inarg;
2731 }
2732
2733 static int fuse_getlk(struct file *file, struct file_lock *fl)
2734 {
2735         struct inode *inode = file_inode(file);
2736         struct fuse_mount *fm = get_fuse_mount(inode);
2737         FUSE_ARGS(args);
2738         struct fuse_lk_in inarg;
2739         struct fuse_lk_out outarg;
2740         int err;
2741
2742         fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2743         args.out_numargs = 1;
2744         args.out_args[0].size = sizeof(outarg);
2745         args.out_args[0].value = &outarg;
2746         err = fuse_simple_request(fm, &args);
2747         if (!err)
2748                 err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
2749
2750         return err;
2751 }
2752
2753 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2754 {
2755         struct inode *inode = file_inode(file);
2756         struct fuse_mount *fm = get_fuse_mount(inode);
2757         FUSE_ARGS(args);
2758         struct fuse_lk_in inarg;
2759         int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2760         struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL;
2761         pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
2762         int err;
2763
2764         if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
2765                 /* NLM needs asynchronous locks, which we don't support yet */
2766                 return -ENOLCK;
2767         }
2768
2769         fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2770         err = fuse_simple_request(fm, &args);
2771
2772         /* locking is restartable */
2773         if (err == -EINTR)
2774                 err = -ERESTARTSYS;
2775
2776         return err;
2777 }
2778
2779 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
2780 {
2781         struct inode *inode = file_inode(file);
2782         struct fuse_conn *fc = get_fuse_conn(inode);
2783         int err;
2784
2785         if (cmd == F_CANCELLK) {
2786                 err = 0;
2787         } else if (cmd == F_GETLK) {
2788                 if (fc->no_lock) {
2789                         posix_test_lock(file, fl);
2790                         err = 0;
2791                 } else
2792                         err = fuse_getlk(file, fl);
2793         } else {
2794                 if (fc->no_lock)
2795                         err = posix_lock_file(file, fl, NULL);
2796                 else
2797                         err = fuse_setlk(file, fl, 0);
2798         }
2799         return err;
2800 }
2801
2802 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
2803 {
2804         struct inode *inode = file_inode(file);
2805         struct fuse_conn *fc = get_fuse_conn(inode);
2806         int err;
2807
2808         if (fc->no_flock) {
2809                 err = locks_lock_file_wait(file, fl);
2810         } else {
2811                 struct fuse_file *ff = file->private_data;
2812
2813                 /* emulate flock with POSIX locks */
2814                 ff->flock = true;
2815                 err = fuse_setlk(file, fl, 1);
2816         }
2817
2818         return err;
2819 }
2820
2821 static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2822 {
2823         struct inode *inode = mapping->host;
2824         struct fuse_mount *fm = get_fuse_mount(inode);
2825         FUSE_ARGS(args);
2826         struct fuse_bmap_in inarg;
2827         struct fuse_bmap_out outarg;
2828         int err;
2829
2830         if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
2831                 return 0;
2832
2833         memset(&inarg, 0, sizeof(inarg));
2834         inarg.block = block;
2835         inarg.blocksize = inode->i_sb->s_blocksize;
2836         args.opcode = FUSE_BMAP;
2837         args.nodeid = get_node_id(inode);
2838         args.in_numargs = 1;
2839         args.in_args[0].size = sizeof(inarg);
2840         args.in_args[0].value = &inarg;
2841         args.out_numargs = 1;
2842         args.out_args[0].size = sizeof(outarg);
2843         args.out_args[0].value = &outarg;
2844         err = fuse_simple_request(fm, &args);
2845         if (err == -ENOSYS)
2846                 fm->fc->no_bmap = 1;
2847
2848         return err ? 0 : outarg.block;
2849 }
2850
2851 static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
2852 {
2853         struct inode *inode = file->f_mapping->host;
2854         struct fuse_mount *fm = get_fuse_mount(inode);
2855         struct fuse_file *ff = file->private_data;
2856         FUSE_ARGS(args);
2857         struct fuse_lseek_in inarg = {
2858                 .fh = ff->fh,
2859                 .offset = offset,
2860                 .whence = whence
2861         };
2862         struct fuse_lseek_out outarg;
2863         int err;
2864
2865         if (fm->fc->no_lseek)
2866                 goto fallback;
2867
2868         args.opcode = FUSE_LSEEK;
2869         args.nodeid = ff->nodeid;
2870         args.in_numargs = 1;
2871         args.in_args[0].size = sizeof(inarg);
2872         args.in_args[0].value = &inarg;
2873         args.out_numargs = 1;
2874         args.out_args[0].size = sizeof(outarg);
2875         args.out_args[0].value = &outarg;
2876         err = fuse_simple_request(fm, &args);
2877         if (err) {
2878                 if (err == -ENOSYS) {
2879                         fm->fc->no_lseek = 1;
2880                         goto fallback;
2881                 }
2882                 return err;
2883         }
2884
2885         return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
2886
2887 fallback:
2888         err = fuse_update_attributes(inode, file, STATX_SIZE);
2889         if (!err)
2890                 return generic_file_llseek(file, offset, whence);
2891         else
2892                 return err;
2893 }
2894
2895 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
2896 {
2897         loff_t retval;
2898         struct inode *inode = file_inode(file);
2899
2900         switch (whence) {
2901         case SEEK_SET:
2902         case SEEK_CUR:
2903                  /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2904                 retval = generic_file_llseek(file, offset, whence);
2905                 break;
2906         case SEEK_END:
2907                 inode_lock(inode);
2908                 retval = fuse_update_attributes(inode, file, STATX_SIZE);
2909                 if (!retval)
2910                         retval = generic_file_llseek(file, offset, whence);
2911                 inode_unlock(inode);
2912                 break;
2913         case SEEK_HOLE:
2914         case SEEK_DATA:
2915                 inode_lock(inode);
2916                 retval = fuse_lseek(file, offset, whence);
2917                 inode_unlock(inode);
2918                 break;
2919         default:
2920                 retval = -EINVAL;
2921         }
2922
2923         return retval;
2924 }
2925
2926 /*
2927  * All files which have been polled are linked to RB tree
2928  * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
2929  * find the matching one.
2930  */
2931 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
2932                                               struct rb_node **parent_out)
2933 {
2934         struct rb_node **link = &fc->polled_files.rb_node;
2935         struct rb_node *last = NULL;
2936
2937         while (*link) {
2938                 struct fuse_file *ff;
2939
2940                 last = *link;
2941                 ff = rb_entry(last, struct fuse_file, polled_node);
2942
2943                 if (kh < ff->kh)
2944                         link = &last->rb_left;
2945                 else if (kh > ff->kh)
2946                         link = &last->rb_right;
2947                 else
2948                         return link;
2949         }
2950
2951         if (parent_out)
2952                 *parent_out = last;
2953         return link;
2954 }
2955
2956 /*
2957  * The file is about to be polled.  Make sure it's on the polled_files
2958  * RB tree.  Note that files once added to the polled_files tree are
2959  * not removed before the file is released.  This is because a file
2960  * polled once is likely to be polled again.
2961  */
2962 static void fuse_register_polled_file(struct fuse_conn *fc,
2963                                       struct fuse_file *ff)
2964 {
2965         spin_lock(&fc->lock);
2966         if (RB_EMPTY_NODE(&ff->polled_node)) {
2967                 struct rb_node **link, *parent;
2968
2969                 link = fuse_find_polled_node(fc, ff->kh, &parent);
2970                 BUG_ON(*link);
2971                 rb_link_node(&ff->polled_node, parent, link);
2972                 rb_insert_color(&ff->polled_node, &fc->polled_files);
2973         }
2974         spin_unlock(&fc->lock);
2975 }
2976
2977 __poll_t fuse_file_poll(struct file *file, poll_table *wait)
2978 {
2979         struct fuse_file *ff = file->private_data;
2980         struct fuse_mount *fm = ff->fm;
2981         struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2982         struct fuse_poll_out outarg;
2983         FUSE_ARGS(args);
2984         int err;
2985
2986         if (fm->fc->no_poll)
2987                 return DEFAULT_POLLMASK;
2988
2989         poll_wait(file, &ff->poll_wait, wait);
2990         inarg.events = mangle_poll(poll_requested_events(wait));
2991
2992         /*
2993          * Ask for notification iff there's someone waiting for it.
2994          * The client may ignore the flag and always notify.
2995          */
2996         if (waitqueue_active(&ff->poll_wait)) {
2997                 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
2998                 fuse_register_polled_file(fm->fc, ff);
2999         }
3000
3001         args.opcode = FUSE_POLL;
3002         args.nodeid = ff->nodeid;
3003         args.in_numargs = 1;
3004         args.in_args[0].size = sizeof(inarg);
3005         args.in_args[0].value = &inarg;
3006         args.out_numargs = 1;
3007         args.out_args[0].size = sizeof(outarg);
3008         args.out_args[0].value = &outarg;
3009         err = fuse_simple_request(fm, &args);
3010
3011         if (!err)
3012                 return demangle_poll(outarg.revents);
3013         if (err == -ENOSYS) {
3014                 fm->fc->no_poll = 1;
3015                 return DEFAULT_POLLMASK;
3016         }
3017         return EPOLLERR;
3018 }
3019 EXPORT_SYMBOL_GPL(fuse_file_poll);
3020
3021 /*
3022  * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
3023  * wakes up the poll waiters.
3024  */
3025 int fuse_notify_poll_wakeup(struct fuse_conn *fc,
3026                             struct fuse_notify_poll_wakeup_out *outarg)
3027 {
3028         u64 kh = outarg->kh;
3029         struct rb_node **link;
3030
3031         spin_lock(&fc->lock);
3032
3033         link = fuse_find_polled_node(fc, kh, NULL);
3034         if (*link) {
3035                 struct fuse_file *ff;
3036
3037                 ff = rb_entry(*link, struct fuse_file, polled_node);
3038                 wake_up_interruptible_sync(&ff->poll_wait);
3039         }
3040
3041         spin_unlock(&fc->lock);
3042         return 0;
3043 }
3044
3045 static void fuse_do_truncate(struct file *file)
3046 {
3047         struct inode *inode = file->f_mapping->host;
3048         struct iattr attr;
3049
3050         attr.ia_valid = ATTR_SIZE;
3051         attr.ia_size = i_size_read(inode);
3052
3053         attr.ia_file = file;
3054         attr.ia_valid |= ATTR_FILE;
3055
3056         fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file);
3057 }
3058
3059 static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
3060 {
3061         return round_up(off, fc->max_pages << PAGE_SHIFT);
3062 }
3063
3064 static ssize_t
3065 fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3066 {
3067         DECLARE_COMPLETION_ONSTACK(wait);
3068         ssize_t ret = 0;
3069         struct file *file = iocb->ki_filp;
3070         struct fuse_file *ff = file->private_data;
3071         loff_t pos = 0;
3072         struct inode *inode;
3073         loff_t i_size;
3074         size_t count = iov_iter_count(iter), shortened = 0;
3075         loff_t offset = iocb->ki_pos;
3076         struct fuse_io_priv *io;
3077
3078         pos = offset;
3079         inode = file->f_mapping->host;
3080         i_size = i_size_read(inode);
3081
3082         if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
3083                 return 0;
3084
3085         io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
3086         if (!io)
3087                 return -ENOMEM;
3088         spin_lock_init(&io->lock);
3089         kref_init(&io->refcnt);
3090         io->reqs = 1;
3091         io->bytes = -1;
3092         io->size = 0;
3093         io->offset = offset;
3094         io->write = (iov_iter_rw(iter) == WRITE);
3095         io->err = 0;
3096         /*
3097          * By default, we want to optimize all I/Os with async request
3098          * submission to the client filesystem if supported.
3099          */
3100         io->async = ff->fm->fc->async_dio;
3101         io->iocb = iocb;
3102         io->blocking = is_sync_kiocb(iocb);
3103
3104         /* optimization for short read */
3105         if (io->async && !io->write && offset + count > i_size) {
3106                 iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
3107                 shortened = count - iov_iter_count(iter);
3108                 count -= shortened;
3109         }
3110
3111         /*
3112          * We cannot asynchronously extend the size of a file.
3113          * In such case the aio will behave exactly like sync io.
3114          */
3115         if ((offset + count > i_size) && io->write)
3116                 io->blocking = true;
3117
3118         if (io->async && io->blocking) {
3119                 /*
3120                  * Additional reference to keep io around after
3121                  * calling fuse_aio_complete()
3122                  */
3123                 kref_get(&io->refcnt);
3124                 io->done = &wait;
3125         }
3126
3127         if (iov_iter_rw(iter) == WRITE) {
3128                 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
3129                 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
3130         } else {
3131                 ret = __fuse_direct_read(io, iter, &pos);
3132         }
3133         iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
3134
3135         if (io->async) {
3136                 bool blocking = io->blocking;
3137
3138                 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
3139
3140                 /* we have a non-extending, async request, so return */
3141                 if (!blocking)
3142                         return -EIOCBQUEUED;
3143
3144                 wait_for_completion(&wait);
3145                 ret = fuse_get_res_by_io(io);
3146         }
3147
3148         kref_put(&io->refcnt, fuse_io_release);
3149
3150         if (iov_iter_rw(iter) == WRITE) {
3151                 fuse_write_update_attr(inode, pos, ret);
3152                 /* For extending writes we already hold exclusive lock */
3153                 if (ret < 0 && offset + count > i_size)
3154                         fuse_do_truncate(file);
3155         }
3156
3157         return ret;
3158 }
3159
3160 static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
3161 {
3162         int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
3163
3164         if (!err)
3165                 fuse_sync_writes(inode);
3166
3167         return err;
3168 }
3169
3170 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
3171                                 loff_t length)
3172 {
3173         struct fuse_file *ff = file->private_data;
3174         struct inode *inode = file_inode(file);
3175         struct fuse_inode *fi = get_fuse_inode(inode);
3176         struct fuse_mount *fm = ff->fm;
3177         FUSE_ARGS(args);
3178         struct fuse_fallocate_in inarg = {
3179                 .fh = ff->fh,
3180                 .offset = offset,
3181                 .length = length,
3182                 .mode = mode
3183         };
3184         int err;
3185         bool block_faults = FUSE_IS_DAX(inode) &&
3186                 (!(mode & FALLOC_FL_KEEP_SIZE) ||
3187                  (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
3188
3189         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3190                      FALLOC_FL_ZERO_RANGE))
3191                 return -EOPNOTSUPP;
3192
3193         if (fm->fc->no_fallocate)
3194                 return -EOPNOTSUPP;
3195
3196         inode_lock(inode);
3197         if (block_faults) {
3198                 filemap_invalidate_lock(inode->i_mapping);
3199                 err = fuse_dax_break_layouts(inode, 0, 0);
3200                 if (err)
3201                         goto out;
3202         }
3203
3204         if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
3205                 loff_t endbyte = offset + length - 1;
3206
3207                 err = fuse_writeback_range(inode, offset, endbyte);
3208                 if (err)
3209                         goto out;
3210         }
3211
3212         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
3213             offset + length > i_size_read(inode)) {
3214                 err = inode_newsize_ok(inode, offset + length);
3215                 if (err)
3216                         goto out;
3217         }
3218
3219         err = file_modified(file);
3220         if (err)
3221                 goto out;
3222
3223         if (!(mode & FALLOC_FL_KEEP_SIZE))
3224                 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3225
3226         args.opcode = FUSE_FALLOCATE;
3227         args.nodeid = ff->nodeid;
3228         args.in_numargs = 1;
3229         args.in_args[0].size = sizeof(inarg);
3230         args.in_args[0].value = &inarg;
3231         err = fuse_simple_request(fm, &args);
3232         if (err == -ENOSYS) {
3233                 fm->fc->no_fallocate = 1;
3234                 err = -EOPNOTSUPP;
3235         }
3236         if (err)
3237                 goto out;
3238
3239         /* we could have extended the file */
3240         if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3241                 if (fuse_write_update_attr(inode, offset + length, length))
3242                         file_update_time(file);
3243         }
3244
3245         if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
3246                 truncate_pagecache_range(inode, offset, offset + length - 1);
3247
3248         fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
3249
3250 out:
3251         if (!(mode & FALLOC_FL_KEEP_SIZE))
3252                 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3253
3254         if (block_faults)
3255                 filemap_invalidate_unlock(inode->i_mapping);
3256
3257         inode_unlock(inode);
3258
3259         fuse_flush_time_update(inode);
3260
3261         return err;
3262 }
3263
3264 static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
3265                                       struct file *file_out, loff_t pos_out,
3266                                       size_t len, unsigned int flags)
3267 {
3268         struct fuse_file *ff_in = file_in->private_data;
3269         struct fuse_file *ff_out = file_out->private_data;
3270         struct inode *inode_in = file_inode(file_in);
3271         struct inode *inode_out = file_inode(file_out);
3272         struct fuse_inode *fi_out = get_fuse_inode(inode_out);
3273         struct fuse_mount *fm = ff_in->fm;
3274         struct fuse_conn *fc = fm->fc;
3275         FUSE_ARGS(args);
3276         struct fuse_copy_file_range_in inarg = {
3277                 .fh_in = ff_in->fh,
3278                 .off_in = pos_in,
3279                 .nodeid_out = ff_out->nodeid,
3280                 .fh_out = ff_out->fh,
3281                 .off_out = pos_out,
3282                 .len = len,
3283                 .flags = flags
3284         };
3285         struct fuse_write_out outarg;
3286         ssize_t err;
3287         /* mark unstable when write-back is not used, and file_out gets
3288          * extended */
3289         bool is_unstable = (!fc->writeback_cache) &&
3290                            ((pos_out + len) > inode_out->i_size);
3291
3292         if (fc->no_copy_file_range)
3293                 return -EOPNOTSUPP;
3294
3295         if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
3296                 return -EXDEV;
3297
3298         inode_lock(inode_in);
3299         err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
3300         inode_unlock(inode_in);
3301         if (err)
3302                 return err;
3303
3304         inode_lock(inode_out);
3305
3306         err = file_modified(file_out);
3307         if (err)
3308                 goto out;
3309
3310         /*
3311          * Write out dirty pages in the destination file before sending the COPY
3312          * request to userspace.  After the request is completed, truncate off
3313          * pages (including partial ones) from the cache that have been copied,
3314          * since these contain stale data at that point.
3315          *
3316          * This should be mostly correct, but if the COPY writes to partial
3317          * pages (at the start or end) and the parts not covered by the COPY are
3318          * written through a memory map after calling fuse_writeback_range(),
3319          * then these partial page modifications will be lost on truncation.
3320          *
3321          * It is unlikely that someone would rely on such mixed style
3322          * modifications.  Yet this does give less guarantees than if the
3323          * copying was performed with write(2).
3324          *
3325          * To fix this a mapping->invalidate_lock could be used to prevent new
3326          * faults while the copy is ongoing.
3327          */
3328         err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
3329         if (err)
3330                 goto out;
3331
3332         if (is_unstable)
3333                 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3334
3335         args.opcode = FUSE_COPY_FILE_RANGE;
3336         args.nodeid = ff_in->nodeid;
3337         args.in_numargs = 1;
3338         args.in_args[0].size = sizeof(inarg);
3339         args.in_args[0].value = &inarg;
3340         args.out_numargs = 1;
3341         args.out_args[0].size = sizeof(outarg);
3342         args.out_args[0].value = &outarg;
3343         err = fuse_simple_request(fm, &args);
3344         if (err == -ENOSYS) {
3345                 fc->no_copy_file_range = 1;
3346                 err = -EOPNOTSUPP;
3347         }
3348         if (err)
3349                 goto out;
3350
3351         truncate_inode_pages_range(inode_out->i_mapping,
3352                                    ALIGN_DOWN(pos_out, PAGE_SIZE),
3353                                    ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
3354
3355         file_update_time(file_out);
3356         fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
3357
3358         err = outarg.size;
3359 out:
3360         if (is_unstable)
3361                 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3362
3363         inode_unlock(inode_out);
3364         file_accessed(file_in);
3365
3366         fuse_flush_time_update(inode_out);
3367
3368         return err;
3369 }
3370
3371 static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
3372                                     struct file *dst_file, loff_t dst_off,
3373                                     size_t len, unsigned int flags)
3374 {
3375         ssize_t ret;
3376
3377         ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
3378                                      len, flags);
3379
3380         if (ret == -EOPNOTSUPP || ret == -EXDEV)
3381                 ret = splice_copy_file_range(src_file, src_off, dst_file,
3382                                              dst_off, len);
3383         return ret;
3384 }
3385
3386 static const struct file_operations fuse_file_operations = {
3387         .llseek         = fuse_file_llseek,
3388         .read_iter      = fuse_file_read_iter,
3389         .write_iter     = fuse_file_write_iter,
3390         .mmap           = fuse_file_mmap,
3391         .open           = fuse_open,
3392         .flush          = fuse_flush,
3393         .release        = fuse_release,
3394         .fsync          = fuse_fsync,
3395         .lock           = fuse_file_lock,
3396         .get_unmapped_area = thp_get_unmapped_area,
3397         .flock          = fuse_file_flock,
3398         .splice_read    = fuse_splice_read,
3399         .splice_write   = fuse_splice_write,
3400         .unlocked_ioctl = fuse_file_ioctl,
3401         .compat_ioctl   = fuse_file_compat_ioctl,
3402         .poll           = fuse_file_poll,
3403         .fallocate      = fuse_file_fallocate,
3404         .copy_file_range = fuse_copy_file_range,
3405 };
3406
3407 static const struct address_space_operations fuse_file_aops  = {
3408         .read_folio     = fuse_read_folio,
3409         .readahead      = fuse_readahead,
3410         .writepages     = fuse_writepages,
3411         .launder_folio  = fuse_launder_folio,
3412         .dirty_folio    = filemap_dirty_folio,
3413         .migrate_folio  = filemap_migrate_folio,
3414         .bmap           = fuse_bmap,
3415         .direct_IO      = fuse_direct_IO,
3416         .write_begin    = fuse_write_begin,
3417         .write_end      = fuse_write_end,
3418 };
3419
3420 void fuse_init_file_inode(struct inode *inode, unsigned int flags)
3421 {
3422         struct fuse_inode *fi = get_fuse_inode(inode);
3423
3424         inode->i_fop = &fuse_file_operations;
3425         inode->i_data.a_ops = &fuse_file_aops;
3426
3427         INIT_LIST_HEAD(&fi->write_files);
3428         INIT_LIST_HEAD(&fi->queued_writes);
3429         fi->writectr = 0;
3430         fi->iocachectr = 0;
3431         init_waitqueue_head(&fi->page_waitq);
3432         init_waitqueue_head(&fi->direct_io_waitq);
3433         fi->writepages = RB_ROOT;
3434
3435         if (IS_ENABLED(CONFIG_FUSE_DAX))
3436                 fuse_dax_inode_init(inode, flags);
3437 }