fs/btrfs/qgroup.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/writeback.h>
   9 #include <linux/blkdev.h>
  10 #include <linux/rbtree.h>
  11 #include <linux/slab.h>
  12 #include <linux/workqueue.h>
  13 #include <linux/btrfs.h>
  14 #include <linux/sched/mm.h>
  15
  16 #include "ctree.h"
  17 #include "transaction.h"
  18 #include "disk-io.h"
  19 #include "locking.h"
  20 #include "ulist.h"
  21 #include "backref.h"
  22 #include "extent_io.h"
  23 #include "qgroup.h"
  24 #include "block-group.h"
  25 #include "sysfs.h"
  26 #include "tree-mod-log.h"
  27 #include "fs.h"
  28 #include "accessors.h"
  29 #include "extent-tree.h"
  30 #include "root-tree.h"
  31 #include "tree-checker.h"
  32
  33 enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info)
  34 {
  35         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
  36                 return BTRFS_QGROUP_MODE_DISABLED;
  37         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
  38                 return BTRFS_QGROUP_MODE_SIMPLE;
  39         return BTRFS_QGROUP_MODE_FULL;
  40 }
  41
  42 bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info)
  43 {
  44         return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
  45 }
  46
  47 bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info)
  48 {
  49         return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
  50 }
  51
  52 /*
  53  * Helpers to access qgroup reservation
  54  *
  55  * Callers should ensure the lock context and type are valid
  56  */
  57
  58 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
  59 {
  60         u64 ret = 0;
  61         int i;
  62
  63         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  64                 ret += qgroup->rsv.values[i];
  65
  66         return ret;
  67 }
  68
  69 #ifdef CONFIG_BTRFS_DEBUG
  70 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
  71 {
  72         if (type == BTRFS_QGROUP_RSV_DATA)
  73                 return "data";
  74         if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
  75                 return "meta_pertrans";
  76         if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
  77                 return "meta_prealloc";
  78         return NULL;
  79 }
  80 #endif
  81
  82 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
  83                            struct btrfs_qgroup *qgroup, u64 num_bytes,
  84                            enum btrfs_qgroup_rsv_type type)
  85 {
  86         trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
  87         qgroup->rsv.values[type] += num_bytes;
  88 }
  89
  90 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
  91                                struct btrfs_qgroup *qgroup, u64 num_bytes,
  92                                enum btrfs_qgroup_rsv_type type)
  93 {
  94         trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
  95         if (qgroup->rsv.values[type] >= num_bytes) {
  96                 qgroup->rsv.values[type] -= num_bytes;
  97                 return;
  98         }
  99 #ifdef CONFIG_BTRFS_DEBUG
 100         WARN_RATELIMIT(1,
 101                 "qgroup %llu %s reserved space underflow, have %llu to free %llu",
 102                 qgroup->qgroupid, qgroup_rsv_type_str(type),
 103                 qgroup->rsv.values[type], num_bytes);
 104 #endif
 105         qgroup->rsv.values[type] = 0;
 106 }
 107
 108 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
 109                                      struct btrfs_qgroup *dest,
 110                                      const struct btrfs_qgroup *src)
 111 {
 112         int i;
 113
 114         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 115                 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
 116 }
 117
 118 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
 119                                          struct btrfs_qgroup *dest,
 120                                          const struct btrfs_qgroup *src)
 121 {
 122         int i;
 123
 124         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 125                 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 126 }
 127
 128 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 129                                            int mod)
 130 {
 131         if (qg->old_refcnt < seq)
 132                 qg->old_refcnt = seq;
 133         qg->old_refcnt += mod;
 134 }
 135
 136 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
 137                                            int mod)
 138 {
 139         if (qg->new_refcnt < seq)
 140                 qg->new_refcnt = seq;
 141         qg->new_refcnt += mod;
 142 }
 143
 144 static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq)
 145 {
 146         if (qg->old_refcnt < seq)
 147                 return 0;
 148         return qg->old_refcnt - seq;
 149 }
 150
 151 static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq)
 152 {
 153         if (qg->new_refcnt < seq)
 154                 return 0;
 155         return qg->new_refcnt - seq;
 156 }
 157
 158 static int
 159 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 160                    int init_flags);
 161 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 162
 163 /* must be called with qgroup_ioctl_lock held */
 164 static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
 165                                            u64 qgroupid)
 166 {
 167         struct rb_node *n = fs_info->qgroup_tree.rb_node;
 168         struct btrfs_qgroup *qgroup;
 169
 170         while (n) {
 171                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
 172                 if (qgroup->qgroupid < qgroupid)
 173                         n = n->rb_left;
 174                 else if (qgroup->qgroupid > qgroupid)
 175                         n = n->rb_right;
 176                 else
 177                         return qgroup;
 178         }
 179         return NULL;
 180 }
 181
 182 /*
 183  * Add qgroup to the filesystem's qgroup tree.
 184  *
 185  * Must be called with qgroup_lock held and @prealloc preallocated.
 186  *
 187  * The control on the lifespan of @prealloc would be transferred to this
 188  * function, thus caller should no longer touch @prealloc.
 189  */
 190 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 191                                           struct btrfs_qgroup *prealloc,
 192                                           u64 qgroupid)
 193 {
 194         struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 195         struct rb_node *parent = NULL;
 196         struct btrfs_qgroup *qgroup;
 197
 198         /* Caller must have pre-allocated @prealloc. */
 199         ASSERT(prealloc);
 200
 201         while (*p) {
 202                 parent = *p;
 203                 qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 204
 205                 if (qgroup->qgroupid < qgroupid) {
 206                         p = &(*p)->rb_left;
 207                 } else if (qgroup->qgroupid > qgroupid) {
 208                         p = &(*p)->rb_right;
 209                 } else {
 210                         kfree(prealloc);
 211                         return qgroup;
 212                 }
 213         }
 214
 215         qgroup = prealloc;
 216         qgroup->qgroupid = qgroupid;
 217         INIT_LIST_HEAD(&qgroup->groups);
 218         INIT_LIST_HEAD(&qgroup->members);
 219         INIT_LIST_HEAD(&qgroup->dirty);
 220         INIT_LIST_HEAD(&qgroup->iterator);
 221         INIT_LIST_HEAD(&qgroup->nested_iterator);
 222
 223         rb_link_node(&qgroup->node, parent, p);
 224         rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
 225
 226         return qgroup;
 227 }
 228
 229 static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
 230 {
 231         struct btrfs_qgroup_list *list;
 232
 233         list_del(&qgroup->dirty);
 234         while (!list_empty(&qgroup->groups)) {
 235                 list = list_first_entry(&qgroup->groups,
 236                                         struct btrfs_qgroup_list, next_group);
 237                 list_del(&list->next_group);
 238                 list_del(&list->next_member);
 239                 kfree(list);
 240         }
 241
 242         while (!list_empty(&qgroup->members)) {
 243                 list = list_first_entry(&qgroup->members,
 244                                         struct btrfs_qgroup_list, next_member);
 245                 list_del(&list->next_group);
 246                 list_del(&list->next_member);
 247                 kfree(list);
 248         }
 249 }
 250
 251 /* must be called with qgroup_lock held */
 252 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 253 {
 254         struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 255
 256         if (!qgroup)
 257                 return -ENOENT;
 258
 259         rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 260         __del_qgroup_rb(qgroup);
 261         return 0;
 262 }
 263
 264 /*
 265  * Add relation specified by two qgroups.
 266  *
 267  * Must be called with qgroup_lock held, the ownership of @prealloc is
 268  * transferred to this function and caller should not touch it anymore.
 269  *
 270  * Return: 0        on success
 271  *         -ENOENT  if one of the qgroups is NULL
 272  *         <0       other errors
 273  */
 274 static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
 275                              struct btrfs_qgroup *member,
 276                              struct btrfs_qgroup *parent)
 277 {
 278         if (!member || !parent) {
 279                 kfree(prealloc);
 280                 return -ENOENT;
 281         }
 282
 283         prealloc->group = parent;
 284         prealloc->member = member;
 285         list_add_tail(&prealloc->next_group, &member->groups);
 286         list_add_tail(&prealloc->next_member, &parent->members);
 287
 288         return 0;
 289 }
 290
 291 /*
 292  * Add relation specified by two qgroup ids.
 293  *
 294  * Must be called with qgroup_lock held.
 295  *
 296  * Return: 0        on success
 297  *         -ENOENT  if one of the ids does not exist
 298  *         <0       other errors
 299  */
 300 static int add_relation_rb(struct btrfs_fs_info *fs_info,
 301                            struct btrfs_qgroup_list *prealloc,
 302                            u64 memberid, u64 parentid)
 303 {
 304         struct btrfs_qgroup *member;
 305         struct btrfs_qgroup *parent;
 306
 307         member = find_qgroup_rb(fs_info, memberid);
 308         parent = find_qgroup_rb(fs_info, parentid);
 309
 310         return __add_relation_rb(prealloc, member, parent);
 311 }
 312
 313 /* Must be called with qgroup_lock held */
 314 static int del_relation_rb(struct btrfs_fs_info *fs_info,
 315                            u64 memberid, u64 parentid)
 316 {
 317         struct btrfs_qgroup *member;
 318         struct btrfs_qgroup *parent;
 319         struct btrfs_qgroup_list *list;
 320
 321         member = find_qgroup_rb(fs_info, memberid);
 322         parent = find_qgroup_rb(fs_info, parentid);
 323         if (!member || !parent)
 324                 return -ENOENT;
 325
 326         list_for_each_entry(list, &member->groups, next_group) {
 327                 if (list->group == parent) {
 328                         list_del(&list->next_group);
 329                         list_del(&list->next_member);
 330                         kfree(list);
 331                         return 0;
 332                 }
 333         }
 334         return -ENOENT;
 335 }
 336
 337 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 338 int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
 339                                u64 rfer, u64 excl)
 340 {
 341         struct btrfs_qgroup *qgroup;
 342
 343         qgroup = find_qgroup_rb(fs_info, qgroupid);
 344         if (!qgroup)
 345                 return -EINVAL;
 346         if (qgroup->rfer != rfer || qgroup->excl != excl)
 347                 return -EINVAL;
 348         return 0;
 349 }
 350 #endif
 351
 352 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
 353 {
 354         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
 355                 return;
 356         fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
 357                                   BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
 358                                   BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
 359 }
 360
 361 static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
 362                                    struct extent_buffer *leaf, int slot,
 363                                    struct btrfs_qgroup_status_item *ptr)
 364 {
 365         ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
 366         ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
 367         fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr);
 368 }
 369
 370 /*
 371  * The full config is read in one go, only called from open_ctree()
 372  * It doesn't use any locking, as at this point we're still single-threaded
 373  */
 374 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 375 {
 376         struct btrfs_key key;
 377         struct btrfs_key found_key;
 378         struct btrfs_root *quota_root = fs_info->quota_root;
 379         struct btrfs_path *path = NULL;
 380         struct extent_buffer *l;
 381         int slot;
 382         int ret = 0;
 383         u64 flags = 0;
 384         u64 rescan_progress = 0;
 385
 386         if (!fs_info->quota_root)
 387                 return 0;
 388
 389         fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 390         if (!fs_info->qgroup_ulist) {
 391                 ret = -ENOMEM;
 392                 goto out;
 393         }
 394
 395         path = btrfs_alloc_path();
 396         if (!path) {
 397                 ret = -ENOMEM;
 398                 goto out;
 399         }
 400
 401         ret = btrfs_sysfs_add_qgroups(fs_info);
 402         if (ret < 0)
 403                 goto out;
 404         /* default this to quota off, in case no status key is found */
 405         fs_info->qgroup_flags = 0;
 406
 407         /*
 408          * pass 1: read status, all qgroup infos and limits
 409          */
 410         key.objectid = 0;
 411         key.type = 0;
 412         key.offset = 0;
 413         ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
 414         if (ret)
 415                 goto out;
 416
 417         while (1) {
 418                 struct btrfs_qgroup *qgroup;
 419
 420                 slot = path->slots[0];
 421                 l = path->nodes[0];
 422                 btrfs_item_key_to_cpu(l, &found_key, slot);
 423
 424                 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
 425                         struct btrfs_qgroup_status_item *ptr;
 426
 427                         ptr = btrfs_item_ptr(l, slot,
 428                                              struct btrfs_qgroup_status_item);
 429
 430                         if (btrfs_qgroup_status_version(l, ptr) !=
 431                             BTRFS_QGROUP_STATUS_VERSION) {
 432                                 btrfs_err(fs_info,
 433                                  "old qgroup version, quota disabled");
 434                                 goto out;
 435                         }
 436                         fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
 437                         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
 438                                 qgroup_read_enable_gen(fs_info, l, slot, ptr);
 439                         } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
 440                                 qgroup_mark_inconsistent(fs_info);
 441                                 btrfs_err(fs_info,
 442                                         "qgroup generation mismatch, marked as inconsistent");
 443                         }
 444                         rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 445                         goto next1;
 446                 }
 447
 448                 if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
 449                     found_key.type != BTRFS_QGROUP_LIMIT_KEY)
 450                         goto next1;
 451
 452                 qgroup = find_qgroup_rb(fs_info, found_key.offset);
 453                 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 454                     (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
 455                         btrfs_err(fs_info, "inconsistent qgroup config");
 456                         qgroup_mark_inconsistent(fs_info);
 457                 }
 458                 if (!qgroup) {
 459                         struct btrfs_qgroup *prealloc;
 460                         struct btrfs_root *tree_root = fs_info->tree_root;
 461
 462                         prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
 463                         if (!prealloc) {
 464                                 ret = -ENOMEM;
 465                                 goto out;
 466                         }
 467                         qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
 468                         /*
 469                          * If a qgroup exists for a subvolume ID, it is possible
 470                          * that subvolume has been deleted, in which case
 471                          * reusing that ID would lead to incorrect accounting.
 472                          *
 473                          * Ensure that we skip any such subvol ids.
 474                          *
 475                          * We don't need to lock because this is only called
 476                          * during mount before we start doing things like creating
 477                          * subvolumes.
 478                          */
 479                         if (is_fstree(qgroup->qgroupid) &&
 480                             qgroup->qgroupid > tree_root->free_objectid)
 481                                 /*
 482                                  * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
 483                                  * as it will get checked on the next call to
 484                                  * btrfs_get_free_objectid.
 485                                  */
 486                                 tree_root->free_objectid = qgroup->qgroupid + 1;
 487                 }
 488                 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 489                 if (ret < 0)
 490                         goto out;
 491
 492                 switch (found_key.type) {
 493                 case BTRFS_QGROUP_INFO_KEY: {
 494                         struct btrfs_qgroup_info_item *ptr;
 495
 496                         ptr = btrfs_item_ptr(l, slot,
 497                                              struct btrfs_qgroup_info_item);
 498                         qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
 499                         qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
 500                         qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
 501                         qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
 502                         /* generation currently unused */
 503                         break;
 504                 }
 505                 case BTRFS_QGROUP_LIMIT_KEY: {
 506                         struct btrfs_qgroup_limit_item *ptr;
 507
 508                         ptr = btrfs_item_ptr(l, slot,
 509                                              struct btrfs_qgroup_limit_item);
 510                         qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
 511                         qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
 512                         qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
 513                         qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
 514                         qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
 515                         break;
 516                 }
 517                 }
 518 next1:
 519                 ret = btrfs_next_item(quota_root, path);
 520                 if (ret < 0)
 521                         goto out;
 522                 if (ret)
 523                         break;
 524         }
 525         btrfs_release_path(path);
 526
 527         /*
 528          * pass 2: read all qgroup relations
 529          */
 530         key.objectid = 0;
 531         key.type = BTRFS_QGROUP_RELATION_KEY;
 532         key.offset = 0;
 533         ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
 534         if (ret)
 535                 goto out;
 536         while (1) {
 537                 struct btrfs_qgroup_list *list = NULL;
 538
 539                 slot = path->slots[0];
 540                 l = path->nodes[0];
 541                 btrfs_item_key_to_cpu(l, &found_key, slot);
 542
 543                 if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
 544                         goto next2;
 545
 546                 if (found_key.objectid > found_key.offset) {
 547                         /* parent <- member, not needed to build config */
 548                         /* FIXME should we omit the key completely? */
 549                         goto next2;
 550                 }
 551
 552                 list = kzalloc(sizeof(*list), GFP_KERNEL);
 553                 if (!list) {
 554                         ret = -ENOMEM;
 555                         goto out;
 556                 }
 557                 ret = add_relation_rb(fs_info, list, found_key.objectid,
 558                                       found_key.offset);
 559                 list = NULL;
 560                 if (ret == -ENOENT) {
 561                         btrfs_warn(fs_info,
 562                                 "orphan qgroup relation 0x%llx->0x%llx",
 563                                 found_key.objectid, found_key.offset);
 564                         ret = 0;        /* ignore the error */
 565                 }
 566                 if (ret)
 567                         goto out;
 568 next2:
 569                 ret = btrfs_next_item(quota_root, path);
 570                 if (ret < 0)
 571                         goto out;
 572                 if (ret)
 573                         break;
 574         }
 575 out:
 576         btrfs_free_path(path);
 577         fs_info->qgroup_flags |= flags;
 578         if (ret >= 0) {
 579                 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
 580                         set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 581                 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
 582                         ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 583         } else {
 584                 ulist_free(fs_info->qgroup_ulist);
 585                 fs_info->qgroup_ulist = NULL;
 586                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 587                 btrfs_sysfs_del_qgroups(fs_info);
 588         }
 589
 590         return ret < 0 ? ret : 0;
 591 }
 592
 593 /*
 594  * Called in close_ctree() when quota is still enabled.  This verifies we don't
 595  * leak some reserved space.
 596  *
 597  * Return false if no reserved space is left.
 598  * Return true if some reserved space is leaked.
 599  */
 600 bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
 601 {
 602         struct rb_node *node;
 603         bool ret = false;
 604
 605         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
 606                 return ret;
 607         /*
 608          * Since we're unmounting, there is no race and no need to grab qgroup
 609          * lock.  And here we don't go post-order to provide a more user
 610          * friendly sorted result.
 611          */
 612         for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
 613                 struct btrfs_qgroup *qgroup;
 614                 int i;
 615
 616                 qgroup = rb_entry(node, struct btrfs_qgroup, node);
 617                 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
 618                         if (qgroup->rsv.values[i]) {
 619                                 ret = true;
 620                                 btrfs_warn(fs_info,
 621                 "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
 622                                    btrfs_qgroup_level(qgroup->qgroupid),
 623                                    btrfs_qgroup_subvolid(qgroup->qgroupid),
 624                                    i, qgroup->rsv.values[i]);
 625                         }
 626                 }
 627         }
 628         return ret;
 629 }
 630
 631 /*
 632  * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 633  * first two are in single-threaded paths.And for the third one, we have set
 634  * quota_root to be null with qgroup_lock held before, so it is safe to clean
 635  * up the in-memory structures without qgroup_lock held.
 636  */
 637 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 638 {
 639         struct rb_node *n;
 640         struct btrfs_qgroup *qgroup;
 641
 642         while ((n = rb_first(&fs_info->qgroup_tree))) {
 643                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
 644                 rb_erase(n, &fs_info->qgroup_tree);
 645                 __del_qgroup_rb(qgroup);
 646                 btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 647                 kfree(qgroup);
 648         }
 649         /*
 650          * We call btrfs_free_qgroup_config() when unmounting
 651          * filesystem and disabling quota, so we set qgroup_ulist
 652          * to be null here to avoid double free.
 653          */
 654         ulist_free(fs_info->qgroup_ulist);
 655         fs_info->qgroup_ulist = NULL;
 656         btrfs_sysfs_del_qgroups(fs_info);
 657 }
 658
 659 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 660                                     u64 dst)
 661 {
 662         int ret;
 663         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 664         struct btrfs_path *path;
 665         struct btrfs_key key;
 666
 667         path = btrfs_alloc_path();
 668         if (!path)
 669                 return -ENOMEM;
 670
 671         key.objectid = src;
 672         key.type = BTRFS_QGROUP_RELATION_KEY;
 673         key.offset = dst;
 674
 675         ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 676
 677         btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 678
 679         btrfs_free_path(path);
 680         return ret;
 681 }
 682
 683 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 684                                     u64 dst)
 685 {
 686         int ret;
 687         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 688         struct btrfs_path *path;
 689         struct btrfs_key key;
 690
 691         path = btrfs_alloc_path();
 692         if (!path)
 693                 return -ENOMEM;
 694
 695         key.objectid = src;
 696         key.type = BTRFS_QGROUP_RELATION_KEY;
 697         key.offset = dst;
 698
 699         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 700         if (ret < 0)
 701                 goto out;
 702
 703         if (ret > 0) {
 704                 ret = -ENOENT;
 705                 goto out;
 706         }
 707
 708         ret = btrfs_del_item(trans, quota_root, path);
 709 out:
 710         btrfs_free_path(path);
 711         return ret;
 712 }
 713
 714 static int add_qgroup_item(struct btrfs_trans_handle *trans,
 715                            struct btrfs_root *quota_root, u64 qgroupid)
 716 {
 717         int ret;
 718         struct btrfs_path *path;
 719         struct btrfs_qgroup_info_item *qgroup_info;
 720         struct btrfs_qgroup_limit_item *qgroup_limit;
 721         struct extent_buffer *leaf;
 722         struct btrfs_key key;
 723
 724         if (btrfs_is_testing(quota_root->fs_info))
 725                 return 0;
 726
 727         path = btrfs_alloc_path();
 728         if (!path)
 729                 return -ENOMEM;
 730
 731         key.objectid = 0;
 732         key.type = BTRFS_QGROUP_INFO_KEY;
 733         key.offset = qgroupid;
 734
 735         /*
 736          * Avoid a transaction abort by catching -EEXIST here. In that
 737          * case, we proceed by re-initializing the existing structure
 738          * on disk.
 739          */
 740
 741         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 742                                       sizeof(*qgroup_info));
 743         if (ret && ret != -EEXIST)
 744                 goto out;
 745
 746         leaf = path->nodes[0];
 747         qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
 748                                  struct btrfs_qgroup_info_item);
 749         btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
 750         btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
 751         btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
 752         btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 753         btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 754
 755         btrfs_mark_buffer_dirty(trans, leaf);
 756
 757         btrfs_release_path(path);
 758
 759         key.type = BTRFS_QGROUP_LIMIT_KEY;
 760         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 761                                       sizeof(*qgroup_limit));
 762         if (ret && ret != -EEXIST)
 763                 goto out;
 764
 765         leaf = path->nodes[0];
 766         qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
 767                                   struct btrfs_qgroup_limit_item);
 768         btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
 769         btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
 770         btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
 771         btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 772         btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 773
 774         btrfs_mark_buffer_dirty(trans, leaf);
 775
 776         ret = 0;
 777 out:
 778         btrfs_free_path(path);
 779         return ret;
 780 }
 781
 782 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
 783 {
 784         int ret;
 785         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 786         struct btrfs_path *path;
 787         struct btrfs_key key;
 788
 789         path = btrfs_alloc_path();
 790         if (!path)
 791                 return -ENOMEM;
 792
 793         key.objectid = 0;
 794         key.type = BTRFS_QGROUP_INFO_KEY;
 795         key.offset = qgroupid;
 796         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 797         if (ret < 0)
 798                 goto out;
 799
 800         if (ret > 0) {
 801                 ret = -ENOENT;
 802                 goto out;
 803         }
 804
 805         ret = btrfs_del_item(trans, quota_root, path);
 806         if (ret)
 807                 goto out;
 808
 809         btrfs_release_path(path);
 810
 811         key.type = BTRFS_QGROUP_LIMIT_KEY;
 812         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 813         if (ret < 0)
 814                 goto out;
 815
 816         if (ret > 0) {
 817                 ret = -ENOENT;
 818                 goto out;
 819         }
 820
 821         ret = btrfs_del_item(trans, quota_root, path);
 822
 823 out:
 824         btrfs_free_path(path);
 825         return ret;
 826 }
 827
 828 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 829                                     struct btrfs_qgroup *qgroup)
 830 {
 831         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 832         struct btrfs_path *path;
 833         struct btrfs_key key;
 834         struct extent_buffer *l;
 835         struct btrfs_qgroup_limit_item *qgroup_limit;
 836         int ret;
 837         int slot;
 838
 839         key.objectid = 0;
 840         key.type = BTRFS_QGROUP_LIMIT_KEY;
 841         key.offset = qgroup->qgroupid;
 842
 843         path = btrfs_alloc_path();
 844         if (!path)
 845                 return -ENOMEM;
 846
 847         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 848         if (ret > 0)
 849                 ret = -ENOENT;
 850
 851         if (ret)
 852                 goto out;
 853
 854         l = path->nodes[0];
 855         slot = path->slots[0];
 856         qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 857         btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
 858         btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
 859         btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 860         btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 861         btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 862
 863         btrfs_mark_buffer_dirty(trans, l);
 864
 865 out:
 866         btrfs_free_path(path);
 867         return ret;
 868 }
 869
 870 static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 871                                    struct btrfs_qgroup *qgroup)
 872 {
 873         struct btrfs_fs_info *fs_info = trans->fs_info;
 874         struct btrfs_root *quota_root = fs_info->quota_root;
 875         struct btrfs_path *path;
 876         struct btrfs_key key;
 877         struct extent_buffer *l;
 878         struct btrfs_qgroup_info_item *qgroup_info;
 879         int ret;
 880         int slot;
 881
 882         if (btrfs_is_testing(fs_info))
 883                 return 0;
 884
 885         key.objectid = 0;
 886         key.type = BTRFS_QGROUP_INFO_KEY;
 887         key.offset = qgroup->qgroupid;
 888
 889         path = btrfs_alloc_path();
 890         if (!path)
 891                 return -ENOMEM;
 892
 893         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 894         if (ret > 0)
 895                 ret = -ENOENT;
 896
 897         if (ret)
 898                 goto out;
 899
 900         l = path->nodes[0];
 901         slot = path->slots[0];
 902         qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 903         btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 904         btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 905         btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 906         btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 907         btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 908
 909         btrfs_mark_buffer_dirty(trans, l);
 910
 911 out:
 912         btrfs_free_path(path);
 913         return ret;
 914 }
 915
 916 static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 917 {
 918         struct btrfs_fs_info *fs_info = trans->fs_info;
 919         struct btrfs_root *quota_root = fs_info->quota_root;
 920         struct btrfs_path *path;
 921         struct btrfs_key key;
 922         struct extent_buffer *l;
 923         struct btrfs_qgroup_status_item *ptr;
 924         int ret;
 925         int slot;
 926
 927         key.objectid = 0;
 928         key.type = BTRFS_QGROUP_STATUS_KEY;
 929         key.offset = 0;
 930
 931         path = btrfs_alloc_path();
 932         if (!path)
 933                 return -ENOMEM;
 934
 935         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 936         if (ret > 0)
 937                 ret = -ENOENT;
 938
 939         if (ret)
 940                 goto out;
 941
 942         l = path->nodes[0];
 943         slot = path->slots[0];
 944         ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 945         btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
 946                                       BTRFS_QGROUP_STATUS_FLAGS_MASK);
 947         btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 948         btrfs_set_qgroup_status_rescan(l, ptr,
 949                                 fs_info->qgroup_rescan_progress.objectid);
 950
 951         btrfs_mark_buffer_dirty(trans, l);
 952
 953 out:
 954         btrfs_free_path(path);
 955         return ret;
 956 }
 957
 958 /*
 959  * called with qgroup_lock held
 960  */
 961 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 962                                   struct btrfs_root *root)
 963 {
 964         struct btrfs_path *path;
 965         struct btrfs_key key;
 966         struct extent_buffer *leaf = NULL;
 967         int ret;
 968         int nr = 0;
 969
 970         path = btrfs_alloc_path();
 971         if (!path)
 972                 return -ENOMEM;
 973
 974         key.objectid = 0;
 975         key.offset = 0;
 976         key.type = 0;
 977
 978         while (1) {
 979                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 980                 if (ret < 0)
 981                         goto out;
 982                 leaf = path->nodes[0];
 983                 nr = btrfs_header_nritems(leaf);
 984                 if (!nr)
 985                         break;
 986                 /*
 987                  * delete the leaf one by one
 988                  * since the whole tree is going
 989                  * to be deleted.
 990                  */
 991                 path->slots[0] = 0;
 992                 ret = btrfs_del_items(trans, root, path, 0, nr);
 993                 if (ret)
 994                         goto out;
 995
 996                 btrfs_release_path(path);
 997         }
 998         ret = 0;
 999 out:
1000         btrfs_free_path(path);
1001         return ret;
1002 }
1003
1004 int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
1005                        struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
1006 {
1007         struct btrfs_root *quota_root;
1008         struct btrfs_root *tree_root = fs_info->tree_root;
1009         struct btrfs_path *path = NULL;
1010         struct btrfs_qgroup_status_item *ptr;
1011         struct extent_buffer *leaf;
1012         struct btrfs_key key;
1013         struct btrfs_key found_key;
1014         struct btrfs_qgroup *qgroup = NULL;
1015         struct btrfs_qgroup *prealloc = NULL;
1016         struct btrfs_trans_handle *trans = NULL;
1017         struct ulist *ulist = NULL;
1018         const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
1019         int ret = 0;
1020         int slot;
1021
1022         /*
1023          * We need to have subvol_sem write locked, to prevent races between
1024          * concurrent tasks trying to enable quotas, because we will unlock
1025          * and relock qgroup_ioctl_lock before setting fs_info->quota_root
1026          * and before setting BTRFS_FS_QUOTA_ENABLED.
1027          */
1028         lockdep_assert_held_write(&fs_info->subvol_sem);
1029
1030         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
1031                 btrfs_err(fs_info,
1032                           "qgroups are currently unsupported in extent tree v2");
1033                 return -EINVAL;
1034         }
1035
1036         mutex_lock(&fs_info->qgroup_ioctl_lock);
1037         if (fs_info->quota_root)
1038                 goto out;
1039
1040         ulist = ulist_alloc(GFP_KERNEL);
1041         if (!ulist) {
1042                 ret = -ENOMEM;
1043                 goto out;
1044         }
1045
1046         ret = btrfs_sysfs_add_qgroups(fs_info);
1047         if (ret < 0)
1048                 goto out;
1049
1050         /*
1051          * Unlock qgroup_ioctl_lock before starting the transaction. This is to
1052          * avoid lock acquisition inversion problems (reported by lockdep) between
1053          * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
1054          * start a transaction.
1055          * After we started the transaction lock qgroup_ioctl_lock again and
1056          * check if someone else created the quota root in the meanwhile. If so,
1057          * just return success and release the transaction handle.
1058          *
1059          * Also we don't need to worry about someone else calling
1060          * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
1061          * that function returns 0 (success) when the sysfs entries already exist.
1062          */
1063         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1064
1065         /*
1066          * 1 for quota root item
1067          * 1 for BTRFS_QGROUP_STATUS item
1068          *
1069          * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
1070          * per subvolume. However those are not currently reserved since it
1071          * would be a lot of overkill.
1072          */
1073         trans = btrfs_start_transaction(tree_root, 2);
1074
1075         mutex_lock(&fs_info->qgroup_ioctl_lock);
1076         if (IS_ERR(trans)) {
1077                 ret = PTR_ERR(trans);
1078                 trans = NULL;
1079                 goto out;
1080         }
1081
1082         if (fs_info->quota_root)
1083                 goto out;
1084
1085         fs_info->qgroup_ulist = ulist;
1086         ulist = NULL;
1087
1088         /*
1089          * initially create the quota tree
1090          */
1091         quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
1092         if (IS_ERR(quota_root)) {
1093                 ret =  PTR_ERR(quota_root);
1094                 btrfs_abort_transaction(trans, ret);
1095                 goto out;
1096         }
1097
1098         path = btrfs_alloc_path();
1099         if (!path) {
1100                 ret = -ENOMEM;
1101                 btrfs_abort_transaction(trans, ret);
1102                 goto out_free_root;
1103         }
1104
1105         key.objectid = 0;
1106         key.type = BTRFS_QGROUP_STATUS_KEY;
1107         key.offset = 0;
1108
1109         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
1110                                       sizeof(*ptr));
1111         if (ret) {
1112                 btrfs_abort_transaction(trans, ret);
1113                 goto out_free_path;
1114         }
1115
1116         leaf = path->nodes[0];
1117         ptr = btrfs_item_ptr(leaf, path->slots[0],
1118                                  struct btrfs_qgroup_status_item);
1119         btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1120         btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1121         fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
1122         if (simple) {
1123                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
1124                 btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
1125         } else {
1126                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1127         }
1128         btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
1129                                       BTRFS_QGROUP_STATUS_FLAGS_MASK);
1130         btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1131
1132         btrfs_mark_buffer_dirty(trans, leaf);
1133
1134         key.objectid = 0;
1135         key.type = BTRFS_ROOT_REF_KEY;
1136         key.offset = 0;
1137
1138         btrfs_release_path(path);
1139         ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1140         if (ret > 0)
1141                 goto out_add_root;
1142         if (ret < 0) {
1143                 btrfs_abort_transaction(trans, ret);
1144                 goto out_free_path;
1145         }
1146
1147         while (1) {
1148                 slot = path->slots[0];
1149                 leaf = path->nodes[0];
1150                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1151
1152                 if (found_key.type == BTRFS_ROOT_REF_KEY) {
1153
1154                         /* Release locks on tree_root before we access quota_root */
1155                         btrfs_release_path(path);
1156
1157                         /* We should not have a stray @prealloc pointer. */
1158                         ASSERT(prealloc == NULL);
1159                         prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1160                         if (!prealloc) {
1161                                 ret = -ENOMEM;
1162                                 btrfs_abort_transaction(trans, ret);
1163                                 goto out_free_path;
1164                         }
1165
1166                         ret = add_qgroup_item(trans, quota_root,
1167                                               found_key.offset);
1168                         if (ret) {
1169                                 btrfs_abort_transaction(trans, ret);
1170                                 goto out_free_path;
1171                         }
1172
1173                         qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
1174                         prealloc = NULL;
1175                         if (IS_ERR(qgroup)) {
1176                                 ret = PTR_ERR(qgroup);
1177                                 btrfs_abort_transaction(trans, ret);
1178                                 goto out_free_path;
1179                         }
1180                         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1181                         if (ret < 0) {
1182                                 btrfs_abort_transaction(trans, ret);
1183                                 goto out_free_path;
1184                         }
1185                         ret = btrfs_search_slot_for_read(tree_root, &found_key,
1186                                                          path, 1, 0);
1187                         if (ret < 0) {
1188                                 btrfs_abort_transaction(trans, ret);
1189                                 goto out_free_path;
1190                         }
1191                         if (ret > 0) {
1192                                 /*
1193                                  * Shouldn't happen, but in case it does we
1194                                  * don't need to do the btrfs_next_item, just
1195                                  * continue.
1196                                  */
1197                                 continue;
1198                         }
1199                 }
1200                 ret = btrfs_next_item(tree_root, path);
1201                 if (ret < 0) {
1202                         btrfs_abort_transaction(trans, ret);
1203                         goto out_free_path;
1204                 }
1205                 if (ret)
1206                         break;
1207         }
1208
1209 out_add_root:
1210         btrfs_release_path(path);
1211         ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1212         if (ret) {
1213                 btrfs_abort_transaction(trans, ret);
1214                 goto out_free_path;
1215         }
1216
1217         ASSERT(prealloc == NULL);
1218         prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1219         if (!prealloc) {
1220                 ret = -ENOMEM;
1221                 goto out_free_path;
1222         }
1223         qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
1224         prealloc = NULL;
1225         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1226         if (ret < 0) {
1227                 btrfs_abort_transaction(trans, ret);
1228                 goto out_free_path;
1229         }
1230
1231         fs_info->qgroup_enable_gen = trans->transid;
1232
1233         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1234         /*
1235          * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1236          * a deadlock with tasks concurrently doing other qgroup operations, such
1237          * adding/removing qgroups or adding/deleting qgroup relations for example,
1238          * because all qgroup operations first start or join a transaction and then
1239          * lock the qgroup_ioctl_lock mutex.
1240          * We are safe from a concurrent task trying to enable quotas, by calling
1241          * this function, since we are serialized by fs_info->subvol_sem.
1242          */
1243         ret = btrfs_commit_transaction(trans);
1244         trans = NULL;
1245         mutex_lock(&fs_info->qgroup_ioctl_lock);
1246         if (ret)
1247                 goto out_free_path;
1248
1249         /*
1250          * Set quota enabled flag after committing the transaction, to avoid
1251          * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1252          * creation.
1253          */
1254         spin_lock(&fs_info->qgroup_lock);
1255         fs_info->quota_root = quota_root;
1256         set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1257         if (simple)
1258                 btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
1259         spin_unlock(&fs_info->qgroup_lock);
1260
1261         /* Skip rescan for simple qgroups. */
1262         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
1263                 goto out_free_path;
1264
1265         ret = qgroup_rescan_init(fs_info, 0, 1);
1266         if (!ret) {
1267                 qgroup_rescan_zero_tracking(fs_info);
1268                 fs_info->qgroup_rescan_running = true;
1269                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1270                                  &fs_info->qgroup_rescan_work);
1271         } else {
1272                 /*
1273                  * We have set both BTRFS_FS_QUOTA_ENABLED and
1274                  * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
1275                  * -EINPROGRESS. That can happen because someone started the
1276                  * rescan worker by calling quota rescan ioctl before we
1277                  * attempted to initialize the rescan worker. Failure due to
1278                  * quotas disabled in the meanwhile is not possible, because
1279                  * we are holding a write lock on fs_info->subvol_sem, which
1280                  * is also acquired when disabling quotas.
1281                  * Ignore such error, and any other error would need to undo
1282                  * everything we did in the transaction we just committed.
1283                  */
1284                 ASSERT(ret == -EINPROGRESS);
1285                 ret = 0;
1286         }
1287
1288 out_free_path:
1289         btrfs_free_path(path);
1290 out_free_root:
1291         if (ret)
1292                 btrfs_put_root(quota_root);
1293 out:
1294         if (ret) {
1295                 ulist_free(fs_info->qgroup_ulist);
1296                 fs_info->qgroup_ulist = NULL;
1297                 btrfs_sysfs_del_qgroups(fs_info);
1298         }
1299         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1300         if (ret && trans)
1301                 btrfs_end_transaction(trans);
1302         else if (trans)
1303                 ret = btrfs_end_transaction(trans);
1304         ulist_free(ulist);
1305         kfree(prealloc);
1306         return ret;
1307 }
1308
1309 /*
1310  * It is possible to have outstanding ordered extents which reserved bytes
1311  * before we disabled. We need to fully flush delalloc, ordered extents, and a
1312  * commit to ensure that we don't leak such reservations, only to have them
1313  * come back if we re-enable.
1314  *
1315  * - enable simple quotas
1316  * - reserve space
1317  * - release it, store rsv_bytes in OE
1318  * - disable quotas
1319  * - enable simple quotas (qgroup rsv are all 0)
1320  * - OE finishes
1321  * - run delayed refs
1322  * - free rsv_bytes, resulting in miscounting or even underflow
1323  */
1324 static int flush_reservations(struct btrfs_fs_info *fs_info)
1325 {
1326         int ret;
1327
1328         ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
1329         if (ret)
1330                 return ret;
1331         btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
1332
1333         return btrfs_commit_current_transaction(fs_info->tree_root);
1334 }
1335
1336 int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1337 {
1338         struct btrfs_root *quota_root = NULL;
1339         struct btrfs_trans_handle *trans = NULL;
1340         int ret = 0;
1341
1342         /*
1343          * We need to have subvol_sem write locked to prevent races with
1344          * snapshot creation.
1345          */
1346         lockdep_assert_held_write(&fs_info->subvol_sem);
1347
1348         /*
1349          * Relocation will mess with backrefs, so make sure we have the
1350          * cleaner_mutex held to protect us from relocate.
1351          */
1352         lockdep_assert_held(&fs_info->cleaner_mutex);
1353
1354         mutex_lock(&fs_info->qgroup_ioctl_lock);
1355         if (!fs_info->quota_root)
1356                 goto out;
1357
1358         /*
1359          * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1360          * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1361          * to lock that mutex while holding a transaction handle and the rescan
1362          * worker needs to commit a transaction.
1363          */
1364         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1365
1366         /*
1367          * Request qgroup rescan worker to complete and wait for it. This wait
1368          * must be done before transaction start for quota disable since it may
1369          * deadlock with transaction by the qgroup rescan worker.
1370          */
1371         clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1372         btrfs_qgroup_wait_for_completion(fs_info, false);
1373
1374         /*
1375          * We have nothing held here and no trans handle, just return the error
1376          * if there is one.
1377          */
1378         ret = flush_reservations(fs_info);
1379         if (ret)
1380                 return ret;
1381
1382         /*
1383          * 1 For the root item
1384          *
1385          * We should also reserve enough items for the quota tree deletion in
1386          * btrfs_clean_quota_tree but this is not done.
1387          *
1388          * Also, we must always start a transaction without holding the mutex
1389          * qgroup_ioctl_lock, see btrfs_quota_enable().
1390          */
1391         trans = btrfs_start_transaction(fs_info->tree_root, 1);
1392
1393         mutex_lock(&fs_info->qgroup_ioctl_lock);
1394         if (IS_ERR(trans)) {
1395                 ret = PTR_ERR(trans);
1396                 trans = NULL;
1397                 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1398                 goto out;
1399         }
1400
1401         if (!fs_info->quota_root)
1402                 goto out;
1403
1404         spin_lock(&fs_info->qgroup_lock);
1405         quota_root = fs_info->quota_root;
1406         fs_info->quota_root = NULL;
1407         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1408         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
1409         fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
1410         spin_unlock(&fs_info->qgroup_lock);
1411
1412         btrfs_free_qgroup_config(fs_info);
1413
1414         ret = btrfs_clean_quota_tree(trans, quota_root);
1415         if (ret) {
1416                 btrfs_abort_transaction(trans, ret);
1417                 goto out;
1418         }
1419
1420         ret = btrfs_del_root(trans, &quota_root->root_key);
1421         if (ret) {
1422                 btrfs_abort_transaction(trans, ret);
1423                 goto out;
1424         }
1425
1426         spin_lock(&fs_info->trans_lock);
1427         list_del(&quota_root->dirty_list);
1428         spin_unlock(&fs_info->trans_lock);
1429
1430         btrfs_tree_lock(quota_root->node);
1431         btrfs_clear_buffer_dirty(trans, quota_root->node);
1432         btrfs_tree_unlock(quota_root->node);
1433         ret = btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
1434                                     quota_root->node, 0, 1);
1435
1436         if (ret < 0)
1437                 btrfs_abort_transaction(trans, ret);
1438
1439 out:
1440         btrfs_put_root(quota_root);
1441         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1442         if (ret && trans)
1443                 btrfs_end_transaction(trans);
1444         else if (trans)
1445                 ret = btrfs_commit_transaction(trans);
1446         return ret;
1447 }
1448
1449 static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1450                          struct btrfs_qgroup *qgroup)
1451 {
1452         if (list_empty(&qgroup->dirty))
1453                 list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1454 }
1455
1456 static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
1457 {
1458         if (!list_empty(&qgroup->iterator))
1459                 return;
1460
1461         list_add_tail(&qgroup->iterator, head);
1462 }
1463
1464 static void qgroup_iterator_clean(struct list_head *head)
1465 {
1466         while (!list_empty(head)) {
1467                 struct btrfs_qgroup *qgroup;
1468
1469                 qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
1470                 list_del_init(&qgroup->iterator);
1471         }
1472 }
1473
1474 /*
1475  * The easy accounting, we're updating qgroup relationship whose child qgroup
1476  * only has exclusive extents.
1477  *
1478  * In this case, all exclusive extents will also be exclusive for parent, so
1479  * excl/rfer just get added/removed.
1480  *
1481  * So is qgroup reservation space, which should also be added/removed to
1482  * parent.
1483  * Or when child tries to release reservation space, parent will underflow its
1484  * reservation (for relationship adding case).
1485  *
1486  * Caller should hold fs_info->qgroup_lock.
1487  */
1488 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
1489                                     struct btrfs_qgroup *src, int sign)
1490 {
1491         struct btrfs_qgroup *qgroup;
1492         struct btrfs_qgroup *cur;
1493         LIST_HEAD(qgroup_list);
1494         u64 num_bytes = src->excl;
1495         int ret = 0;
1496
1497         qgroup = find_qgroup_rb(fs_info, ref_root);
1498         if (!qgroup)
1499                 goto out;
1500
1501         qgroup_iterator_add(&qgroup_list, qgroup);
1502         list_for_each_entry(cur, &qgroup_list, iterator) {
1503                 struct btrfs_qgroup_list *glist;
1504
1505                 qgroup->rfer += sign * num_bytes;
1506                 qgroup->rfer_cmpr += sign * num_bytes;
1507
1508                 WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1509                 qgroup->excl += sign * num_bytes;
1510                 qgroup->excl_cmpr += sign * num_bytes;
1511
1512                 if (sign > 0)
1513                         qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1514                 else
1515                         qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1516                 qgroup_dirty(fs_info, qgroup);
1517
1518                 /* Append parent qgroups to @qgroup_list. */
1519                 list_for_each_entry(glist, &qgroup->groups, next_group)
1520                         qgroup_iterator_add(&qgroup_list, glist->group);
1521         }
1522         ret = 0;
1523 out:
1524         qgroup_iterator_clean(&qgroup_list);
1525         return ret;
1526 }
1527
1528
1529 /*
1530  * Quick path for updating qgroup with only excl refs.
1531  *
1532  * In that case, just update all parent will be enough.
1533  * Or we needs to do a full rescan.
1534  * Caller should also hold fs_info->qgroup_lock.
1535  *
1536  * Return 0 for quick update, return >0 for need to full rescan
1537  * and mark INCONSISTENT flag.
1538  * Return < 0 for other error.
1539  */
1540 static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1541                                    u64 src, u64 dst, int sign)
1542 {
1543         struct btrfs_qgroup *qgroup;
1544         int ret = 1;
1545
1546         qgroup = find_qgroup_rb(fs_info, src);
1547         if (!qgroup)
1548                 goto out;
1549         if (qgroup->excl == qgroup->rfer) {
1550                 ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
1551                 if (ret < 0)
1552                         goto out;
1553                 ret = 0;
1554         }
1555 out:
1556         if (ret)
1557                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1558         return ret;
1559 }
1560
1561 /*
1562  * Add relation between @src and @dst qgroup. The @prealloc is allocated by the
1563  * callers and transferred here (either used or freed on error).
1564  */
1565 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
1566                               struct btrfs_qgroup_list *prealloc)
1567 {
1568         struct btrfs_fs_info *fs_info = trans->fs_info;
1569         struct btrfs_qgroup *parent;
1570         struct btrfs_qgroup *member;
1571         struct btrfs_qgroup_list *list;
1572         int ret = 0;
1573
1574         ASSERT(prealloc);
1575
1576         /* Check the level of src and dst first */
1577         if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1578                 return -EINVAL;
1579
1580         mutex_lock(&fs_info->qgroup_ioctl_lock);
1581         if (!fs_info->quota_root) {
1582                 ret = -ENOTCONN;
1583                 goto out;
1584         }
1585         member = find_qgroup_rb(fs_info, src);
1586         parent = find_qgroup_rb(fs_info, dst);
1587         if (!member || !parent) {
1588                 ret = -EINVAL;
1589                 goto out;
1590         }
1591
1592         /* check if such qgroup relation exist firstly */
1593         list_for_each_entry(list, &member->groups, next_group) {
1594                 if (list->group == parent) {
1595                         ret = -EEXIST;
1596                         goto out;
1597                 }
1598         }
1599
1600         ret = add_qgroup_relation_item(trans, src, dst);
1601         if (ret)
1602                 goto out;
1603
1604         ret = add_qgroup_relation_item(trans, dst, src);
1605         if (ret) {
1606                 del_qgroup_relation_item(trans, src, dst);
1607                 goto out;
1608         }
1609
1610         spin_lock(&fs_info->qgroup_lock);
1611         ret = __add_relation_rb(prealloc, member, parent);
1612         prealloc = NULL;
1613         if (ret < 0) {
1614                 spin_unlock(&fs_info->qgroup_lock);
1615                 goto out;
1616         }
1617         ret = quick_update_accounting(fs_info, src, dst, 1);
1618         spin_unlock(&fs_info->qgroup_lock);
1619 out:
1620         kfree(prealloc);
1621         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1622         return ret;
1623 }
1624
1625 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1626                                  u64 dst)
1627 {
1628         struct btrfs_fs_info *fs_info = trans->fs_info;
1629         struct btrfs_qgroup *parent;
1630         struct btrfs_qgroup *member;
1631         struct btrfs_qgroup_list *list;
1632         bool found = false;
1633         int ret = 0;
1634         int ret2;
1635
1636         if (!fs_info->quota_root) {
1637                 ret = -ENOTCONN;
1638                 goto out;
1639         }
1640
1641         member = find_qgroup_rb(fs_info, src);
1642         parent = find_qgroup_rb(fs_info, dst);
1643         /*
1644          * The parent/member pair doesn't exist, then try to delete the dead
1645          * relation items only.
1646          */
1647         if (!member || !parent)
1648                 goto delete_item;
1649
1650         /* check if such qgroup relation exist firstly */
1651         list_for_each_entry(list, &member->groups, next_group) {
1652                 if (list->group == parent) {
1653                         found = true;
1654                         break;
1655                 }
1656         }
1657
1658 delete_item:
1659         ret = del_qgroup_relation_item(trans, src, dst);
1660         if (ret < 0 && ret != -ENOENT)
1661                 goto out;
1662         ret2 = del_qgroup_relation_item(trans, dst, src);
1663         if (ret2 < 0 && ret2 != -ENOENT)
1664                 goto out;
1665
1666         /* At least one deletion succeeded, return 0 */
1667         if (!ret || !ret2)
1668                 ret = 0;
1669
1670         if (found) {
1671                 spin_lock(&fs_info->qgroup_lock);
1672                 del_relation_rb(fs_info, src, dst);
1673                 ret = quick_update_accounting(fs_info, src, dst, -1);
1674                 spin_unlock(&fs_info->qgroup_lock);
1675         }
1676 out:
1677         return ret;
1678 }
1679
1680 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1681                               u64 dst)
1682 {
1683         struct btrfs_fs_info *fs_info = trans->fs_info;
1684         int ret = 0;
1685
1686         mutex_lock(&fs_info->qgroup_ioctl_lock);
1687         ret = __del_qgroup_relation(trans, src, dst);
1688         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1689
1690         return ret;
1691 }
1692
1693 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1694 {
1695         struct btrfs_fs_info *fs_info = trans->fs_info;
1696         struct btrfs_root *quota_root;
1697         struct btrfs_qgroup *qgroup;
1698         struct btrfs_qgroup *prealloc = NULL;
1699         int ret = 0;
1700
1701         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
1702                 return 0;
1703
1704         mutex_lock(&fs_info->qgroup_ioctl_lock);
1705         if (!fs_info->quota_root) {
1706                 ret = -ENOTCONN;
1707                 goto out;
1708         }
1709         quota_root = fs_info->quota_root;
1710         qgroup = find_qgroup_rb(fs_info, qgroupid);
1711         if (qgroup) {
1712                 ret = -EEXIST;
1713                 goto out;
1714         }
1715
1716         prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1717         if (!prealloc) {
1718                 ret = -ENOMEM;
1719                 goto out;
1720         }
1721
1722         ret = add_qgroup_item(trans, quota_root, qgroupid);
1723         if (ret)
1724                 goto out;
1725
1726         spin_lock(&fs_info->qgroup_lock);
1727         qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
1728         spin_unlock(&fs_info->qgroup_lock);
1729         prealloc = NULL;
1730
1731         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1732 out:
1733         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1734         kfree(prealloc);
1735         return ret;
1736 }
1737
1738 /*
1739  * Return 0 if we can not delete the qgroup (not empty or has children etc).
1740  * Return >0 if we can delete the qgroup.
1741  * Return <0 for other errors during tree search.
1742  */
1743 static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
1744 {
1745         struct btrfs_key key;
1746         struct btrfs_path *path;
1747         int ret;
1748
1749         /*
1750          * Squota would never be inconsistent, but there can still be case
1751          * where a dropped subvolume still has qgroup numbers, and squota
1752          * relies on such qgroup for future accounting.
1753          *
1754          * So for squota, do not allow dropping any non-zero qgroup.
1755          */
1756         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
1757             (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr))
1758                 return 0;
1759
1760         /* For higher level qgroup, we can only delete it if it has no child. */
1761         if (btrfs_qgroup_level(qgroup->qgroupid)) {
1762                 if (!list_empty(&qgroup->members))
1763                         return 0;
1764                 return 1;
1765         }
1766
1767         /*
1768          * For level-0 qgroups, we can only delete it if it has no subvolume
1769          * for it.
1770          * This means even a subvolume is unlinked but not yet fully dropped,
1771          * we can not delete the qgroup.
1772          */
1773         key.objectid = qgroup->qgroupid;
1774         key.type = BTRFS_ROOT_ITEM_KEY;
1775         key.offset = -1ULL;
1776         path = btrfs_alloc_path();
1777         if (!path)
1778                 return -ENOMEM;
1779
1780         ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
1781         btrfs_free_path(path);
1782         /*
1783          * The @ret from btrfs_find_root() exactly matches our definition for
1784          * the return value, thus can be returned directly.
1785          */
1786         return ret;
1787 }
1788
1789 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1790 {
1791         struct btrfs_fs_info *fs_info = trans->fs_info;
1792         struct btrfs_qgroup *qgroup;
1793         struct btrfs_qgroup_list *list;
1794         int ret = 0;
1795
1796         mutex_lock(&fs_info->qgroup_ioctl_lock);
1797         if (!fs_info->quota_root) {
1798                 ret = -ENOTCONN;
1799                 goto out;
1800         }
1801
1802         qgroup = find_qgroup_rb(fs_info, qgroupid);
1803         if (!qgroup) {
1804                 ret = -ENOENT;
1805                 goto out;
1806         }
1807
1808         ret = can_delete_qgroup(fs_info, qgroup);
1809         if (ret < 0)
1810                 goto out;
1811         if (ret == 0) {
1812                 ret = -EBUSY;
1813                 goto out;
1814         }
1815
1816         /* Check if there are no children of this qgroup */
1817         if (!list_empty(&qgroup->members)) {
1818                 ret = -EBUSY;
1819                 goto out;
1820         }
1821
1822         ret = del_qgroup_item(trans, qgroupid);
1823         if (ret && ret != -ENOENT)
1824                 goto out;
1825
1826         while (!list_empty(&qgroup->groups)) {
1827                 list = list_first_entry(&qgroup->groups,
1828                                         struct btrfs_qgroup_list, next_group);
1829                 ret = __del_qgroup_relation(trans, qgroupid,
1830                                             list->group->qgroupid);
1831                 if (ret)
1832                         goto out;
1833         }
1834
1835         spin_lock(&fs_info->qgroup_lock);
1836         /*
1837          * Warn on reserved space. The subvolume should has no child nor
1838          * corresponding subvolume.
1839          * Thus its reserved space should all be zero, no matter if qgroup
1840          * is consistent or the mode.
1841          */
1842         WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
1843                 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
1844                 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
1845         /*
1846          * The same for rfer/excl numbers, but that's only if our qgroup is
1847          * consistent and if it's in regular qgroup mode.
1848          * For simple mode it's not as accurate thus we can hit non-zero values
1849          * very frequently.
1850          */
1851         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
1852             !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
1853                 if (WARN_ON(qgroup->rfer || qgroup->excl ||
1854                             qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
1855                         btrfs_warn_rl(fs_info,
1856 "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
1857                                       btrfs_qgroup_level(qgroup->qgroupid),
1858                                       btrfs_qgroup_subvolid(qgroup->qgroupid),
1859                                       qgroup->rfer, qgroup->rfer_cmpr,
1860                                       qgroup->excl, qgroup->excl_cmpr);
1861                         qgroup_mark_inconsistent(fs_info);
1862                 }
1863         }
1864         del_qgroup_rb(fs_info, qgroupid);
1865         spin_unlock(&fs_info->qgroup_lock);
1866
1867         /*
1868          * Remove the qgroup from sysfs now without holding the qgroup_lock
1869          * spinlock, since the sysfs_remove_group() function needs to take
1870          * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1871          */
1872         btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1873         kfree(qgroup);
1874 out:
1875         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1876         return ret;
1877 }
1878
1879 int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid)
1880 {
1881         struct btrfs_trans_handle *trans;
1882         int ret;
1883
1884         if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root)
1885                 return 0;
1886
1887         /*
1888          * Commit current transaction to make sure all the rfer/excl numbers
1889          * get updated.
1890          */
1891         trans = btrfs_start_transaction(fs_info->quota_root, 0);
1892         if (IS_ERR(trans))
1893                 return PTR_ERR(trans);
1894
1895         ret = btrfs_commit_transaction(trans);
1896         if (ret < 0)
1897                 return ret;
1898
1899         /* Start new trans to delete the qgroup info and limit items. */
1900         trans = btrfs_start_transaction(fs_info->quota_root, 2);
1901         if (IS_ERR(trans))
1902                 return PTR_ERR(trans);
1903         ret = btrfs_remove_qgroup(trans, subvolid);
1904         btrfs_end_transaction(trans);
1905         /*
1906          * It's squota and the subvolume still has numbers needed for future
1907          * accounting, in this case we can not delete it.  Just skip it.
1908          */
1909         if (ret == -EBUSY)
1910                 ret = 0;
1911         return ret;
1912 }
1913
1914 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1915                        struct btrfs_qgroup_limit *limit)
1916 {
1917         struct btrfs_fs_info *fs_info = trans->fs_info;
1918         struct btrfs_qgroup *qgroup;
1919         int ret = 0;
1920         /* Sometimes we would want to clear the limit on this qgroup.
1921          * To meet this requirement, we treat the -1 as a special value
1922          * which tell kernel to clear the limit on this qgroup.
1923          */
1924         const u64 CLEAR_VALUE = -1;
1925
1926         mutex_lock(&fs_info->qgroup_ioctl_lock);
1927         if (!fs_info->quota_root) {
1928                 ret = -ENOTCONN;
1929                 goto out;
1930         }
1931
1932         qgroup = find_qgroup_rb(fs_info, qgroupid);
1933         if (!qgroup) {
1934                 ret = -ENOENT;
1935                 goto out;
1936         }
1937
1938         spin_lock(&fs_info->qgroup_lock);
1939         if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1940                 if (limit->max_rfer == CLEAR_VALUE) {
1941                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1942                         limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1943                         qgroup->max_rfer = 0;
1944                 } else {
1945                         qgroup->max_rfer = limit->max_rfer;
1946                 }
1947         }
1948         if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1949                 if (limit->max_excl == CLEAR_VALUE) {
1950                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1951                         limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1952                         qgroup->max_excl = 0;
1953                 } else {
1954                         qgroup->max_excl = limit->max_excl;
1955                 }
1956         }
1957         if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1958                 if (limit->rsv_rfer == CLEAR_VALUE) {
1959                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1960                         limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1961                         qgroup->rsv_rfer = 0;
1962                 } else {
1963                         qgroup->rsv_rfer = limit->rsv_rfer;
1964                 }
1965         }
1966         if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1967                 if (limit->rsv_excl == CLEAR_VALUE) {
1968                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1969                         limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1970                         qgroup->rsv_excl = 0;
1971                 } else {
1972                         qgroup->rsv_excl = limit->rsv_excl;
1973                 }
1974         }
1975         qgroup->lim_flags |= limit->flags;
1976
1977         spin_unlock(&fs_info->qgroup_lock);
1978
1979         ret = update_qgroup_limit_item(trans, qgroup);
1980         if (ret) {
1981                 qgroup_mark_inconsistent(fs_info);
1982                 btrfs_info(fs_info, "unable to update quota limit for %llu",
1983                        qgroupid);
1984         }
1985
1986 out:
1987         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1988         return ret;
1989 }
1990
1991 /*
1992  * Inform qgroup to trace one dirty extent, its info is recorded in @record.
1993  * So qgroup can account it at transaction committing time.
1994  *
1995  * No lock version, caller must acquire delayed ref lock and allocated memory,
1996  * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
1997  *
1998  * Return 0 for success insert
1999  * Return >0 for existing record, caller can free @record safely.
2000  * Return <0 for insertion failure, caller can free @record safely.
2001  */
2002 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
2003                                      struct btrfs_delayed_ref_root *delayed_refs,
2004                                      struct btrfs_qgroup_extent_record *record,
2005                                      u64 bytenr)
2006 {
2007         struct btrfs_qgroup_extent_record *existing, *ret;
2008         const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
2009
2010         if (!btrfs_qgroup_full_accounting(fs_info))
2011                 return 1;
2012
2013 #if BITS_PER_LONG == 32
2014         if (bytenr >= MAX_LFS_FILESIZE) {
2015                 btrfs_err_rl(fs_info,
2016 "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
2017                              bytenr);
2018                 btrfs_err_32bit_limit(fs_info);
2019                 return -EOVERFLOW;
2020         }
2021 #endif
2022
2023         trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr);
2024
2025         xa_lock(&delayed_refs->dirty_extents);
2026         existing = xa_load(&delayed_refs->dirty_extents, index);
2027         if (existing) {
2028                 if (record->data_rsv && !existing->data_rsv) {
2029                         existing->data_rsv = record->data_rsv;
2030                         existing->data_rsv_refroot = record->data_rsv_refroot;
2031                 }
2032                 xa_unlock(&delayed_refs->dirty_extents);
2033                 return 1;
2034         }
2035
2036         ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
2037         xa_unlock(&delayed_refs->dirty_extents);
2038         if (xa_is_err(ret)) {
2039                 qgroup_mark_inconsistent(fs_info);
2040                 return xa_err(ret);
2041         }
2042
2043         return 0;
2044 }
2045
2046 /*
2047  * Post handler after qgroup_trace_extent_nolock().
2048  *
2049  * NOTE: Current qgroup does the expensive backref walk at transaction
2050  * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
2051  * new transaction.
2052  * This is designed to allow btrfs_find_all_roots() to get correct new_roots
2053  * result.
2054  *
2055  * However for old_roots there is no need to do backref walk at that time,
2056  * since we search commit roots to walk backref and result will always be
2057  * correct.
2058  *
2059  * Due to the nature of no lock version, we can't do backref there.
2060  * So we must call btrfs_qgroup_trace_extent_post() after exiting
2061  * spinlock context.
2062  *
2063  * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
2064  * using current root, then we can move all expensive backref walk out of
2065  * transaction committing, but not now as qgroup accounting will be wrong again.
2066  */
2067 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
2068                                    struct btrfs_qgroup_extent_record *qrecord,
2069                                    u64 bytenr)
2070 {
2071         struct btrfs_fs_info *fs_info = trans->fs_info;
2072         struct btrfs_backref_walk_ctx ctx = {
2073                 .bytenr = bytenr,
2074                 .fs_info = fs_info,
2075         };
2076         int ret;
2077
2078         if (!btrfs_qgroup_full_accounting(fs_info))
2079                 return 0;
2080         /*
2081          * We are always called in a context where we are already holding a
2082          * transaction handle. Often we are called when adding a data delayed
2083          * reference from btrfs_truncate_inode_items() (truncating or unlinking),
2084          * in which case we will be holding a write lock on extent buffer from a
2085          * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
2086          * acquire fs_info->commit_root_sem, because that is a higher level lock
2087          * that must be acquired before locking any extent buffers.
2088          *
2089          * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
2090          * but we can't pass it a non-NULL transaction handle, because otherwise
2091          * it would not use commit roots and would lock extent buffers, causing
2092          * a deadlock if it ends up trying to read lock the same extent buffer
2093          * that was previously write locked at btrfs_truncate_inode_items().
2094          *
2095          * So pass a NULL transaction handle to btrfs_find_all_roots() and
2096          * explicitly tell it to not acquire the commit_root_sem - if we are
2097          * holding a transaction handle we don't need its protection.
2098          */
2099         ASSERT(trans != NULL);
2100
2101         if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
2102                 return 0;
2103
2104         ret = btrfs_find_all_roots(&ctx, true);
2105         if (ret < 0) {
2106                 qgroup_mark_inconsistent(fs_info);
2107                 btrfs_warn(fs_info,
2108 "error accounting new delayed refs extent (err code: %d), quota inconsistent",
2109                         ret);
2110                 return 0;
2111         }
2112
2113         /*
2114          * Here we don't need to get the lock of
2115          * trans->transaction->delayed_refs, since inserted qrecord won't
2116          * be deleted, only qrecord->node may be modified (new qrecord insert)
2117          *
2118          * So modifying qrecord->old_roots is safe here
2119          */
2120         qrecord->old_roots = ctx.roots;
2121         return 0;
2122 }
2123
2124 /*
2125  * Inform qgroup to trace one dirty extent, specified by @bytenr and
2126  * @num_bytes.
2127  * So qgroup can account it at commit trans time.
2128  *
2129  * Better encapsulated version, with memory allocation and backref walk for
2130  * commit roots.
2131  * So this can sleep.
2132  *
2133  * Return 0 if the operation is done.
2134  * Return <0 for error, like memory allocation failure or invalid parameter
2135  * (NULL trans)
2136  */
2137 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2138                               u64 num_bytes)
2139 {
2140         struct btrfs_fs_info *fs_info = trans->fs_info;
2141         struct btrfs_qgroup_extent_record *record;
2142         struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs;
2143         const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
2144         int ret;
2145
2146         if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
2147                 return 0;
2148         record = kzalloc(sizeof(*record), GFP_NOFS);
2149         if (!record)
2150                 return -ENOMEM;
2151
2152         if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
2153                 kfree(record);
2154                 return -ENOMEM;
2155         }
2156
2157         record->num_bytes = num_bytes;
2158
2159         ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
2160         if (ret) {
2161                 /* Clean up if insertion fails or item exists. */
2162                 xa_release(&delayed_refs->dirty_extents, index);
2163                 kfree(record);
2164                 return 0;
2165         }
2166         return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
2167 }
2168
2169 /*
2170  * Inform qgroup to trace all leaf items of data
2171  *
2172  * Return 0 for success
2173  * Return <0 for error(ENOMEM)
2174  */
2175 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
2176                                   struct extent_buffer *eb)
2177 {
2178         struct btrfs_fs_info *fs_info = trans->fs_info;
2179         int nr = btrfs_header_nritems(eb);
2180         int i, extent_type, ret;
2181         struct btrfs_key key;
2182         struct btrfs_file_extent_item *fi;
2183         u64 bytenr, num_bytes;
2184
2185         /* We can be called directly from walk_up_proc() */
2186         if (!btrfs_qgroup_full_accounting(fs_info))
2187                 return 0;
2188
2189         for (i = 0; i < nr; i++) {
2190                 btrfs_item_key_to_cpu(eb, &key, i);
2191
2192                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2193                         continue;
2194
2195                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2196                 /* filter out non qgroup-accountable extents  */
2197                 extent_type = btrfs_file_extent_type(eb, fi);
2198
2199                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
2200                         continue;
2201
2202                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
2203                 if (!bytenr)
2204                         continue;
2205
2206                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
2207
2208                 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
2209                 if (ret)
2210                         return ret;
2211         }
2212         cond_resched();
2213         return 0;
2214 }
2215
2216 /*
2217  * Walk up the tree from the bottom, freeing leaves and any interior
2218  * nodes which have had all slots visited. If a node (leaf or
2219  * interior) is freed, the node above it will have it's slot
2220  * incremented. The root node will never be freed.
2221  *
2222  * At the end of this function, we should have a path which has all
2223  * slots incremented to the next position for a search. If we need to
2224  * read a new node it will be NULL and the node above it will have the
2225  * correct slot selected for a later read.
2226  *
2227  * If we increment the root nodes slot counter past the number of
2228  * elements, 1 is returned to signal completion of the search.
2229  */
2230 static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
2231 {
2232         int level = 0;
2233         int nr, slot;
2234         struct extent_buffer *eb;
2235
2236         if (root_level == 0)
2237                 return 1;
2238
2239         while (level <= root_level) {
2240                 eb = path->nodes[level];
2241                 nr = btrfs_header_nritems(eb);
2242                 path->slots[level]++;
2243                 slot = path->slots[level];
2244                 if (slot >= nr || level == 0) {
2245                         /*
2246                          * Don't free the root -  we will detect this
2247                          * condition after our loop and return a
2248                          * positive value for caller to stop walking the tree.
2249                          */
2250                         if (level != root_level) {
2251                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
2252                                 path->locks[level] = 0;
2253
2254                                 free_extent_buffer(eb);
2255                                 path->nodes[level] = NULL;
2256                                 path->slots[level] = 0;
2257                         }
2258                 } else {
2259                         /*
2260                          * We have a valid slot to walk back down
2261                          * from. Stop here so caller can process these
2262                          * new nodes.
2263                          */
2264                         break;
2265                 }
2266
2267                 level++;
2268         }
2269
2270         eb = path->nodes[root_level];
2271         if (path->slots[root_level] >= btrfs_header_nritems(eb))
2272                 return 1;
2273
2274         return 0;
2275 }
2276
2277 /*
2278  * Helper function to trace a subtree tree block swap.
2279  *
2280  * The swap will happen in highest tree block, but there may be a lot of
2281  * tree blocks involved.
2282  *
2283  * For example:
2284  *  OO = Old tree blocks
2285  *  NN = New tree blocks allocated during balance
2286  *
2287  *           File tree (257)                  Reloc tree for 257
2288  * L2              OO                                NN
2289  *               /    \                            /    \
2290  * L1          OO      OO (a)                    OO      NN (a)
2291  *            / \     / \                       / \     / \
2292  * L0       OO   OO OO   OO                   OO   OO NN   NN
2293  *                  (b)  (c)                          (b)  (c)
2294  *
2295  * When calling qgroup_trace_extent_swap(), we will pass:
2296  * @src_eb = OO(a)
2297  * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
2298  * @dst_level = 0
2299  * @root_level = 1
2300  *
2301  * In that case, qgroup_trace_extent_swap() will search from OO(a) to
2302  * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
2303  *
2304  * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
2305  *
2306  * 1) Tree search from @src_eb
2307  *    It should acts as a simplified btrfs_search_slot().
2308  *    The key for search can be extracted from @dst_path->nodes[dst_level]
2309  *    (first key).
2310  *
2311  * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
2312  *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
2313  *    They should be marked during previous (@dst_level = 1) iteration.
2314  *
2315  * 3) Mark file extents in leaves dirty
2316  *    We don't have good way to pick out new file extents only.
2317  *    So we still follow the old method by scanning all file extents in
2318  *    the leave.
2319  *
2320  * This function can free us from keeping two paths, thus later we only need
2321  * to care about how to iterate all new tree blocks in reloc tree.
2322  */
2323 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
2324                                     struct extent_buffer *src_eb,
2325                                     struct btrfs_path *dst_path,
2326                                     int dst_level, int root_level,
2327                                     bool trace_leaf)
2328 {
2329         struct btrfs_key key;
2330         struct btrfs_path *src_path;
2331         struct btrfs_fs_info *fs_info = trans->fs_info;
2332         u32 nodesize = fs_info->nodesize;
2333         int cur_level = root_level;
2334         int ret;
2335
2336         BUG_ON(dst_level > root_level);
2337         /* Level mismatch */
2338         if (btrfs_header_level(src_eb) != root_level)
2339                 return -EINVAL;
2340
2341         src_path = btrfs_alloc_path();
2342         if (!src_path) {
2343                 ret = -ENOMEM;
2344                 goto out;
2345         }
2346
2347         if (dst_level)
2348                 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2349         else
2350                 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2351
2352         /* For src_path */
2353         atomic_inc(&src_eb->refs);
2354         src_path->nodes[root_level] = src_eb;
2355         src_path->slots[root_level] = dst_path->slots[root_level];
2356         src_path->locks[root_level] = 0;
2357
2358         /* A simplified version of btrfs_search_slot() */
2359         while (cur_level >= dst_level) {
2360                 struct btrfs_key src_key;
2361                 struct btrfs_key dst_key;
2362
2363                 if (src_path->nodes[cur_level] == NULL) {
2364                         struct extent_buffer *eb;
2365                         int parent_slot;
2366
2367                         eb = src_path->nodes[cur_level + 1];
2368                         parent_slot = src_path->slots[cur_level + 1];
2369
2370                         eb = btrfs_read_node_slot(eb, parent_slot);
2371                         if (IS_ERR(eb)) {
2372                                 ret = PTR_ERR(eb);
2373                                 goto out;
2374                         }
2375
2376                         src_path->nodes[cur_level] = eb;
2377
2378                         btrfs_tree_read_lock(eb);
2379                         src_path->locks[cur_level] = BTRFS_READ_LOCK;
2380                 }
2381
2382                 src_path->slots[cur_level] = dst_path->slots[cur_level];
2383                 if (cur_level) {
2384                         btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
2385                                         &dst_key, dst_path->slots[cur_level]);
2386                         btrfs_node_key_to_cpu(src_path->nodes[cur_level],
2387                                         &src_key, src_path->slots[cur_level]);
2388                 } else {
2389                         btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2390                                         &dst_key, dst_path->slots[cur_level]);
2391                         btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2392                                         &src_key, src_path->slots[cur_level]);
2393                 }
2394                 /* Content mismatch, something went wrong */
2395                 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
2396                         ret = -ENOENT;
2397                         goto out;
2398                 }
2399                 cur_level--;
2400         }
2401
2402         /*
2403          * Now both @dst_path and @src_path have been populated, record the tree
2404          * blocks for qgroup accounting.
2405          */
2406         ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2407                                         nodesize);
2408         if (ret < 0)
2409                 goto out;
2410         ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
2411                                         nodesize);
2412         if (ret < 0)
2413                 goto out;
2414
2415         /* Record leaf file extents */
2416         if (dst_level == 0 && trace_leaf) {
2417                 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2418                 if (ret < 0)
2419                         goto out;
2420                 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2421         }
2422 out:
2423         btrfs_free_path(src_path);
2424         return ret;
2425 }
2426
2427 /*
2428  * Helper function to do recursive generation-aware depth-first search, to
2429  * locate all new tree blocks in a subtree of reloc tree.
2430  *
2431  * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2432  *         reloc tree
2433  * L2         NN (a)
2434  *          /    \
2435  * L1    OO        NN (b)
2436  *      /  \      /  \
2437  * L0  OO  OO    OO  NN
2438  *               (c) (d)
2439  * If we pass:
2440  * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2441  * @cur_level = 1
2442  * @root_level = 1
2443  *
2444  * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2445  * above tree blocks along with their counter parts in file tree.
2446  * While during search, old tree blocks OO(c) will be skipped as tree block swap
2447  * won't affect OO(c).
2448  */
2449 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2450                                            struct extent_buffer *src_eb,
2451                                            struct btrfs_path *dst_path,
2452                                            int cur_level, int root_level,
2453                                            u64 last_snapshot, bool trace_leaf)
2454 {
2455         struct btrfs_fs_info *fs_info = trans->fs_info;
2456         struct extent_buffer *eb;
2457         bool need_cleanup = false;
2458         int ret = 0;
2459         int i;
2460
2461         /* Level sanity check */
2462         if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2463             root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2464             root_level < cur_level) {
2465                 btrfs_err_rl(fs_info,
2466                         "%s: bad levels, cur_level=%d root_level=%d",
2467                         __func__, cur_level, root_level);
2468                 return -EUCLEAN;
2469         }
2470
2471         /* Read the tree block if needed */
2472         if (dst_path->nodes[cur_level] == NULL) {
2473                 int parent_slot;
2474                 u64 child_gen;
2475
2476                 /*
2477                  * dst_path->nodes[root_level] must be initialized before
2478                  * calling this function.
2479                  */
2480                 if (cur_level == root_level) {
2481                         btrfs_err_rl(fs_info,
2482         "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2483                                 __func__, root_level, root_level, cur_level);
2484                         return -EUCLEAN;
2485                 }
2486
2487                 /*
2488                  * We need to get child blockptr/gen from parent before we can
2489                  * read it.
2490                   */
2491                 eb = dst_path->nodes[cur_level + 1];
2492                 parent_slot = dst_path->slots[cur_level + 1];
2493                 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2494
2495                 /* This node is old, no need to trace */
2496                 if (child_gen < last_snapshot)
2497                         goto out;
2498
2499                 eb = btrfs_read_node_slot(eb, parent_slot);
2500                 if (IS_ERR(eb)) {
2501                         ret = PTR_ERR(eb);
2502                         goto out;
2503                 }
2504
2505                 dst_path->nodes[cur_level] = eb;
2506                 dst_path->slots[cur_level] = 0;
2507
2508                 btrfs_tree_read_lock(eb);
2509                 dst_path->locks[cur_level] = BTRFS_READ_LOCK;
2510                 need_cleanup = true;
2511         }
2512
2513         /* Now record this tree block and its counter part for qgroups */
2514         ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2515                                        root_level, trace_leaf);
2516         if (ret < 0)
2517                 goto cleanup;
2518
2519         eb = dst_path->nodes[cur_level];
2520
2521         if (cur_level > 0) {
2522                 /* Iterate all child tree blocks */
2523                 for (i = 0; i < btrfs_header_nritems(eb); i++) {
2524                         /* Skip old tree blocks as they won't be swapped */
2525                         if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2526                                 continue;
2527                         dst_path->slots[cur_level] = i;
2528
2529                         /* Recursive call (at most 7 times) */
2530                         ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2531                                         dst_path, cur_level - 1, root_level,
2532                                         last_snapshot, trace_leaf);
2533                         if (ret < 0)
2534                                 goto cleanup;
2535                 }
2536         }
2537
2538 cleanup:
2539         if (need_cleanup) {
2540                 /* Clean up */
2541                 btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2542                                      dst_path->locks[cur_level]);
2543                 free_extent_buffer(dst_path->nodes[cur_level]);
2544                 dst_path->nodes[cur_level] = NULL;
2545                 dst_path->slots[cur_level] = 0;
2546                 dst_path->locks[cur_level] = 0;
2547         }
2548 out:
2549         return ret;
2550 }
2551
2552 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2553                                 struct extent_buffer *src_eb,
2554                                 struct extent_buffer *dst_eb,
2555                                 u64 last_snapshot, bool trace_leaf)
2556 {
2557         struct btrfs_fs_info *fs_info = trans->fs_info;
2558         struct btrfs_path *dst_path = NULL;
2559         int level;
2560         int ret;
2561
2562         if (!btrfs_qgroup_full_accounting(fs_info))
2563                 return 0;
2564
2565         /* Wrong parameter order */
2566         if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2567                 btrfs_err_rl(fs_info,
2568                 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2569                              btrfs_header_generation(src_eb),
2570                              btrfs_header_generation(dst_eb));
2571                 return -EUCLEAN;
2572         }
2573
2574         if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2575                 ret = -EIO;
2576                 goto out;
2577         }
2578
2579         level = btrfs_header_level(dst_eb);
2580         dst_path = btrfs_alloc_path();
2581         if (!dst_path) {
2582                 ret = -ENOMEM;
2583                 goto out;
2584         }
2585         /* For dst_path */
2586         atomic_inc(&dst_eb->refs);
2587         dst_path->nodes[level] = dst_eb;
2588         dst_path->slots[level] = 0;
2589         dst_path->locks[level] = 0;
2590
2591         /* Do the generation aware breadth-first search */
2592         ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2593                                               level, last_snapshot, trace_leaf);
2594         if (ret < 0)
2595                 goto out;
2596         ret = 0;
2597
2598 out:
2599         btrfs_free_path(dst_path);
2600         if (ret < 0)
2601                 qgroup_mark_inconsistent(fs_info);
2602         return ret;
2603 }
2604
2605 /*
2606  * Inform qgroup to trace a whole subtree, including all its child tree
2607  * blocks and data.
2608  * The root tree block is specified by @root_eb.
2609  *
2610  * Normally used by relocation(tree block swap) and subvolume deletion.
2611  *
2612  * Return 0 for success
2613  * Return <0 for error(ENOMEM or tree search error)
2614  */
2615 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2616                                struct extent_buffer *root_eb,
2617                                u64 root_gen, int root_level)
2618 {
2619         struct btrfs_fs_info *fs_info = trans->fs_info;
2620         int ret = 0;
2621         int level;
2622         u8 drop_subptree_thres;
2623         struct extent_buffer *eb = root_eb;
2624         struct btrfs_path *path = NULL;
2625
2626         ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
2627         ASSERT(root_eb != NULL);
2628
2629         if (!btrfs_qgroup_full_accounting(fs_info))
2630                 return 0;
2631
2632         spin_lock(&fs_info->qgroup_lock);
2633         drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
2634         spin_unlock(&fs_info->qgroup_lock);
2635
2636         /*
2637          * This function only gets called for snapshot drop, if we hit a high
2638          * node here, it means we are going to change ownership for quite a lot
2639          * of extents, which will greatly slow down btrfs_commit_transaction().
2640          *
2641          * So here if we find a high tree here, we just skip the accounting and
2642          * mark qgroup inconsistent.
2643          */
2644         if (root_level >= drop_subptree_thres) {
2645                 qgroup_mark_inconsistent(fs_info);
2646                 return 0;
2647         }
2648
2649         if (!extent_buffer_uptodate(root_eb)) {
2650                 struct btrfs_tree_parent_check check = {
2651                         .transid = root_gen,
2652                         .level = root_level
2653                 };
2654
2655                 ret = btrfs_read_extent_buffer(root_eb, &check);
2656                 if (ret)
2657                         goto out;
2658         }
2659
2660         if (root_level == 0) {
2661                 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2662                 goto out;
2663         }
2664
2665         path = btrfs_alloc_path();
2666         if (!path)
2667                 return -ENOMEM;
2668
2669         /*
2670          * Walk down the tree.  Missing extent blocks are filled in as
2671          * we go. Metadata is accounted every time we read a new
2672          * extent block.
2673          *
2674          * When we reach a leaf, we account for file extent items in it,
2675          * walk back up the tree (adjusting slot pointers as we go)
2676          * and restart the search process.
2677          */
2678         atomic_inc(&root_eb->refs);     /* For path */
2679         path->nodes[root_level] = root_eb;
2680         path->slots[root_level] = 0;
2681         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2682 walk_down:
2683         level = root_level;
2684         while (level >= 0) {
2685                 if (path->nodes[level] == NULL) {
2686                         int parent_slot;
2687                         u64 child_bytenr;
2688
2689                         /*
2690                          * We need to get child blockptr from parent before we
2691                          * can read it.
2692                           */
2693                         eb = path->nodes[level + 1];
2694                         parent_slot = path->slots[level + 1];
2695                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2696
2697                         eb = btrfs_read_node_slot(eb, parent_slot);
2698                         if (IS_ERR(eb)) {
2699                                 ret = PTR_ERR(eb);
2700                                 goto out;
2701                         }
2702
2703                         path->nodes[level] = eb;
2704                         path->slots[level] = 0;
2705
2706                         btrfs_tree_read_lock(eb);
2707                         path->locks[level] = BTRFS_READ_LOCK;
2708
2709                         ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2710                                                         fs_info->nodesize);
2711                         if (ret)
2712                                 goto out;
2713                 }
2714
2715                 if (level == 0) {
2716                         ret = btrfs_qgroup_trace_leaf_items(trans,
2717                                                             path->nodes[level]);
2718                         if (ret)
2719                                 goto out;
2720
2721                         /* Nonzero return here means we completed our search */
2722                         ret = adjust_slots_upwards(path, root_level);
2723                         if (ret)
2724                                 break;
2725
2726                         /* Restart search with new slots */
2727                         goto walk_down;
2728                 }
2729
2730                 level--;
2731         }
2732
2733         ret = 0;
2734 out:
2735         btrfs_free_path(path);
2736
2737         return ret;
2738 }
2739
2740 static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
2741 {
2742         if (!list_empty(&qgroup->nested_iterator))
2743                 return;
2744
2745         list_add_tail(&qgroup->nested_iterator, head);
2746 }
2747
2748 static void qgroup_iterator_nested_clean(struct list_head *head)
2749 {
2750         while (!list_empty(head)) {
2751                 struct btrfs_qgroup *qgroup;
2752
2753                 qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
2754                 list_del_init(&qgroup->nested_iterator);
2755         }
2756 }
2757
2758 #define UPDATE_NEW      0
2759 #define UPDATE_OLD      1
2760 /*
2761  * Walk all of the roots that points to the bytenr and adjust their refcnts.
2762  */
2763 static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2764                                  struct ulist *roots, struct list_head *qgroups,
2765                                  u64 seq, int update_old)
2766 {
2767         struct ulist_node *unode;
2768         struct ulist_iterator uiter;
2769         struct btrfs_qgroup *qg;
2770
2771         if (!roots)
2772                 return;
2773         ULIST_ITER_INIT(&uiter);
2774         while ((unode = ulist_next(roots, &uiter))) {
2775                 LIST_HEAD(tmp);
2776
2777                 qg = find_qgroup_rb(fs_info, unode->val);
2778                 if (!qg)
2779                         continue;
2780
2781                 qgroup_iterator_nested_add(qgroups, qg);
2782                 qgroup_iterator_add(&tmp, qg);
2783                 list_for_each_entry(qg, &tmp, iterator) {
2784                         struct btrfs_qgroup_list *glist;
2785
2786                         if (update_old)
2787                                 btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2788                         else
2789                                 btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2790
2791                         list_for_each_entry(glist, &qg->groups, next_group) {
2792                                 qgroup_iterator_nested_add(qgroups, glist->group);
2793                                 qgroup_iterator_add(&tmp, glist->group);
2794                         }
2795                 }
2796                 qgroup_iterator_clean(&tmp);
2797         }
2798 }
2799
2800 /*
2801  * Update qgroup rfer/excl counters.
2802  * Rfer update is easy, codes can explain themselves.
2803  *
2804  * Excl update is tricky, the update is split into 2 parts.
2805  * Part 1: Possible exclusive <-> sharing detect:
2806  *      |       A       |       !A      |
2807  *  -------------------------------------
2808  *  B   |       *       |       -       |
2809  *  -------------------------------------
2810  *  !B  |       +       |       **      |
2811  *  -------------------------------------
2812  *
2813  * Conditions:
2814  * A:   cur_old_roots < nr_old_roots    (not exclusive before)
2815  * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
2816  * B:   cur_new_roots < nr_new_roots    (not exclusive now)
2817  * !B:  cur_new_roots == nr_new_roots   (possible exclusive now)
2818  *
2819  * Results:
2820  * +: Possible sharing -> exclusive     -: Possible exclusive -> sharing
2821  * *: Definitely not changed.           **: Possible unchanged.
2822  *
2823  * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2824  *
2825  * To make the logic clear, we first use condition A and B to split
2826  * combination into 4 results.
2827  *
2828  * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2829  * only on variant maybe 0.
2830  *
2831  * Lastly, check result **, since there are 2 variants maybe 0, split them
2832  * again(2x2).
2833  * But this time we don't need to consider other things, the codes and logic
2834  * is easy to understand now.
2835  */
2836 static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
2837                                    struct list_head *qgroups, u64 nr_old_roots,
2838                                    u64 nr_new_roots, u64 num_bytes, u64 seq)
2839 {
2840         struct btrfs_qgroup *qg;
2841
2842         list_for_each_entry(qg, qgroups, nested_iterator) {
2843                 u64 cur_new_count, cur_old_count;
2844                 bool dirty = false;
2845
2846                 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2847                 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2848
2849                 trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2850                                              cur_new_count);
2851
2852                 /* Rfer update part */
2853                 if (cur_old_count == 0 && cur_new_count > 0) {
2854                         qg->rfer += num_bytes;
2855                         qg->rfer_cmpr += num_bytes;
2856                         dirty = true;
2857                 }
2858                 if (cur_old_count > 0 && cur_new_count == 0) {
2859                         qg->rfer -= num_bytes;
2860                         qg->rfer_cmpr -= num_bytes;
2861                         dirty = true;
2862                 }
2863
2864                 /* Excl update part */
2865                 /* Exclusive/none -> shared case */
2866                 if (cur_old_count == nr_old_roots &&
2867                     cur_new_count < nr_new_roots) {
2868                         /* Exclusive -> shared */
2869                         if (cur_old_count != 0) {
2870                                 qg->excl -= num_bytes;
2871                                 qg->excl_cmpr -= num_bytes;
2872                                 dirty = true;
2873                         }
2874                 }
2875
2876                 /* Shared -> exclusive/none case */
2877                 if (cur_old_count < nr_old_roots &&
2878                     cur_new_count == nr_new_roots) {
2879                         /* Shared->exclusive */
2880                         if (cur_new_count != 0) {
2881                                 qg->excl += num_bytes;
2882                                 qg->excl_cmpr += num_bytes;
2883                                 dirty = true;
2884                         }
2885                 }
2886
2887                 /* Exclusive/none -> exclusive/none case */
2888                 if (cur_old_count == nr_old_roots &&
2889                     cur_new_count == nr_new_roots) {
2890                         if (cur_old_count == 0) {
2891                                 /* None -> exclusive/none */
2892
2893                                 if (cur_new_count != 0) {
2894                                         /* None -> exclusive */
2895                                         qg->excl += num_bytes;
2896                                         qg->excl_cmpr += num_bytes;
2897                                         dirty = true;
2898                                 }
2899                                 /* None -> none, nothing changed */
2900                         } else {
2901                                 /* Exclusive -> exclusive/none */
2902
2903                                 if (cur_new_count == 0) {
2904                                         /* Exclusive -> none */
2905                                         qg->excl -= num_bytes;
2906                                         qg->excl_cmpr -= num_bytes;
2907                                         dirty = true;
2908                                 }
2909                                 /* Exclusive -> exclusive, nothing changed */
2910                         }
2911                 }
2912
2913                 if (dirty)
2914                         qgroup_dirty(fs_info, qg);
2915         }
2916 }
2917
2918 /*
2919  * Check if the @roots potentially is a list of fs tree roots
2920  *
2921  * Return 0 for definitely not a fs/subvol tree roots ulist
2922  * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2923  *          one as well)
2924  */
2925 static int maybe_fs_roots(struct ulist *roots)
2926 {
2927         struct ulist_node *unode;
2928         struct ulist_iterator uiter;
2929
2930         /* Empty one, still possible for fs roots */
2931         if (!roots || roots->nnodes == 0)
2932                 return 1;
2933
2934         ULIST_ITER_INIT(&uiter);
2935         unode = ulist_next(roots, &uiter);
2936         if (!unode)
2937                 return 1;
2938
2939         /*
2940          * If it contains fs tree roots, then it must belong to fs/subvol
2941          * trees.
2942          * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2943          */
2944         return is_fstree(unode->val);
2945 }
2946
2947 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2948                                 u64 num_bytes, struct ulist *old_roots,
2949                                 struct ulist *new_roots)
2950 {
2951         struct btrfs_fs_info *fs_info = trans->fs_info;
2952         LIST_HEAD(qgroups);
2953         u64 seq;
2954         u64 nr_new_roots = 0;
2955         u64 nr_old_roots = 0;
2956         int ret = 0;
2957
2958         /*
2959          * If quotas get disabled meanwhile, the resources need to be freed and
2960          * we can't just exit here.
2961          */
2962         if (!btrfs_qgroup_full_accounting(fs_info) ||
2963             fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
2964                 goto out_free;
2965
2966         if (new_roots) {
2967                 if (!maybe_fs_roots(new_roots))
2968                         goto out_free;
2969                 nr_new_roots = new_roots->nnodes;
2970         }
2971         if (old_roots) {
2972                 if (!maybe_fs_roots(old_roots))
2973                         goto out_free;
2974                 nr_old_roots = old_roots->nnodes;
2975         }
2976
2977         /* Quick exit, either not fs tree roots, or won't affect any qgroup */
2978         if (nr_old_roots == 0 && nr_new_roots == 0)
2979                 goto out_free;
2980
2981         trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2982                                         num_bytes, nr_old_roots, nr_new_roots);
2983
2984         mutex_lock(&fs_info->qgroup_rescan_lock);
2985         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2986                 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2987                         mutex_unlock(&fs_info->qgroup_rescan_lock);
2988                         ret = 0;
2989                         goto out_free;
2990                 }
2991         }
2992         mutex_unlock(&fs_info->qgroup_rescan_lock);
2993
2994         spin_lock(&fs_info->qgroup_lock);
2995         seq = fs_info->qgroup_seq;
2996
2997         /* Update old refcnts using old_roots */
2998         qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);
2999
3000         /* Update new refcnts using new_roots */
3001         qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);
3002
3003         qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
3004                                num_bytes, seq);
3005
3006         /*
3007          * We're done using the iterator, release all its qgroups while holding
3008          * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
3009          * and trigger use-after-free accesses to qgroups.
3010          */
3011         qgroup_iterator_nested_clean(&qgroups);
3012
3013         /*
3014          * Bump qgroup_seq to avoid seq overlap
3015          */
3016         fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
3017         spin_unlock(&fs_info->qgroup_lock);
3018 out_free:
3019         ulist_free(old_roots);
3020         ulist_free(new_roots);
3021         return ret;
3022 }
3023
3024 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
3025 {
3026         struct btrfs_fs_info *fs_info = trans->fs_info;
3027         struct btrfs_qgroup_extent_record *record;
3028         struct btrfs_delayed_ref_root *delayed_refs;
3029         struct ulist *new_roots = NULL;
3030         unsigned long index;
3031         u64 num_dirty_extents = 0;
3032         u64 qgroup_to_skip;
3033         int ret = 0;
3034
3035         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
3036                 return 0;
3037
3038         delayed_refs = &trans->transaction->delayed_refs;
3039         qgroup_to_skip = delayed_refs->qgroup_to_skip;
3040         xa_for_each(&delayed_refs->dirty_extents, index, record) {
3041                 const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits);
3042
3043                 num_dirty_extents++;
3044                 trace_btrfs_qgroup_account_extents(fs_info, record, bytenr);
3045
3046                 if (!ret && !(fs_info->qgroup_flags &
3047                               BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
3048                         struct btrfs_backref_walk_ctx ctx = { 0 };
3049
3050                         ctx.bytenr = bytenr;
3051                         ctx.fs_info = fs_info;
3052
3053                         /*
3054                          * Old roots should be searched when inserting qgroup
3055                          * extent record.
3056                          *
3057                          * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
3058                          * we may have some record inserted during
3059                          * NO_ACCOUNTING (thus no old_roots populated), but
3060                          * later we start rescan, which clears NO_ACCOUNTING,
3061                          * leaving some inserted records without old_roots
3062                          * populated.
3063                          *
3064                          * Those cases are rare and should not cause too much
3065                          * time spent during commit_transaction().
3066                          */
3067                         if (!record->old_roots) {
3068                                 /* Search commit root to find old_roots */
3069                                 ret = btrfs_find_all_roots(&ctx, false);
3070                                 if (ret < 0)
3071                                         goto cleanup;
3072                                 record->old_roots = ctx.roots;
3073                                 ctx.roots = NULL;
3074                         }
3075
3076                         /*
3077                          * Use BTRFS_SEQ_LAST as time_seq to do special search,
3078                          * which doesn't lock tree or delayed_refs and search
3079                          * current root. It's safe inside commit_transaction().
3080                          */
3081                         ctx.trans = trans;
3082                         ctx.time_seq = BTRFS_SEQ_LAST;
3083                         ret = btrfs_find_all_roots(&ctx, false);
3084                         if (ret < 0)
3085                                 goto cleanup;
3086                         new_roots = ctx.roots;
3087                         if (qgroup_to_skip) {
3088                                 ulist_del(new_roots, qgroup_to_skip, 0);
3089                                 ulist_del(record->old_roots, qgroup_to_skip,
3090                                           0);
3091                         }
3092                         ret = btrfs_qgroup_account_extent(trans, bytenr,
3093                                                           record->num_bytes,
3094                                                           record->old_roots,
3095                                                           new_roots);
3096                         record->old_roots = NULL;
3097                         new_roots = NULL;
3098                 }
3099                 /* Free the reserved data space */
3100                 btrfs_qgroup_free_refroot(fs_info,
3101                                 record->data_rsv_refroot,
3102                                 record->data_rsv,
3103                                 BTRFS_QGROUP_RSV_DATA);
3104 cleanup:
3105                 ulist_free(record->old_roots);
3106                 ulist_free(new_roots);
3107                 new_roots = NULL;
3108                 xa_erase(&delayed_refs->dirty_extents, index);
3109                 kfree(record);
3110
3111         }
3112         trace_qgroup_num_dirty_extents(fs_info, trans->transid,
3113                                        num_dirty_extents);
3114         return ret;
3115 }
3116
3117 /*
3118  * Writes all changed qgroups to disk.
3119  * Called by the transaction commit path and the qgroup assign ioctl.
3120  */
3121 int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
3122 {
3123         struct btrfs_fs_info *fs_info = trans->fs_info;
3124         int ret = 0;
3125
3126         /*
3127          * In case we are called from the qgroup assign ioctl, assert that we
3128          * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
3129          * disable operation (ioctl) and access a freed quota root.
3130          */
3131         if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
3132                 lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
3133
3134         if (!fs_info->quota_root)
3135                 return ret;
3136
3137         spin_lock(&fs_info->qgroup_lock);
3138         while (!list_empty(&fs_info->dirty_qgroups)) {
3139                 struct btrfs_qgroup *qgroup;
3140                 qgroup = list_first_entry(&fs_info->dirty_qgroups,
3141                                           struct btrfs_qgroup, dirty);
3142                 list_del_init(&qgroup->dirty);
3143                 spin_unlock(&fs_info->qgroup_lock);
3144                 ret = update_qgroup_info_item(trans, qgroup);
3145                 if (ret)
3146                         qgroup_mark_inconsistent(fs_info);
3147                 ret = update_qgroup_limit_item(trans, qgroup);
3148                 if (ret)
3149                         qgroup_mark_inconsistent(fs_info);
3150                 spin_lock(&fs_info->qgroup_lock);
3151         }
3152         if (btrfs_qgroup_enabled(fs_info))
3153                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
3154         else
3155                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
3156         spin_unlock(&fs_info->qgroup_lock);
3157
3158         ret = update_qgroup_status_item(trans);
3159         if (ret)
3160                 qgroup_mark_inconsistent(fs_info);
3161
3162         return ret;
3163 }
3164
3165 int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
3166                                struct btrfs_qgroup_inherit *inherit,
3167                                size_t size)
3168 {
3169         if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
3170                 return -EOPNOTSUPP;
3171         if (size < sizeof(*inherit) || size > PAGE_SIZE)
3172                 return -EINVAL;
3173
3174         /*
3175          * In the past we allowed btrfs_qgroup_inherit to specify to copy
3176          * rfer/excl numbers directly from other qgroups.  This behavior has
3177          * been disabled in userspace for a very long time, but here we should
3178          * also disable it in kernel, as this behavior is known to mark qgroup
3179          * inconsistent, and a rescan would wipe out the changes anyway.
3180          *
3181          * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies.
3182          */
3183         if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0)
3184                 return -EINVAL;
3185
3186         if (size != struct_size(inherit, qgroups, inherit->num_qgroups))
3187                 return -EINVAL;
3188
3189         /*
3190          * Skip the inherit source qgroups check if qgroup is not enabled.
3191          * Qgroup can still be later enabled causing problems, but in that case
3192          * btrfs_qgroup_inherit() would just ignore those invalid ones.
3193          */
3194         if (!btrfs_qgroup_enabled(fs_info))
3195                 return 0;
3196
3197         /*
3198          * Now check all the remaining qgroups, they should all:
3199          *
3200          * - Exist
3201          * - Be higher level qgroups.
3202          */
3203         for (int i = 0; i < inherit->num_qgroups; i++) {
3204                 struct btrfs_qgroup *qgroup;
3205                 u64 qgroupid = inherit->qgroups[i];
3206
3207                 if (btrfs_qgroup_level(qgroupid) == 0)
3208                         return -EINVAL;
3209
3210                 spin_lock(&fs_info->qgroup_lock);
3211                 qgroup = find_qgroup_rb(fs_info, qgroupid);
3212                 if (!qgroup) {
3213                         spin_unlock(&fs_info->qgroup_lock);
3214                         return -ENOENT;
3215                 }
3216                 spin_unlock(&fs_info->qgroup_lock);
3217         }
3218         return 0;
3219 }
3220
3221 static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
3222                                u64 inode_rootid,
3223                                struct btrfs_qgroup_inherit **inherit)
3224 {
3225         int i = 0;
3226         u64 num_qgroups = 0;
3227         struct btrfs_qgroup *inode_qg;
3228         struct btrfs_qgroup_list *qg_list;
3229         struct btrfs_qgroup_inherit *res;
3230         size_t struct_sz;
3231         u64 *qgids;
3232
3233         if (*inherit)
3234                 return -EEXIST;
3235
3236         inode_qg = find_qgroup_rb(fs_info, inode_rootid);
3237         if (!inode_qg)
3238                 return -ENOENT;
3239
3240         num_qgroups = list_count_nodes(&inode_qg->groups);
3241
3242         if (!num_qgroups)
3243                 return 0;
3244
3245         struct_sz = struct_size(res, qgroups, num_qgroups);
3246         if (struct_sz == SIZE_MAX)
3247                 return -ERANGE;
3248
3249         res = kzalloc(struct_sz, GFP_NOFS);
3250         if (!res)
3251                 return -ENOMEM;
3252         res->num_qgroups = num_qgroups;
3253         qgids = res->qgroups;
3254
3255         list_for_each_entry(qg_list, &inode_qg->groups, next_group)
3256                 qgids[i++] = qg_list->group->qgroupid;
3257
3258         *inherit = res;
3259         return 0;
3260 }
3261
3262 /*
3263  * Check if we can skip rescan when inheriting qgroups.  If @src has a single
3264  * @parent, and that @parent is owning all its bytes exclusively, we can skip
3265  * the full rescan, by just adding nodesize to the @parent's excl/rfer.
3266  *
3267  * Return <0 for fatal errors (like srcid/parentid has no qgroup).
3268  * Return 0 if a quick inherit is done.
3269  * Return >0 if a quick inherit is not possible, and a full rescan is needed.
3270  */
3271 static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
3272                                          u64 srcid, u64 parentid)
3273 {
3274         struct btrfs_qgroup *src;
3275         struct btrfs_qgroup *parent;
3276         struct btrfs_qgroup_list *list;
3277         int nr_parents = 0;
3278
3279         src = find_qgroup_rb(fs_info, srcid);
3280         if (!src)
3281                 return -ENOENT;
3282         parent = find_qgroup_rb(fs_info, parentid);
3283         if (!parent)
3284                 return -ENOENT;
3285
3286         /*
3287          * Source has no parent qgroup, but our new qgroup would have one.
3288          * Qgroup numbers would become inconsistent.
3289          */
3290         if (list_empty(&src->groups))
3291                 return 1;
3292
3293         list_for_each_entry(list, &src->groups, next_group) {
3294                 /* The parent is not the same, quick update is not possible. */
3295                 if (list->group->qgroupid != parentid)
3296                         return 1;
3297                 nr_parents++;
3298                 /*
3299                  * More than one parent qgroup, we can't be sure about accounting
3300                  * consistency.
3301                  */
3302                 if (nr_parents > 1)
3303                         return 1;
3304         }
3305
3306         /*
3307          * The parent is not exclusively owning all its bytes.  We're not sure
3308          * if the source has any bytes not fully owned by the parent.
3309          */
3310         if (parent->excl != parent->rfer)
3311                 return 1;
3312
3313         parent->excl += fs_info->nodesize;
3314         parent->rfer += fs_info->nodesize;
3315         return 0;
3316 }
3317
3318 /*
3319  * Copy the accounting information between qgroups. This is necessary
3320  * when a snapshot or a subvolume is created. Throwing an error will
3321  * cause a transaction abort so we take extra care here to only error
3322  * when a readonly fs is a reasonable outcome.
3323  */
3324 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
3325                          u64 objectid, u64 inode_rootid,
3326                          struct btrfs_qgroup_inherit *inherit)
3327 {
3328         int ret = 0;
3329         u64 *i_qgroups;
3330         bool committing = false;
3331         struct btrfs_fs_info *fs_info = trans->fs_info;
3332         struct btrfs_root *quota_root;
3333         struct btrfs_qgroup *srcgroup;
3334         struct btrfs_qgroup *dstgroup;
3335         struct btrfs_qgroup *prealloc;
3336         struct btrfs_qgroup_list **qlist_prealloc = NULL;
3337         bool free_inherit = false;
3338         bool need_rescan = false;
3339         u32 level_size = 0;
3340         u64 nums;
3341
3342         prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
3343         if (!prealloc)
3344                 return -ENOMEM;
3345
3346         /*
3347          * There are only two callers of this function.
3348          *
3349          * One in create_subvol() in the ioctl context, which needs to hold
3350          * the qgroup_ioctl_lock.
3351          *
3352          * The other one in create_pending_snapshot() where no other qgroup
3353          * code can modify the fs as they all need to either start a new trans
3354          * or hold a trans handler, thus we don't need to hold
3355          * qgroup_ioctl_lock.
3356          * This would avoid long and complex lock chain and make lockdep happy.
3357          */
3358         spin_lock(&fs_info->trans_lock);
3359         if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
3360                 committing = true;
3361         spin_unlock(&fs_info->trans_lock);
3362
3363         if (!committing)
3364                 mutex_lock(&fs_info->qgroup_ioctl_lock);
3365         if (!btrfs_qgroup_enabled(fs_info))
3366                 goto out;
3367
3368         quota_root = fs_info->quota_root;
3369         if (!quota_root) {
3370                 ret = -EINVAL;
3371                 goto out;
3372         }
3373
3374         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
3375                 ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit);
3376                 if (ret)
3377                         goto out;
3378                 free_inherit = true;
3379         }
3380
3381         if (inherit) {
3382                 i_qgroups = (u64 *)(inherit + 1);
3383                 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
3384                        2 * inherit->num_excl_copies;
3385                 for (int i = 0; i < nums; i++) {
3386                         srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
3387
3388                         /*
3389                          * Zero out invalid groups so we can ignore
3390                          * them later.
3391                          */
3392                         if (!srcgroup ||
3393                             ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
3394                                 *i_qgroups = 0ULL;
3395
3396                         ++i_qgroups;
3397                 }
3398         }
3399
3400         /*
3401          * create a tracking group for the subvol itself
3402          */
3403         ret = add_qgroup_item(trans, quota_root, objectid);
3404         if (ret)
3405                 goto out;
3406
3407         /*
3408          * add qgroup to all inherited groups
3409          */
3410         if (inherit) {
3411                 i_qgroups = (u64 *)(inherit + 1);
3412                 for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) {
3413                         if (*i_qgroups == 0)
3414                                 continue;
3415                         ret = add_qgroup_relation_item(trans, objectid,
3416                                                        *i_qgroups);
3417                         if (ret && ret != -EEXIST)
3418                                 goto out;
3419                         ret = add_qgroup_relation_item(trans, *i_qgroups,
3420                                                        objectid);
3421                         if (ret && ret != -EEXIST)
3422                                 goto out;
3423                 }
3424                 ret = 0;
3425
3426                 qlist_prealloc = kcalloc(inherit->num_qgroups,
3427                                          sizeof(struct btrfs_qgroup_list *),
3428                                          GFP_NOFS);
3429                 if (!qlist_prealloc) {
3430                         ret = -ENOMEM;
3431                         goto out;
3432                 }
3433                 for (int i = 0; i < inherit->num_qgroups; i++) {
3434                         qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
3435                                                     GFP_NOFS);
3436                         if (!qlist_prealloc[i]) {
3437                                 ret = -ENOMEM;
3438                                 goto out;
3439                         }
3440                 }
3441         }
3442
3443         spin_lock(&fs_info->qgroup_lock);
3444
3445         dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
3446         prealloc = NULL;
3447
3448         if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
3449                 dstgroup->lim_flags = inherit->lim.flags;
3450                 dstgroup->max_rfer = inherit->lim.max_rfer;
3451                 dstgroup->max_excl = inherit->lim.max_excl;
3452                 dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
3453                 dstgroup->rsv_excl = inherit->lim.rsv_excl;
3454
3455                 qgroup_dirty(fs_info, dstgroup);
3456         }
3457
3458         if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
3459                 srcgroup = find_qgroup_rb(fs_info, srcid);
3460                 if (!srcgroup)
3461                         goto unlock;
3462
3463                 /*
3464                  * We call inherit after we clone the root in order to make sure
3465                  * our counts don't go crazy, so at this point the only
3466                  * difference between the two roots should be the root node.
3467                  */
3468                 level_size = fs_info->nodesize;
3469                 dstgroup->rfer = srcgroup->rfer;
3470                 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
3471                 dstgroup->excl = level_size;
3472                 dstgroup->excl_cmpr = level_size;
3473                 srcgroup->excl = level_size;
3474                 srcgroup->excl_cmpr = level_size;
3475
3476                 /* inherit the limit info */
3477                 dstgroup->lim_flags = srcgroup->lim_flags;
3478                 dstgroup->max_rfer = srcgroup->max_rfer;
3479                 dstgroup->max_excl = srcgroup->max_excl;
3480                 dstgroup->rsv_rfer = srcgroup->rsv_rfer;
3481                 dstgroup->rsv_excl = srcgroup->rsv_excl;
3482
3483                 qgroup_dirty(fs_info, dstgroup);
3484                 qgroup_dirty(fs_info, srcgroup);
3485
3486                 /*
3487                  * If the source qgroup has parent but the new one doesn't,
3488                  * we need a full rescan.
3489                  */
3490                 if (!inherit && !list_empty(&srcgroup->groups))
3491                         need_rescan = true;
3492         }
3493
3494         if (!inherit)
3495                 goto unlock;
3496
3497         i_qgroups = (u64 *)(inherit + 1);
3498         for (int i = 0; i < inherit->num_qgroups; i++) {
3499                 if (*i_qgroups) {
3500                         ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
3501                                               *i_qgroups);
3502                         qlist_prealloc[i] = NULL;
3503                         if (ret)
3504                                 goto unlock;
3505                 }
3506                 if (srcid) {
3507                         /* Check if we can do a quick inherit. */
3508                         ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups);
3509                         if (ret < 0)
3510                                 goto unlock;
3511                         if (ret > 0)
3512                                 need_rescan = true;
3513                         ret = 0;
3514                 }
3515                 ++i_qgroups;
3516         }
3517
3518         for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) {
3519                 struct btrfs_qgroup *src;
3520                 struct btrfs_qgroup *dst;
3521
3522                 if (!i_qgroups[0] || !i_qgroups[1])
3523                         continue;
3524
3525                 src = find_qgroup_rb(fs_info, i_qgroups[0]);
3526                 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
3527
3528                 if (!src || !dst) {
3529                         ret = -EINVAL;
3530                         goto unlock;
3531                 }
3532
3533                 dst->rfer = src->rfer - level_size;
3534                 dst->rfer_cmpr = src->rfer_cmpr - level_size;
3535
3536                 /* Manually tweaking numbers certainly needs a rescan */
3537                 need_rescan = true;
3538         }
3539         for (int i = 0; i < inherit->num_excl_copies; i++, i_qgroups += 2) {
3540                 struct btrfs_qgroup *src;
3541                 struct btrfs_qgroup *dst;
3542
3543                 if (!i_qgroups[0] || !i_qgroups[1])
3544                         continue;
3545
3546                 src = find_qgroup_rb(fs_info, i_qgroups[0]);
3547                 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
3548
3549                 if (!src || !dst) {
3550                         ret = -EINVAL;
3551                         goto unlock;
3552                 }
3553
3554                 dst->excl = src->excl + level_size;
3555                 dst->excl_cmpr = src->excl_cmpr + level_size;
3556                 need_rescan = true;
3557         }
3558
3559 unlock:
3560         spin_unlock(&fs_info->qgroup_lock);
3561         if (!ret)
3562                 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
3563 out:
3564         if (!committing)
3565                 mutex_unlock(&fs_info->qgroup_ioctl_lock);
3566         if (need_rescan)
3567                 qgroup_mark_inconsistent(fs_info);
3568         if (qlist_prealloc) {
3569                 for (int i = 0; i < inherit->num_qgroups; i++)
3570                         kfree(qlist_prealloc[i]);
3571                 kfree(qlist_prealloc);
3572         }
3573         if (free_inherit)
3574                 kfree(inherit);
3575         kfree(prealloc);
3576         return ret;
3577 }
3578
3579 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
3580 {
3581         if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
3582             qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
3583                 return false;
3584
3585         if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
3586             qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
3587                 return false;
3588
3589         return true;
3590 }
3591
3592 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
3593                           enum btrfs_qgroup_rsv_type type)
3594 {
3595         struct btrfs_qgroup *qgroup;
3596         struct btrfs_fs_info *fs_info = root->fs_info;
3597         u64 ref_root = btrfs_root_id(root);
3598         int ret = 0;
3599         LIST_HEAD(qgroup_list);
3600
3601         if (!is_fstree(ref_root))
3602                 return 0;
3603
3604         if (num_bytes == 0)
3605                 return 0;
3606
3607         if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
3608             capable(CAP_SYS_RESOURCE))
3609                 enforce = false;
3610
3611         spin_lock(&fs_info->qgroup_lock);
3612         if (!fs_info->quota_root)
3613                 goto out;
3614
3615         qgroup = find_qgroup_rb(fs_info, ref_root);
3616         if (!qgroup)
3617                 goto out;
3618
3619         qgroup_iterator_add(&qgroup_list, qgroup);
3620         list_for_each_entry(qgroup, &qgroup_list, iterator) {
3621                 struct btrfs_qgroup_list *glist;
3622
3623                 if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
3624                         ret = -EDQUOT;
3625                         goto out;
3626                 }
3627
3628                 list_for_each_entry(glist, &qgroup->groups, next_group)
3629                         qgroup_iterator_add(&qgroup_list, glist->group);
3630         }
3631
3632         ret = 0;
3633         /*
3634          * no limits exceeded, now record the reservation into all qgroups
3635          */
3636         list_for_each_entry(qgroup, &qgroup_list, iterator)
3637                 qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
3638
3639 out:
3640         qgroup_iterator_clean(&qgroup_list);
3641         spin_unlock(&fs_info->qgroup_lock);
3642         return ret;
3643 }
3644
3645 /*
3646  * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3647  * qgroup).
3648  *
3649  * Will handle all higher level qgroup too.
3650  *
3651  * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3652  * This special case is only used for META_PERTRANS type.
3653  */
3654 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3655                                u64 ref_root, u64 num_bytes,
3656                                enum btrfs_qgroup_rsv_type type)
3657 {
3658         struct btrfs_qgroup *qgroup;
3659         LIST_HEAD(qgroup_list);
3660
3661         if (!is_fstree(ref_root))
3662                 return;
3663
3664         if (num_bytes == 0)
3665                 return;
3666
3667         if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3668                 WARN(1, "%s: Invalid type to free", __func__);
3669                 return;
3670         }
3671         spin_lock(&fs_info->qgroup_lock);
3672
3673         if (!fs_info->quota_root)
3674                 goto out;
3675
3676         qgroup = find_qgroup_rb(fs_info, ref_root);
3677         if (!qgroup)
3678                 goto out;
3679
3680         if (num_bytes == (u64)-1)
3681                 /*
3682                  * We're freeing all pertrans rsv, get reserved value from
3683                  * level 0 qgroup as real num_bytes to free.
3684                  */
3685                 num_bytes = qgroup->rsv.values[type];
3686
3687         qgroup_iterator_add(&qgroup_list, qgroup);
3688         list_for_each_entry(qgroup, &qgroup_list, iterator) {
3689                 struct btrfs_qgroup_list *glist;
3690
3691                 qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
3692                 list_for_each_entry(glist, &qgroup->groups, next_group) {
3693                         qgroup_iterator_add(&qgroup_list, glist->group);
3694                 }
3695         }
3696 out:
3697         qgroup_iterator_clean(&qgroup_list);
3698         spin_unlock(&fs_info->qgroup_lock);
3699 }
3700
3701 /*
3702  * Check if the leaf is the last leaf. Which means all node pointers
3703  * are at their last position.
3704  */
3705 static bool is_last_leaf(struct btrfs_path *path)
3706 {
3707         int i;
3708
3709         for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3710                 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3711                         return false;
3712         }
3713         return true;
3714 }
3715
3716 /*
3717  * returns < 0 on error, 0 when more leafs are to be scanned.
3718  * returns 1 when done.
3719  */
3720 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3721                               struct btrfs_path *path)
3722 {
3723         struct btrfs_fs_info *fs_info = trans->fs_info;
3724         struct btrfs_root *extent_root;
3725         struct btrfs_key found;
3726         struct extent_buffer *scratch_leaf = NULL;
3727         u64 num_bytes;
3728         bool done;
3729         int slot;
3730         int ret;
3731
3732         if (!btrfs_qgroup_full_accounting(fs_info))
3733                 return 1;
3734
3735         mutex_lock(&fs_info->qgroup_rescan_lock);
3736         extent_root = btrfs_extent_root(fs_info,
3737                                 fs_info->qgroup_rescan_progress.objectid);
3738         ret = btrfs_search_slot_for_read(extent_root,
3739                                          &fs_info->qgroup_rescan_progress,
3740                                          path, 1, 0);
3741
3742         btrfs_debug(fs_info,
3743                 "current progress key (%llu %u %llu), search_slot ret %d",
3744                 fs_info->qgroup_rescan_progress.objectid,
3745                 fs_info->qgroup_rescan_progress.type,
3746                 fs_info->qgroup_rescan_progress.offset, ret);
3747
3748         if (ret) {
3749                 /*
3750                  * The rescan is about to end, we will not be scanning any
3751                  * further blocks. We cannot unset the RESCAN flag here, because
3752                  * we want to commit the transaction if everything went well.
3753                  * To make the live accounting work in this phase, we set our
3754                  * scan progress pointer such that every real extent objectid
3755                  * will be smaller.
3756                  */
3757                 fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3758                 btrfs_release_path(path);
3759                 mutex_unlock(&fs_info->qgroup_rescan_lock);
3760                 return ret;
3761         }
3762         done = is_last_leaf(path);
3763
3764         btrfs_item_key_to_cpu(path->nodes[0], &found,
3765                               btrfs_header_nritems(path->nodes[0]) - 1);
3766         fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3767
3768         scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3769         if (!scratch_leaf) {
3770                 ret = -ENOMEM;
3771                 mutex_unlock(&fs_info->qgroup_rescan_lock);
3772                 goto out;
3773         }
3774         slot = path->slots[0];
3775         btrfs_release_path(path);
3776         mutex_unlock(&fs_info->qgroup_rescan_lock);
3777
3778         for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3779                 struct btrfs_backref_walk_ctx ctx = { 0 };
3780
3781                 btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3782                 if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3783                     found.type != BTRFS_METADATA_ITEM_KEY)
3784                         continue;
3785                 if (found.type == BTRFS_METADATA_ITEM_KEY)
3786                         num_bytes = fs_info->nodesize;
3787                 else
3788                         num_bytes = found.offset;
3789
3790                 ctx.bytenr = found.objectid;
3791                 ctx.fs_info = fs_info;
3792
3793                 ret = btrfs_find_all_roots(&ctx, false);
3794                 if (ret < 0)
3795                         goto out;
3796                 /* For rescan, just pass old_roots as NULL */
3797                 ret = btrfs_qgroup_account_extent(trans, found.objectid,
3798                                                   num_bytes, NULL, ctx.roots);
3799                 if (ret < 0)
3800                         goto out;
3801         }
3802 out:
3803         if (scratch_leaf)
3804                 free_extent_buffer(scratch_leaf);
3805
3806         if (done && !ret) {
3807                 ret = 1;
3808                 fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3809         }
3810         return ret;
3811 }
3812
3813 static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
3814 {
3815         if (btrfs_fs_closing(fs_info))
3816                 return true;
3817         if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
3818                 return true;
3819         if (!btrfs_qgroup_enabled(fs_info))
3820                 return true;
3821         if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
3822                 return true;
3823         return false;
3824 }
3825
3826 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3827 {
3828         struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3829                                                      qgroup_rescan_work);
3830         struct btrfs_path *path;
3831         struct btrfs_trans_handle *trans = NULL;
3832         int ret = 0;
3833         bool stopped = false;
3834         bool did_leaf_rescans = false;
3835
3836         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
3837                 return;
3838
3839         path = btrfs_alloc_path();
3840         if (!path) {
3841                 ret = -ENOMEM;
3842                 goto out;
3843         }
3844         /*
3845          * Rescan should only search for commit root, and any later difference
3846          * should be recorded by qgroup
3847          */
3848         path->search_commit_root = 1;
3849         path->skip_locking = 1;
3850
3851         while (!ret && !(stopped = rescan_should_stop(fs_info))) {
3852                 trans = btrfs_start_transaction(fs_info->fs_root, 0);
3853                 if (IS_ERR(trans)) {
3854                         ret = PTR_ERR(trans);
3855                         break;
3856                 }
3857
3858                 ret = qgroup_rescan_leaf(trans, path);
3859                 did_leaf_rescans = true;
3860
3861                 if (ret > 0)
3862                         btrfs_commit_transaction(trans);
3863                 else
3864                         btrfs_end_transaction(trans);
3865         }
3866
3867 out:
3868         btrfs_free_path(path);
3869
3870         mutex_lock(&fs_info->qgroup_rescan_lock);
3871         if (ret > 0 &&
3872             fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3873                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3874         } else if (ret < 0 || stopped) {
3875                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3876         }
3877         mutex_unlock(&fs_info->qgroup_rescan_lock);
3878
3879         /*
3880          * Only update status, since the previous part has already updated the
3881          * qgroup info, and only if we did any actual work. This also prevents
3882          * race with a concurrent quota disable, which has already set
3883          * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
3884          * btrfs_quota_disable().
3885          */
3886         if (did_leaf_rescans) {
3887                 trans = btrfs_start_transaction(fs_info->quota_root, 1);
3888                 if (IS_ERR(trans)) {
3889                         ret = PTR_ERR(trans);
3890                         trans = NULL;
3891                         btrfs_err(fs_info,
3892                                   "fail to start transaction for status update: %d",
3893                                   ret);
3894                 }
3895         } else {
3896                 trans = NULL;
3897         }
3898
3899         mutex_lock(&fs_info->qgroup_rescan_lock);
3900         if (!stopped ||
3901             fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
3902                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3903         if (trans) {
3904                 int ret2 = update_qgroup_status_item(trans);
3905
3906                 if (ret2 < 0) {
3907                         ret = ret2;
3908                         btrfs_err(fs_info, "fail to update qgroup status: %d", ret);
3909                 }
3910         }
3911         fs_info->qgroup_rescan_running = false;
3912         fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
3913         complete_all(&fs_info->qgroup_rescan_completion);
3914         mutex_unlock(&fs_info->qgroup_rescan_lock);
3915
3916         if (!trans)
3917                 return;
3918
3919         btrfs_end_transaction(trans);
3920
3921         if (stopped) {
3922                 btrfs_info(fs_info, "qgroup scan paused");
3923         } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
3924                 btrfs_info(fs_info, "qgroup scan cancelled");
3925         } else if (ret >= 0) {
3926                 btrfs_info(fs_info, "qgroup scan completed%s",
3927                         ret > 0 ? " (inconsistency flag cleared)" : "");
3928         } else {
3929                 btrfs_err(fs_info, "qgroup scan failed with %d", ret);
3930         }
3931 }
3932
3933 /*
3934  * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3935  * memory required for the rescan context.
3936  */
3937 static int
3938 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3939                    int init_flags)
3940 {
3941         int ret = 0;
3942
3943         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
3944                 btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
3945                 return -EINVAL;
3946         }
3947
3948         if (!init_flags) {
3949                 /* we're resuming qgroup rescan at mount time */
3950                 if (!(fs_info->qgroup_flags &
3951                       BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3952                         btrfs_debug(fs_info,
3953                         "qgroup rescan init failed, qgroup rescan is not queued");
3954                         ret = -EINVAL;
3955                 } else if (!(fs_info->qgroup_flags &
3956                              BTRFS_QGROUP_STATUS_FLAG_ON)) {
3957                         btrfs_debug(fs_info,
3958                         "qgroup rescan init failed, qgroup is not enabled");
3959                         ret = -ENOTCONN;
3960                 }
3961
3962                 if (ret)
3963                         return ret;
3964         }
3965
3966         mutex_lock(&fs_info->qgroup_rescan_lock);
3967
3968         if (init_flags) {
3969                 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3970                         ret = -EINPROGRESS;
3971                 } else if (!(fs_info->qgroup_flags &
3972                              BTRFS_QGROUP_STATUS_FLAG_ON)) {
3973                         btrfs_debug(fs_info,
3974                         "qgroup rescan init failed, qgroup is not enabled");
3975                         ret = -ENOTCONN;
3976                 } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
3977                         /* Quota disable is in progress */
3978                         ret = -EBUSY;
3979                 }
3980
3981                 if (ret) {
3982                         mutex_unlock(&fs_info->qgroup_rescan_lock);
3983                         return ret;
3984                 }
3985                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3986         }
3987
3988         memset(&fs_info->qgroup_rescan_progress, 0,
3989                 sizeof(fs_info->qgroup_rescan_progress));
3990         fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
3991                                    BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
3992         fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3993         init_completion(&fs_info->qgroup_rescan_completion);
3994         mutex_unlock(&fs_info->qgroup_rescan_lock);
3995
3996         btrfs_init_work(&fs_info->qgroup_rescan_work,
3997                         btrfs_qgroup_rescan_worker, NULL);
3998         return 0;
3999 }
4000
4001 static void
4002 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
4003 {
4004         struct rb_node *n;
4005         struct btrfs_qgroup *qgroup;
4006
4007         spin_lock(&fs_info->qgroup_lock);
4008         /* clear all current qgroup tracking information */
4009         for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
4010                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
4011                 qgroup->rfer = 0;
4012                 qgroup->rfer_cmpr = 0;
4013                 qgroup->excl = 0;
4014                 qgroup->excl_cmpr = 0;
4015                 qgroup_dirty(fs_info, qgroup);
4016         }
4017         spin_unlock(&fs_info->qgroup_lock);
4018 }
4019
4020 int
4021 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
4022 {
4023         int ret = 0;
4024
4025         ret = qgroup_rescan_init(fs_info, 0, 1);
4026         if (ret)
4027                 return ret;
4028
4029         /*
4030          * We have set the rescan_progress to 0, which means no more
4031          * delayed refs will be accounted by btrfs_qgroup_account_ref.
4032          * However, btrfs_qgroup_account_ref may be right after its call
4033          * to btrfs_find_all_roots, in which case it would still do the
4034          * accounting.
4035          * To solve this, we're committing the transaction, which will
4036          * ensure we run all delayed refs and only after that, we are
4037          * going to clear all tracking information for a clean start.
4038          */
4039
4040         ret = btrfs_commit_current_transaction(fs_info->fs_root);
4041         if (ret) {
4042                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
4043                 return ret;
4044         }
4045
4046         qgroup_rescan_zero_tracking(fs_info);
4047
4048         mutex_lock(&fs_info->qgroup_rescan_lock);
4049         fs_info->qgroup_rescan_running = true;
4050         btrfs_queue_work(fs_info->qgroup_rescan_workers,
4051                          &fs_info->qgroup_rescan_work);
4052         mutex_unlock(&fs_info->qgroup_rescan_lock);
4053
4054         return 0;
4055 }
4056
4057 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
4058                                      bool interruptible)
4059 {
4060         int running;
4061         int ret = 0;
4062
4063         mutex_lock(&fs_info->qgroup_rescan_lock);
4064         running = fs_info->qgroup_rescan_running;
4065         mutex_unlock(&fs_info->qgroup_rescan_lock);
4066
4067         if (!running)
4068                 return 0;
4069
4070         if (interruptible)
4071                 ret = wait_for_completion_interruptible(
4072                                         &fs_info->qgroup_rescan_completion);
4073         else
4074                 wait_for_completion(&fs_info->qgroup_rescan_completion);
4075
4076         return ret;
4077 }
4078
4079 /*
4080  * this is only called from open_ctree where we're still single threaded, thus
4081  * locking is omitted here.
4082  */
4083 void
4084 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
4085 {
4086         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
4087                 mutex_lock(&fs_info->qgroup_rescan_lock);
4088                 fs_info->qgroup_rescan_running = true;
4089                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
4090                                  &fs_info->qgroup_rescan_work);
4091                 mutex_unlock(&fs_info->qgroup_rescan_lock);
4092         }
4093 }
4094
4095 #define rbtree_iterate_from_safe(node, next, start)                             \
4096        for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
4097
4098 static int qgroup_unreserve_range(struct btrfs_inode *inode,
4099                                   struct extent_changeset *reserved, u64 start,
4100                                   u64 len)
4101 {
4102         struct rb_node *node;
4103         struct rb_node *next;
4104         struct ulist_node *entry;
4105         int ret = 0;
4106
4107         node = reserved->range_changed.root.rb_node;
4108         if (!node)
4109                 return 0;
4110         while (node) {
4111                 entry = rb_entry(node, struct ulist_node, rb_node);
4112                 if (entry->val < start)
4113                         node = node->rb_right;
4114                 else
4115                         node = node->rb_left;
4116         }
4117
4118         if (entry->val > start && rb_prev(&entry->rb_node))
4119                 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
4120                                  rb_node);
4121
4122         rbtree_iterate_from_safe(node, next, &entry->rb_node) {
4123                 u64 entry_start;
4124                 u64 entry_end;
4125                 u64 entry_len;
4126                 int clear_ret;
4127
4128                 entry = rb_entry(node, struct ulist_node, rb_node);
4129                 entry_start = entry->val;
4130                 entry_end = entry->aux;
4131                 entry_len = entry_end - entry_start + 1;
4132
4133                 if (entry_start >= start + len)
4134                         break;
4135                 if (entry_start + entry_len <= start)
4136                         continue;
4137                 /*
4138                  * Now the entry is in [start, start + len), revert the
4139                  * EXTENT_QGROUP_RESERVED bit.
4140                  */
4141                 clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
4142                                               entry_end, EXTENT_QGROUP_RESERVED);
4143                 if (!ret && clear_ret < 0)
4144                         ret = clear_ret;
4145
4146                 ulist_del(&reserved->range_changed, entry->val, entry->aux);
4147                 if (likely(reserved->bytes_changed >= entry_len)) {
4148                         reserved->bytes_changed -= entry_len;
4149                 } else {
4150                         WARN_ON(1);
4151                         reserved->bytes_changed = 0;
4152                 }
4153         }
4154
4155         return ret;
4156 }
4157
4158 /*
4159  * Try to free some space for qgroup.
4160  *
4161  * For qgroup, there are only 3 ways to free qgroup space:
4162  * - Flush nodatacow write
4163  *   Any nodatacow write will free its reserved data space at run_delalloc_range().
4164  *   In theory, we should only flush nodatacow inodes, but it's not yet
4165  *   possible, so we need to flush the whole root.
4166  *
4167  * - Wait for ordered extents
4168  *   When ordered extents are finished, their reserved metadata is finally
4169  *   converted to per_trans status, which can be freed by later commit
4170  *   transaction.
4171  *
4172  * - Commit transaction
4173  *   This would free the meta_per_trans space.
4174  *   In theory this shouldn't provide much space, but any more qgroup space
4175  *   is needed.
4176  */
4177 static int try_flush_qgroup(struct btrfs_root *root)
4178 {
4179         int ret;
4180
4181         /* Can't hold an open transaction or we run the risk of deadlocking. */
4182         ASSERT(current->journal_info == NULL);
4183         if (WARN_ON(current->journal_info))
4184                 return 0;
4185
4186         /*
4187          * We don't want to run flush again and again, so if there is a running
4188          * one, we won't try to start a new flush, but exit directly.
4189          */
4190         if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
4191                 wait_event(root->qgroup_flush_wait,
4192                         !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
4193                 return 0;
4194         }
4195
4196         ret = btrfs_start_delalloc_snapshot(root, true);
4197         if (ret < 0)
4198                 goto out;
4199         btrfs_wait_ordered_extents(root, U64_MAX, NULL);
4200
4201         /*
4202          * After waiting for ordered extents run delayed iputs in order to free
4203          * space from unlinked files before committing the current transaction,
4204          * as ordered extents may have been holding the last reference of an
4205          * inode and they add a delayed iput when they complete.
4206          */
4207         btrfs_run_delayed_iputs(root->fs_info);
4208         btrfs_wait_on_delayed_iputs(root->fs_info);
4209
4210         ret = btrfs_commit_current_transaction(root);
4211 out:
4212         clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
4213         wake_up(&root->qgroup_flush_wait);
4214         return ret;
4215 }
4216
4217 static int qgroup_reserve_data(struct btrfs_inode *inode,
4218                         struct extent_changeset **reserved_ret, u64 start,
4219                         u64 len)
4220 {
4221         struct btrfs_root *root = inode->root;
4222         struct extent_changeset *reserved;
4223         bool new_reserved = false;
4224         u64 orig_reserved;
4225         u64 to_reserve;
4226         int ret;
4227
4228         if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4229             !is_fstree(btrfs_root_id(root)) || len == 0)
4230                 return 0;
4231
4232         /* @reserved parameter is mandatory for qgroup */
4233         if (WARN_ON(!reserved_ret))
4234                 return -EINVAL;
4235         if (!*reserved_ret) {
4236                 new_reserved = true;
4237                 *reserved_ret = extent_changeset_alloc();
4238                 if (!*reserved_ret)
4239                         return -ENOMEM;
4240         }
4241         reserved = *reserved_ret;
4242         /* Record already reserved space */
4243         orig_reserved = reserved->bytes_changed;
4244         ret = set_record_extent_bits(&inode->io_tree, start,
4245                         start + len -1, EXTENT_QGROUP_RESERVED, reserved);
4246
4247         /* Newly reserved space */
4248         to_reserve = reserved->bytes_changed - orig_reserved;
4249         trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
4250                                         to_reserve, QGROUP_RESERVE);
4251         if (ret < 0)
4252                 goto out;
4253         ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
4254         if (ret < 0)
4255                 goto cleanup;
4256
4257         return ret;
4258
4259 cleanup:
4260         qgroup_unreserve_range(inode, reserved, start, len);
4261 out:
4262         if (new_reserved) {
4263                 extent_changeset_free(reserved);
4264                 *reserved_ret = NULL;
4265         }
4266         return ret;
4267 }
4268
4269 /*
4270  * Reserve qgroup space for range [start, start + len).
4271  *
4272  * This function will either reserve space from related qgroups or do nothing
4273  * if the range is already reserved.
4274  *
4275  * Return 0 for successful reservation
4276  * Return <0 for error (including -EQUOT)
4277  *
4278  * NOTE: This function may sleep for memory allocation, dirty page flushing and
4279  *       commit transaction. So caller should not hold any dirty page locked.
4280  */
4281 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
4282                         struct extent_changeset **reserved_ret, u64 start,
4283                         u64 len)
4284 {
4285         int ret;
4286
4287         ret = qgroup_reserve_data(inode, reserved_ret, start, len);
4288         if (ret <= 0 && ret != -EDQUOT)
4289                 return ret;
4290
4291         ret = try_flush_qgroup(inode->root);
4292         if (ret < 0)
4293                 return ret;
4294         return qgroup_reserve_data(inode, reserved_ret, start, len);
4295 }
4296
4297 /* Free ranges specified by @reserved, normally in error path */
4298 static int qgroup_free_reserved_data(struct btrfs_inode *inode,
4299                                      struct extent_changeset *reserved,
4300                                      u64 start, u64 len, u64 *freed_ret)
4301 {
4302         struct btrfs_root *root = inode->root;
4303         struct ulist_node *unode;
4304         struct ulist_iterator uiter;
4305         struct extent_changeset changeset;
4306         u64 freed = 0;
4307         int ret;
4308
4309         extent_changeset_init(&changeset);
4310         len = round_up(start + len, root->fs_info->sectorsize);
4311         start = round_down(start, root->fs_info->sectorsize);
4312
4313         ULIST_ITER_INIT(&uiter);
4314         while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
4315                 u64 range_start = unode->val;
4316                 /* unode->aux is the inclusive end */
4317                 u64 range_len = unode->aux - range_start + 1;
4318                 u64 free_start;
4319                 u64 free_len;
4320
4321                 extent_changeset_release(&changeset);
4322
4323                 /* Only free range in range [start, start + len) */
4324                 if (range_start >= start + len ||
4325                     range_start + range_len <= start)
4326                         continue;
4327                 free_start = max(range_start, start);
4328                 free_len = min(start + len, range_start + range_len) -
4329                            free_start;
4330                 /*
4331                  * TODO: To also modify reserved->ranges_reserved to reflect
4332                  * the modification.
4333                  *
4334                  * However as long as we free qgroup reserved according to
4335                  * EXTENT_QGROUP_RESERVED, we won't double free.
4336                  * So not need to rush.
4337                  */
4338                 ret = clear_record_extent_bits(&inode->io_tree, free_start,
4339                                 free_start + free_len - 1,
4340                                 EXTENT_QGROUP_RESERVED, &changeset);
4341                 if (ret < 0)
4342                         goto out;
4343                 freed += changeset.bytes_changed;
4344         }
4345         btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed,
4346                                   BTRFS_QGROUP_RSV_DATA);
4347         if (freed_ret)
4348                 *freed_ret = freed;
4349         ret = 0;
4350 out:
4351         extent_changeset_release(&changeset);
4352         return ret;
4353 }
4354
4355 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
4356                         struct extent_changeset *reserved, u64 start, u64 len,
4357                         u64 *released, int free)
4358 {
4359         struct extent_changeset changeset;
4360         int trace_op = QGROUP_RELEASE;
4361         int ret;
4362
4363         if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
4364                 return clear_record_extent_bits(&inode->io_tree, start,
4365                                                 start + len - 1,
4366                                                 EXTENT_QGROUP_RESERVED, NULL);
4367         }
4368
4369         /* In release case, we shouldn't have @reserved */
4370         WARN_ON(!free && reserved);
4371         if (free && reserved)
4372                 return qgroup_free_reserved_data(inode, reserved, start, len, released);
4373         extent_changeset_init(&changeset);
4374         ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
4375                                        EXTENT_QGROUP_RESERVED, &changeset);
4376         if (ret < 0)
4377                 goto out;
4378
4379         if (free)
4380                 trace_op = QGROUP_FREE;
4381         trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
4382                                         changeset.bytes_changed, trace_op);
4383         if (free)
4384                 btrfs_qgroup_free_refroot(inode->root->fs_info,
4385                                 btrfs_root_id(inode->root),
4386                                 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4387         if (released)
4388                 *released = changeset.bytes_changed;
4389 out:
4390         extent_changeset_release(&changeset);
4391         return ret;
4392 }
4393
4394 /*
4395  * Free a reserved space range from io_tree and related qgroups
4396  *
4397  * Should be called when a range of pages get invalidated before reaching disk.
4398  * Or for error cleanup case.
4399  * if @reserved is given, only reserved range in [@start, @start + @len) will
4400  * be freed.
4401  *
4402  * For data written to disk, use btrfs_qgroup_release_data().
4403  *
4404  * NOTE: This function may sleep for memory allocation.
4405  */
4406 int btrfs_qgroup_free_data(struct btrfs_inode *inode,
4407                            struct extent_changeset *reserved,
4408                            u64 start, u64 len, u64 *freed)
4409 {
4410         return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
4411 }
4412
4413 /*
4414  * Release a reserved space range from io_tree only.
4415  *
4416  * Should be called when a range of pages get written to disk and corresponding
4417  * FILE_EXTENT is inserted into corresponding root.
4418  *
4419  * Since new qgroup accounting framework will only update qgroup numbers at
4420  * commit_transaction() time, its reserved space shouldn't be freed from
4421  * related qgroups.
4422  *
4423  * But we should release the range from io_tree, to allow further write to be
4424  * COWed.
4425  *
4426  * NOTE: This function may sleep for memory allocation.
4427  */
4428 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
4429 {
4430         return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
4431 }
4432
4433 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
4434                               enum btrfs_qgroup_rsv_type type)
4435 {
4436         if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4437             type != BTRFS_QGROUP_RSV_META_PERTRANS)
4438                 return;
4439         if (num_bytes == 0)
4440                 return;
4441
4442         spin_lock(&root->qgroup_meta_rsv_lock);
4443         if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
4444                 root->qgroup_meta_rsv_prealloc += num_bytes;
4445         else
4446                 root->qgroup_meta_rsv_pertrans += num_bytes;
4447         spin_unlock(&root->qgroup_meta_rsv_lock);
4448 }
4449
4450 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
4451                              enum btrfs_qgroup_rsv_type type)
4452 {
4453         if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4454             type != BTRFS_QGROUP_RSV_META_PERTRANS)
4455                 return 0;
4456         if (num_bytes == 0)
4457                 return 0;
4458
4459         spin_lock(&root->qgroup_meta_rsv_lock);
4460         if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
4461                 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
4462                                   num_bytes);
4463                 root->qgroup_meta_rsv_prealloc -= num_bytes;
4464         } else {
4465                 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
4466                                   num_bytes);
4467                 root->qgroup_meta_rsv_pertrans -= num_bytes;
4468         }
4469         spin_unlock(&root->qgroup_meta_rsv_lock);
4470         return num_bytes;
4471 }
4472
4473 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
4474                               enum btrfs_qgroup_rsv_type type, bool enforce)
4475 {
4476         struct btrfs_fs_info *fs_info = root->fs_info;
4477         int ret;
4478
4479         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4480             !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
4481                 return 0;
4482
4483         BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4484         trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
4485         ret = qgroup_reserve(root, num_bytes, enforce, type);
4486         if (ret < 0)
4487                 return ret;
4488         /*
4489          * Record what we have reserved into root.
4490          *
4491          * To avoid quota disabled->enabled underflow.
4492          * In that case, we may try to free space we haven't reserved
4493          * (since quota was disabled), so record what we reserved into root.
4494          * And ensure later release won't underflow this number.
4495          */
4496         add_root_meta_rsv(root, num_bytes, type);
4497         return ret;
4498 }
4499
4500 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
4501                                 enum btrfs_qgroup_rsv_type type, bool enforce,
4502                                 bool noflush)
4503 {
4504         int ret;
4505
4506         ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4507         if ((ret <= 0 && ret != -EDQUOT) || noflush)
4508                 return ret;
4509
4510         ret = try_flush_qgroup(root);
4511         if (ret < 0)
4512                 return ret;
4513         return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4514 }
4515
4516 /*
4517  * Per-transaction meta reservation should be all freed at transaction commit
4518  * time
4519  */
4520 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
4521 {
4522         struct btrfs_fs_info *fs_info = root->fs_info;
4523
4524         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4525             !is_fstree(btrfs_root_id(root)))
4526                 return;
4527
4528         /* TODO: Update trace point to handle such free */
4529         trace_qgroup_meta_free_all_pertrans(root);
4530         /* Special value -1 means to free all reserved space */
4531         btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
4532                                   BTRFS_QGROUP_RSV_META_PERTRANS);
4533 }
4534
4535 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
4536                               enum btrfs_qgroup_rsv_type type)
4537 {
4538         struct btrfs_fs_info *fs_info = root->fs_info;
4539
4540         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4541             !is_fstree(btrfs_root_id(root)))
4542                 return;
4543
4544         /*
4545          * reservation for META_PREALLOC can happen before quota is enabled,
4546          * which can lead to underflow.
4547          * Here ensure we will only free what we really have reserved.
4548          */
4549         num_bytes = sub_root_meta_rsv(root, num_bytes, type);
4550         BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4551         trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
4552         btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
4553 }
4554
4555 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
4556                                 int num_bytes)
4557 {
4558         struct btrfs_qgroup *qgroup;
4559         LIST_HEAD(qgroup_list);
4560
4561         if (num_bytes == 0)
4562                 return;
4563         if (!fs_info->quota_root)
4564                 return;
4565
4566         spin_lock(&fs_info->qgroup_lock);
4567         qgroup = find_qgroup_rb(fs_info, ref_root);
4568         if (!qgroup)
4569                 goto out;
4570
4571         qgroup_iterator_add(&qgroup_list, qgroup);
4572         list_for_each_entry(qgroup, &qgroup_list, iterator) {
4573                 struct btrfs_qgroup_list *glist;
4574
4575                 qgroup_rsv_release(fs_info, qgroup, num_bytes,
4576                                 BTRFS_QGROUP_RSV_META_PREALLOC);
4577                 if (!sb_rdonly(fs_info->sb))
4578                         qgroup_rsv_add(fs_info, qgroup, num_bytes,
4579                                        BTRFS_QGROUP_RSV_META_PERTRANS);
4580
4581                 list_for_each_entry(glist, &qgroup->groups, next_group)
4582                         qgroup_iterator_add(&qgroup_list, glist->group);
4583         }
4584 out:
4585         qgroup_iterator_clean(&qgroup_list);
4586         spin_unlock(&fs_info->qgroup_lock);
4587 }
4588
4589 /*
4590  * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
4591  *
4592  * This is called when preallocated meta reservation needs to be used.
4593  * Normally after btrfs_join_transaction() call.
4594  */
4595 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
4596 {
4597         struct btrfs_fs_info *fs_info = root->fs_info;
4598
4599         if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4600             !is_fstree(btrfs_root_id(root)))
4601                 return;
4602         /* Same as btrfs_qgroup_free_meta_prealloc() */
4603         num_bytes = sub_root_meta_rsv(root, num_bytes,
4604                                       BTRFS_QGROUP_RSV_META_PREALLOC);
4605         trace_qgroup_meta_convert(root, num_bytes);
4606         qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
4607         if (!sb_rdonly(fs_info->sb))
4608                 add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
4609 }
4610
4611 /*
4612  * Check qgroup reserved space leaking, normally at destroy inode
4613  * time
4614  */
4615 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
4616 {
4617         struct extent_changeset changeset;
4618         struct ulist_node *unode;
4619         struct ulist_iterator iter;
4620         int ret;
4621
4622         extent_changeset_init(&changeset);
4623         ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
4624                         EXTENT_QGROUP_RESERVED, &changeset);
4625
4626         WARN_ON(ret < 0);
4627         if (WARN_ON(changeset.bytes_changed)) {
4628                 ULIST_ITER_INIT(&iter);
4629                 while ((unode = ulist_next(&changeset.range_changed, &iter))) {
4630                         btrfs_warn(inode->root->fs_info,
4631                 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4632                                 btrfs_ino(inode), unode->val, unode->aux);
4633                 }
4634                 btrfs_qgroup_free_refroot(inode->root->fs_info,
4635                                 btrfs_root_id(inode->root),
4636                                 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4637
4638         }
4639         extent_changeset_release(&changeset);
4640 }
4641
4642 void btrfs_qgroup_init_swapped_blocks(
4643         struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4644 {
4645         int i;
4646
4647         spin_lock_init(&swapped_blocks->lock);
4648         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4649                 swapped_blocks->blocks[i] = RB_ROOT;
4650         swapped_blocks->swapped = false;
4651 }
4652
4653 /*
4654  * Delete all swapped blocks record of @root.
4655  * Every record here means we skipped a full subtree scan for qgroup.
4656  *
4657  * Gets called when committing one transaction.
4658  */
4659 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4660 {
4661         struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4662         int i;
4663
4664         swapped_blocks = &root->swapped_blocks;
4665
4666         spin_lock(&swapped_blocks->lock);
4667         if (!swapped_blocks->swapped)
4668                 goto out;
4669         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4670                 struct rb_root *cur_root = &swapped_blocks->blocks[i];
4671                 struct btrfs_qgroup_swapped_block *entry;
4672                 struct btrfs_qgroup_swapped_block *next;
4673
4674                 rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4675                                                      node)
4676                         kfree(entry);
4677                 swapped_blocks->blocks[i] = RB_ROOT;
4678         }
4679         swapped_blocks->swapped = false;
4680 out:
4681         spin_unlock(&swapped_blocks->lock);
4682 }
4683
4684 /*
4685  * Add subtree roots record into @subvol_root.
4686  *
4687  * @subvol_root:        tree root of the subvolume tree get swapped
4688  * @bg:                 block group under balance
4689  * @subvol_parent/slot: pointer to the subtree root in subvolume tree
4690  * @reloc_parent/slot:  pointer to the subtree root in reloc tree
4691  *                      BOTH POINTERS ARE BEFORE TREE SWAP
4692  * @last_snapshot:      last snapshot generation of the subvolume tree
4693  */
4694 int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
4695                 struct btrfs_block_group *bg,
4696                 struct extent_buffer *subvol_parent, int subvol_slot,
4697                 struct extent_buffer *reloc_parent, int reloc_slot,
4698                 u64 last_snapshot)
4699 {
4700         struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4701         struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4702         struct btrfs_qgroup_swapped_block *block;
4703         struct rb_node **cur;
4704         struct rb_node *parent = NULL;
4705         int level = btrfs_header_level(subvol_parent) - 1;
4706         int ret = 0;
4707
4708         if (!btrfs_qgroup_full_accounting(fs_info))
4709                 return 0;
4710
4711         if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4712             btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4713                 btrfs_err_rl(fs_info,
4714                 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4715                         __func__,
4716                         btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4717                         btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4718                 return -EUCLEAN;
4719         }
4720
4721         block = kmalloc(sizeof(*block), GFP_NOFS);
4722         if (!block) {
4723                 ret = -ENOMEM;
4724                 goto out;
4725         }
4726
4727         /*
4728          * @reloc_parent/slot is still before swap, while @block is going to
4729          * record the bytenr after swap, so we do the swap here.
4730          */
4731         block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4732         block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4733                                                              reloc_slot);
4734         block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4735         block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4736                                                             subvol_slot);
4737         block->last_snapshot = last_snapshot;
4738         block->level = level;
4739
4740         /*
4741          * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4742          * no one else can modify tree blocks thus we qgroup will not change
4743          * no matter the value of trace_leaf.
4744          */
4745         if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4746                 block->trace_leaf = true;
4747         else
4748                 block->trace_leaf = false;
4749         btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4750
4751         /* Insert @block into @blocks */
4752         spin_lock(&blocks->lock);
4753         cur = &blocks->blocks[level].rb_node;
4754         while (*cur) {
4755                 struct btrfs_qgroup_swapped_block *entry;
4756
4757                 parent = *cur;
4758                 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4759                                  node);
4760
4761                 if (entry->subvol_bytenr < block->subvol_bytenr) {
4762                         cur = &(*cur)->rb_left;
4763                 } else if (entry->subvol_bytenr > block->subvol_bytenr) {
4764                         cur = &(*cur)->rb_right;
4765                 } else {
4766                         if (entry->subvol_generation !=
4767                                         block->subvol_generation ||
4768                             entry->reloc_bytenr != block->reloc_bytenr ||
4769                             entry->reloc_generation !=
4770                                         block->reloc_generation) {
4771                                 /*
4772                                  * Duplicated but mismatch entry found.
4773                                  * Shouldn't happen.
4774                                  *
4775                                  * Marking qgroup inconsistent should be enough
4776                                  * for end users.
4777                                  */
4778                                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4779                                 ret = -EEXIST;
4780                         }
4781                         kfree(block);
4782                         goto out_unlock;
4783                 }
4784         }
4785         rb_link_node(&block->node, parent, cur);
4786         rb_insert_color(&block->node, &blocks->blocks[level]);
4787         blocks->swapped = true;
4788 out_unlock:
4789         spin_unlock(&blocks->lock);
4790 out:
4791         if (ret < 0)
4792                 qgroup_mark_inconsistent(fs_info);
4793         return ret;
4794 }
4795
4796 /*
4797  * Check if the tree block is a subtree root, and if so do the needed
4798  * delayed subtree trace for qgroup.
4799  *
4800  * This is called during btrfs_cow_block().
4801  */
4802 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4803                                          struct btrfs_root *root,
4804                                          struct extent_buffer *subvol_eb)
4805 {
4806         struct btrfs_fs_info *fs_info = root->fs_info;
4807         struct btrfs_tree_parent_check check = { 0 };
4808         struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4809         struct btrfs_qgroup_swapped_block *block;
4810         struct extent_buffer *reloc_eb = NULL;
4811         struct rb_node *node;
4812         bool found = false;
4813         bool swapped = false;
4814         int level = btrfs_header_level(subvol_eb);
4815         int ret = 0;
4816         int i;
4817
4818         if (!btrfs_qgroup_full_accounting(fs_info))
4819                 return 0;
4820         if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
4821                 return 0;
4822
4823         spin_lock(&blocks->lock);
4824         if (!blocks->swapped) {
4825                 spin_unlock(&blocks->lock);
4826                 return 0;
4827         }
4828         node = blocks->blocks[level].rb_node;
4829
4830         while (node) {
4831                 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4832                 if (block->subvol_bytenr < subvol_eb->start) {
4833                         node = node->rb_left;
4834                 } else if (block->subvol_bytenr > subvol_eb->start) {
4835                         node = node->rb_right;
4836                 } else {
4837                         found = true;
4838                         break;
4839                 }
4840         }
4841         if (!found) {
4842                 spin_unlock(&blocks->lock);
4843                 goto out;
4844         }
4845         /* Found one, remove it from @blocks first and update blocks->swapped */
4846         rb_erase(&block->node, &blocks->blocks[level]);
4847         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4848                 if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4849                         swapped = true;
4850                         break;
4851                 }
4852         }
4853         blocks->swapped = swapped;
4854         spin_unlock(&blocks->lock);
4855
4856         check.level = block->level;
4857         check.transid = block->reloc_generation;
4858         check.has_first_key = true;
4859         memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
4860
4861         /* Read out reloc subtree root */
4862         reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
4863         if (IS_ERR(reloc_eb)) {
4864                 ret = PTR_ERR(reloc_eb);
4865                 reloc_eb = NULL;
4866                 goto free_out;
4867         }
4868         if (!extent_buffer_uptodate(reloc_eb)) {
4869                 ret = -EIO;
4870                 goto free_out;
4871         }
4872
4873         ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4874                         block->last_snapshot, block->trace_leaf);
4875 free_out:
4876         kfree(block);
4877         free_extent_buffer(reloc_eb);
4878 out:
4879         if (ret < 0) {
4880                 btrfs_err_rl(fs_info,
4881                              "failed to account subtree at bytenr %llu: %d",
4882                              subvol_eb->start, ret);
4883                 qgroup_mark_inconsistent(fs_info);
4884         }
4885         return ret;
4886 }
4887
4888 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4889 {
4890         struct btrfs_qgroup_extent_record *entry;
4891         unsigned long index;
4892
4893         xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) {
4894                 ulist_free(entry->old_roots);
4895                 kfree(entry);
4896         }
4897         xa_destroy(&trans->delayed_refs.dirty_extents);
4898 }
4899
4900 int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
4901                               const struct btrfs_squota_delta *delta)
4902 {
4903         int ret;
4904         struct btrfs_qgroup *qgroup;
4905         struct btrfs_qgroup *qg;
4906         LIST_HEAD(qgroup_list);
4907         u64 root = delta->root;
4908         u64 num_bytes = delta->num_bytes;
4909         const int sign = (delta->is_inc ? 1 : -1);
4910
4911         if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
4912                 return 0;
4913
4914         if (!is_fstree(root))
4915                 return 0;
4916
4917         /* If the extent predates enabling quotas, don't count it. */
4918         if (delta->generation < fs_info->qgroup_enable_gen)
4919                 return 0;
4920
4921         spin_lock(&fs_info->qgroup_lock);
4922         qgroup = find_qgroup_rb(fs_info, root);
4923         if (!qgroup) {
4924                 ret = -ENOENT;
4925                 goto out;
4926         }
4927
4928         ret = 0;
4929         qgroup_iterator_add(&qgroup_list, qgroup);
4930         list_for_each_entry(qg, &qgroup_list, iterator) {
4931                 struct btrfs_qgroup_list *glist;
4932
4933                 qg->excl += num_bytes * sign;
4934                 qg->rfer += num_bytes * sign;
4935                 qgroup_dirty(fs_info, qg);
4936
4937                 list_for_each_entry(glist, &qg->groups, next_group)
4938                         qgroup_iterator_add(&qgroup_list, glist->group);
4939         }
4940         qgroup_iterator_clean(&qgroup_list);
4941
4942 out:
4943         spin_unlock(&fs_info->qgroup_lock);
4944         return ret;
4945 }