Blame - marvell/linux/fs/btrfs/qgroup.c - T108

blob: 7d0b7e58d66af2a3c5f49d169f111083b14ffcb6 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2011 STRATO. All rights reserved.
				4	*/
				5
				6	#include <linux/sched.h>
				7	#include <linux/pagemap.h>
				8	#include <linux/writeback.h>
				9	#include <linux/blkdev.h>
				10	#include <linux/rbtree.h>
				11	#include <linux/slab.h>
				12	#include <linux/workqueue.h>
				13	#include <linux/btrfs.h>
				14
				15	#include "ctree.h"
				16	#include "transaction.h"
				17	#include "disk-io.h"
				18	#include "locking.h"
				19	#include "ulist.h"
				20	#include "backref.h"
				21	#include "extent_io.h"
				22	#include "qgroup.h"
				23	#include "block-group.h"
				24
				25	/* TODO XXX FIXME
				26	* - subvol delete -> delete when ref goes to 0? delete limits also?
				27	* - reorganize keys
				28	* - compressed
				29	* - sync
				30	* - copy also limits on subvol creation
				31	* - limit
				32	* - caches for ulists
				33	* - performance benchmarks
				34	* - check all ioctl parameters
				35	*/
				36
				37	/*
				38	* Helpers to access qgroup reservation
				39	*
				40	* Callers should ensure the lock context and type are valid
				41	*/
				42
				43	static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
				44	{
				45	u64 ret = 0;
				46	int i;
				47
				48	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
				49	ret += qgroup->rsv.values[i];
				50
				51	return ret;
				52	}
				53
				54	#ifdef CONFIG_BTRFS_DEBUG
				55	static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
				56	{
				57	if (type == BTRFS_QGROUP_RSV_DATA)
				58	return "data";
				59	if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
				60	return "meta_pertrans";
				61	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
				62	return "meta_prealloc";
				63	return NULL;
				64	}
				65	#endif
				66
				67	static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
				68	struct btrfs_qgroup *qgroup, u64 num_bytes,
				69	enum btrfs_qgroup_rsv_type type)
				70	{
				71	trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
				72	qgroup->rsv.values[type] += num_bytes;
				73	}
				74
				75	static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
				76	struct btrfs_qgroup *qgroup, u64 num_bytes,
				77	enum btrfs_qgroup_rsv_type type)
				78	{
				79	trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
				80	if (qgroup->rsv.values[type] >= num_bytes) {
				81	qgroup->rsv.values[type] -= num_bytes;
				82	return;
				83	}
				84	#ifdef CONFIG_BTRFS_DEBUG
				85	WARN_RATELIMIT(1,
				86	"qgroup %llu %s reserved space underflow, have %llu to free %llu",
				87	qgroup->qgroupid, qgroup_rsv_type_str(type),
				88	qgroup->rsv.values[type], num_bytes);
				89	#endif
				90	qgroup->rsv.values[type] = 0;
				91	}
				92
				93	static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
				94	struct btrfs_qgroup *dest,
				95	struct btrfs_qgroup *src)
				96	{
				97	int i;
				98
				99	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
				100	qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
				101	}
				102
				103	static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
				104	struct btrfs_qgroup *dest,
				105	struct btrfs_qgroup *src)
				106	{
				107	int i;
				108
				109	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
				110	qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
				111	}
				112
				113	static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
				114	int mod)
				115	{
				116	if (qg->old_refcnt < seq)
				117	qg->old_refcnt = seq;
				118	qg->old_refcnt += mod;
				119	}
				120
				121	static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
				122	int mod)
				123	{
				124	if (qg->new_refcnt < seq)
				125	qg->new_refcnt = seq;
				126	qg->new_refcnt += mod;
				127	}
				128
				129	static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
				130	{
				131	if (qg->old_refcnt < seq)
				132	return 0;
				133	return qg->old_refcnt - seq;
				134	}
				135
				136	static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
				137	{
				138	if (qg->new_refcnt < seq)
				139	return 0;
				140	return qg->new_refcnt - seq;
				141	}
				142
				143	/*
				144	* glue structure to represent the relations between qgroups.
				145	*/
				146	struct btrfs_qgroup_list {
				147	struct list_head next_group;
				148	struct list_head next_member;
				149	struct btrfs_qgroup *group;
				150	struct btrfs_qgroup *member;
				151	};
				152
				153	static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
				154	{
				155	return (u64)(uintptr_t)qg;
				156	}
				157
				158	static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
				159	{
				160	return (struct btrfs_qgroup *)(uintptr_t)n->aux;
				161	}
				162
				163	static int
				164	qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
				165	int init_flags);
				166	static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
				167
				168	/* must be called with qgroup_ioctl_lock held */
				169	static struct btrfs_qgroup find_qgroup_rb(struct btrfs_fs_info fs_info,
				170	u64 qgroupid)
				171	{
				172	struct rb_node *n = fs_info->qgroup_tree.rb_node;
				173	struct btrfs_qgroup *qgroup;
				174
				175	while (n) {
				176	qgroup = rb_entry(n, struct btrfs_qgroup, node);
				177	if (qgroup->qgroupid < qgroupid)
				178	n = n->rb_left;
				179	else if (qgroup->qgroupid > qgroupid)
				180	n = n->rb_right;
				181	else
				182	return qgroup;
				183	}
				184	return NULL;
				185	}
				186
				187	/* must be called with qgroup_lock held */
				188	static struct btrfs_qgroup add_qgroup_rb(struct btrfs_fs_info fs_info,
				189	u64 qgroupid)
				190	{
				191	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
				192	struct rb_node *parent = NULL;
				193	struct btrfs_qgroup *qgroup;
				194
				195	while (*p) {
				196	parent = *p;
				197	qgroup = rb_entry(parent, struct btrfs_qgroup, node);
				198
				199	if (qgroup->qgroupid < qgroupid)
				200	p = &(*p)->rb_left;
				201	else if (qgroup->qgroupid > qgroupid)
				202	p = &(*p)->rb_right;
				203	else
				204	return qgroup;
				205	}
				206
				207	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
				208	if (!qgroup)
				209	return ERR_PTR(-ENOMEM);
				210
				211	qgroup->qgroupid = qgroupid;
				212	INIT_LIST_HEAD(&qgroup->groups);
				213	INIT_LIST_HEAD(&qgroup->members);
				214	INIT_LIST_HEAD(&qgroup->dirty);
				215
				216	rb_link_node(&qgroup->node, parent, p);
				217	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
				218
				219	return qgroup;
				220	}
				221
				222	static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
				223	{
				224	struct btrfs_qgroup_list *list;
				225
				226	list_del(&qgroup->dirty);
				227	while (!list_empty(&qgroup->groups)) {
				228	list = list_first_entry(&qgroup->groups,
				229	struct btrfs_qgroup_list, next_group);
				230	list_del(&list->next_group);
				231	list_del(&list->next_member);
				232	kfree(list);
				233	}
				234
				235	while (!list_empty(&qgroup->members)) {
				236	list = list_first_entry(&qgroup->members,
				237	struct btrfs_qgroup_list, next_member);
				238	list_del(&list->next_group);
				239	list_del(&list->next_member);
				240	kfree(list);
				241	}
				242	kfree(qgroup);
				243	}
				244
				245	/* must be called with qgroup_lock held */
				246	static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
				247	{
				248	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
				249
				250	if (!qgroup)
				251	return -ENOENT;
				252
				253	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
				254	__del_qgroup_rb(qgroup);
				255	return 0;
				256	}
				257
				258	/* must be called with qgroup_lock held */
				259	static int add_relation_rb(struct btrfs_fs_info *fs_info,
				260	u64 memberid, u64 parentid)
				261	{
				262	struct btrfs_qgroup *member;
				263	struct btrfs_qgroup *parent;
				264	struct btrfs_qgroup_list *list;
				265
				266	member = find_qgroup_rb(fs_info, memberid);
				267	parent = find_qgroup_rb(fs_info, parentid);
				268	if (!member \|\| !parent)
				269	return -ENOENT;
				270
				271	list = kzalloc(sizeof(*list), GFP_ATOMIC);
				272	if (!list)
				273	return -ENOMEM;
				274
				275	list->group = parent;
				276	list->member = member;
				277	list_add_tail(&list->next_group, &member->groups);
				278	list_add_tail(&list->next_member, &parent->members);
				279
				280	return 0;
				281	}
				282
				283	/* must be called with qgroup_lock held */
				284	static int del_relation_rb(struct btrfs_fs_info *fs_info,
				285	u64 memberid, u64 parentid)
				286	{
				287	struct btrfs_qgroup *member;
				288	struct btrfs_qgroup *parent;
				289	struct btrfs_qgroup_list *list;
				290
				291	member = find_qgroup_rb(fs_info, memberid);
				292	parent = find_qgroup_rb(fs_info, parentid);
				293	if (!member \|\| !parent)
				294	return -ENOENT;
				295
				296	list_for_each_entry(list, &member->groups, next_group) {
				297	if (list->group == parent) {
				298	list_del(&list->next_group);
				299	list_del(&list->next_member);
				300	kfree(list);
				301	return 0;
				302	}
				303	}
				304	return -ENOENT;
				305	}
				306
				307	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				308	int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
				309	u64 rfer, u64 excl)
				310	{
				311	struct btrfs_qgroup *qgroup;
				312
				313	qgroup = find_qgroup_rb(fs_info, qgroupid);
				314	if (!qgroup)
				315	return -EINVAL;
				316	if (qgroup->rfer != rfer \|\| qgroup->excl != excl)
				317	return -EINVAL;
				318	return 0;
				319	}
				320	#endif
				321
				322	/*
				323	* The full config is read in one go, only called from open_ctree()
				324	* It doesn't use any locking, as at this point we're still single-threaded
				325	*/
				326	int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
				327	{
				328	struct btrfs_key key;
				329	struct btrfs_key found_key;
				330	struct btrfs_root *quota_root = fs_info->quota_root;
				331	struct btrfs_path *path = NULL;
				332	struct extent_buffer *l;
				333	int slot;
				334	int ret = 0;
				335	u64 flags = 0;
				336	u64 rescan_progress = 0;
				337
				338	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				339	return 0;
				340
				341	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
				342	if (!fs_info->qgroup_ulist) {
				343	ret = -ENOMEM;
				344	goto out;
				345	}
				346
				347	path = btrfs_alloc_path();
				348	if (!path) {
				349	ret = -ENOMEM;
				350	goto out;
				351	}
				352
				353	/* default this to quota off, in case no status key is found */
				354	fs_info->qgroup_flags = 0;
				355
				356	/*
				357	* pass 1: read status, all qgroup infos and limits
				358	*/
				359	key.objectid = 0;
				360	key.type = 0;
				361	key.offset = 0;
				362	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
				363	if (ret)
				364	goto out;
				365
				366	while (1) {
				367	struct btrfs_qgroup *qgroup;
				368
				369	slot = path->slots[0];
				370	l = path->nodes[0];
				371	btrfs_item_key_to_cpu(l, &found_key, slot);
				372
				373	if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
				374	struct btrfs_qgroup_status_item *ptr;
				375
				376	ptr = btrfs_item_ptr(l, slot,
				377	struct btrfs_qgroup_status_item);
				378
				379	if (btrfs_qgroup_status_version(l, ptr) !=
				380	BTRFS_QGROUP_STATUS_VERSION) {
				381	btrfs_err(fs_info,
				382	"old qgroup version, quota disabled");
				383	goto out;
				384	}
				385	if (btrfs_qgroup_status_generation(l, ptr) !=
				386	fs_info->generation) {
				387	flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				388	btrfs_err(fs_info,
				389	"qgroup generation mismatch, marked as inconsistent");
				390	}
				391	fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
				392	ptr);
				393	rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
				394	goto next1;
				395	}
				396
				397	if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
				398	found_key.type != BTRFS_QGROUP_LIMIT_KEY)
				399	goto next1;
				400
				401	qgroup = find_qgroup_rb(fs_info, found_key.offset);
				402	if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) \|\|
				403	(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
				404	btrfs_err(fs_info, "inconsistent qgroup config");
				405	flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				406	}
				407	if (!qgroup) {
				408	qgroup = add_qgroup_rb(fs_info, found_key.offset);
				409	if (IS_ERR(qgroup)) {
				410	ret = PTR_ERR(qgroup);
				411	goto out;
				412	}
				413	}
				414	switch (found_key.type) {
				415	case BTRFS_QGROUP_INFO_KEY: {
				416	struct btrfs_qgroup_info_item *ptr;
				417
				418	ptr = btrfs_item_ptr(l, slot,
				419	struct btrfs_qgroup_info_item);
				420	qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
				421	qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
				422	qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
				423	qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
				424	/* generation currently unused */
				425	break;
				426	}
				427	case BTRFS_QGROUP_LIMIT_KEY: {
				428	struct btrfs_qgroup_limit_item *ptr;
				429
				430	ptr = btrfs_item_ptr(l, slot,
				431	struct btrfs_qgroup_limit_item);
				432	qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
				433	qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
				434	qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
				435	qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
				436	qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
				437	break;
				438	}
				439	}
				440	next1:
				441	ret = btrfs_next_item(quota_root, path);
				442	if (ret < 0)
				443	goto out;
				444	if (ret)
				445	break;
				446	}
				447	btrfs_release_path(path);
				448
				449	/*
				450	* pass 2: read all qgroup relations
				451	*/
				452	key.objectid = 0;
				453	key.type = BTRFS_QGROUP_RELATION_KEY;
				454	key.offset = 0;
				455	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
				456	if (ret)
				457	goto out;
				458	while (1) {
				459	slot = path->slots[0];
				460	l = path->nodes[0];
				461	btrfs_item_key_to_cpu(l, &found_key, slot);
				462
				463	if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
				464	goto next2;
				465
				466	if (found_key.objectid > found_key.offset) {
				467	/* parent <- member, not needed to build config */
				468	/* FIXME should we omit the key completely? */
				469	goto next2;
				470	}
				471
				472	ret = add_relation_rb(fs_info, found_key.objectid,
				473	found_key.offset);
				474	if (ret == -ENOENT) {
				475	btrfs_warn(fs_info,
				476	"orphan qgroup relation 0x%llx->0x%llx",
				477	found_key.objectid, found_key.offset);
				478	ret = 0; /* ignore the error */
				479	}
				480	if (ret)
				481	goto out;
				482	next2:
				483	ret = btrfs_next_item(quota_root, path);
				484	if (ret < 0)
				485	goto out;
				486	if (ret)
				487	break;
				488	}
				489	out:
				490	btrfs_free_path(path);
				491	fs_info->qgroup_flags \|= flags;
				492	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
				493	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
				494	else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
				495	ret >= 0)
				496	ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
				497
				498	if (ret < 0) {
				499	ulist_free(fs_info->qgroup_ulist);
				500	fs_info->qgroup_ulist = NULL;
				501	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
				502	}
				503
				504	return ret < 0 ? ret : 0;
				505	}
				506
				507	static u64 btrfs_qgroup_subvolid(u64 qgroupid)
				508	{
				509	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
				510	}
				511
				512	/*
				513	* Called in close_ctree() when quota is still enabled. This verifies we don't
				514	* leak some reserved space.
				515	*
				516	* Return false if no reserved space is left.
				517	* Return true if some reserved space is leaked.
				518	*/
				519	bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
				520	{
				521	struct rb_node *node;
				522	bool ret = false;
				523
				524	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				525	return ret;
				526	/*
				527	* Since we're unmounting, there is no race and no need to grab qgroup
				528	* lock. And here we don't go post-order to provide a more user
				529	* friendly sorted result.
				530	*/
				531	for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
				532	struct btrfs_qgroup *qgroup;
				533	int i;
				534
				535	qgroup = rb_entry(node, struct btrfs_qgroup, node);
				536	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
				537	if (qgroup->rsv.values[i]) {
				538	ret = true;
				539	btrfs_warn(fs_info,
				540	"qgroup %llu/%llu has unreleased space, type %d rsv %llu",
				541	btrfs_qgroup_level(qgroup->qgroupid),
				542	btrfs_qgroup_subvolid(qgroup->qgroupid),
				543	i, qgroup->rsv.values[i]);
				544	}
				545	}
				546	}
				547	return ret;
				548	}
				549
				550	/*
				551	* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
				552	* first two are in single-threaded paths.And for the third one, we have set
				553	* quota_root to be null with qgroup_lock held before, so it is safe to clean
				554	* up the in-memory structures without qgroup_lock held.
				555	*/
				556	void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
				557	{
				558	struct rb_node *n;
				559	struct btrfs_qgroup *qgroup;
				560
				561	while ((n = rb_first(&fs_info->qgroup_tree))) {
				562	qgroup = rb_entry(n, struct btrfs_qgroup, node);
				563	rb_erase(n, &fs_info->qgroup_tree);
				564	__del_qgroup_rb(qgroup);
				565	}
				566	/*
				567	* We call btrfs_free_qgroup_config() when unmounting
				568	* filesystem and disabling quota, so we set qgroup_ulist
				569	* to be null here to avoid double free.
				570	*/
				571	ulist_free(fs_info->qgroup_ulist);
				572	fs_info->qgroup_ulist = NULL;
				573	}
				574
				575	static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
				576	u64 dst)
				577	{
				578	int ret;
				579	struct btrfs_root *quota_root = trans->fs_info->quota_root;
				580	struct btrfs_path *path;
				581	struct btrfs_key key;
				582
				583	path = btrfs_alloc_path();
				584	if (!path)
				585	return -ENOMEM;
				586
				587	key.objectid = src;
				588	key.type = BTRFS_QGROUP_RELATION_KEY;
				589	key.offset = dst;
				590
				591	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
				592
				593	btrfs_mark_buffer_dirty(path->nodes[0]);
				594
				595	btrfs_free_path(path);
				596	return ret;
				597	}
				598
				599	static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
				600	u64 dst)
				601	{
				602	int ret;
				603	struct btrfs_root *quota_root = trans->fs_info->quota_root;
				604	struct btrfs_path *path;
				605	struct btrfs_key key;
				606
				607	path = btrfs_alloc_path();
				608	if (!path)
				609	return -ENOMEM;
				610
				611	key.objectid = src;
				612	key.type = BTRFS_QGROUP_RELATION_KEY;
				613	key.offset = dst;
				614
				615	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
				616	if (ret < 0)
				617	goto out;
				618
				619	if (ret > 0) {
				620	ret = -ENOENT;
				621	goto out;
				622	}
				623
				624	ret = btrfs_del_item(trans, quota_root, path);
				625	out:
				626	btrfs_free_path(path);
				627	return ret;
				628	}
				629
				630	static int add_qgroup_item(struct btrfs_trans_handle *trans,
				631	struct btrfs_root *quota_root, u64 qgroupid)
				632	{
				633	int ret;
				634	struct btrfs_path *path;
				635	struct btrfs_qgroup_info_item *qgroup_info;
				636	struct btrfs_qgroup_limit_item *qgroup_limit;
				637	struct extent_buffer *leaf;
				638	struct btrfs_key key;
				639
				640	if (btrfs_is_testing(quota_root->fs_info))
				641	return 0;
				642
				643	path = btrfs_alloc_path();
				644	if (!path)
				645	return -ENOMEM;
				646
				647	key.objectid = 0;
				648	key.type = BTRFS_QGROUP_INFO_KEY;
				649	key.offset = qgroupid;
				650
				651	/*
				652	* Avoid a transaction abort by catching -EEXIST here. In that
				653	* case, we proceed by re-initializing the existing structure
				654	* on disk.
				655	*/
				656
				657	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
				658	sizeof(*qgroup_info));
				659	if (ret && ret != -EEXIST)
				660	goto out;
				661
				662	leaf = path->nodes[0];
				663	qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
				664	struct btrfs_qgroup_info_item);
				665	btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
				666	btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
				667	btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
				668	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
				669	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
				670
				671	btrfs_mark_buffer_dirty(leaf);
				672
				673	btrfs_release_path(path);
				674
				675	key.type = BTRFS_QGROUP_LIMIT_KEY;
				676	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
				677	sizeof(*qgroup_limit));
				678	if (ret && ret != -EEXIST)
				679	goto out;
				680
				681	leaf = path->nodes[0];
				682	qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
				683	struct btrfs_qgroup_limit_item);
				684	btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
				685	btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
				686	btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
				687	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
				688	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
				689
				690	btrfs_mark_buffer_dirty(leaf);
				691
				692	ret = 0;
				693	out:
				694	btrfs_free_path(path);
				695	return ret;
				696	}
				697
				698	static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
				699	{
				700	int ret;
				701	struct btrfs_root *quota_root = trans->fs_info->quota_root;
				702	struct btrfs_path *path;
				703	struct btrfs_key key;
				704
				705	path = btrfs_alloc_path();
				706	if (!path)
				707	return -ENOMEM;
				708
				709	key.objectid = 0;
				710	key.type = BTRFS_QGROUP_INFO_KEY;
				711	key.offset = qgroupid;
				712	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
				713	if (ret < 0)
				714	goto out;
				715
				716	if (ret > 0) {
				717	ret = -ENOENT;
				718	goto out;
				719	}
				720
				721	ret = btrfs_del_item(trans, quota_root, path);
				722	if (ret)
				723	goto out;
				724
				725	btrfs_release_path(path);
				726
				727	key.type = BTRFS_QGROUP_LIMIT_KEY;
				728	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
				729	if (ret < 0)
				730	goto out;
				731
				732	if (ret > 0) {
				733	ret = -ENOENT;
				734	goto out;
				735	}
				736
				737	ret = btrfs_del_item(trans, quota_root, path);
				738
				739	out:
				740	btrfs_free_path(path);
				741	return ret;
				742	}
				743
				744	static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
				745	struct btrfs_qgroup *qgroup)
				746	{
				747	struct btrfs_root *quota_root = trans->fs_info->quota_root;
				748	struct btrfs_path *path;
				749	struct btrfs_key key;
				750	struct extent_buffer *l;
				751	struct btrfs_qgroup_limit_item *qgroup_limit;
				752	int ret;
				753	int slot;
				754
				755	key.objectid = 0;
				756	key.type = BTRFS_QGROUP_LIMIT_KEY;
				757	key.offset = qgroup->qgroupid;
				758
				759	path = btrfs_alloc_path();
				760	if (!path)
				761	return -ENOMEM;
				762
				763	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
				764	if (ret > 0)
				765	ret = -ENOENT;
				766
				767	if (ret)
				768	goto out;
				769
				770	l = path->nodes[0];
				771	slot = path->slots[0];
				772	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
				773	btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
				774	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
				775	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
				776	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
				777	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
				778
				779	btrfs_mark_buffer_dirty(l);
				780
				781	out:
				782	btrfs_free_path(path);
				783	return ret;
				784	}
				785
				786	static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
				787	struct btrfs_qgroup *qgroup)
				788	{
				789	struct btrfs_fs_info *fs_info = trans->fs_info;
				790	struct btrfs_root *quota_root = fs_info->quota_root;
				791	struct btrfs_path *path;
				792	struct btrfs_key key;
				793	struct extent_buffer *l;
				794	struct btrfs_qgroup_info_item *qgroup_info;
				795	int ret;
				796	int slot;
				797
				798	if (btrfs_is_testing(fs_info))
				799	return 0;
				800
				801	key.objectid = 0;
				802	key.type = BTRFS_QGROUP_INFO_KEY;
				803	key.offset = qgroup->qgroupid;
				804
				805	path = btrfs_alloc_path();
				806	if (!path)
				807	return -ENOMEM;
				808
				809	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
				810	if (ret > 0)
				811	ret = -ENOENT;
				812
				813	if (ret)
				814	goto out;
				815
				816	l = path->nodes[0];
				817	slot = path->slots[0];
				818	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
				819	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
				820	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
				821	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
				822	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
				823	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
				824
				825	btrfs_mark_buffer_dirty(l);
				826
				827	out:
				828	btrfs_free_path(path);
				829	return ret;
				830	}
				831
				832	static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
				833	{
				834	struct btrfs_fs_info *fs_info = trans->fs_info;
				835	struct btrfs_root *quota_root = fs_info->quota_root;
				836	struct btrfs_path *path;
				837	struct btrfs_key key;
				838	struct extent_buffer *l;
				839	struct btrfs_qgroup_status_item *ptr;
				840	int ret;
				841	int slot;
				842
				843	key.objectid = 0;
				844	key.type = BTRFS_QGROUP_STATUS_KEY;
				845	key.offset = 0;
				846
				847	path = btrfs_alloc_path();
				848	if (!path)
				849	return -ENOMEM;
				850
				851	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
				852	if (ret > 0)
				853	ret = -ENOENT;
				854
				855	if (ret)
				856	goto out;
				857
				858	l = path->nodes[0];
				859	slot = path->slots[0];
				860	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
				861	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
				862	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
				863	btrfs_set_qgroup_status_rescan(l, ptr,
				864	fs_info->qgroup_rescan_progress.objectid);
				865
				866	btrfs_mark_buffer_dirty(l);
				867
				868	out:
				869	btrfs_free_path(path);
				870	return ret;
				871	}
				872
				873	/*
				874	* called with qgroup_lock held
				875	*/
				876	static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
				877	struct btrfs_root *root)
				878	{
				879	struct btrfs_path *path;
				880	struct btrfs_key key;
				881	struct extent_buffer *leaf = NULL;
				882	int ret;
				883	int nr = 0;
				884
				885	path = btrfs_alloc_path();
				886	if (!path)
				887	return -ENOMEM;
				888
				889	path->leave_spinning = 1;
				890
				891	key.objectid = 0;
				892	key.offset = 0;
				893	key.type = 0;
				894
				895	while (1) {
				896	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				897	if (ret < 0)
				898	goto out;
				899	leaf = path->nodes[0];
				900	nr = btrfs_header_nritems(leaf);
				901	if (!nr)
				902	break;
				903	/*
				904	* delete the leaf one by one
				905	* since the whole tree is going
				906	* to be deleted.
				907	*/
				908	path->slots[0] = 0;
				909	ret = btrfs_del_items(trans, root, path, 0, nr);
				910	if (ret)
				911	goto out;
				912
				913	btrfs_release_path(path);
				914	}
				915	ret = 0;
				916	out:
				917	btrfs_free_path(path);
				918	return ret;
				919	}
				920
				921	int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
				922	{
				923	struct btrfs_root *quota_root;
				924	struct btrfs_root *tree_root = fs_info->tree_root;
				925	struct btrfs_path *path = NULL;
				926	struct btrfs_qgroup_status_item *ptr;
				927	struct extent_buffer *leaf;
				928	struct btrfs_key key;
				929	struct btrfs_key found_key;
				930	struct btrfs_qgroup *qgroup = NULL;
				931	struct btrfs_trans_handle *trans = NULL;
				932	struct ulist *ulist = NULL;
				933	int ret = 0;
				934	int slot;
				935
				936	/*
				937	* We need to have subvol_sem write locked, to prevent races between
				938	* concurrent tasks trying to enable quotas, because we will unlock
				939	* and relock qgroup_ioctl_lock before setting fs_info->quota_root
				940	* and before setting BTRFS_FS_QUOTA_ENABLED.
				941	*/
				942	lockdep_assert_held_write(&fs_info->subvol_sem);
				943
				944	mutex_lock(&fs_info->qgroup_ioctl_lock);
				945	if (fs_info->quota_root)
				946	goto out;
				947
				948	ulist = ulist_alloc(GFP_KERNEL);
				949	if (!ulist) {
				950	ret = -ENOMEM;
				951	goto out;
				952	}
				953
				954	/*
				955	* Unlock qgroup_ioctl_lock before starting the transaction. This is to
				956	* avoid lock acquisition inversion problems (reported by lockdep) between
				957	* qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
				958	* start a transaction.
				959	* After we started the transaction lock qgroup_ioctl_lock again and
				960	* check if someone else created the quota root in the meanwhile. If so,
				961	* just return success and release the transaction handle.
				962	*
				963	* Also we don't need to worry about someone else calling
				964	* btrfs_sysfs_add_qgroups() after we unlock and getting an error because
				965	* that function returns 0 (success) when the sysfs entries already exist.
				966	*/
				967	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				968
				969	/*
				970	* 1 for quota root item
				971	* 1 for BTRFS_QGROUP_STATUS item
				972	*
				973	* Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
				974	* per subvolume. However those are not currently reserved since it
				975	* would be a lot of overkill.
				976	*/
				977	trans = btrfs_start_transaction(tree_root, 2);
				978
				979	mutex_lock(&fs_info->qgroup_ioctl_lock);
				980	if (IS_ERR(trans)) {
				981	ret = PTR_ERR(trans);
				982	trans = NULL;
				983	goto out;
				984	}
				985
				986	if (fs_info->quota_root)
				987	goto out;
				988
				989	fs_info->qgroup_ulist = ulist;
				990	ulist = NULL;
				991
				992	/*
				993	* initially create the quota tree
				994	*/
				995	quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
				996	if (IS_ERR(quota_root)) {
				997	ret = PTR_ERR(quota_root);
				998	btrfs_abort_transaction(trans, ret);
				999	goto out;
				1000	}
				1001
				1002	path = btrfs_alloc_path();
				1003	if (!path) {
				1004	ret = -ENOMEM;
				1005	btrfs_abort_transaction(trans, ret);
				1006	goto out_free_root;
				1007	}
				1008
				1009	key.objectid = 0;
				1010	key.type = BTRFS_QGROUP_STATUS_KEY;
				1011	key.offset = 0;
				1012
				1013	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
				1014	sizeof(*ptr));
				1015	if (ret) {
				1016	btrfs_abort_transaction(trans, ret);
				1017	goto out_free_path;
				1018	}
				1019
				1020	leaf = path->nodes[0];
				1021	ptr = btrfs_item_ptr(leaf, path->slots[0],
				1022	struct btrfs_qgroup_status_item);
				1023	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
				1024	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
				1025	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON \|
				1026	BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				1027	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
				1028	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
				1029
				1030	btrfs_mark_buffer_dirty(leaf);
				1031
				1032	key.objectid = 0;
				1033	key.type = BTRFS_ROOT_REF_KEY;
				1034	key.offset = 0;
				1035
				1036	btrfs_release_path(path);
				1037	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
				1038	if (ret > 0)
				1039	goto out_add_root;
				1040	if (ret < 0) {
				1041	btrfs_abort_transaction(trans, ret);
				1042	goto out_free_path;
				1043	}
				1044
				1045	while (1) {
				1046	slot = path->slots[0];
				1047	leaf = path->nodes[0];
				1048	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				1049
				1050	if (found_key.type == BTRFS_ROOT_REF_KEY) {
				1051	ret = add_qgroup_item(trans, quota_root,
				1052	found_key.offset);
				1053	if (ret) {
				1054	btrfs_abort_transaction(trans, ret);
				1055	goto out_free_path;
				1056	}
				1057
				1058	qgroup = add_qgroup_rb(fs_info, found_key.offset);
				1059	if (IS_ERR(qgroup)) {
				1060	ret = PTR_ERR(qgroup);
				1061	btrfs_abort_transaction(trans, ret);
				1062	goto out_free_path;
				1063	}
				1064	}
				1065	ret = btrfs_next_item(tree_root, path);
				1066	if (ret < 0) {
				1067	btrfs_abort_transaction(trans, ret);
				1068	goto out_free_path;
				1069	}
				1070	if (ret)
				1071	break;
				1072	}
				1073
				1074	out_add_root:
				1075	btrfs_release_path(path);
				1076	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
				1077	if (ret) {
				1078	btrfs_abort_transaction(trans, ret);
				1079	goto out_free_path;
				1080	}
				1081
				1082	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
				1083	if (IS_ERR(qgroup)) {
				1084	ret = PTR_ERR(qgroup);
				1085	btrfs_abort_transaction(trans, ret);
				1086	goto out_free_path;
				1087	}
				1088
				1089	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1090	/*
				1091	* Commit the transaction while not holding qgroup_ioctl_lock, to avoid
				1092	* a deadlock with tasks concurrently doing other qgroup operations, such
				1093	* adding/removing qgroups or adding/deleting qgroup relations for example,
				1094	* because all qgroup operations first start or join a transaction and then
				1095	* lock the qgroup_ioctl_lock mutex.
				1096	* We are safe from a concurrent task trying to enable quotas, by calling
				1097	* this function, since we are serialized by fs_info->subvol_sem.
				1098	*/
				1099	ret = btrfs_commit_transaction(trans);
				1100	trans = NULL;
				1101	mutex_lock(&fs_info->qgroup_ioctl_lock);
				1102	if (ret)
				1103	goto out_free_path;
				1104
				1105	/*
				1106	* Set quota enabled flag after committing the transaction, to avoid
				1107	* deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
				1108	* creation.
				1109	*/
				1110	spin_lock(&fs_info->qgroup_lock);
				1111	fs_info->quota_root = quota_root;
				1112	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
				1113	spin_unlock(&fs_info->qgroup_lock);
				1114
				1115	ret = qgroup_rescan_init(fs_info, 0, 1);
				1116	if (!ret) {
				1117	qgroup_rescan_zero_tracking(fs_info);
				1118	fs_info->qgroup_rescan_running = true;
				1119	btrfs_queue_work(fs_info->qgroup_rescan_workers,
				1120	&fs_info->qgroup_rescan_work);
				1121	} else {
				1122	/*
				1123	* We have set both BTRFS_FS_QUOTA_ENABLED and
				1124	* BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
				1125	* -EINPROGRESS. That can happen because someone started the
				1126	* rescan worker by calling quota rescan ioctl before we
				1127	* attempted to initialize the rescan worker. Failure due to
				1128	* quotas disabled in the meanwhile is not possible, because
				1129	* we are holding a write lock on fs_info->subvol_sem, which
				1130	* is also acquired when disabling quotas.
				1131	* Ignore such error, and any other error would need to undo
				1132	* everything we did in the transaction we just committed.
				1133	*/
				1134	ASSERT(ret == -EINPROGRESS);
				1135	ret = 0;
				1136	}
				1137
				1138	out_free_path:
				1139	btrfs_free_path(path);
				1140	out_free_root:
				1141	if (ret) {
				1142	free_extent_buffer(quota_root->node);
				1143	free_extent_buffer(quota_root->commit_root);
				1144	kfree(quota_root);
				1145	}
				1146	out:
				1147	if (ret) {
				1148	ulist_free(fs_info->qgroup_ulist);
				1149	fs_info->qgroup_ulist = NULL;
				1150	}
				1151	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1152	if (ret && trans)
				1153	btrfs_end_transaction(trans);
				1154	else if (trans)
				1155	ret = btrfs_end_transaction(trans);
				1156	ulist_free(ulist);
				1157	return ret;
				1158	}
				1159
				1160	int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
				1161	{
				1162	struct btrfs_root *quota_root;
				1163	struct btrfs_trans_handle *trans = NULL;
				1164	int ret = 0;
				1165
				1166	/*
				1167	* We need to have subvol_sem write locked to prevent races with
				1168	* snapshot creation.
				1169	*/
				1170	lockdep_assert_held_write(&fs_info->subvol_sem);
				1171
				1172	/*
				1173	* Lock the cleaner mutex to prevent races with concurrent relocation,
				1174	* because relocation may be building backrefs for blocks of the quota
				1175	* root while we are deleting the root. This is like dropping fs roots
				1176	* of deleted snapshots/subvolumes, we need the same protection.
				1177	*
				1178	* This also prevents races between concurrent tasks trying to disable
				1179	* quotas, because we will unlock and relock qgroup_ioctl_lock across
				1180	* BTRFS_FS_QUOTA_ENABLED changes.
				1181	*/
				1182	mutex_lock(&fs_info->cleaner_mutex);
				1183
				1184	mutex_lock(&fs_info->qgroup_ioctl_lock);
				1185	if (!fs_info->quota_root)
				1186	goto out;
				1187
				1188	/*
				1189	* Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
				1190	* complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
				1191	* to lock that mutex while holding a transaction handle and the rescan
				1192	* worker needs to commit a transaction.
				1193	*/
				1194	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1195
				1196	/*
				1197	* Request qgroup rescan worker to complete and wait for it. This wait
				1198	* must be done before transaction start for quota disable since it may
				1199	* deadlock with transaction by the qgroup rescan worker.
				1200	*/
				1201	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
				1202	btrfs_qgroup_wait_for_completion(fs_info, false);
				1203
				1204	/*
				1205	* 1 For the root item
				1206	*
				1207	* We should also reserve enough items for the quota tree deletion in
				1208	* btrfs_clean_quota_tree but this is not done.
				1209	*
				1210	* Also, we must always start a transaction without holding the mutex
				1211	* qgroup_ioctl_lock, see btrfs_quota_enable().
				1212	*/
				1213	trans = btrfs_start_transaction(fs_info->tree_root, 1);
				1214
				1215	mutex_lock(&fs_info->qgroup_ioctl_lock);
				1216	if (IS_ERR(trans)) {
				1217	ret = PTR_ERR(trans);
				1218	trans = NULL;
				1219	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
				1220	goto out;
				1221	}
				1222
				1223	if (!fs_info->quota_root)
				1224	goto out;
				1225
				1226	spin_lock(&fs_info->qgroup_lock);
				1227	quota_root = fs_info->quota_root;
				1228	fs_info->quota_root = NULL;
				1229	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
				1230	spin_unlock(&fs_info->qgroup_lock);
				1231
				1232	btrfs_free_qgroup_config(fs_info);
				1233
				1234	ret = btrfs_clean_quota_tree(trans, quota_root);
				1235	if (ret) {
				1236	btrfs_abort_transaction(trans, ret);
				1237	goto out;
				1238	}
				1239
				1240	ret = btrfs_del_root(trans, &quota_root->root_key);
				1241	if (ret) {
				1242	btrfs_abort_transaction(trans, ret);
				1243	goto out;
				1244	}
				1245
				1246	spin_lock(&fs_info->trans_lock);
				1247	list_del(&quota_root->dirty_list);
				1248	spin_unlock(&fs_info->trans_lock);
				1249
				1250	btrfs_tree_lock(quota_root->node);
				1251	btrfs_clean_tree_block(quota_root->node);
				1252	btrfs_tree_unlock(quota_root->node);
				1253	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
				1254
				1255	free_extent_buffer(quota_root->node);
				1256	free_extent_buffer(quota_root->commit_root);
				1257	kfree(quota_root);
				1258
				1259	out:
				1260	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1261	if (ret && trans)
				1262	btrfs_end_transaction(trans);
				1263	else if (trans)
				1264	ret = btrfs_end_transaction(trans);
				1265	mutex_unlock(&fs_info->cleaner_mutex);
				1266
				1267	return ret;
				1268	}
				1269
				1270	static void qgroup_dirty(struct btrfs_fs_info *fs_info,
				1271	struct btrfs_qgroup *qgroup)
				1272	{
				1273	if (list_empty(&qgroup->dirty))
				1274	list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
				1275	}
				1276
				1277	/*
				1278	* The easy accounting, we're updating qgroup relationship whose child qgroup
				1279	* only has exclusive extents.
				1280	*
				1281	* In this case, all exclusive extents will also be exclusive for parent, so
				1282	* excl/rfer just get added/removed.
				1283	*
				1284	* So is qgroup reservation space, which should also be added/removed to
				1285	* parent.
				1286	* Or when child tries to release reservation space, parent will underflow its
				1287	* reservation (for relationship adding case).
				1288	*
				1289	* Caller should hold fs_info->qgroup_lock.
				1290	*/
				1291	static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
				1292	struct ulist *tmp, u64 ref_root,
				1293	struct btrfs_qgroup *src, int sign)
				1294	{
				1295	struct btrfs_qgroup *qgroup;
				1296	struct btrfs_qgroup_list *glist;
				1297	struct ulist_node *unode;
				1298	struct ulist_iterator uiter;
				1299	u64 num_bytes = src->excl;
				1300	int ret = 0;
				1301
				1302	qgroup = find_qgroup_rb(fs_info, ref_root);
				1303	if (!qgroup)
				1304	goto out;
				1305
				1306	qgroup->rfer += sign * num_bytes;
				1307	qgroup->rfer_cmpr += sign * num_bytes;
				1308
				1309	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
				1310	qgroup->excl += sign * num_bytes;
				1311	qgroup->excl_cmpr += sign * num_bytes;
				1312
				1313	if (sign > 0)
				1314	qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
				1315	else
				1316	qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
				1317
				1318	qgroup_dirty(fs_info, qgroup);
				1319
				1320	/* Get all of the parent groups that contain this qgroup */
				1321	list_for_each_entry(glist, &qgroup->groups, next_group) {
				1322	ret = ulist_add(tmp, glist->group->qgroupid,
				1323	qgroup_to_aux(glist->group), GFP_ATOMIC);
				1324	if (ret < 0)
				1325	goto out;
				1326	}
				1327
				1328	/* Iterate all of the parents and adjust their reference counts */
				1329	ULIST_ITER_INIT(&uiter);
				1330	while ((unode = ulist_next(tmp, &uiter))) {
				1331	qgroup = unode_aux_to_qgroup(unode);
				1332	qgroup->rfer += sign * num_bytes;
				1333	qgroup->rfer_cmpr += sign * num_bytes;
				1334	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
				1335	qgroup->excl += sign * num_bytes;
				1336	if (sign > 0)
				1337	qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
				1338	else
				1339	qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
				1340	qgroup->excl_cmpr += sign * num_bytes;
				1341	qgroup_dirty(fs_info, qgroup);
				1342
				1343	/* Add any parents of the parents */
				1344	list_for_each_entry(glist, &qgroup->groups, next_group) {
				1345	ret = ulist_add(tmp, glist->group->qgroupid,
				1346	qgroup_to_aux(glist->group), GFP_ATOMIC);
				1347	if (ret < 0)
				1348	goto out;
				1349	}
				1350	}
				1351	ret = 0;
				1352	out:
				1353	return ret;
				1354	}
				1355
				1356
				1357	/*
				1358	* Quick path for updating qgroup with only excl refs.
				1359	*
				1360	* In that case, just update all parent will be enough.
				1361	* Or we needs to do a full rescan.
				1362	* Caller should also hold fs_info->qgroup_lock.
				1363	*
				1364	* Return 0 for quick update, return >0 for need to full rescan
				1365	* and mark INCONSISTENT flag.
				1366	* Return < 0 for other error.
				1367	*/
				1368	static int quick_update_accounting(struct btrfs_fs_info *fs_info,
				1369	struct ulist *tmp, u64 src, u64 dst,
				1370	int sign)
				1371	{
				1372	struct btrfs_qgroup *qgroup;
				1373	int ret = 1;
				1374	int err = 0;
				1375
				1376	qgroup = find_qgroup_rb(fs_info, src);
				1377	if (!qgroup)
				1378	goto out;
				1379	if (qgroup->excl == qgroup->rfer) {
				1380	ret = 0;
				1381	err = __qgroup_excl_accounting(fs_info, tmp, dst,
				1382	qgroup, sign);
				1383	if (err < 0) {
				1384	ret = err;
				1385	goto out;
				1386	}
				1387	}
				1388	out:
				1389	if (ret)
				1390	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				1391	return ret;
				1392	}
				1393
				1394	int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
				1395	u64 dst)
				1396	{
				1397	struct btrfs_fs_info *fs_info = trans->fs_info;
				1398	struct btrfs_qgroup *parent;
				1399	struct btrfs_qgroup *member;
				1400	struct btrfs_qgroup_list *list;
				1401	struct ulist *tmp;
				1402	int ret = 0;
				1403
				1404	/* Check the level of src and dst first */
				1405	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
				1406	return -EINVAL;
				1407
				1408	tmp = ulist_alloc(GFP_KERNEL);
				1409	if (!tmp)
				1410	return -ENOMEM;
				1411
				1412	mutex_lock(&fs_info->qgroup_ioctl_lock);
				1413	if (!fs_info->quota_root) {
				1414	ret = -ENOTCONN;
				1415	goto out;
				1416	}
				1417	member = find_qgroup_rb(fs_info, src);
				1418	parent = find_qgroup_rb(fs_info, dst);
				1419	if (!member \|\| !parent) {
				1420	ret = -EINVAL;
				1421	goto out;
				1422	}
				1423
				1424	/* check if such qgroup relation exist firstly */
				1425	list_for_each_entry(list, &member->groups, next_group) {
				1426	if (list->group == parent) {
				1427	ret = -EEXIST;
				1428	goto out;
				1429	}
				1430	}
				1431
				1432	ret = add_qgroup_relation_item(trans, src, dst);
				1433	if (ret)
				1434	goto out;
				1435
				1436	ret = add_qgroup_relation_item(trans, dst, src);
				1437	if (ret) {
				1438	del_qgroup_relation_item(trans, src, dst);
				1439	goto out;
				1440	}
				1441
				1442	spin_lock(&fs_info->qgroup_lock);
				1443	ret = add_relation_rb(fs_info, src, dst);
				1444	if (ret < 0) {
				1445	spin_unlock(&fs_info->qgroup_lock);
				1446	goto out;
				1447	}
				1448	ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
				1449	spin_unlock(&fs_info->qgroup_lock);
				1450	out:
				1451	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1452	ulist_free(tmp);
				1453	return ret;
				1454	}
				1455
				1456	static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
				1457	u64 dst)
				1458	{
				1459	struct btrfs_fs_info *fs_info = trans->fs_info;
				1460	struct btrfs_qgroup *parent;
				1461	struct btrfs_qgroup *member;
				1462	struct btrfs_qgroup_list *list;
				1463	struct ulist *tmp;
				1464	bool found = false;
				1465	int ret = 0;
				1466	int ret2;
				1467
				1468	tmp = ulist_alloc(GFP_KERNEL);
				1469	if (!tmp)
				1470	return -ENOMEM;
				1471
				1472	if (!fs_info->quota_root) {
				1473	ret = -ENOTCONN;
				1474	goto out;
				1475	}
				1476
				1477	member = find_qgroup_rb(fs_info, src);
				1478	parent = find_qgroup_rb(fs_info, dst);
				1479	/*
				1480	* The parent/member pair doesn't exist, then try to delete the dead
				1481	* relation items only.
				1482	*/
				1483	if (!member \|\| !parent)
				1484	goto delete_item;
				1485
				1486	/* check if such qgroup relation exist firstly */
				1487	list_for_each_entry(list, &member->groups, next_group) {
				1488	if (list->group == parent) {
				1489	found = true;
				1490	break;
				1491	}
				1492	}
				1493
				1494	delete_item:
				1495	ret = del_qgroup_relation_item(trans, src, dst);
				1496	if (ret < 0 && ret != -ENOENT)
				1497	goto out;
				1498	ret2 = del_qgroup_relation_item(trans, dst, src);
				1499	if (ret2 < 0 && ret2 != -ENOENT)
				1500	goto out;
				1501
				1502	/* At least one deletion succeeded, return 0 */
				1503	if (!ret \|\| !ret2)
				1504	ret = 0;
				1505
				1506	if (found) {
				1507	spin_lock(&fs_info->qgroup_lock);
				1508	del_relation_rb(fs_info, src, dst);
				1509	ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
				1510	spin_unlock(&fs_info->qgroup_lock);
				1511	}
				1512	out:
				1513	ulist_free(tmp);
				1514	return ret;
				1515	}
				1516
				1517	int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
				1518	u64 dst)
				1519	{
				1520	struct btrfs_fs_info *fs_info = trans->fs_info;
				1521	int ret = 0;
				1522
				1523	mutex_lock(&fs_info->qgroup_ioctl_lock);
				1524	ret = __del_qgroup_relation(trans, src, dst);
				1525	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1526
				1527	return ret;
				1528	}
				1529
				1530	int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
				1531	{
				1532	struct btrfs_fs_info *fs_info = trans->fs_info;
				1533	struct btrfs_root *quota_root;
				1534	struct btrfs_qgroup *qgroup;
				1535	int ret = 0;
				1536
				1537	mutex_lock(&fs_info->qgroup_ioctl_lock);
				1538	if (!fs_info->quota_root) {
				1539	ret = -ENOTCONN;
				1540	goto out;
				1541	}
				1542	quota_root = fs_info->quota_root;
				1543	qgroup = find_qgroup_rb(fs_info, qgroupid);
				1544	if (qgroup) {
				1545	ret = -EEXIST;
				1546	goto out;
				1547	}
				1548
				1549	ret = add_qgroup_item(trans, quota_root, qgroupid);
				1550	if (ret)
				1551	goto out;
				1552
				1553	spin_lock(&fs_info->qgroup_lock);
				1554	qgroup = add_qgroup_rb(fs_info, qgroupid);
				1555	spin_unlock(&fs_info->qgroup_lock);
				1556
				1557	if (IS_ERR(qgroup))
				1558	ret = PTR_ERR(qgroup);
				1559	out:
				1560	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1561	return ret;
				1562	}
				1563
				1564	static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
				1565	{
				1566	return (qgroup->rfer > 0 \|\| qgroup->rfer_cmpr > 0 \|\|
				1567	qgroup->excl > 0 \|\| qgroup->excl_cmpr > 0 \|\|
				1568	qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 \|\|
				1569	qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 \|\|
				1570	qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
				1571	}
				1572
				1573	int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
				1574	{
				1575	struct btrfs_fs_info *fs_info = trans->fs_info;
				1576	struct btrfs_qgroup *qgroup;
				1577	struct btrfs_qgroup_list *list;
				1578	int ret = 0;
				1579
				1580	mutex_lock(&fs_info->qgroup_ioctl_lock);
				1581	if (!fs_info->quota_root) {
				1582	ret = -ENOTCONN;
				1583	goto out;
				1584	}
				1585
				1586	qgroup = find_qgroup_rb(fs_info, qgroupid);
				1587	if (!qgroup) {
				1588	ret = -ENOENT;
				1589	goto out;
				1590	}
				1591
				1592	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
				1593	ret = -EBUSY;
				1594	goto out;
				1595	}
				1596
				1597	/* Check if there are no children of this qgroup */
				1598	if (!list_empty(&qgroup->members)) {
				1599	ret = -EBUSY;
				1600	goto out;
				1601	}
				1602
				1603	ret = del_qgroup_item(trans, qgroupid);
				1604	if (ret && ret != -ENOENT)
				1605	goto out;
				1606
				1607	while (!list_empty(&qgroup->groups)) {
				1608	list = list_first_entry(&qgroup->groups,
				1609	struct btrfs_qgroup_list, next_group);
				1610	ret = __del_qgroup_relation(trans, qgroupid,
				1611	list->group->qgroupid);
				1612	if (ret)
				1613	goto out;
				1614	}
				1615
				1616	spin_lock(&fs_info->qgroup_lock);
				1617	del_qgroup_rb(fs_info, qgroupid);
				1618	spin_unlock(&fs_info->qgroup_lock);
				1619	out:
				1620	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1621	return ret;
				1622	}
				1623
				1624	int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
				1625	struct btrfs_qgroup_limit *limit)
				1626	{
				1627	struct btrfs_fs_info *fs_info = trans->fs_info;
				1628	struct btrfs_qgroup *qgroup;
				1629	int ret = 0;
				1630	/* Sometimes we would want to clear the limit on this qgroup.
				1631	* To meet this requirement, we treat the -1 as a special value
				1632	* which tell kernel to clear the limit on this qgroup.
				1633	*/
				1634	const u64 CLEAR_VALUE = -1;
				1635
				1636	mutex_lock(&fs_info->qgroup_ioctl_lock);
				1637	if (!fs_info->quota_root) {
				1638	ret = -ENOTCONN;
				1639	goto out;
				1640	}
				1641
				1642	qgroup = find_qgroup_rb(fs_info, qgroupid);
				1643	if (!qgroup) {
				1644	ret = -ENOENT;
				1645	goto out;
				1646	}
				1647
				1648	spin_lock(&fs_info->qgroup_lock);
				1649	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
				1650	if (limit->max_rfer == CLEAR_VALUE) {
				1651	qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
				1652	limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
				1653	qgroup->max_rfer = 0;
				1654	} else {
				1655	qgroup->max_rfer = limit->max_rfer;
				1656	}
				1657	}
				1658	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
				1659	if (limit->max_excl == CLEAR_VALUE) {
				1660	qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
				1661	limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
				1662	qgroup->max_excl = 0;
				1663	} else {
				1664	qgroup->max_excl = limit->max_excl;
				1665	}
				1666	}
				1667	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
				1668	if (limit->rsv_rfer == CLEAR_VALUE) {
				1669	qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
				1670	limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
				1671	qgroup->rsv_rfer = 0;
				1672	} else {
				1673	qgroup->rsv_rfer = limit->rsv_rfer;
				1674	}
				1675	}
				1676	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
				1677	if (limit->rsv_excl == CLEAR_VALUE) {
				1678	qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
				1679	limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
				1680	qgroup->rsv_excl = 0;
				1681	} else {
				1682	qgroup->rsv_excl = limit->rsv_excl;
				1683	}
				1684	}
				1685	qgroup->lim_flags \|= limit->flags;
				1686
				1687	spin_unlock(&fs_info->qgroup_lock);
				1688
				1689	ret = update_qgroup_limit_item(trans, qgroup);
				1690	if (ret) {
				1691	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				1692	btrfs_info(fs_info, "unable to update quota limit for %llu",
				1693	qgroupid);
				1694	}
				1695
				1696	out:
				1697	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				1698	return ret;
				1699	}
				1700
				1701	int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
				1702	struct btrfs_delayed_ref_root *delayed_refs,
				1703	struct btrfs_qgroup_extent_record *record)
				1704	{
				1705	struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
				1706	struct rb_node *parent_node = NULL;
				1707	struct btrfs_qgroup_extent_record *entry;
				1708	u64 bytenr = record->bytenr;
				1709
				1710	lockdep_assert_held(&delayed_refs->lock);
				1711	trace_btrfs_qgroup_trace_extent(fs_info, record);
				1712
				1713	while (*p) {
				1714	parent_node = *p;
				1715	entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
				1716	node);
				1717	if (bytenr < entry->bytenr) {
				1718	p = &(*p)->rb_left;
				1719	} else if (bytenr > entry->bytenr) {
				1720	p = &(*p)->rb_right;
				1721	} else {
				1722	if (record->data_rsv && !entry->data_rsv) {
				1723	entry->data_rsv = record->data_rsv;
				1724	entry->data_rsv_refroot =
				1725	record->data_rsv_refroot;
				1726	}
				1727	return 1;
				1728	}
				1729	}
				1730
				1731	rb_link_node(&record->node, parent_node, p);
				1732	rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
				1733	return 0;
				1734	}
				1735
				1736	int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
				1737	struct btrfs_qgroup_extent_record *qrecord)
				1738	{
				1739	struct ulist *old_root;
				1740	u64 bytenr = qrecord->bytenr;
				1741	int ret;
				1742
				1743	ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
				1744	if (ret < 0) {
				1745	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				1746	btrfs_warn(fs_info,
				1747	"error accounting new delayed refs extent (err code: %d), quota inconsistent",
				1748	ret);
				1749	return 0;
				1750	}
				1751
				1752	/*
				1753	* Here we don't need to get the lock of
				1754	* trans->transaction->delayed_refs, since inserted qrecord won't
				1755	* be deleted, only qrecord->node may be modified (new qrecord insert)
				1756	*
				1757	* So modifying qrecord->old_roots is safe here
				1758	*/
				1759	qrecord->old_roots = old_root;
				1760	return 0;
				1761	}
				1762
				1763	int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
				1764	u64 num_bytes, gfp_t gfp_flag)
				1765	{
				1766	struct btrfs_fs_info *fs_info = trans->fs_info;
				1767	struct btrfs_qgroup_extent_record *record;
				1768	struct btrfs_delayed_ref_root *delayed_refs;
				1769	int ret;
				1770
				1771	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
				1772	\|\| bytenr == 0 \|\| num_bytes == 0)
				1773	return 0;
				1774	record = kzalloc(sizeof(*record), gfp_flag);
				1775	if (!record)
				1776	return -ENOMEM;
				1777
				1778	delayed_refs = &trans->transaction->delayed_refs;
				1779	record->bytenr = bytenr;
				1780	record->num_bytes = num_bytes;
				1781	record->old_roots = NULL;
				1782
				1783	spin_lock(&delayed_refs->lock);
				1784	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
				1785	spin_unlock(&delayed_refs->lock);
				1786	if (ret > 0) {
				1787	kfree(record);
				1788	return 0;
				1789	}
				1790	return btrfs_qgroup_trace_extent_post(fs_info, record);
				1791	}
				1792
				1793	int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
				1794	struct extent_buffer *eb)
				1795	{
				1796	struct btrfs_fs_info *fs_info = trans->fs_info;
				1797	int nr = btrfs_header_nritems(eb);
				1798	int i, extent_type, ret;
				1799	struct btrfs_key key;
				1800	struct btrfs_file_extent_item *fi;
				1801	u64 bytenr, num_bytes;
				1802
				1803	/* We can be called directly from walk_up_proc() */
				1804	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				1805	return 0;
				1806
				1807	for (i = 0; i < nr; i++) {
				1808	btrfs_item_key_to_cpu(eb, &key, i);
				1809
				1810	if (key.type != BTRFS_EXTENT_DATA_KEY)
				1811	continue;
				1812
				1813	fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
				1814	/* filter out non qgroup-accountable extents */
				1815	extent_type = btrfs_file_extent_type(eb, fi);
				1816
				1817	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
				1818	continue;
				1819
				1820	bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
				1821	if (!bytenr)
				1822	continue;
				1823
				1824	num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
				1825
				1826	ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
				1827	GFP_NOFS);
				1828	if (ret)
				1829	return ret;
				1830	}
				1831	cond_resched();
				1832	return 0;
				1833	}
				1834
				1835	/*
				1836	* Walk up the tree from the bottom, freeing leaves and any interior
				1837	* nodes which have had all slots visited. If a node (leaf or
				1838	* interior) is freed, the node above it will have it's slot
				1839	* incremented. The root node will never be freed.
				1840	*
				1841	* At the end of this function, we should have a path which has all
				1842	* slots incremented to the next position for a search. If we need to
				1843	* read a new node it will be NULL and the node above it will have the
				1844	* correct slot selected for a later read.
				1845	*
				1846	* If we increment the root nodes slot counter past the number of
				1847	* elements, 1 is returned to signal completion of the search.
				1848	*/
				1849	static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
				1850	{
				1851	int level = 0;
				1852	int nr, slot;
				1853	struct extent_buffer *eb;
				1854
				1855	if (root_level == 0)
				1856	return 1;
				1857
				1858	while (level <= root_level) {
				1859	eb = path->nodes[level];
				1860	nr = btrfs_header_nritems(eb);
				1861	path->slots[level]++;
				1862	slot = path->slots[level];
				1863	if (slot >= nr \|\| level == 0) {
				1864	/*
				1865	* Don't free the root - we will detect this
				1866	* condition after our loop and return a
				1867	* positive value for caller to stop walking the tree.
				1868	*/
				1869	if (level != root_level) {
				1870	btrfs_tree_unlock_rw(eb, path->locks[level]);
				1871	path->locks[level] = 0;
				1872
				1873	free_extent_buffer(eb);
				1874	path->nodes[level] = NULL;
				1875	path->slots[level] = 0;
				1876	}
				1877	} else {
				1878	/*
				1879	* We have a valid slot to walk back down
				1880	* from. Stop here so caller can process these
				1881	* new nodes.
				1882	*/
				1883	break;
				1884	}
				1885
				1886	level++;
				1887	}
				1888
				1889	eb = path->nodes[root_level];
				1890	if (path->slots[root_level] >= btrfs_header_nritems(eb))
				1891	return 1;
				1892
				1893	return 0;
				1894	}
				1895
				1896	/*
				1897	* Helper function to trace a subtree tree block swap.
				1898	*
				1899	* The swap will happen in highest tree block, but there may be a lot of
				1900	* tree blocks involved.
				1901	*
				1902	* For example:
				1903	* OO = Old tree blocks
				1904	* NN = New tree blocks allocated during balance
				1905	*
				1906	* File tree (257) Reloc tree for 257
				1907	* L2 OO NN
				1908	* / \ / \
				1909	* L1 OO OO (a) OO NN (a)
				1910	* / \ / \ / \ / \
				1911	* L0 OO OO OO OO OO OO NN NN
				1912	* (b) (c) (b) (c)
				1913	*
				1914	* When calling qgroup_trace_extent_swap(), we will pass:
				1915	* @src_eb = OO(a)
				1916	* @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
				1917	* @dst_level = 0
				1918	* @root_level = 1
				1919	*
				1920	* In that case, qgroup_trace_extent_swap() will search from OO(a) to
				1921	* reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
				1922	*
				1923	* The main work of qgroup_trace_extent_swap() can be split into 3 parts:
				1924	*
				1925	* 1) Tree search from @src_eb
				1926	* It should acts as a simplified btrfs_search_slot().
				1927	* The key for search can be extracted from @dst_path->nodes[dst_level]
				1928	* (first key).
				1929	*
				1930	* 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
				1931	* NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
				1932	* They should be marked during previous (@dst_level = 1) iteration.
				1933	*
				1934	* 3) Mark file extents in leaves dirty
				1935	* We don't have good way to pick out new file extents only.
				1936	* So we still follow the old method by scanning all file extents in
				1937	* the leave.
				1938	*
				1939	* This function can free us from keeping two paths, thus later we only need
				1940	* to care about how to iterate all new tree blocks in reloc tree.
				1941	*/
				1942	static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
				1943	struct extent_buffer *src_eb,
				1944	struct btrfs_path *dst_path,
				1945	int dst_level, int root_level,
				1946	bool trace_leaf)
				1947	{
				1948	struct btrfs_key key;
				1949	struct btrfs_path *src_path;
				1950	struct btrfs_fs_info *fs_info = trans->fs_info;
				1951	u32 nodesize = fs_info->nodesize;
				1952	int cur_level = root_level;
				1953	int ret;
				1954
				1955	BUG_ON(dst_level > root_level);
				1956	/* Level mismatch */
				1957	if (btrfs_header_level(src_eb) != root_level)
				1958	return -EINVAL;
				1959
				1960	src_path = btrfs_alloc_path();
				1961	if (!src_path) {
				1962	ret = -ENOMEM;
				1963	goto out;
				1964	}
				1965
				1966	if (dst_level)
				1967	btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
				1968	else
				1969	btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
				1970
				1971	/* For src_path */
				1972	extent_buffer_get(src_eb);
				1973	src_path->nodes[root_level] = src_eb;
				1974	src_path->slots[root_level] = dst_path->slots[root_level];
				1975	src_path->locks[root_level] = 0;
				1976
				1977	/* A simplified version of btrfs_search_slot() */
				1978	while (cur_level >= dst_level) {
				1979	struct btrfs_key src_key;
				1980	struct btrfs_key dst_key;
				1981
				1982	if (src_path->nodes[cur_level] == NULL) {
				1983	struct btrfs_key first_key;
				1984	struct extent_buffer *eb;
				1985	int parent_slot;
				1986	u64 child_gen;
				1987	u64 child_bytenr;
				1988
				1989	eb = src_path->nodes[cur_level + 1];
				1990	parent_slot = src_path->slots[cur_level + 1];
				1991	child_bytenr = btrfs_node_blockptr(eb, parent_slot);
				1992	child_gen = btrfs_node_ptr_generation(eb, parent_slot);
				1993	btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
				1994
				1995	eb = read_tree_block(fs_info, child_bytenr, child_gen,
				1996	cur_level, &first_key);
				1997	if (IS_ERR(eb)) {
				1998	ret = PTR_ERR(eb);
				1999	goto out;
				2000	} else if (!extent_buffer_uptodate(eb)) {
				2001	free_extent_buffer(eb);
				2002	ret = -EIO;
				2003	goto out;
				2004	}
				2005
				2006	src_path->nodes[cur_level] = eb;
				2007
				2008	btrfs_tree_read_lock(eb);
				2009	btrfs_set_lock_blocking_read(eb);
				2010	src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
				2011	}
				2012
				2013	src_path->slots[cur_level] = dst_path->slots[cur_level];
				2014	if (cur_level) {
				2015	btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
				2016	&dst_key, dst_path->slots[cur_level]);
				2017	btrfs_node_key_to_cpu(src_path->nodes[cur_level],
				2018	&src_key, src_path->slots[cur_level]);
				2019	} else {
				2020	btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
				2021	&dst_key, dst_path->slots[cur_level]);
				2022	btrfs_item_key_to_cpu(src_path->nodes[cur_level],
				2023	&src_key, src_path->slots[cur_level]);
				2024	}
				2025	/* Content mismatch, something went wrong */
				2026	if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
				2027	ret = -ENOENT;
				2028	goto out;
				2029	}
				2030	cur_level--;
				2031	}
				2032
				2033	/*
				2034	* Now both @dst_path and @src_path have been populated, record the tree
				2035	* blocks for qgroup accounting.
				2036	*/
				2037	ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
				2038	nodesize, GFP_NOFS);
				2039	if (ret < 0)
				2040	goto out;
				2041	ret = btrfs_qgroup_trace_extent(trans,
				2042	dst_path->nodes[dst_level]->start,
				2043	nodesize, GFP_NOFS);
				2044	if (ret < 0)
				2045	goto out;
				2046
				2047	/* Record leaf file extents */
				2048	if (dst_level == 0 && trace_leaf) {
				2049	ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
				2050	if (ret < 0)
				2051	goto out;
				2052	ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
				2053	}
				2054	out:
				2055	btrfs_free_path(src_path);
				2056	return ret;
				2057	}
				2058
				2059	/*
				2060	* Helper function to do recursive generation-aware depth-first search, to
				2061	* locate all new tree blocks in a subtree of reloc tree.
				2062	*
				2063	* E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
				2064	* reloc tree
				2065	* L2 NN (a)
				2066	* / \
				2067	* L1 OO NN (b)
				2068	* / \ / \
				2069	* L0 OO OO OO NN
				2070	* (c) (d)
				2071	* If we pass:
				2072	* @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
				2073	* @cur_level = 1
				2074	* @root_level = 1
				2075	*
				2076	* We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
				2077	* above tree blocks along with their counter parts in file tree.
				2078	* While during search, old tree blocks OO(c) will be skipped as tree block swap
				2079	* won't affect OO(c).
				2080	*/
				2081	static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
				2082	struct extent_buffer *src_eb,
				2083	struct btrfs_path *dst_path,
				2084	int cur_level, int root_level,
				2085	u64 last_snapshot, bool trace_leaf)
				2086	{
				2087	struct btrfs_fs_info *fs_info = trans->fs_info;
				2088	struct extent_buffer *eb;
				2089	bool need_cleanup = false;
				2090	int ret = 0;
				2091	int i;
				2092
				2093	/* Level sanity check */
				2094	if (cur_level < 0 \|\| cur_level >= BTRFS_MAX_LEVEL - 1 \|\|
				2095	root_level < 0 \|\| root_level >= BTRFS_MAX_LEVEL - 1 \|\|
				2096	root_level < cur_level) {
				2097	btrfs_err_rl(fs_info,
				2098	"%s: bad levels, cur_level=%d root_level=%d",
				2099	__func__, cur_level, root_level);
				2100	return -EUCLEAN;
				2101	}
				2102
				2103	/* Read the tree block if needed */
				2104	if (dst_path->nodes[cur_level] == NULL) {
				2105	struct btrfs_key first_key;
				2106	int parent_slot;
				2107	u64 child_gen;
				2108	u64 child_bytenr;
				2109
				2110	/*
				2111	* dst_path->nodes[root_level] must be initialized before
				2112	* calling this function.
				2113	*/
				2114	if (cur_level == root_level) {
				2115	btrfs_err_rl(fs_info,
				2116	"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
				2117	__func__, root_level, root_level, cur_level);
				2118	return -EUCLEAN;
				2119	}
				2120
				2121	/*
				2122	* We need to get child blockptr/gen from parent before we can
				2123	* read it.
				2124	*/
				2125	eb = dst_path->nodes[cur_level + 1];
				2126	parent_slot = dst_path->slots[cur_level + 1];
				2127	child_bytenr = btrfs_node_blockptr(eb, parent_slot);
				2128	child_gen = btrfs_node_ptr_generation(eb, parent_slot);
				2129	btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
				2130
				2131	/* This node is old, no need to trace */
				2132	if (child_gen < last_snapshot)
				2133	goto out;
				2134
				2135	eb = read_tree_block(fs_info, child_bytenr, child_gen,
				2136	cur_level, &first_key);
				2137	if (IS_ERR(eb)) {
				2138	ret = PTR_ERR(eb);
				2139	goto out;
				2140	} else if (!extent_buffer_uptodate(eb)) {
				2141	free_extent_buffer(eb);
				2142	ret = -EIO;
				2143	goto out;
				2144	}
				2145
				2146	dst_path->nodes[cur_level] = eb;
				2147	dst_path->slots[cur_level] = 0;
				2148
				2149	btrfs_tree_read_lock(eb);
				2150	btrfs_set_lock_blocking_read(eb);
				2151	dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
				2152	need_cleanup = true;
				2153	}
				2154
				2155	/* Now record this tree block and its counter part for qgroups */
				2156	ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
				2157	root_level, trace_leaf);
				2158	if (ret < 0)
				2159	goto cleanup;
				2160
				2161	eb = dst_path->nodes[cur_level];
				2162
				2163	if (cur_level > 0) {
				2164	/* Iterate all child tree blocks */
				2165	for (i = 0; i < btrfs_header_nritems(eb); i++) {
				2166	/* Skip old tree blocks as they won't be swapped */
				2167	if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
				2168	continue;
				2169	dst_path->slots[cur_level] = i;
				2170
				2171	/* Recursive call (at most 7 times) */
				2172	ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
				2173	dst_path, cur_level - 1, root_level,
				2174	last_snapshot, trace_leaf);
				2175	if (ret < 0)
				2176	goto cleanup;
				2177	}
				2178	}
				2179
				2180	cleanup:
				2181	if (need_cleanup) {
				2182	/* Clean up */
				2183	btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
				2184	dst_path->locks[cur_level]);
				2185	free_extent_buffer(dst_path->nodes[cur_level]);
				2186	dst_path->nodes[cur_level] = NULL;
				2187	dst_path->slots[cur_level] = 0;
				2188	dst_path->locks[cur_level] = 0;
				2189	}
				2190	out:
				2191	return ret;
				2192	}
				2193
				2194	static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
				2195	struct extent_buffer *src_eb,
				2196	struct extent_buffer *dst_eb,
				2197	u64 last_snapshot, bool trace_leaf)
				2198	{
				2199	struct btrfs_fs_info *fs_info = trans->fs_info;
				2200	struct btrfs_path *dst_path = NULL;
				2201	int level;
				2202	int ret;
				2203
				2204	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				2205	return 0;
				2206
				2207	/* Wrong parameter order */
				2208	if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
				2209	btrfs_err_rl(fs_info,
				2210	"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
				2211	btrfs_header_generation(src_eb),
				2212	btrfs_header_generation(dst_eb));
				2213	return -EUCLEAN;
				2214	}
				2215
				2216	if (!extent_buffer_uptodate(src_eb) \|\| !extent_buffer_uptodate(dst_eb)) {
				2217	ret = -EIO;
				2218	goto out;
				2219	}
				2220
				2221	level = btrfs_header_level(dst_eb);
				2222	dst_path = btrfs_alloc_path();
				2223	if (!dst_path) {
				2224	ret = -ENOMEM;
				2225	goto out;
				2226	}
				2227	/* For dst_path */
				2228	extent_buffer_get(dst_eb);
				2229	dst_path->nodes[level] = dst_eb;
				2230	dst_path->slots[level] = 0;
				2231	dst_path->locks[level] = 0;
				2232
				2233	/* Do the generation aware breadth-first search */
				2234	ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
				2235	level, last_snapshot, trace_leaf);
				2236	if (ret < 0)
				2237	goto out;
				2238	ret = 0;
				2239
				2240	out:
				2241	btrfs_free_path(dst_path);
				2242	if (ret < 0)
				2243	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				2244	return ret;
				2245	}
				2246
				2247	int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
				2248	struct extent_buffer *root_eb,
				2249	u64 root_gen, int root_level)
				2250	{
				2251	struct btrfs_fs_info *fs_info = trans->fs_info;
				2252	int ret = 0;
				2253	int level;
				2254	struct extent_buffer *eb = root_eb;
				2255	struct btrfs_path *path = NULL;
				2256
				2257	BUG_ON(root_level < 0 \|\| root_level >= BTRFS_MAX_LEVEL);
				2258	BUG_ON(root_eb == NULL);
				2259
				2260	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				2261	return 0;
				2262
				2263	if (!extent_buffer_uptodate(root_eb)) {
				2264	ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
				2265	if (ret)
				2266	goto out;
				2267	}
				2268
				2269	if (root_level == 0) {
				2270	ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
				2271	goto out;
				2272	}
				2273
				2274	path = btrfs_alloc_path();
				2275	if (!path)
				2276	return -ENOMEM;
				2277
				2278	/*
				2279	* Walk down the tree. Missing extent blocks are filled in as
				2280	* we go. Metadata is accounted every time we read a new
				2281	* extent block.
				2282	*
				2283	* When we reach a leaf, we account for file extent items in it,
				2284	* walk back up the tree (adjusting slot pointers as we go)
				2285	* and restart the search process.
				2286	*/
				2287	extent_buffer_get(root_eb); /* For path */
				2288	path->nodes[root_level] = root_eb;
				2289	path->slots[root_level] = 0;
				2290	path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
				2291	walk_down:
				2292	level = root_level;
				2293	while (level >= 0) {
				2294	if (path->nodes[level] == NULL) {
				2295	struct btrfs_key first_key;
				2296	int parent_slot;
				2297	u64 child_gen;
				2298	u64 child_bytenr;
				2299
				2300	/*
				2301	* We need to get child blockptr/gen from parent before
				2302	* we can read it.
				2303	*/
				2304	eb = path->nodes[level + 1];
				2305	parent_slot = path->slots[level + 1];
				2306	child_bytenr = btrfs_node_blockptr(eb, parent_slot);
				2307	child_gen = btrfs_node_ptr_generation(eb, parent_slot);
				2308	btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
				2309
				2310	eb = read_tree_block(fs_info, child_bytenr, child_gen,
				2311	level, &first_key);
				2312	if (IS_ERR(eb)) {
				2313	ret = PTR_ERR(eb);
				2314	goto out;
				2315	} else if (!extent_buffer_uptodate(eb)) {
				2316	free_extent_buffer(eb);
				2317	ret = -EIO;
				2318	goto out;
				2319	}
				2320
				2321	path->nodes[level] = eb;
				2322	path->slots[level] = 0;
				2323
				2324	btrfs_tree_read_lock(eb);
				2325	btrfs_set_lock_blocking_read(eb);
				2326	path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
				2327
				2328	ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
				2329	fs_info->nodesize,
				2330	GFP_NOFS);
				2331	if (ret)
				2332	goto out;
				2333	}
				2334
				2335	if (level == 0) {
				2336	ret = btrfs_qgroup_trace_leaf_items(trans,
				2337	path->nodes[level]);
				2338	if (ret)
				2339	goto out;
				2340
				2341	/* Nonzero return here means we completed our search */
				2342	ret = adjust_slots_upwards(path, root_level);
				2343	if (ret)
				2344	break;
				2345
				2346	/* Restart search with new slots */
				2347	goto walk_down;
				2348	}
				2349
				2350	level--;
				2351	}
				2352
				2353	ret = 0;
				2354	out:
				2355	btrfs_free_path(path);
				2356
				2357	return ret;
				2358	}
				2359
				2360	#define UPDATE_NEW 0
				2361	#define UPDATE_OLD 1
				2362	/*
				2363	* Walk all of the roots that points to the bytenr and adjust their refcnts.
				2364	*/
				2365	static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
				2366	struct ulist roots, struct ulist tmp,
				2367	struct ulist *qgroups, u64 seq, int update_old)
				2368	{
				2369	struct ulist_node *unode;
				2370	struct ulist_iterator uiter;
				2371	struct ulist_node *tmp_unode;
				2372	struct ulist_iterator tmp_uiter;
				2373	struct btrfs_qgroup *qg;
				2374	int ret = 0;
				2375
				2376	if (!roots)
				2377	return 0;
				2378	ULIST_ITER_INIT(&uiter);
				2379	while ((unode = ulist_next(roots, &uiter))) {
				2380	qg = find_qgroup_rb(fs_info, unode->val);
				2381	if (!qg)
				2382	continue;
				2383
				2384	ulist_reinit(tmp);
				2385	ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
				2386	GFP_ATOMIC);
				2387	if (ret < 0)
				2388	return ret;
				2389	ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
				2390	if (ret < 0)
				2391	return ret;
				2392	ULIST_ITER_INIT(&tmp_uiter);
				2393	while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
				2394	struct btrfs_qgroup_list *glist;
				2395
				2396	qg = unode_aux_to_qgroup(tmp_unode);
				2397	if (update_old)
				2398	btrfs_qgroup_update_old_refcnt(qg, seq, 1);
				2399	else
				2400	btrfs_qgroup_update_new_refcnt(qg, seq, 1);
				2401	list_for_each_entry(glist, &qg->groups, next_group) {
				2402	ret = ulist_add(qgroups, glist->group->qgroupid,
				2403	qgroup_to_aux(glist->group),
				2404	GFP_ATOMIC);
				2405	if (ret < 0)
				2406	return ret;
				2407	ret = ulist_add(tmp, glist->group->qgroupid,
				2408	qgroup_to_aux(glist->group),
				2409	GFP_ATOMIC);
				2410	if (ret < 0)
				2411	return ret;
				2412	}
				2413	}
				2414	}
				2415	return 0;
				2416	}
				2417
				2418	/*
				2419	* Update qgroup rfer/excl counters.
				2420	* Rfer update is easy, codes can explain themselves.
				2421	*
				2422	* Excl update is tricky, the update is split into 2 parts.
				2423	* Part 1: Possible exclusive <-> sharing detect:
				2424	* \| A \| !A \|
				2425	* -------------------------------------
				2426	* B \| * \| - \|
				2427	* -------------------------------------
				2428	* !B \| + \| ** \|
				2429	* -------------------------------------
				2430	*
				2431	* Conditions:
				2432	* A: cur_old_roots < nr_old_roots (not exclusive before)
				2433	* !A: cur_old_roots == nr_old_roots (possible exclusive before)
				2434	* B: cur_new_roots < nr_new_roots (not exclusive now)
				2435	* !B: cur_new_roots == nr_new_roots (possible exclusive now)
				2436	*
				2437	* Results:
				2438	* +: Possible sharing -> exclusive -: Possible exclusive -> sharing
				2439	* : Definitely not changed. *: Possible unchanged.
				2440	*
				2441	* For !A and !B condition, the exception is cur_old/new_roots == 0 case.
				2442	*
				2443	* To make the logic clear, we first use condition A and B to split
				2444	* combination into 4 results.
				2445	*
				2446	* Then, for result "+" and "-", check old/new_roots == 0 case, as in them
				2447	* only on variant maybe 0.
				2448	*
				2449	* Lastly, check result **, since there are 2 variants maybe 0, split them
				2450	* again(2x2).
				2451	* But this time we don't need to consider other things, the codes and logic
				2452	* is easy to understand now.
				2453	*/
				2454	static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
				2455	struct ulist *qgroups,
				2456	u64 nr_old_roots,
				2457	u64 nr_new_roots,
				2458	u64 num_bytes, u64 seq)
				2459	{
				2460	struct ulist_node *unode;
				2461	struct ulist_iterator uiter;
				2462	struct btrfs_qgroup *qg;
				2463	u64 cur_new_count, cur_old_count;
				2464
				2465	ULIST_ITER_INIT(&uiter);
				2466	while ((unode = ulist_next(qgroups, &uiter))) {
				2467	bool dirty = false;
				2468
				2469	qg = unode_aux_to_qgroup(unode);
				2470	cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
				2471	cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
				2472
				2473	trace_qgroup_update_counters(fs_info, qg, cur_old_count,
				2474	cur_new_count);
				2475
				2476	/* Rfer update part */
				2477	if (cur_old_count == 0 && cur_new_count > 0) {
				2478	qg->rfer += num_bytes;
				2479	qg->rfer_cmpr += num_bytes;
				2480	dirty = true;
				2481	}
				2482	if (cur_old_count > 0 && cur_new_count == 0) {
				2483	qg->rfer -= num_bytes;
				2484	qg->rfer_cmpr -= num_bytes;
				2485	dirty = true;
				2486	}
				2487
				2488	/* Excl update part */
				2489	/* Exclusive/none -> shared case */
				2490	if (cur_old_count == nr_old_roots &&
				2491	cur_new_count < nr_new_roots) {
				2492	/* Exclusive -> shared */
				2493	if (cur_old_count != 0) {
				2494	qg->excl -= num_bytes;
				2495	qg->excl_cmpr -= num_bytes;
				2496	dirty = true;
				2497	}
				2498	}
				2499
				2500	/* Shared -> exclusive/none case */
				2501	if (cur_old_count < nr_old_roots &&
				2502	cur_new_count == nr_new_roots) {
				2503	/* Shared->exclusive */
				2504	if (cur_new_count != 0) {
				2505	qg->excl += num_bytes;
				2506	qg->excl_cmpr += num_bytes;
				2507	dirty = true;
				2508	}
				2509	}
				2510
				2511	/* Exclusive/none -> exclusive/none case */
				2512	if (cur_old_count == nr_old_roots &&
				2513	cur_new_count == nr_new_roots) {
				2514	if (cur_old_count == 0) {
				2515	/* None -> exclusive/none */
				2516
				2517	if (cur_new_count != 0) {
				2518	/* None -> exclusive */
				2519	qg->excl += num_bytes;
				2520	qg->excl_cmpr += num_bytes;
				2521	dirty = true;
				2522	}
				2523	/* None -> none, nothing changed */
				2524	} else {
				2525	/* Exclusive -> exclusive/none */
				2526
				2527	if (cur_new_count == 0) {
				2528	/* Exclusive -> none */
				2529	qg->excl -= num_bytes;
				2530	qg->excl_cmpr -= num_bytes;
				2531	dirty = true;
				2532	}
				2533	/* Exclusive -> exclusive, nothing changed */
				2534	}
				2535	}
				2536
				2537	if (dirty)
				2538	qgroup_dirty(fs_info, qg);
				2539	}
				2540	return 0;
				2541	}
				2542
				2543	/*
				2544	* Check if the @roots potentially is a list of fs tree roots
				2545	*
				2546	* Return 0 for definitely not a fs/subvol tree roots ulist
				2547	* Return 1 for possible fs/subvol tree roots in the list (considering an empty
				2548	* one as well)
				2549	*/
				2550	static int maybe_fs_roots(struct ulist *roots)
				2551	{
				2552	struct ulist_node *unode;
				2553	struct ulist_iterator uiter;
				2554
				2555	/* Empty one, still possible for fs roots */
				2556	if (!roots \|\| roots->nnodes == 0)
				2557	return 1;
				2558
				2559	ULIST_ITER_INIT(&uiter);
				2560	unode = ulist_next(roots, &uiter);
				2561	if (!unode)
				2562	return 1;
				2563
				2564	/*
				2565	* If it contains fs tree roots, then it must belong to fs/subvol
				2566	* trees.
				2567	* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
				2568	*/
				2569	return is_fstree(unode->val);
				2570	}
				2571
				2572	int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
				2573	u64 num_bytes, struct ulist *old_roots,
				2574	struct ulist *new_roots)
				2575	{
				2576	struct btrfs_fs_info *fs_info = trans->fs_info;
				2577	struct ulist *qgroups = NULL;
				2578	struct ulist *tmp = NULL;
				2579	u64 seq;
				2580	u64 nr_new_roots = 0;
				2581	u64 nr_old_roots = 0;
				2582	int ret = 0;
				2583
				2584	/*
				2585	* If quotas get disabled meanwhile, the resouces need to be freed and
				2586	* we can't just exit here.
				2587	*/
				2588	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				2589	goto out_free;
				2590
				2591	if (new_roots) {
				2592	if (!maybe_fs_roots(new_roots))
				2593	goto out_free;
				2594	nr_new_roots = new_roots->nnodes;
				2595	}
				2596	if (old_roots) {
				2597	if (!maybe_fs_roots(old_roots))
				2598	goto out_free;
				2599	nr_old_roots = old_roots->nnodes;
				2600	}
				2601
				2602	/* Quick exit, either not fs tree roots, or won't affect any qgroup */
				2603	if (nr_old_roots == 0 && nr_new_roots == 0)
				2604	goto out_free;
				2605
				2606	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
				2607	num_bytes, nr_old_roots, nr_new_roots);
				2608
				2609	qgroups = ulist_alloc(GFP_NOFS);
				2610	if (!qgroups) {
				2611	ret = -ENOMEM;
				2612	goto out_free;
				2613	}
				2614	tmp = ulist_alloc(GFP_NOFS);
				2615	if (!tmp) {
				2616	ret = -ENOMEM;
				2617	goto out_free;
				2618	}
				2619
				2620	mutex_lock(&fs_info->qgroup_rescan_lock);
				2621	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
				2622	if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
				2623	mutex_unlock(&fs_info->qgroup_rescan_lock);
				2624	ret = 0;
				2625	goto out_free;
				2626	}
				2627	}
				2628	mutex_unlock(&fs_info->qgroup_rescan_lock);
				2629
				2630	spin_lock(&fs_info->qgroup_lock);
				2631	seq = fs_info->qgroup_seq;
				2632
				2633	/* Update old refcnts using old_roots */
				2634	ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
				2635	UPDATE_OLD);
				2636	if (ret < 0)
				2637	goto out;
				2638
				2639	/* Update new refcnts using new_roots */
				2640	ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
				2641	UPDATE_NEW);
				2642	if (ret < 0)
				2643	goto out;
				2644
				2645	qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
				2646	num_bytes, seq);
				2647
				2648	/*
				2649	* Bump qgroup_seq to avoid seq overlap
				2650	*/
				2651	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
				2652	out:
				2653	spin_unlock(&fs_info->qgroup_lock);
				2654	out_free:
				2655	ulist_free(tmp);
				2656	ulist_free(qgroups);
				2657	ulist_free(old_roots);
				2658	ulist_free(new_roots);
				2659	return ret;
				2660	}
				2661
				2662	int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
				2663	{
				2664	struct btrfs_fs_info *fs_info = trans->fs_info;
				2665	struct btrfs_qgroup_extent_record *record;
				2666	struct btrfs_delayed_ref_root *delayed_refs;
				2667	struct ulist *new_roots = NULL;
				2668	struct rb_node *node;
				2669	u64 num_dirty_extents = 0;
				2670	u64 qgroup_to_skip;
				2671	int ret = 0;
				2672
				2673	delayed_refs = &trans->transaction->delayed_refs;
				2674	qgroup_to_skip = delayed_refs->qgroup_to_skip;
				2675	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
				2676	record = rb_entry(node, struct btrfs_qgroup_extent_record,
				2677	node);
				2678
				2679	num_dirty_extents++;
				2680	trace_btrfs_qgroup_account_extents(fs_info, record);
				2681
				2682	if (!ret) {
				2683	/*
				2684	* Old roots should be searched when inserting qgroup
				2685	* extent record
				2686	*/
				2687	if (WARN_ON(!record->old_roots)) {
				2688	/* Search commit root to find old_roots */
				2689	ret = btrfs_find_all_roots(NULL, fs_info,
				2690	record->bytenr, 0,
				2691	&record->old_roots, false);
				2692	if (ret < 0)
				2693	goto cleanup;
				2694	}
				2695
				2696	/* Free the reserved data space */
				2697	btrfs_qgroup_free_refroot(fs_info,
				2698	record->data_rsv_refroot,
				2699	record->data_rsv,
				2700	BTRFS_QGROUP_RSV_DATA);
				2701	/*
				2702	* Use SEQ_LAST as time_seq to do special search, which
				2703	* doesn't lock tree or delayed_refs and search current
				2704	* root. It's safe inside commit_transaction().
				2705	*/
				2706	ret = btrfs_find_all_roots(trans, fs_info,
				2707	record->bytenr, SEQ_LAST, &new_roots, false);
				2708	if (ret < 0)
				2709	goto cleanup;
				2710	if (qgroup_to_skip) {
				2711	ulist_del(new_roots, qgroup_to_skip, 0);
				2712	ulist_del(record->old_roots, qgroup_to_skip,
				2713	0);
				2714	}
				2715	ret = btrfs_qgroup_account_extent(trans, record->bytenr,
				2716	record->num_bytes,
				2717	record->old_roots,
				2718	new_roots);
				2719	record->old_roots = NULL;
				2720	new_roots = NULL;
				2721	}
				2722	cleanup:
				2723	ulist_free(record->old_roots);
				2724	ulist_free(new_roots);
				2725	new_roots = NULL;
				2726	rb_erase(node, &delayed_refs->dirty_extent_root);
				2727	kfree(record);
				2728
				2729	}
				2730	trace_qgroup_num_dirty_extents(fs_info, trans->transid,
				2731	num_dirty_extents);
				2732	return ret;
				2733	}
				2734
				2735	/*
				2736	* Writes all changed qgroups to disk.
				2737	* Called by the transaction commit path and the qgroup assign ioctl.
				2738	*/
				2739	int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
				2740	{
				2741	struct btrfs_fs_info *fs_info = trans->fs_info;
				2742	int ret = 0;
				2743
				2744	/*
				2745	* In case we are called from the qgroup assign ioctl, assert that we
				2746	* are holding the qgroup_ioctl_lock, otherwise we can race with a quota
				2747	* disable operation (ioctl) and access a freed quota root.
				2748	*/
				2749	if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
				2750	lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
				2751
				2752	if (!fs_info->quota_root)
				2753	return ret;
				2754
				2755	spin_lock(&fs_info->qgroup_lock);
				2756	while (!list_empty(&fs_info->dirty_qgroups)) {
				2757	struct btrfs_qgroup *qgroup;
				2758	qgroup = list_first_entry(&fs_info->dirty_qgroups,
				2759	struct btrfs_qgroup, dirty);
				2760	list_del_init(&qgroup->dirty);
				2761	spin_unlock(&fs_info->qgroup_lock);
				2762	ret = update_qgroup_info_item(trans, qgroup);
				2763	if (ret)
				2764	fs_info->qgroup_flags \|=
				2765	BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				2766	ret = update_qgroup_limit_item(trans, qgroup);
				2767	if (ret)
				2768	fs_info->qgroup_flags \|=
				2769	BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				2770	spin_lock(&fs_info->qgroup_lock);
				2771	}
				2772	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				2773	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_ON;
				2774	else
				2775	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
				2776	spin_unlock(&fs_info->qgroup_lock);
				2777
				2778	ret = update_qgroup_status_item(trans);
				2779	if (ret)
				2780	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				2781
				2782	return ret;
				2783	}
				2784
				2785	/*
				2786	* Copy the accounting information between qgroups. This is necessary
				2787	* when a snapshot or a subvolume is created. Throwing an error will
				2788	* cause a transaction abort so we take extra care here to only error
				2789	* when a readonly fs is a reasonable outcome.
				2790	*/
				2791	int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
				2792	u64 objectid, struct btrfs_qgroup_inherit *inherit)
				2793	{
				2794	int ret = 0;
				2795	int i;
				2796	u64 *i_qgroups;
				2797	bool committing = false;
				2798	struct btrfs_fs_info *fs_info = trans->fs_info;
				2799	struct btrfs_root *quota_root;
				2800	struct btrfs_qgroup *srcgroup;
				2801	struct btrfs_qgroup *dstgroup;
				2802	bool need_rescan = false;
				2803	u32 level_size = 0;
				2804	u64 nums;
				2805
				2806	/*
				2807	* There are only two callers of this function.
				2808	*
				2809	* One in create_subvol() in the ioctl context, which needs to hold
				2810	* the qgroup_ioctl_lock.
				2811	*
				2812	* The other one in create_pending_snapshot() where no other qgroup
				2813	* code can modify the fs as they all need to either start a new trans
				2814	* or hold a trans handler, thus we don't need to hold
				2815	* qgroup_ioctl_lock.
				2816	* This would avoid long and complex lock chain and make lockdep happy.
				2817	*/
				2818	spin_lock(&fs_info->trans_lock);
				2819	if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
				2820	committing = true;
				2821	spin_unlock(&fs_info->trans_lock);
				2822
				2823	if (!committing)
				2824	mutex_lock(&fs_info->qgroup_ioctl_lock);
				2825	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				2826	goto out;
				2827
				2828	quota_root = fs_info->quota_root;
				2829	if (!quota_root) {
				2830	ret = -EINVAL;
				2831	goto out;
				2832	}
				2833
				2834	if (inherit) {
				2835	i_qgroups = (u64 *)(inherit + 1);
				2836	nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
				2837	2 * inherit->num_excl_copies;
				2838	for (i = 0; i < nums; ++i) {
				2839	srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
				2840
				2841	/*
				2842	* Zero out invalid groups so we can ignore
				2843	* them later.
				2844	*/
				2845	if (!srcgroup \|\|
				2846	((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
				2847	*i_qgroups = 0ULL;
				2848
				2849	++i_qgroups;
				2850	}
				2851	}
				2852
				2853	/*
				2854	* create a tracking group for the subvol itself
				2855	*/
				2856	ret = add_qgroup_item(trans, quota_root, objectid);
				2857	if (ret)
				2858	goto out;
				2859
				2860	/*
				2861	* add qgroup to all inherited groups
				2862	*/
				2863	if (inherit) {
				2864	i_qgroups = (u64 *)(inherit + 1);
				2865	for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
				2866	if (*i_qgroups == 0)
				2867	continue;
				2868	ret = add_qgroup_relation_item(trans, objectid,
				2869	*i_qgroups);
				2870	if (ret && ret != -EEXIST)
				2871	goto out;
				2872	ret = add_qgroup_relation_item(trans, *i_qgroups,
				2873	objectid);
				2874	if (ret && ret != -EEXIST)
				2875	goto out;
				2876	}
				2877	ret = 0;
				2878	}
				2879
				2880
				2881	spin_lock(&fs_info->qgroup_lock);
				2882
				2883	dstgroup = add_qgroup_rb(fs_info, objectid);
				2884	if (IS_ERR(dstgroup)) {
				2885	ret = PTR_ERR(dstgroup);
				2886	goto unlock;
				2887	}
				2888
				2889	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
				2890	dstgroup->lim_flags = inherit->lim.flags;
				2891	dstgroup->max_rfer = inherit->lim.max_rfer;
				2892	dstgroup->max_excl = inherit->lim.max_excl;
				2893	dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
				2894	dstgroup->rsv_excl = inherit->lim.rsv_excl;
				2895
				2896	qgroup_dirty(fs_info, dstgroup);
				2897	}
				2898
				2899	if (srcid) {
				2900	srcgroup = find_qgroup_rb(fs_info, srcid);
				2901	if (!srcgroup)
				2902	goto unlock;
				2903
				2904	/*
				2905	* We call inherit after we clone the root in order to make sure
				2906	* our counts don't go crazy, so at this point the only
				2907	* difference between the two roots should be the root node.
				2908	*/
				2909	level_size = fs_info->nodesize;
				2910	dstgroup->rfer = srcgroup->rfer;
				2911	dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
				2912	dstgroup->excl = level_size;
				2913	dstgroup->excl_cmpr = level_size;
				2914	srcgroup->excl = level_size;
				2915	srcgroup->excl_cmpr = level_size;
				2916
				2917	/* inherit the limit info */
				2918	dstgroup->lim_flags = srcgroup->lim_flags;
				2919	dstgroup->max_rfer = srcgroup->max_rfer;
				2920	dstgroup->max_excl = srcgroup->max_excl;
				2921	dstgroup->rsv_rfer = srcgroup->rsv_rfer;
				2922	dstgroup->rsv_excl = srcgroup->rsv_excl;
				2923
				2924	qgroup_dirty(fs_info, dstgroup);
				2925	qgroup_dirty(fs_info, srcgroup);
				2926	}
				2927
				2928	if (!inherit)
				2929	goto unlock;
				2930
				2931	i_qgroups = (u64 *)(inherit + 1);
				2932	for (i = 0; i < inherit->num_qgroups; ++i) {
				2933	if (*i_qgroups) {
				2934	ret = add_relation_rb(fs_info, objectid, *i_qgroups);
				2935	if (ret)
				2936	goto unlock;
				2937	}
				2938	++i_qgroups;
				2939
				2940	/*
				2941	* If we're doing a snapshot, and adding the snapshot to a new
				2942	* qgroup, the numbers are guaranteed to be incorrect.
				2943	*/
				2944	if (srcid)
				2945	need_rescan = true;
				2946	}
				2947
				2948	for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
				2949	struct btrfs_qgroup *src;
				2950	struct btrfs_qgroup *dst;
				2951
				2952	if (!i_qgroups[0] \|\| !i_qgroups[1])
				2953	continue;
				2954
				2955	src = find_qgroup_rb(fs_info, i_qgroups[0]);
				2956	dst = find_qgroup_rb(fs_info, i_qgroups[1]);
				2957
				2958	if (!src \|\| !dst) {
				2959	ret = -EINVAL;
				2960	goto unlock;
				2961	}
				2962
				2963	dst->rfer = src->rfer - level_size;
				2964	dst->rfer_cmpr = src->rfer_cmpr - level_size;
				2965
				2966	/* Manually tweaking numbers certainly needs a rescan */
				2967	need_rescan = true;
				2968	}
				2969	for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
				2970	struct btrfs_qgroup *src;
				2971	struct btrfs_qgroup *dst;
				2972
				2973	if (!i_qgroups[0] \|\| !i_qgroups[1])
				2974	continue;
				2975
				2976	src = find_qgroup_rb(fs_info, i_qgroups[0]);
				2977	dst = find_qgroup_rb(fs_info, i_qgroups[1]);
				2978
				2979	if (!src \|\| !dst) {
				2980	ret = -EINVAL;
				2981	goto unlock;
				2982	}
				2983
				2984	dst->excl = src->excl + level_size;
				2985	dst->excl_cmpr = src->excl_cmpr + level_size;
				2986	need_rescan = true;
				2987	}
				2988
				2989	unlock:
				2990	spin_unlock(&fs_info->qgroup_lock);
				2991	out:
				2992	if (!committing)
				2993	mutex_unlock(&fs_info->qgroup_ioctl_lock);
				2994	if (need_rescan)
				2995	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				2996	return ret;
				2997	}
				2998
				2999	static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
				3000	{
				3001	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
				3002	qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
				3003	return false;
				3004
				3005	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
				3006	qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
				3007	return false;
				3008
				3009	return true;
				3010	}
				3011
				3012	static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
				3013	enum btrfs_qgroup_rsv_type type)
				3014	{
				3015	struct btrfs_qgroup *qgroup;
				3016	struct btrfs_fs_info *fs_info = root->fs_info;
				3017	u64 ref_root = root->root_key.objectid;
				3018	int ret = 0;
				3019	struct ulist_node *unode;
				3020	struct ulist_iterator uiter;
				3021
				3022	if (!is_fstree(ref_root))
				3023	return 0;
				3024
				3025	if (num_bytes == 0)
				3026	return 0;
				3027
				3028	if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
				3029	capable(CAP_SYS_RESOURCE))
				3030	enforce = false;
				3031
				3032	spin_lock(&fs_info->qgroup_lock);
				3033	if (!fs_info->quota_root)
				3034	goto out;
				3035
				3036	qgroup = find_qgroup_rb(fs_info, ref_root);
				3037	if (!qgroup)
				3038	goto out;
				3039
				3040	/*
				3041	* in a first step, we check all affected qgroups if any limits would
				3042	* be exceeded
				3043	*/
				3044	ulist_reinit(fs_info->qgroup_ulist);
				3045	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
				3046	qgroup_to_aux(qgroup), GFP_ATOMIC);
				3047	if (ret < 0)
				3048	goto out;
				3049	ULIST_ITER_INIT(&uiter);
				3050	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
				3051	struct btrfs_qgroup *qg;
				3052	struct btrfs_qgroup_list *glist;
				3053
				3054	qg = unode_aux_to_qgroup(unode);
				3055
				3056	if (enforce && !qgroup_check_limits(qg, num_bytes)) {
				3057	ret = -EDQUOT;
				3058	goto out;
				3059	}
				3060
				3061	list_for_each_entry(glist, &qg->groups, next_group) {
				3062	ret = ulist_add(fs_info->qgroup_ulist,
				3063	glist->group->qgroupid,
				3064	qgroup_to_aux(glist->group), GFP_ATOMIC);
				3065	if (ret < 0)
				3066	goto out;
				3067	}
				3068	}
				3069	ret = 0;
				3070	/*
				3071	* no limits exceeded, now record the reservation into all qgroups
				3072	*/
				3073	ULIST_ITER_INIT(&uiter);
				3074	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
				3075	struct btrfs_qgroup *qg;
				3076
				3077	qg = unode_aux_to_qgroup(unode);
				3078
				3079	qgroup_rsv_add(fs_info, qg, num_bytes, type);
				3080	}
				3081
				3082	out:
				3083	spin_unlock(&fs_info->qgroup_lock);
				3084	return ret;
				3085	}
				3086
				3087	/*
				3088	* Free @num_bytes of reserved space with @type for qgroup. (Normally level 0
				3089	* qgroup).
				3090	*
				3091	* Will handle all higher level qgroup too.
				3092	*
				3093	* NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
				3094	* This special case is only used for META_PERTRANS type.
				3095	*/
				3096	void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
				3097	u64 ref_root, u64 num_bytes,
				3098	enum btrfs_qgroup_rsv_type type)
				3099	{
				3100	struct btrfs_qgroup *qgroup;
				3101	struct ulist_node *unode;
				3102	struct ulist_iterator uiter;
				3103	int ret = 0;
				3104
				3105	if (!is_fstree(ref_root))
				3106	return;
				3107
				3108	if (num_bytes == 0)
				3109	return;
				3110
				3111	if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
				3112	WARN(1, "%s: Invalid type to free", __func__);
				3113	return;
				3114	}
				3115	spin_lock(&fs_info->qgroup_lock);
				3116
				3117	if (!fs_info->quota_root)
				3118	goto out;
				3119
				3120	qgroup = find_qgroup_rb(fs_info, ref_root);
				3121	if (!qgroup)
				3122	goto out;
				3123
				3124	if (num_bytes == (u64)-1)
				3125	/*
				3126	* We're freeing all pertrans rsv, get reserved value from
				3127	* level 0 qgroup as real num_bytes to free.
				3128	*/
				3129	num_bytes = qgroup->rsv.values[type];
				3130
				3131	ulist_reinit(fs_info->qgroup_ulist);
				3132	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
				3133	qgroup_to_aux(qgroup), GFP_ATOMIC);
				3134	if (ret < 0)
				3135	goto out;
				3136	ULIST_ITER_INIT(&uiter);
				3137	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
				3138	struct btrfs_qgroup *qg;
				3139	struct btrfs_qgroup_list *glist;
				3140
				3141	qg = unode_aux_to_qgroup(unode);
				3142
				3143	qgroup_rsv_release(fs_info, qg, num_bytes, type);
				3144
				3145	list_for_each_entry(glist, &qg->groups, next_group) {
				3146	ret = ulist_add(fs_info->qgroup_ulist,
				3147	glist->group->qgroupid,
				3148	qgroup_to_aux(glist->group), GFP_ATOMIC);
				3149	if (ret < 0)
				3150	goto out;
				3151	}
				3152	}
				3153
				3154	out:
				3155	spin_unlock(&fs_info->qgroup_lock);
				3156	}
				3157
				3158	/*
				3159	* Check if the leaf is the last leaf. Which means all node pointers
				3160	* are at their last position.
				3161	*/
				3162	static bool is_last_leaf(struct btrfs_path *path)
				3163	{
				3164	int i;
				3165
				3166	for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
				3167	if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
				3168	return false;
				3169	}
				3170	return true;
				3171	}
				3172
				3173	/*
				3174	* returns < 0 on error, 0 when more leafs are to be scanned.
				3175	* returns 1 when done.
				3176	*/
				3177	static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
				3178	struct btrfs_path *path)
				3179	{
				3180	struct btrfs_fs_info *fs_info = trans->fs_info;
				3181	struct btrfs_key found;
				3182	struct extent_buffer *scratch_leaf = NULL;
				3183	struct ulist *roots = NULL;
				3184	u64 num_bytes;
				3185	bool done;
				3186	int slot;
				3187	int ret;
				3188
				3189	mutex_lock(&fs_info->qgroup_rescan_lock);
				3190	ret = btrfs_search_slot_for_read(fs_info->extent_root,
				3191	&fs_info->qgroup_rescan_progress,
				3192	path, 1, 0);
				3193
				3194	btrfs_debug(fs_info,
				3195	"current progress key (%llu %u %llu), search_slot ret %d",
				3196	fs_info->qgroup_rescan_progress.objectid,
				3197	fs_info->qgroup_rescan_progress.type,
				3198	fs_info->qgroup_rescan_progress.offset, ret);
				3199
				3200	if (ret) {
				3201	/*
				3202	* The rescan is about to end, we will not be scanning any
				3203	* further blocks. We cannot unset the RESCAN flag here, because
				3204	* we want to commit the transaction if everything went well.
				3205	* To make the live accounting work in this phase, we set our
				3206	* scan progress pointer such that every real extent objectid
				3207	* will be smaller.
				3208	*/
				3209	fs_info->qgroup_rescan_progress.objectid = (u64)-1;
				3210	btrfs_release_path(path);
				3211	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3212	return ret;
				3213	}
				3214	done = is_last_leaf(path);
				3215
				3216	btrfs_item_key_to_cpu(path->nodes[0], &found,
				3217	btrfs_header_nritems(path->nodes[0]) - 1);
				3218	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
				3219
				3220	scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
				3221	if (!scratch_leaf) {
				3222	ret = -ENOMEM;
				3223	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3224	goto out;
				3225	}
				3226	slot = path->slots[0];
				3227	btrfs_release_path(path);
				3228	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3229
				3230	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
				3231	btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
				3232	if (found.type != BTRFS_EXTENT_ITEM_KEY &&
				3233	found.type != BTRFS_METADATA_ITEM_KEY)
				3234	continue;
				3235	if (found.type == BTRFS_METADATA_ITEM_KEY)
				3236	num_bytes = fs_info->nodesize;
				3237	else
				3238	num_bytes = found.offset;
				3239
				3240	ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
				3241	&roots, false);
				3242	if (ret < 0)
				3243	goto out;
				3244	/* For rescan, just pass old_roots as NULL */
				3245	ret = btrfs_qgroup_account_extent(trans, found.objectid,
				3246	num_bytes, NULL, roots);
				3247	if (ret < 0)
				3248	goto out;
				3249	}
				3250	out:
				3251	if (scratch_leaf)
				3252	free_extent_buffer(scratch_leaf);
				3253
				3254	if (done && !ret) {
				3255	ret = 1;
				3256	fs_info->qgroup_rescan_progress.objectid = (u64)-1;
				3257	}
				3258	return ret;
				3259	}
				3260
				3261	static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
				3262	{
				3263	return btrfs_fs_closing(fs_info) \|\|
				3264	test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) \|\|
				3265	!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
				3266	}
				3267
				3268	static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
				3269	{
				3270	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
				3271	qgroup_rescan_work);
				3272	struct btrfs_path *path;
				3273	struct btrfs_trans_handle *trans = NULL;
				3274	int err = -ENOMEM;
				3275	int ret = 0;
				3276	bool stopped = false;
				3277	bool did_leaf_rescans = false;
				3278
				3279	path = btrfs_alloc_path();
				3280	if (!path)
				3281	goto out;
				3282	/*
				3283	* Rescan should only search for commit root, and any later difference
				3284	* should be recorded by qgroup
				3285	*/
				3286	path->search_commit_root = 1;
				3287	path->skip_locking = 1;
				3288
				3289	err = 0;
				3290	while (!err && !(stopped = rescan_should_stop(fs_info))) {
				3291	trans = btrfs_start_transaction(fs_info->fs_root, 0);
				3292	if (IS_ERR(trans)) {
				3293	err = PTR_ERR(trans);
				3294	break;
				3295	}
				3296
				3297	err = qgroup_rescan_leaf(trans, path);
				3298	did_leaf_rescans = true;
				3299
				3300	if (err > 0)
				3301	btrfs_commit_transaction(trans);
				3302	else
				3303	btrfs_end_transaction(trans);
				3304	}
				3305
				3306	out:
				3307	btrfs_free_path(path);
				3308
				3309	mutex_lock(&fs_info->qgroup_rescan_lock);
				3310	if (err > 0 &&
				3311	fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
				3312	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				3313	} else if (err < 0 \|\| stopped) {
				3314	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				3315	}
				3316	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3317
				3318	/*
				3319	* Only update status, since the previous part has already updated the
				3320	* qgroup info, and only if we did any actual work. This also prevents
				3321	* race with a concurrent quota disable, which has already set
				3322	* fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
				3323	* btrfs_quota_disable().
				3324	*/
				3325	if (did_leaf_rescans) {
				3326	trans = btrfs_start_transaction(fs_info->quota_root, 1);
				3327	if (IS_ERR(trans)) {
				3328	err = PTR_ERR(trans);
				3329	trans = NULL;
				3330	btrfs_err(fs_info,
				3331	"fail to start transaction for status update: %d",
				3332	err);
				3333	}
				3334	} else {
				3335	trans = NULL;
				3336	}
				3337
				3338	mutex_lock(&fs_info->qgroup_rescan_lock);
				3339	if (!stopped)
				3340	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
				3341	if (trans) {
				3342	ret = update_qgroup_status_item(trans);
				3343	if (ret < 0) {
				3344	err = ret;
				3345	btrfs_err(fs_info, "fail to update qgroup status: %d",
				3346	err);
				3347	}
				3348	}
				3349	fs_info->qgroup_rescan_running = false;
				3350	complete_all(&fs_info->qgroup_rescan_completion);
				3351	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3352
				3353	if (!trans)
				3354	return;
				3355
				3356	btrfs_end_transaction(trans);
				3357
				3358	if (stopped) {
				3359	btrfs_info(fs_info, "qgroup scan paused");
				3360	} else if (err >= 0) {
				3361	btrfs_info(fs_info, "qgroup scan completed%s",
				3362	err > 0 ? " (inconsistency flag cleared)" : "");
				3363	} else {
				3364	btrfs_err(fs_info, "qgroup scan failed with %d", err);
				3365	}
				3366	}
				3367
				3368	/*
				3369	* Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
				3370	* memory required for the rescan context.
				3371	*/
				3372	static int
				3373	qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
				3374	int init_flags)
				3375	{
				3376	int ret = 0;
				3377
				3378	if (!init_flags) {
				3379	/* we're resuming qgroup rescan at mount time */
				3380	if (!(fs_info->qgroup_flags &
				3381	BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
				3382	btrfs_warn(fs_info,
				3383	"qgroup rescan init failed, qgroup rescan is not queued");
				3384	ret = -EINVAL;
				3385	} else if (!(fs_info->qgroup_flags &
				3386	BTRFS_QGROUP_STATUS_FLAG_ON)) {
				3387	btrfs_warn(fs_info,
				3388	"qgroup rescan init failed, qgroup is not enabled");
				3389	ret = -EINVAL;
				3390	}
				3391
				3392	if (ret)
				3393	return ret;
				3394	}
				3395
				3396	mutex_lock(&fs_info->qgroup_rescan_lock);
				3397	spin_lock(&fs_info->qgroup_lock);
				3398
				3399	if (init_flags) {
				3400	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
				3401	btrfs_warn(fs_info,
				3402	"qgroup rescan is already in progress");
				3403	ret = -EINPROGRESS;
				3404	} else if (!(fs_info->qgroup_flags &
				3405	BTRFS_QGROUP_STATUS_FLAG_ON)) {
				3406	btrfs_warn(fs_info,
				3407	"qgroup rescan init failed, qgroup is not enabled");
				3408	ret = -EINVAL;
				3409	} else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
				3410	/* Quota disable is in progress */
				3411	ret = -EBUSY;
				3412	}
				3413
				3414	if (ret) {
				3415	spin_unlock(&fs_info->qgroup_lock);
				3416	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3417	return ret;
				3418	}
				3419	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
				3420	}
				3421
				3422	memset(&fs_info->qgroup_rescan_progress, 0,
				3423	sizeof(fs_info->qgroup_rescan_progress));
				3424	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
				3425	init_completion(&fs_info->qgroup_rescan_completion);
				3426
				3427	spin_unlock(&fs_info->qgroup_lock);
				3428	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3429
				3430	memset(&fs_info->qgroup_rescan_work, 0,
				3431	sizeof(fs_info->qgroup_rescan_work));
				3432	btrfs_init_work(&fs_info->qgroup_rescan_work,
				3433	btrfs_qgroup_rescan_worker, NULL, NULL);
				3434	return 0;
				3435	}
				3436
				3437	static void
				3438	qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
				3439	{
				3440	struct rb_node *n;
				3441	struct btrfs_qgroup *qgroup;
				3442
				3443	spin_lock(&fs_info->qgroup_lock);
				3444	/* clear all current qgroup tracking information */
				3445	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
				3446	qgroup = rb_entry(n, struct btrfs_qgroup, node);
				3447	qgroup->rfer = 0;
				3448	qgroup->rfer_cmpr = 0;
				3449	qgroup->excl = 0;
				3450	qgroup->excl_cmpr = 0;
				3451	qgroup_dirty(fs_info, qgroup);
				3452	}
				3453	spin_unlock(&fs_info->qgroup_lock);
				3454	}
				3455
				3456	int
				3457	btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
				3458	{
				3459	int ret = 0;
				3460	struct btrfs_trans_handle *trans;
				3461
				3462	ret = qgroup_rescan_init(fs_info, 0, 1);
				3463	if (ret)
				3464	return ret;
				3465
				3466	/*
				3467	* We have set the rescan_progress to 0, which means no more
				3468	* delayed refs will be accounted by btrfs_qgroup_account_ref.
				3469	* However, btrfs_qgroup_account_ref may be right after its call
				3470	* to btrfs_find_all_roots, in which case it would still do the
				3471	* accounting.
				3472	* To solve this, we're committing the transaction, which will
				3473	* ensure we run all delayed refs and only after that, we are
				3474	* going to clear all tracking information for a clean start.
				3475	*/
				3476
				3477	trans = btrfs_join_transaction(fs_info->fs_root);
				3478	if (IS_ERR(trans)) {
				3479	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
				3480	return PTR_ERR(trans);
				3481	}
				3482	ret = btrfs_commit_transaction(trans);
				3483	if (ret) {
				3484	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
				3485	return ret;
				3486	}
				3487
				3488	qgroup_rescan_zero_tracking(fs_info);
				3489
				3490	mutex_lock(&fs_info->qgroup_rescan_lock);
				3491	fs_info->qgroup_rescan_running = true;
				3492	btrfs_queue_work(fs_info->qgroup_rescan_workers,
				3493	&fs_info->qgroup_rescan_work);
				3494	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3495
				3496	return 0;
				3497	}
				3498
				3499	int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
				3500	bool interruptible)
				3501	{
				3502	int running;
				3503	int ret = 0;
				3504
				3505	mutex_lock(&fs_info->qgroup_rescan_lock);
				3506	spin_lock(&fs_info->qgroup_lock);
				3507	running = fs_info->qgroup_rescan_running;
				3508	spin_unlock(&fs_info->qgroup_lock);
				3509	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3510
				3511	if (!running)
				3512	return 0;
				3513
				3514	if (interruptible)
				3515	ret = wait_for_completion_interruptible(
				3516	&fs_info->qgroup_rescan_completion);
				3517	else
				3518	wait_for_completion(&fs_info->qgroup_rescan_completion);
				3519
				3520	return ret;
				3521	}
				3522
				3523	/*
				3524	* this is only called from open_ctree where we're still single threaded, thus
				3525	* locking is omitted here.
				3526	*/
				3527	void
				3528	btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
				3529	{
				3530	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
				3531	mutex_lock(&fs_info->qgroup_rescan_lock);
				3532	fs_info->qgroup_rescan_running = true;
				3533	btrfs_queue_work(fs_info->qgroup_rescan_workers,
				3534	&fs_info->qgroup_rescan_work);
				3535	mutex_unlock(&fs_info->qgroup_rescan_lock);
				3536	}
				3537	}
				3538
				3539	#define rbtree_iterate_from_safe(node, next, start) \
				3540	for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
				3541
				3542	static int qgroup_unreserve_range(struct btrfs_inode *inode,
				3543	struct extent_changeset *reserved, u64 start,
				3544	u64 len)
				3545	{
				3546	struct rb_node *node;
				3547	struct rb_node *next;
				3548	struct ulist_node *entry = NULL;
				3549	int ret = 0;
				3550
				3551	node = reserved->range_changed.root.rb_node;
				3552	while (node) {
				3553	entry = rb_entry(node, struct ulist_node, rb_node);
				3554	if (entry->val < start)
				3555	node = node->rb_right;
				3556	else if (entry)
				3557	node = node->rb_left;
				3558	else
				3559	break;
				3560	}
				3561
				3562	/* Empty changeset */
				3563	if (!entry)
				3564	return 0;
				3565
				3566	if (entry->val > start && rb_prev(&entry->rb_node))
				3567	entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
				3568	rb_node);
				3569
				3570	rbtree_iterate_from_safe(node, next, &entry->rb_node) {
				3571	u64 entry_start;
				3572	u64 entry_end;
				3573	u64 entry_len;
				3574	int clear_ret;
				3575
				3576	entry = rb_entry(node, struct ulist_node, rb_node);
				3577	entry_start = entry->val;
				3578	entry_end = entry->aux;
				3579	entry_len = entry_end - entry_start + 1;
				3580
				3581	if (entry_start >= start + len)
				3582	break;
				3583	if (entry_start + entry_len <= start)
				3584	continue;
				3585	/*
				3586	* Now the entry is in [start, start + len), revert the
				3587	* EXTENT_QGROUP_RESERVED bit.
				3588	*/
				3589	clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
				3590	entry_end, EXTENT_QGROUP_RESERVED);
				3591	if (!ret && clear_ret < 0)
				3592	ret = clear_ret;
				3593
				3594	ulist_del(&reserved->range_changed, entry->val, entry->aux);
				3595	if (likely(reserved->bytes_changed >= entry_len)) {
				3596	reserved->bytes_changed -= entry_len;
				3597	} else {
				3598	WARN_ON(1);
				3599	reserved->bytes_changed = 0;
				3600	}
				3601	}
				3602
				3603	return ret;
				3604	}
				3605
				3606	/*
				3607	* Try to free some space for qgroup.
				3608	*
				3609	* For qgroup, there are only 3 ways to free qgroup space:
				3610	* - Flush nodatacow write
				3611	* Any nodatacow write will free its reserved data space at run_delalloc_range().
				3612	* In theory, we should only flush nodatacow inodes, but it's not yet
				3613	* possible, so we need to flush the whole root.
				3614	*
				3615	* - Wait for ordered extents
				3616	* When ordered extents are finished, their reserved metadata is finally
				3617	* converted to per_trans status, which can be freed by later commit
				3618	* transaction.
				3619	*
				3620	* - Commit transaction
				3621	* This would free the meta_per_trans space.
				3622	* In theory this shouldn't provide much space, but any more qgroup space
				3623	* is needed.
				3624	*/
				3625	static int try_flush_qgroup(struct btrfs_root *root)
				3626	{
				3627	struct btrfs_trans_handle *trans;
				3628	int ret;
				3629	bool can_commit = true;
				3630
				3631	/*
				3632	* We don't want to run flush again and again, so if there is a running
				3633	* one, we won't try to start a new flush, but exit directly.
				3634	*/
				3635	if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
				3636	wait_event(root->qgroup_flush_wait,
				3637	!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
				3638	return 0;
				3639	}
				3640
				3641	/*
				3642	* If current process holds a transaction, we shouldn't flush, as we
				3643	* assume all space reservation happens before a transaction handle is
				3644	* held.
				3645	*
				3646	* But there are cases like btrfs_delayed_item_reserve_metadata() where
				3647	* we try to reserve space with one transction handle already held.
				3648	* In that case we can't commit transaction, but at least try to end it
				3649	* and hope the started data writes can free some space.
				3650	*/
				3651	if (current->journal_info &&
				3652	current->journal_info != BTRFS_SEND_TRANS_STUB)
				3653	can_commit = false;
				3654
				3655	ret = btrfs_start_delalloc_snapshot(root);
				3656	if (ret < 0)
				3657	goto out;
				3658	btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
				3659
				3660	trans = btrfs_join_transaction(root);
				3661	if (IS_ERR(trans)) {
				3662	ret = PTR_ERR(trans);
				3663	goto out;
				3664	}
				3665
				3666	if (can_commit)
				3667	ret = btrfs_commit_transaction(trans);
				3668	else
				3669	ret = btrfs_end_transaction(trans);
				3670	out:
				3671	clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
				3672	wake_up(&root->qgroup_flush_wait);
				3673	return ret;
				3674	}
				3675
				3676	static int qgroup_reserve_data(struct btrfs_inode *inode,
				3677	struct extent_changeset **reserved_ret, u64 start,
				3678	u64 len)
				3679	{
				3680	struct btrfs_root *root = inode->root;
				3681	struct extent_changeset *reserved;
				3682	bool new_reserved = false;
				3683	u64 orig_reserved;
				3684	u64 to_reserve;
				3685	int ret;
				3686
				3687	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) \|\|
				3688	!is_fstree(root->root_key.objectid) \|\| len == 0)
				3689	return 0;
				3690
				3691	/* @reserved parameter is mandatory for qgroup */
				3692	if (WARN_ON(!reserved_ret))
				3693	return -EINVAL;
				3694	if (!*reserved_ret) {
				3695	new_reserved = true;
				3696	*reserved_ret = extent_changeset_alloc();
				3697	if (!*reserved_ret)
				3698	return -ENOMEM;
				3699	}
				3700	reserved = *reserved_ret;
				3701	/* Record already reserved space */
				3702	orig_reserved = reserved->bytes_changed;
				3703	ret = set_record_extent_bits(&inode->io_tree, start,
				3704	start + len -1, EXTENT_QGROUP_RESERVED, reserved);
				3705
				3706	/* Newly reserved space */
				3707	to_reserve = reserved->bytes_changed - orig_reserved;
				3708	trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
				3709	to_reserve, QGROUP_RESERVE);
				3710	if (ret < 0)
				3711	goto out;
				3712	ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
				3713	if (ret < 0)
				3714	goto cleanup;
				3715
				3716	return ret;
				3717
				3718	cleanup:
				3719	qgroup_unreserve_range(inode, reserved, start, len);
				3720	out:
				3721	if (new_reserved) {
				3722	extent_changeset_release(reserved);
				3723	kfree(reserved);
				3724	*reserved_ret = NULL;
				3725	}
				3726	return ret;
				3727	}
				3728
				3729	/*
				3730	* Reserve qgroup space for range [start, start + len).
				3731	*
				3732	* This function will either reserve space from related qgroups or do nothing
				3733	* if the range is already reserved.
				3734	*
				3735	* Return 0 for successful reservation
				3736	* Return <0 for error (including -EQUOT)
				3737	*
				3738	* NOTE: This function may sleep for memory allocation, dirty page flushing and
				3739	* commit transaction. So caller should not hold any dirty page locked.
				3740	*/
				3741	int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
				3742	struct extent_changeset **reserved_ret, u64 start,
				3743	u64 len)
				3744	{
				3745	int ret;
				3746
				3747	ret = qgroup_reserve_data(inode, reserved_ret, start, len);
				3748	if (ret <= 0 && ret != -EDQUOT)
				3749	return ret;
				3750
				3751	ret = try_flush_qgroup(inode->root);
				3752	if (ret < 0)
				3753	return ret;
				3754	return qgroup_reserve_data(inode, reserved_ret, start, len);
				3755	}
				3756
				3757	/* Free ranges specified by @reserved, normally in error path */
				3758	static int qgroup_free_reserved_data(struct btrfs_inode *inode,
				3759	struct extent_changeset *reserved, u64 start, u64 len)
				3760	{
				3761	struct btrfs_root *root = inode->root;
				3762	struct ulist_node *unode;
				3763	struct ulist_iterator uiter;
				3764	struct extent_changeset changeset;
				3765	int freed = 0;
				3766	int ret;
				3767
				3768	extent_changeset_init(&changeset);
				3769	len = round_up(start + len, root->fs_info->sectorsize);
				3770	start = round_down(start, root->fs_info->sectorsize);
				3771
				3772	ULIST_ITER_INIT(&uiter);
				3773	while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
				3774	u64 range_start = unode->val;
				3775	/* unode->aux is the inclusive end */
				3776	u64 range_len = unode->aux - range_start + 1;
				3777	u64 free_start;
				3778	u64 free_len;
				3779
				3780	extent_changeset_release(&changeset);
				3781
				3782	/* Only free range in range [start, start + len) */
				3783	if (range_start >= start + len \|\|
				3784	range_start + range_len <= start)
				3785	continue;
				3786	free_start = max(range_start, start);
				3787	free_len = min(start + len, range_start + range_len) -
				3788	free_start;
				3789	/*
				3790	* TODO: To also modify reserved->ranges_reserved to reflect
				3791	* the modification.
				3792	*
				3793	* However as long as we free qgroup reserved according to
				3794	* EXTENT_QGROUP_RESERVED, we won't double free.
				3795	* So not need to rush.
				3796	*/
				3797	ret = clear_record_extent_bits(&inode->io_tree, free_start,
				3798	free_start + free_len - 1,
				3799	EXTENT_QGROUP_RESERVED, &changeset);
				3800	if (ret < 0)
				3801	goto out;
				3802	freed += changeset.bytes_changed;
				3803	}
				3804	btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
				3805	BTRFS_QGROUP_RSV_DATA);
				3806	ret = freed;
				3807	out:
				3808	extent_changeset_release(&changeset);
				3809	return ret;
				3810	}
				3811
				3812	static int __btrfs_qgroup_release_data(struct inode *inode,
				3813	struct extent_changeset *reserved, u64 start, u64 len,
				3814	int free)
				3815	{
				3816	struct extent_changeset changeset;
				3817	int trace_op = QGROUP_RELEASE;
				3818	int ret;
				3819
				3820	if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
				3821	&BTRFS_I(inode)->root->fs_info->flags))
				3822	return 0;
				3823
				3824	/* In release case, we shouldn't have @reserved */
				3825	WARN_ON(!free && reserved);
				3826	if (free && reserved)
				3827	return qgroup_free_reserved_data(BTRFS_I(inode), reserved,
				3828	start, len);
				3829	extent_changeset_init(&changeset);
				3830	ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
				3831	start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
				3832	if (ret < 0)
				3833	goto out;
				3834
				3835	if (free)
				3836	trace_op = QGROUP_FREE;
				3837	trace_btrfs_qgroup_release_data(inode, start, len,
				3838	changeset.bytes_changed, trace_op);
				3839	if (free)
				3840	btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
				3841	BTRFS_I(inode)->root->root_key.objectid,
				3842	changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
				3843	ret = changeset.bytes_changed;
				3844	out:
				3845	extent_changeset_release(&changeset);
				3846	return ret;
				3847	}
				3848
				3849	/*
				3850	* Free a reserved space range from io_tree and related qgroups
				3851	*
				3852	* Should be called when a range of pages get invalidated before reaching disk.
				3853	* Or for error cleanup case.
				3854	* if @reserved is given, only reserved range in [@start, @start + @len) will
				3855	* be freed.
				3856	*
				3857	* For data written to disk, use btrfs_qgroup_release_data().
				3858	*
				3859	* NOTE: This function may sleep for memory allocation.
				3860	*/
				3861	int btrfs_qgroup_free_data(struct inode *inode,
				3862	struct extent_changeset *reserved, u64 start, u64 len)
				3863	{
				3864	return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
				3865	}
				3866
				3867	/*
				3868	* Release a reserved space range from io_tree only.
				3869	*
				3870	* Should be called when a range of pages get written to disk and corresponding
				3871	* FILE_EXTENT is inserted into corresponding root.
				3872	*
				3873	* Since new qgroup accounting framework will only update qgroup numbers at
				3874	* commit_transaction() time, its reserved space shouldn't be freed from
				3875	* related qgroups.
				3876	*
				3877	* But we should release the range from io_tree, to allow further write to be
				3878	* COWed.
				3879	*
				3880	* NOTE: This function may sleep for memory allocation.
				3881	*/
				3882	int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
				3883	{
				3884	return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
				3885	}
				3886
				3887	static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
				3888	enum btrfs_qgroup_rsv_type type)
				3889	{
				3890	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
				3891	type != BTRFS_QGROUP_RSV_META_PERTRANS)
				3892	return;
				3893	if (num_bytes == 0)
				3894	return;
				3895
				3896	spin_lock(&root->qgroup_meta_rsv_lock);
				3897	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
				3898	root->qgroup_meta_rsv_prealloc += num_bytes;
				3899	else
				3900	root->qgroup_meta_rsv_pertrans += num_bytes;
				3901	spin_unlock(&root->qgroup_meta_rsv_lock);
				3902	}
				3903
				3904	static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
				3905	enum btrfs_qgroup_rsv_type type)
				3906	{
				3907	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
				3908	type != BTRFS_QGROUP_RSV_META_PERTRANS)
				3909	return 0;
				3910	if (num_bytes == 0)
				3911	return 0;
				3912
				3913	spin_lock(&root->qgroup_meta_rsv_lock);
				3914	if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
				3915	num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
				3916	num_bytes);
				3917	root->qgroup_meta_rsv_prealloc -= num_bytes;
				3918	} else {
				3919	num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
				3920	num_bytes);
				3921	root->qgroup_meta_rsv_pertrans -= num_bytes;
				3922	}
				3923	spin_unlock(&root->qgroup_meta_rsv_lock);
				3924	return num_bytes;
				3925	}
				3926
				3927	int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
				3928	enum btrfs_qgroup_rsv_type type, bool enforce)
				3929	{
				3930	struct btrfs_fs_info *fs_info = root->fs_info;
				3931	int ret;
				3932
				3933	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) \|\|
				3934	!is_fstree(root->root_key.objectid) \|\| num_bytes == 0)
				3935	return 0;
				3936
				3937	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
				3938	trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
				3939	ret = qgroup_reserve(root, num_bytes, enforce, type);
				3940	if (ret < 0)
				3941	return ret;
				3942	/*
				3943	* Record what we have reserved into root.
				3944	*
				3945	* To avoid quota disabled->enabled underflow.
				3946	* In that case, we may try to free space we haven't reserved
				3947	* (since quota was disabled), so record what we reserved into root.
				3948	* And ensure later release won't underflow this number.
				3949	*/
				3950	add_root_meta_rsv(root, num_bytes, type);
				3951	return ret;
				3952	}
				3953
				3954	int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
				3955	enum btrfs_qgroup_rsv_type type, bool enforce)
				3956	{
				3957	int ret;
				3958
				3959	ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
				3960	if (ret <= 0 && ret != -EDQUOT)
				3961	return ret;
				3962
				3963	ret = try_flush_qgroup(root);
				3964	if (ret < 0)
				3965	return ret;
				3966	return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
				3967	}
				3968
				3969	void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
				3970	{
				3971	struct btrfs_fs_info *fs_info = root->fs_info;
				3972
				3973	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) \|\|
				3974	!is_fstree(root->root_key.objectid))
				3975	return;
				3976
				3977	/* TODO: Update trace point to handle such free */
				3978	trace_qgroup_meta_free_all_pertrans(root);
				3979	/* Special value -1 means to free all reserved space */
				3980	btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
				3981	BTRFS_QGROUP_RSV_META_PERTRANS);
				3982	}
				3983
				3984	void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
				3985	enum btrfs_qgroup_rsv_type type)
				3986	{
				3987	struct btrfs_fs_info *fs_info = root->fs_info;
				3988
				3989	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) \|\|
				3990	!is_fstree(root->root_key.objectid))
				3991	return;
				3992
				3993	/*
				3994	* reservation for META_PREALLOC can happen before quota is enabled,
				3995	* which can lead to underflow.
				3996	* Here ensure we will only free what we really have reserved.
				3997	*/
				3998	num_bytes = sub_root_meta_rsv(root, num_bytes, type);
				3999	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
				4000	trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
				4001	btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
				4002	num_bytes, type);
				4003	}
				4004
				4005	static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
				4006	int num_bytes)
				4007	{
				4008	struct btrfs_qgroup *qgroup;
				4009	struct ulist_node *unode;
				4010	struct ulist_iterator uiter;
				4011	int ret = 0;
				4012
				4013	if (num_bytes == 0)
				4014	return;
				4015	if (!fs_info->quota_root)
				4016	return;
				4017
				4018	spin_lock(&fs_info->qgroup_lock);
				4019	qgroup = find_qgroup_rb(fs_info, ref_root);
				4020	if (!qgroup)
				4021	goto out;
				4022	ulist_reinit(fs_info->qgroup_ulist);
				4023	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
				4024	qgroup_to_aux(qgroup), GFP_ATOMIC);
				4025	if (ret < 0)
				4026	goto out;
				4027	ULIST_ITER_INIT(&uiter);
				4028	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
				4029	struct btrfs_qgroup *qg;
				4030	struct btrfs_qgroup_list *glist;
				4031
				4032	qg = unode_aux_to_qgroup(unode);
				4033
				4034	qgroup_rsv_release(fs_info, qg, num_bytes,
				4035	BTRFS_QGROUP_RSV_META_PREALLOC);
				4036	qgroup_rsv_add(fs_info, qg, num_bytes,
				4037	BTRFS_QGROUP_RSV_META_PERTRANS);
				4038	list_for_each_entry(glist, &qg->groups, next_group) {
				4039	ret = ulist_add(fs_info->qgroup_ulist,
				4040	glist->group->qgroupid,
				4041	qgroup_to_aux(glist->group), GFP_ATOMIC);
				4042	if (ret < 0)
				4043	goto out;
				4044	}
				4045	}
				4046	out:
				4047	spin_unlock(&fs_info->qgroup_lock);
				4048	}
				4049
				4050	void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
				4051	{
				4052	struct btrfs_fs_info *fs_info = root->fs_info;
				4053
				4054	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) \|\|
				4055	!is_fstree(root->root_key.objectid))
				4056	return;
				4057	/* Same as btrfs_qgroup_free_meta_prealloc() */
				4058	num_bytes = sub_root_meta_rsv(root, num_bytes,
				4059	BTRFS_QGROUP_RSV_META_PREALLOC);
				4060	trace_qgroup_meta_convert(root, num_bytes);
				4061	qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
				4062	if (!sb_rdonly(fs_info->sb))
				4063	add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
				4064	}
				4065
				4066	/*
				4067	* Check qgroup reserved space leaking, normally at destroy inode
				4068	* time
				4069	*/
				4070	void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
				4071	{
				4072	struct extent_changeset changeset;
				4073	struct ulist_node *unode;
				4074	struct ulist_iterator iter;
				4075	int ret;
				4076
				4077	extent_changeset_init(&changeset);
				4078	ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
				4079	EXTENT_QGROUP_RESERVED, &changeset);
				4080
				4081	WARN_ON(ret < 0);
				4082	if (WARN_ON(changeset.bytes_changed)) {
				4083	ULIST_ITER_INIT(&iter);
				4084	while ((unode = ulist_next(&changeset.range_changed, &iter))) {
				4085	btrfs_warn(inode->root->fs_info,
				4086	"leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
				4087	btrfs_ino(inode), unode->val, unode->aux);
				4088	}
				4089	btrfs_qgroup_free_refroot(inode->root->fs_info,
				4090	inode->root->root_key.objectid,
				4091	changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
				4092
				4093	}
				4094	extent_changeset_release(&changeset);
				4095	}
				4096
				4097	void btrfs_qgroup_init_swapped_blocks(
				4098	struct btrfs_qgroup_swapped_blocks *swapped_blocks)
				4099	{
				4100	int i;
				4101
				4102	spin_lock_init(&swapped_blocks->lock);
				4103	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
				4104	swapped_blocks->blocks[i] = RB_ROOT;
				4105	swapped_blocks->swapped = false;
				4106	}
				4107
				4108	/*
				4109	* Delete all swapped blocks record of @root.
				4110	* Every record here means we skipped a full subtree scan for qgroup.
				4111	*
				4112	* Gets called when committing one transaction.
				4113	*/
				4114	void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
				4115	{
				4116	struct btrfs_qgroup_swapped_blocks *swapped_blocks;
				4117	int i;
				4118
				4119	swapped_blocks = &root->swapped_blocks;
				4120
				4121	spin_lock(&swapped_blocks->lock);
				4122	if (!swapped_blocks->swapped)
				4123	goto out;
				4124	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
				4125	struct rb_root *cur_root = &swapped_blocks->blocks[i];
				4126	struct btrfs_qgroup_swapped_block *entry;
				4127	struct btrfs_qgroup_swapped_block *next;
				4128
				4129	rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
				4130	node)
				4131	kfree(entry);
				4132	swapped_blocks->blocks[i] = RB_ROOT;
				4133	}
				4134	swapped_blocks->swapped = false;
				4135	out:
				4136	spin_unlock(&swapped_blocks->lock);
				4137	}
				4138
				4139	/*
				4140	* Add subtree roots record into @subvol_root.
				4141	*
				4142	* @subvol_root: tree root of the subvolume tree get swapped
				4143	* @bg: block group under balance
				4144	* @subvol_parent/slot: pointer to the subtree root in subvolume tree
				4145	* @reloc_parent/slot: pointer to the subtree root in reloc tree
				4146	* BOTH POINTERS ARE BEFORE TREE SWAP
				4147	* @last_snapshot: last snapshot generation of the subvolume tree
				4148	*/
				4149	int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
				4150	struct btrfs_root *subvol_root,
				4151	struct btrfs_block_group_cache *bg,
				4152	struct extent_buffer *subvol_parent, int subvol_slot,
				4153	struct extent_buffer *reloc_parent, int reloc_slot,
				4154	u64 last_snapshot)
				4155	{
				4156	struct btrfs_fs_info *fs_info = subvol_root->fs_info;
				4157	struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
				4158	struct btrfs_qgroup_swapped_block *block;
				4159	struct rb_node **cur;
				4160	struct rb_node *parent = NULL;
				4161	int level = btrfs_header_level(subvol_parent) - 1;
				4162	int ret = 0;
				4163
				4164	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				4165	return 0;
				4166
				4167	if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
				4168	btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
				4169	btrfs_err_rl(fs_info,
				4170	"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
				4171	__func__,
				4172	btrfs_node_ptr_generation(subvol_parent, subvol_slot),
				4173	btrfs_node_ptr_generation(reloc_parent, reloc_slot));
				4174	return -EUCLEAN;
				4175	}
				4176
				4177	block = kmalloc(sizeof(*block), GFP_NOFS);
				4178	if (!block) {
				4179	ret = -ENOMEM;
				4180	goto out;
				4181	}
				4182
				4183	/*
				4184	* @reloc_parent/slot is still before swap, while @block is going to
				4185	* record the bytenr after swap, so we do the swap here.
				4186	*/
				4187	block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
				4188	block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
				4189	reloc_slot);
				4190	block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
				4191	block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
				4192	subvol_slot);
				4193	block->last_snapshot = last_snapshot;
				4194	block->level = level;
				4195
				4196	/*
				4197	* If we have bg == NULL, we're called from btrfs_recover_relocation(),
				4198	* no one else can modify tree blocks thus we qgroup will not change
				4199	* no matter the value of trace_leaf.
				4200	*/
				4201	if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
				4202	block->trace_leaf = true;
				4203	else
				4204	block->trace_leaf = false;
				4205	btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
				4206
				4207	/* Insert @block into @blocks */
				4208	spin_lock(&blocks->lock);
				4209	cur = &blocks->blocks[level].rb_node;
				4210	while (*cur) {
				4211	struct btrfs_qgroup_swapped_block *entry;
				4212
				4213	parent = *cur;
				4214	entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
				4215	node);
				4216
				4217	if (entry->subvol_bytenr < block->subvol_bytenr) {
				4218	cur = &(*cur)->rb_left;
				4219	} else if (entry->subvol_bytenr > block->subvol_bytenr) {
				4220	cur = &(*cur)->rb_right;
				4221	} else {
				4222	if (entry->subvol_generation !=
				4223	block->subvol_generation \|\|
				4224	entry->reloc_bytenr != block->reloc_bytenr \|\|
				4225	entry->reloc_generation !=
				4226	block->reloc_generation) {
				4227	/*
				4228	* Duplicated but mismatch entry found.
				4229	* Shouldn't happen.
				4230	*
				4231	* Marking qgroup inconsistent should be enough
				4232	* for end users.
				4233	*/
				4234	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
				4235	ret = -EEXIST;
				4236	}
				4237	kfree(block);
				4238	goto out_unlock;
				4239	}
				4240	}
				4241	rb_link_node(&block->node, parent, cur);
				4242	rb_insert_color(&block->node, &blocks->blocks[level]);
				4243	blocks->swapped = true;
				4244	out_unlock:
				4245	spin_unlock(&blocks->lock);
				4246	out:
				4247	if (ret < 0)
				4248	fs_info->qgroup_flags \|=
				4249	BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				4250	return ret;
				4251	}
				4252
				4253	/*
				4254	* Check if the tree block is a subtree root, and if so do the needed
				4255	* delayed subtree trace for qgroup.
				4256	*
				4257	* This is called during btrfs_cow_block().
				4258	*/
				4259	int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
				4260	struct btrfs_root *root,
				4261	struct extent_buffer *subvol_eb)
				4262	{
				4263	struct btrfs_fs_info *fs_info = root->fs_info;
				4264	struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
				4265	struct btrfs_qgroup_swapped_block *block;
				4266	struct extent_buffer *reloc_eb = NULL;
				4267	struct rb_node *node;
				4268	bool found = false;
				4269	bool swapped = false;
				4270	int level = btrfs_header_level(subvol_eb);
				4271	int ret = 0;
				4272	int i;
				4273
				4274	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				4275	return 0;
				4276	if (!is_fstree(root->root_key.objectid) \|\| !root->reloc_root)
				4277	return 0;
				4278
				4279	spin_lock(&blocks->lock);
				4280	if (!blocks->swapped) {
				4281	spin_unlock(&blocks->lock);
				4282	return 0;
				4283	}
				4284	node = blocks->blocks[level].rb_node;
				4285
				4286	while (node) {
				4287	block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
				4288	if (block->subvol_bytenr < subvol_eb->start) {
				4289	node = node->rb_left;
				4290	} else if (block->subvol_bytenr > subvol_eb->start) {
				4291	node = node->rb_right;
				4292	} else {
				4293	found = true;
				4294	break;
				4295	}
				4296	}
				4297	if (!found) {
				4298	spin_unlock(&blocks->lock);
				4299	goto out;
				4300	}
				4301	/* Found one, remove it from @blocks first and update blocks->swapped */
				4302	rb_erase(&block->node, &blocks->blocks[level]);
				4303	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
				4304	if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
				4305	swapped = true;
				4306	break;
				4307	}
				4308	}
				4309	blocks->swapped = swapped;
				4310	spin_unlock(&blocks->lock);
				4311
				4312	/* Read out reloc subtree root */
				4313	reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
				4314	block->reloc_generation, block->level,
				4315	&block->first_key);
				4316	if (IS_ERR(reloc_eb)) {
				4317	ret = PTR_ERR(reloc_eb);
				4318	reloc_eb = NULL;
				4319	goto free_out;
				4320	}
				4321	if (!extent_buffer_uptodate(reloc_eb)) {
				4322	ret = -EIO;
				4323	goto free_out;
				4324	}
				4325
				4326	ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
				4327	block->last_snapshot, block->trace_leaf);
				4328	free_out:
				4329	kfree(block);
				4330	free_extent_buffer(reloc_eb);
				4331	out:
				4332	if (ret < 0) {
				4333	btrfs_err_rl(fs_info,
				4334	"failed to account subtree at bytenr %llu: %d",
				4335	subvol_eb->start, ret);
				4336	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
				4337	}
				4338	return ret;
				4339	}
				4340
				4341	void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
				4342	{
				4343	struct btrfs_qgroup_extent_record *entry;
				4344	struct btrfs_qgroup_extent_record *next;
				4345	struct rb_root *root;
				4346
				4347	root = &trans->delayed_refs.dirty_extent_root;
				4348	rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
				4349	ulist_free(entry->old_roots);
				4350	kfree(entry);
				4351	}
				4352	*root = RB_ROOT;
				4353	}