blob: 5c0ef7a041697ebe7c9a4d9e68af439f70eadeae [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4 * Written by Alex Tomas <alex@clusterfs.com>
5 *
6 * Architecture independence:
7 * Copyright (c) 2005, Bull S.A.
8 * Written by Pierre Peiffer <pierre.peiffer@bull.net>
9 */
10
11/*
12 * Extents support for EXT4
13 *
14 * TODO:
15 * - ext4*_error() should be used in some situations
16 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate
17 * - smart tree reduction
18 */
19
20#include <linux/fs.h>
21#include <linux/time.h>
22#include <linux/jbd2.h>
23#include <linux/highuid.h>
24#include <linux/pagemap.h>
25#include <linux/quotaops.h>
26#include <linux/string.h>
27#include <linux/slab.h>
28#include <linux/uaccess.h>
29#include <linux/fiemap.h>
30#include <linux/backing-dev.h>
31#include "ext4_jbd2.h"
32#include "ext4_extents.h"
33#include "xattr.h"
34
35#include <trace/events/ext4.h>
36
37/*
38 * used by extent splitting.
39 */
40#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
41 due to ENOSPC */
42#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
43#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
44
45#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
46#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
47
48static __le32 ext4_extent_block_csum(struct inode *inode,
49 struct ext4_extent_header *eh)
50{
51 struct ext4_inode_info *ei = EXT4_I(inode);
52 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
53 __u32 csum;
54
55 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
56 EXT4_EXTENT_TAIL_OFFSET(eh));
57 return cpu_to_le32(csum);
58}
59
60static int ext4_extent_block_csum_verify(struct inode *inode,
61 struct ext4_extent_header *eh)
62{
63 struct ext4_extent_tail *et;
64
65 if (!ext4_has_metadata_csum(inode->i_sb))
66 return 1;
67
68 et = find_ext4_extent_tail(eh);
69 if (et->et_checksum != ext4_extent_block_csum(inode, eh))
70 return 0;
71 return 1;
72}
73
74static void ext4_extent_block_csum_set(struct inode *inode,
75 struct ext4_extent_header *eh)
76{
77 struct ext4_extent_tail *et;
78
79 if (!ext4_has_metadata_csum(inode->i_sb))
80 return;
81
82 et = find_ext4_extent_tail(eh);
83 et->et_checksum = ext4_extent_block_csum(inode, eh);
84}
85
86static int ext4_split_extent(handle_t *handle,
87 struct inode *inode,
88 struct ext4_ext_path **ppath,
89 struct ext4_map_blocks *map,
90 int split_flag,
91 int flags);
92
93static int ext4_split_extent_at(handle_t *handle,
94 struct inode *inode,
95 struct ext4_ext_path **ppath,
96 ext4_lblk_t split,
97 int split_flag,
98 int flags);
99
100static int ext4_find_delayed_extent(struct inode *inode,
101 struct extent_status *newes);
102
103static int ext4_ext_truncate_extend_restart(handle_t *handle,
104 struct inode *inode,
105 int needed)
106{
107 int err;
108
109 if (!ext4_handle_valid(handle))
110 return 0;
111 if (handle->h_buffer_credits >= needed)
112 return 0;
113 /*
114 * If we need to extend the journal get a few extra blocks
115 * while we're at it for efficiency's sake.
116 */
117 needed += 3;
118 err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
119 if (err <= 0)
120 return err;
121 err = ext4_truncate_restart_trans(handle, inode, needed);
122 if (err == 0)
123 err = -EAGAIN;
124
125 return err;
126}
127
128/*
129 * could return:
130 * - EROFS
131 * - ENOMEM
132 */
133static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
134 struct ext4_ext_path *path)
135{
136 int err = 0;
137
138 if (path->p_bh) {
139 /* path points to block */
140 BUFFER_TRACE(path->p_bh, "get_write_access");
141 err = ext4_journal_get_write_access(handle, path->p_bh);
142
143 /*
144 * The extent buffer's verified bit will be set again in
145 * __ext4_ext_dirty(). We could leave an inconsistent
146 * buffer if the extents updating procudure break off du
147 * to some error happens, force to check it again.
148 */
149 if (!err)
150 clear_buffer_verified(path->p_bh);
151 }
152 /* path points to leaf/index in inode body */
153 /* we use in-core data, no need to protect them */
154 return err;
155}
156
157/*
158 * could return:
159 * - EROFS
160 * - ENOMEM
161 * - EIO
162 */
163int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
164 struct inode *inode, struct ext4_ext_path *path)
165{
166 int err;
167
168 WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
169 if (path->p_bh) {
170 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
171 /* path points to block */
172 err = __ext4_handle_dirty_metadata(where, line, handle,
173 inode, path->p_bh);
174 /* Extents updating done, re-set verified flag */
175 if (!err)
176 set_buffer_verified(path->p_bh);
177 } else {
178 /* path points to leaf/index in inode body */
179 err = ext4_mark_inode_dirty(handle, inode);
180 }
181 return err;
182}
183
184static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
185 struct ext4_ext_path *path,
186 ext4_lblk_t block)
187{
188 if (path) {
189 int depth = path->p_depth;
190 struct ext4_extent *ex;
191
192 /*
193 * Try to predict block placement assuming that we are
194 * filling in a file which will eventually be
195 * non-sparse --- i.e., in the case of libbfd writing
196 * an ELF object sections out-of-order but in a way
197 * the eventually results in a contiguous object or
198 * executable file, or some database extending a table
199 * space file. However, this is actually somewhat
200 * non-ideal if we are writing a sparse file such as
201 * qemu or KVM writing a raw image file that is going
202 * to stay fairly sparse, since it will end up
203 * fragmenting the file system's free space. Maybe we
204 * should have some hueristics or some way to allow
205 * userspace to pass a hint to file system,
206 * especially if the latter case turns out to be
207 * common.
208 */
209 ex = path[depth].p_ext;
210 if (ex) {
211 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
212 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
213
214 if (block > ext_block)
215 return ext_pblk + (block - ext_block);
216 else
217 return ext_pblk - (ext_block - block);
218 }
219
220 /* it looks like index is empty;
221 * try to find starting block from index itself */
222 if (path[depth].p_bh)
223 return path[depth].p_bh->b_blocknr;
224 }
225
226 /* OK. use inode's group */
227 return ext4_inode_to_goal_block(inode);
228}
229
230/*
231 * Allocation for a meta data block
232 */
233static ext4_fsblk_t
234ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
235 struct ext4_ext_path *path,
236 struct ext4_extent *ex, int *err, unsigned int flags)
237{
238 ext4_fsblk_t goal, newblock;
239
240 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
241 newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
242 NULL, err);
243 return newblock;
244}
245
246static inline int ext4_ext_space_block(struct inode *inode, int check)
247{
248 int size;
249
250 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
251 / sizeof(struct ext4_extent);
252#ifdef AGGRESSIVE_TEST
253 if (!check && size > 6)
254 size = 6;
255#endif
256 return size;
257}
258
259static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
260{
261 int size;
262
263 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
264 / sizeof(struct ext4_extent_idx);
265#ifdef AGGRESSIVE_TEST
266 if (!check && size > 5)
267 size = 5;
268#endif
269 return size;
270}
271
272static inline int ext4_ext_space_root(struct inode *inode, int check)
273{
274 int size;
275
276 size = sizeof(EXT4_I(inode)->i_data);
277 size -= sizeof(struct ext4_extent_header);
278 size /= sizeof(struct ext4_extent);
279#ifdef AGGRESSIVE_TEST
280 if (!check && size > 3)
281 size = 3;
282#endif
283 return size;
284}
285
286static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
287{
288 int size;
289
290 size = sizeof(EXT4_I(inode)->i_data);
291 size -= sizeof(struct ext4_extent_header);
292 size /= sizeof(struct ext4_extent_idx);
293#ifdef AGGRESSIVE_TEST
294 if (!check && size > 4)
295 size = 4;
296#endif
297 return size;
298}
299
300static inline int
301ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
302 struct ext4_ext_path **ppath, ext4_lblk_t lblk,
303 int nofail)
304{
305 struct ext4_ext_path *path = *ppath;
306 int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
307 int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
308
309 if (nofail)
310 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
311
312 return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
313 EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
314 flags);
315}
316
317/*
318 * Calculate the number of metadata blocks needed
319 * to allocate @blocks
320 * Worse case is one block per extent
321 */
322int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
323{
324 struct ext4_inode_info *ei = EXT4_I(inode);
325 int idxs;
326
327 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
328 / sizeof(struct ext4_extent_idx));
329
330 /*
331 * If the new delayed allocation block is contiguous with the
332 * previous da block, it can share index blocks with the
333 * previous block, so we only need to allocate a new index
334 * block every idxs leaf blocks. At ldxs**2 blocks, we need
335 * an additional index block, and at ldxs**3 blocks, yet
336 * another index blocks.
337 */
338 if (ei->i_da_metadata_calc_len &&
339 ei->i_da_metadata_calc_last_lblock+1 == lblock) {
340 int num = 0;
341
342 if ((ei->i_da_metadata_calc_len % idxs) == 0)
343 num++;
344 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
345 num++;
346 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
347 num++;
348 ei->i_da_metadata_calc_len = 0;
349 } else
350 ei->i_da_metadata_calc_len++;
351 ei->i_da_metadata_calc_last_lblock++;
352 return num;
353 }
354
355 /*
356 * In the worst case we need a new set of index blocks at
357 * every level of the inode's extent tree.
358 */
359 ei->i_da_metadata_calc_len = 1;
360 ei->i_da_metadata_calc_last_lblock = lblock;
361 return ext_depth(inode) + 1;
362}
363
364static int
365ext4_ext_max_entries(struct inode *inode, int depth)
366{
367 int max;
368
369 if (depth == ext_depth(inode)) {
370 if (depth == 0)
371 max = ext4_ext_space_root(inode, 1);
372 else
373 max = ext4_ext_space_root_idx(inode, 1);
374 } else {
375 if (depth == 0)
376 max = ext4_ext_space_block(inode, 1);
377 else
378 max = ext4_ext_space_block_idx(inode, 1);
379 }
380
381 return max;
382}
383
384static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
385{
386 ext4_fsblk_t block = ext4_ext_pblock(ext);
387 int len = ext4_ext_get_actual_len(ext);
388 ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
389
390 /*
391 * We allow neither:
392 * - zero length
393 * - overflow/wrap-around
394 */
395 if (lblock + len <= lblock)
396 return 0;
397 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
398}
399
400static int ext4_valid_extent_idx(struct inode *inode,
401 struct ext4_extent_idx *ext_idx)
402{
403 ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
404
405 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
406}
407
408static int ext4_valid_extent_entries(struct inode *inode,
409 struct ext4_extent_header *eh,
410 ext4_lblk_t lblk, ext4_fsblk_t *pblk,
411 int depth)
412{
413 unsigned short entries;
414 ext4_lblk_t lblock = 0;
415 ext4_lblk_t cur = 0;
416
417 if (eh->eh_entries == 0)
418 return 1;
419
420 entries = le16_to_cpu(eh->eh_entries);
421
422 if (depth == 0) {
423 /* leaf entries */
424 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
425 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
426 ext4_fsblk_t pblock = 0;
427
428 /*
429 * The logical block in the first entry should equal to
430 * the number in the index block.
431 */
432 if (depth != ext_depth(inode) &&
433 lblk != le32_to_cpu(ext->ee_block))
434 return 0;
435 while (entries) {
436 if (!ext4_valid_extent(inode, ext))
437 return 0;
438
439 /* Check for overlapping extents */
440 lblock = le32_to_cpu(ext->ee_block);
441 if (lblock < cur) {
442 pblock = ext4_ext_pblock(ext);
443 es->s_last_error_block = cpu_to_le64(pblock);
444 return 0;
445 }
446 cur = lblock + ext4_ext_get_actual_len(ext);
447 ext++;
448 entries--;
449 }
450 } else {
451 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
452
453 /*
454 * The logical block in the first entry should equal to
455 * the number in the parent index block.
456 */
457 if (depth != ext_depth(inode) &&
458 lblk != le32_to_cpu(ext_idx->ei_block))
459 return 0;
460 while (entries) {
461 if (!ext4_valid_extent_idx(inode, ext_idx))
462 return 0;
463
464 /* Check for overlapping index extents */
465 lblock = le32_to_cpu(ext_idx->ei_block);
466 if (lblock < cur) {
467 *pblk = ext4_idx_pblock(ext_idx);
468 return 0;
469 }
470 ext_idx++;
471 entries--;
472 cur = lblock + 1;
473 }
474 }
475 return 1;
476}
477
478static int __ext4_ext_check(const char *function, unsigned int line,
479 struct inode *inode, struct ext4_extent_header *eh,
480 int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
481{
482 const char *error_msg;
483 int max = 0, err = -EFSCORRUPTED;
484
485 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
486 error_msg = "invalid magic";
487 goto corrupted;
488 }
489 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
490 error_msg = "unexpected eh_depth";
491 goto corrupted;
492 }
493 if (unlikely(eh->eh_max == 0)) {
494 error_msg = "invalid eh_max";
495 goto corrupted;
496 }
497 max = ext4_ext_max_entries(inode, depth);
498 if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
499 error_msg = "too large eh_max";
500 goto corrupted;
501 }
502 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
503 error_msg = "invalid eh_entries";
504 goto corrupted;
505 }
506 if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
507 error_msg = "eh_entries is 0 but eh_depth is > 0";
508 goto corrupted;
509 }
510 if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
511 error_msg = "invalid extent entries";
512 goto corrupted;
513 }
514 if (unlikely(depth > 32)) {
515 error_msg = "too large eh_depth";
516 goto corrupted;
517 }
518 /* Verify checksum on non-root extent tree nodes */
519 if (ext_depth(inode) != depth &&
520 !ext4_extent_block_csum_verify(inode, eh)) {
521 error_msg = "extent tree corrupted";
522 err = -EFSBADCRC;
523 goto corrupted;
524 }
525 return 0;
526
527corrupted:
528 ext4_error_inode(inode, function, line, 0,
529 "pblk %llu bad header/extent: %s - magic %x, "
530 "entries %u, max %u(%u), depth %u(%u)",
531 (unsigned long long) pblk, error_msg,
532 le16_to_cpu(eh->eh_magic),
533 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
534 max, le16_to_cpu(eh->eh_depth), depth);
535 return err;
536}
537
538#define ext4_ext_check(inode, eh, depth, pblk) \
539 __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
540
541int ext4_ext_check_inode(struct inode *inode)
542{
543 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
544}
545
546static void ext4_cache_extents(struct inode *inode,
547 struct ext4_extent_header *eh)
548{
549 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
550 ext4_lblk_t prev = 0;
551 int i;
552
553 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
554 unsigned int status = EXTENT_STATUS_WRITTEN;
555 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
556 int len = ext4_ext_get_actual_len(ex);
557
558 if (prev && (prev != lblk))
559 ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
560 EXTENT_STATUS_HOLE);
561
562 if (ext4_ext_is_unwritten(ex))
563 status = EXTENT_STATUS_UNWRITTEN;
564 ext4_es_cache_extent(inode, lblk, len,
565 ext4_ext_pblock(ex), status);
566 prev = lblk + len;
567 }
568}
569
570static struct buffer_head *
571__read_extent_tree_block(const char *function, unsigned int line,
572 struct inode *inode, struct ext4_extent_idx *idx,
573 int depth, int flags)
574{
575 struct buffer_head *bh;
576 int err;
577 ext4_fsblk_t pblk;
578 gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS;
579
580 if (flags & EXT4_EX_NOFAIL)
581 gfp_flags |= __GFP_NOFAIL;
582
583 pblk = ext4_idx_pblock(idx);
584 bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
585 if (unlikely(!bh))
586 return ERR_PTR(-ENOMEM);
587
588 if (!bh_uptodate_or_lock(bh)) {
589 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
590 err = bh_submit_read(bh);
591 if (err < 0)
592 goto errout;
593 }
594 if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
595 return bh;
596 if (!ext4_has_feature_journal(inode->i_sb) ||
597 (inode->i_ino !=
598 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
599 err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
600 depth, pblk, le32_to_cpu(idx->ei_block));
601 if (err)
602 goto errout;
603 }
604 set_buffer_verified(bh);
605 /*
606 * If this is a leaf block, cache all of its entries
607 */
608 if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
609 struct ext4_extent_header *eh = ext_block_hdr(bh);
610 ext4_cache_extents(inode, eh);
611 }
612 return bh;
613errout:
614 put_bh(bh);
615 return ERR_PTR(err);
616
617}
618
619#define read_extent_tree_block(inode, idx, depth, flags) \
620 __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
621 (depth), (flags))
622
623/*
624 * This function is called to cache a file's extent information in the
625 * extent status tree
626 */
627int ext4_ext_precache(struct inode *inode)
628{
629 struct ext4_inode_info *ei = EXT4_I(inode);
630 struct ext4_ext_path *path = NULL;
631 struct buffer_head *bh;
632 int i = 0, depth, ret = 0;
633
634 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
635 return 0; /* not an extent-mapped inode */
636
637 down_read(&ei->i_data_sem);
638 depth = ext_depth(inode);
639
640 path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
641 GFP_NOFS);
642 if (path == NULL) {
643 up_read(&ei->i_data_sem);
644 return -ENOMEM;
645 }
646
647 /* Don't cache anything if there are no external extent blocks */
648 if (depth == 0)
649 goto out;
650 path[0].p_hdr = ext_inode_hdr(inode);
651 ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
652 if (ret)
653 goto out;
654 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
655 while (i >= 0) {
656 /*
657 * If this is a leaf block or we've reached the end of
658 * the index block, go up
659 */
660 if ((i == depth) ||
661 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
662 brelse(path[i].p_bh);
663 path[i].p_bh = NULL;
664 i--;
665 continue;
666 }
667 bh = read_extent_tree_block(inode, path[i].p_idx++,
668 depth - i - 1,
669 EXT4_EX_FORCE_CACHE);
670 if (IS_ERR(bh)) {
671 ret = PTR_ERR(bh);
672 break;
673 }
674 i++;
675 path[i].p_bh = bh;
676 path[i].p_hdr = ext_block_hdr(bh);
677 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
678 }
679 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
680out:
681 up_read(&ei->i_data_sem);
682 ext4_ext_drop_refs(path);
683 kfree(path);
684 return ret;
685}
686
687#ifdef EXT_DEBUG
688static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
689{
690 int k, l = path->p_depth;
691
692 ext_debug("path:");
693 for (k = 0; k <= l; k++, path++) {
694 if (path->p_idx) {
695 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
696 ext4_idx_pblock(path->p_idx));
697 } else if (path->p_ext) {
698 ext_debug(" %d:[%d]%d:%llu ",
699 le32_to_cpu(path->p_ext->ee_block),
700 ext4_ext_is_unwritten(path->p_ext),
701 ext4_ext_get_actual_len(path->p_ext),
702 ext4_ext_pblock(path->p_ext));
703 } else
704 ext_debug(" []");
705 }
706 ext_debug("\n");
707}
708
709static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
710{
711 int depth = ext_depth(inode);
712 struct ext4_extent_header *eh;
713 struct ext4_extent *ex;
714 int i;
715
716 if (!path)
717 return;
718
719 eh = path[depth].p_hdr;
720 ex = EXT_FIRST_EXTENT(eh);
721
722 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
723
724 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
725 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
726 ext4_ext_is_unwritten(ex),
727 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
728 }
729 ext_debug("\n");
730}
731
732static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
733 ext4_fsblk_t newblock, int level)
734{
735 int depth = ext_depth(inode);
736 struct ext4_extent *ex;
737
738 if (depth != level) {
739 struct ext4_extent_idx *idx;
740 idx = path[level].p_idx;
741 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
742 ext_debug("%d: move %d:%llu in new index %llu\n", level,
743 le32_to_cpu(idx->ei_block),
744 ext4_idx_pblock(idx),
745 newblock);
746 idx++;
747 }
748
749 return;
750 }
751
752 ex = path[depth].p_ext;
753 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
754 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
755 le32_to_cpu(ex->ee_block),
756 ext4_ext_pblock(ex),
757 ext4_ext_is_unwritten(ex),
758 ext4_ext_get_actual_len(ex),
759 newblock);
760 ex++;
761 }
762}
763
764#else
765#define ext4_ext_show_path(inode, path)
766#define ext4_ext_show_leaf(inode, path)
767#define ext4_ext_show_move(inode, path, newblock, level)
768#endif
769
770void ext4_ext_drop_refs(struct ext4_ext_path *path)
771{
772 int depth, i;
773
774 if (!path)
775 return;
776 depth = path->p_depth;
777 for (i = 0; i <= depth; i++, path++)
778 if (path->p_bh) {
779 brelse(path->p_bh);
780 path->p_bh = NULL;
781 }
782}
783
784/*
785 * ext4_ext_binsearch_idx:
786 * binary search for the closest index of the given block
787 * the header must be checked before calling this
788 */
789static void
790ext4_ext_binsearch_idx(struct inode *inode,
791 struct ext4_ext_path *path, ext4_lblk_t block)
792{
793 struct ext4_extent_header *eh = path->p_hdr;
794 struct ext4_extent_idx *r, *l, *m;
795
796
797 ext_debug("binsearch for %u(idx): ", block);
798
799 l = EXT_FIRST_INDEX(eh) + 1;
800 r = EXT_LAST_INDEX(eh);
801 while (l <= r) {
802 m = l + (r - l) / 2;
803 if (block < le32_to_cpu(m->ei_block))
804 r = m - 1;
805 else
806 l = m + 1;
807 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
808 m, le32_to_cpu(m->ei_block),
809 r, le32_to_cpu(r->ei_block));
810 }
811
812 path->p_idx = l - 1;
813 ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
814 ext4_idx_pblock(path->p_idx));
815
816#ifdef CHECK_BINSEARCH
817 {
818 struct ext4_extent_idx *chix, *ix;
819 int k;
820
821 chix = ix = EXT_FIRST_INDEX(eh);
822 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
823 if (k != 0 &&
824 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
825 printk(KERN_DEBUG "k=%d, ix=0x%p, "
826 "first=0x%p\n", k,
827 ix, EXT_FIRST_INDEX(eh));
828 printk(KERN_DEBUG "%u <= %u\n",
829 le32_to_cpu(ix->ei_block),
830 le32_to_cpu(ix[-1].ei_block));
831 }
832 BUG_ON(k && le32_to_cpu(ix->ei_block)
833 <= le32_to_cpu(ix[-1].ei_block));
834 if (block < le32_to_cpu(ix->ei_block))
835 break;
836 chix = ix;
837 }
838 BUG_ON(chix != path->p_idx);
839 }
840#endif
841
842}
843
844/*
845 * ext4_ext_binsearch:
846 * binary search for closest extent of the given block
847 * the header must be checked before calling this
848 */
849static void
850ext4_ext_binsearch(struct inode *inode,
851 struct ext4_ext_path *path, ext4_lblk_t block)
852{
853 struct ext4_extent_header *eh = path->p_hdr;
854 struct ext4_extent *r, *l, *m;
855
856 if (eh->eh_entries == 0) {
857 /*
858 * this leaf is empty:
859 * we get such a leaf in split/add case
860 */
861 return;
862 }
863
864 ext_debug("binsearch for %u: ", block);
865
866 l = EXT_FIRST_EXTENT(eh) + 1;
867 r = EXT_LAST_EXTENT(eh);
868
869 while (l <= r) {
870 m = l + (r - l) / 2;
871 if (block < le32_to_cpu(m->ee_block))
872 r = m - 1;
873 else
874 l = m + 1;
875 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
876 m, le32_to_cpu(m->ee_block),
877 r, le32_to_cpu(r->ee_block));
878 }
879
880 path->p_ext = l - 1;
881 ext_debug(" -> %d:%llu:[%d]%d ",
882 le32_to_cpu(path->p_ext->ee_block),
883 ext4_ext_pblock(path->p_ext),
884 ext4_ext_is_unwritten(path->p_ext),
885 ext4_ext_get_actual_len(path->p_ext));
886
887#ifdef CHECK_BINSEARCH
888 {
889 struct ext4_extent *chex, *ex;
890 int k;
891
892 chex = ex = EXT_FIRST_EXTENT(eh);
893 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
894 BUG_ON(k && le32_to_cpu(ex->ee_block)
895 <= le32_to_cpu(ex[-1].ee_block));
896 if (block < le32_to_cpu(ex->ee_block))
897 break;
898 chex = ex;
899 }
900 BUG_ON(chex != path->p_ext);
901 }
902#endif
903
904}
905
906int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
907{
908 struct ext4_extent_header *eh;
909
910 eh = ext_inode_hdr(inode);
911 eh->eh_depth = 0;
912 eh->eh_entries = 0;
913 eh->eh_magic = EXT4_EXT_MAGIC;
914 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
915 eh->eh_generation = 0;
916 ext4_mark_inode_dirty(handle, inode);
917 return 0;
918}
919
920struct ext4_ext_path *
921ext4_find_extent(struct inode *inode, ext4_lblk_t block,
922 struct ext4_ext_path **orig_path, int flags)
923{
924 struct ext4_extent_header *eh;
925 struct buffer_head *bh;
926 struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
927 short int depth, i, ppos = 0;
928 int ret;
929 gfp_t gfp_flags = GFP_NOFS;
930
931 if (flags & EXT4_EX_NOFAIL)
932 gfp_flags |= __GFP_NOFAIL;
933
934 eh = ext_inode_hdr(inode);
935 depth = ext_depth(inode);
936 if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
937 EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
938 depth);
939 ret = -EFSCORRUPTED;
940 goto err;
941 }
942
943 if (path) {
944 ext4_ext_drop_refs(path);
945 if (depth > path[0].p_maxdepth) {
946 kfree(path);
947 *orig_path = path = NULL;
948 }
949 }
950 if (!path) {
951 /* account possible depth increase */
952 path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
953 gfp_flags);
954 if (unlikely(!path))
955 return ERR_PTR(-ENOMEM);
956 path[0].p_maxdepth = depth + 1;
957 }
958 path[0].p_hdr = eh;
959 path[0].p_bh = NULL;
960
961 i = depth;
962 if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
963 ext4_cache_extents(inode, eh);
964 /* walk through the tree */
965 while (i) {
966 ext_debug("depth %d: num %d, max %d\n",
967 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
968
969 ext4_ext_binsearch_idx(inode, path + ppos, block);
970 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
971 path[ppos].p_depth = i;
972 path[ppos].p_ext = NULL;
973
974 bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
975 if (IS_ERR(bh)) {
976 ret = PTR_ERR(bh);
977 goto err;
978 }
979
980 eh = ext_block_hdr(bh);
981 ppos++;
982 path[ppos].p_bh = bh;
983 path[ppos].p_hdr = eh;
984 }
985
986 path[ppos].p_depth = i;
987 path[ppos].p_ext = NULL;
988 path[ppos].p_idx = NULL;
989
990 /* find extent */
991 ext4_ext_binsearch(inode, path + ppos, block);
992 /* if not an empty leaf */
993 if (path[ppos].p_ext)
994 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
995
996 ext4_ext_show_path(inode, path);
997
998 return path;
999
1000err:
1001 ext4_ext_drop_refs(path);
1002 kfree(path);
1003 if (orig_path)
1004 *orig_path = NULL;
1005 return ERR_PTR(ret);
1006}
1007
1008/*
1009 * ext4_ext_insert_index:
1010 * insert new index [@logical;@ptr] into the block at @curp;
1011 * check where to insert: before @curp or after @curp
1012 */
1013static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
1014 struct ext4_ext_path *curp,
1015 int logical, ext4_fsblk_t ptr)
1016{
1017 struct ext4_extent_idx *ix;
1018 int len, err;
1019
1020 err = ext4_ext_get_access(handle, inode, curp);
1021 if (err)
1022 return err;
1023
1024 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
1025 EXT4_ERROR_INODE(inode,
1026 "logical %d == ei_block %d!",
1027 logical, le32_to_cpu(curp->p_idx->ei_block));
1028 return -EFSCORRUPTED;
1029 }
1030
1031 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
1032 >= le16_to_cpu(curp->p_hdr->eh_max))) {
1033 EXT4_ERROR_INODE(inode,
1034 "eh_entries %d >= eh_max %d!",
1035 le16_to_cpu(curp->p_hdr->eh_entries),
1036 le16_to_cpu(curp->p_hdr->eh_max));
1037 return -EFSCORRUPTED;
1038 }
1039
1040 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
1041 /* insert after */
1042 ext_debug("insert new index %d after: %llu\n", logical, ptr);
1043 ix = curp->p_idx + 1;
1044 } else {
1045 /* insert before */
1046 ext_debug("insert new index %d before: %llu\n", logical, ptr);
1047 ix = curp->p_idx;
1048 }
1049
1050 if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
1051 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
1052 return -EFSCORRUPTED;
1053 }
1054
1055 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
1056 BUG_ON(len < 0);
1057 if (len > 0) {
1058 ext_debug("insert new index %d: "
1059 "move %d indices from 0x%p to 0x%p\n",
1060 logical, len, ix, ix + 1);
1061 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
1062 }
1063
1064 ix->ei_block = cpu_to_le32(logical);
1065 ext4_idx_store_pblock(ix, ptr);
1066 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
1067
1068 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
1069 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
1070 return -EFSCORRUPTED;
1071 }
1072
1073 err = ext4_ext_dirty(handle, inode, curp);
1074 ext4_std_error(inode->i_sb, err);
1075
1076 return err;
1077}
1078
1079/*
1080 * ext4_ext_split:
1081 * inserts new subtree into the path, using free index entry
1082 * at depth @at:
1083 * - allocates all needed blocks (new leaf and all intermediate index blocks)
1084 * - makes decision where to split
1085 * - moves remaining extents and index entries (right to the split point)
1086 * into the newly allocated blocks
1087 * - initializes subtree
1088 */
1089static int ext4_ext_split(handle_t *handle, struct inode *inode,
1090 unsigned int flags,
1091 struct ext4_ext_path *path,
1092 struct ext4_extent *newext, int at)
1093{
1094 struct buffer_head *bh = NULL;
1095 int depth = ext_depth(inode);
1096 struct ext4_extent_header *neh;
1097 struct ext4_extent_idx *fidx;
1098 int i = at, k, m, a;
1099 ext4_fsblk_t newblock, oldblock;
1100 __le32 border;
1101 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1102 gfp_t gfp_flags = GFP_NOFS;
1103 int err = 0;
1104 size_t ext_size = 0;
1105
1106 if (flags & EXT4_EX_NOFAIL)
1107 gfp_flags |= __GFP_NOFAIL;
1108
1109 /* make decision: where to split? */
1110 /* FIXME: now decision is simplest: at current extent */
1111
1112 /* if current leaf will be split, then we should use
1113 * border from split point */
1114 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1115 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1116 return -EFSCORRUPTED;
1117 }
1118 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1119 border = path[depth].p_ext[1].ee_block;
1120 ext_debug("leaf will be split."
1121 " next leaf starts at %d\n",
1122 le32_to_cpu(border));
1123 } else {
1124 border = newext->ee_block;
1125 ext_debug("leaf will be added."
1126 " next leaf starts at %d\n",
1127 le32_to_cpu(border));
1128 }
1129
1130 /*
1131 * If error occurs, then we break processing
1132 * and mark filesystem read-only. index won't
1133 * be inserted and tree will be in consistent
1134 * state. Next mount will repair buffers too.
1135 */
1136
1137 /*
1138 * Get array to track all allocated blocks.
1139 * We need this to handle errors and free blocks
1140 * upon them.
1141 */
1142 ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
1143 if (!ablocks)
1144 return -ENOMEM;
1145
1146 /* allocate all needed blocks */
1147 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1148 for (a = 0; a < depth - at; a++) {
1149 newblock = ext4_ext_new_meta_block(handle, inode, path,
1150 newext, &err, flags);
1151 if (newblock == 0)
1152 goto cleanup;
1153 ablocks[a] = newblock;
1154 }
1155
1156 /* initialize new leaf */
1157 newblock = ablocks[--a];
1158 if (unlikely(newblock == 0)) {
1159 EXT4_ERROR_INODE(inode, "newblock == 0!");
1160 err = -EFSCORRUPTED;
1161 goto cleanup;
1162 }
1163 bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1164 if (unlikely(!bh)) {
1165 err = -ENOMEM;
1166 goto cleanup;
1167 }
1168 lock_buffer(bh);
1169
1170 err = ext4_journal_get_create_access(handle, bh);
1171 if (err)
1172 goto cleanup;
1173
1174 neh = ext_block_hdr(bh);
1175 neh->eh_entries = 0;
1176 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1177 neh->eh_magic = EXT4_EXT_MAGIC;
1178 neh->eh_depth = 0;
1179 neh->eh_generation = 0;
1180
1181 /* move remainder of path[depth] to the new leaf */
1182 if (unlikely(path[depth].p_hdr->eh_entries !=
1183 path[depth].p_hdr->eh_max)) {
1184 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1185 path[depth].p_hdr->eh_entries,
1186 path[depth].p_hdr->eh_max);
1187 err = -EFSCORRUPTED;
1188 goto cleanup;
1189 }
1190 /* start copy from next extent */
1191 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1192 ext4_ext_show_move(inode, path, newblock, depth);
1193 if (m) {
1194 struct ext4_extent *ex;
1195 ex = EXT_FIRST_EXTENT(neh);
1196 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1197 le16_add_cpu(&neh->eh_entries, m);
1198 }
1199
1200 /* zero out unused area in the extent block */
1201 ext_size = sizeof(struct ext4_extent_header) +
1202 sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1203 memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1204 ext4_extent_block_csum_set(inode, neh);
1205 set_buffer_uptodate(bh);
1206 unlock_buffer(bh);
1207
1208 err = ext4_handle_dirty_metadata(handle, inode, bh);
1209 if (err)
1210 goto cleanup;
1211 brelse(bh);
1212 bh = NULL;
1213
1214 /* correct old leaf */
1215 if (m) {
1216 err = ext4_ext_get_access(handle, inode, path + depth);
1217 if (err)
1218 goto cleanup;
1219 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1220 err = ext4_ext_dirty(handle, inode, path + depth);
1221 if (err)
1222 goto cleanup;
1223
1224 }
1225
1226 /* create intermediate indexes */
1227 k = depth - at - 1;
1228 if (unlikely(k < 0)) {
1229 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1230 err = -EFSCORRUPTED;
1231 goto cleanup;
1232 }
1233 if (k)
1234 ext_debug("create %d intermediate indices\n", k);
1235 /* insert new index into current index block */
1236 /* current depth stored in i var */
1237 i = depth - 1;
1238 while (k--) {
1239 oldblock = newblock;
1240 newblock = ablocks[--a];
1241 bh = sb_getblk(inode->i_sb, newblock);
1242 if (unlikely(!bh)) {
1243 err = -ENOMEM;
1244 goto cleanup;
1245 }
1246 lock_buffer(bh);
1247
1248 err = ext4_journal_get_create_access(handle, bh);
1249 if (err)
1250 goto cleanup;
1251
1252 neh = ext_block_hdr(bh);
1253 neh->eh_entries = cpu_to_le16(1);
1254 neh->eh_magic = EXT4_EXT_MAGIC;
1255 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1256 neh->eh_depth = cpu_to_le16(depth - i);
1257 neh->eh_generation = 0;
1258 fidx = EXT_FIRST_INDEX(neh);
1259 fidx->ei_block = border;
1260 ext4_idx_store_pblock(fidx, oldblock);
1261
1262 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1263 i, newblock, le32_to_cpu(border), oldblock);
1264
1265 /* move remainder of path[i] to the new index block */
1266 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1267 EXT_LAST_INDEX(path[i].p_hdr))) {
1268 EXT4_ERROR_INODE(inode,
1269 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1270 le32_to_cpu(path[i].p_ext->ee_block));
1271 err = -EFSCORRUPTED;
1272 goto cleanup;
1273 }
1274 /* start copy indexes */
1275 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1276 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1277 EXT_MAX_INDEX(path[i].p_hdr));
1278 ext4_ext_show_move(inode, path, newblock, i);
1279 if (m) {
1280 memmove(++fidx, path[i].p_idx,
1281 sizeof(struct ext4_extent_idx) * m);
1282 le16_add_cpu(&neh->eh_entries, m);
1283 }
1284 /* zero out unused area in the extent block */
1285 ext_size = sizeof(struct ext4_extent_header) +
1286 (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1287 memset(bh->b_data + ext_size, 0,
1288 inode->i_sb->s_blocksize - ext_size);
1289 ext4_extent_block_csum_set(inode, neh);
1290 set_buffer_uptodate(bh);
1291 unlock_buffer(bh);
1292
1293 err = ext4_handle_dirty_metadata(handle, inode, bh);
1294 if (err)
1295 goto cleanup;
1296 brelse(bh);
1297 bh = NULL;
1298
1299 /* correct old index */
1300 if (m) {
1301 err = ext4_ext_get_access(handle, inode, path + i);
1302 if (err)
1303 goto cleanup;
1304 le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1305 err = ext4_ext_dirty(handle, inode, path + i);
1306 if (err)
1307 goto cleanup;
1308 }
1309
1310 i--;
1311 }
1312
1313 /* insert new index */
1314 err = ext4_ext_insert_index(handle, inode, path + at,
1315 le32_to_cpu(border), newblock);
1316
1317cleanup:
1318 if (bh) {
1319 if (buffer_locked(bh))
1320 unlock_buffer(bh);
1321 brelse(bh);
1322 }
1323
1324 if (err) {
1325 /* free all allocated blocks in error case */
1326 for (i = 0; i < depth; i++) {
1327 if (!ablocks[i])
1328 continue;
1329 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1330 EXT4_FREE_BLOCKS_METADATA);
1331 }
1332 }
1333 kfree(ablocks);
1334
1335 return err;
1336}
1337
1338/*
1339 * ext4_ext_grow_indepth:
1340 * implements tree growing procedure:
1341 * - allocates new block
1342 * - moves top-level data (index block or leaf) into the new block
1343 * - initializes new top-level, creating index that points to the
1344 * just created block
1345 */
1346static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1347 unsigned int flags)
1348{
1349 struct ext4_extent_header *neh;
1350 struct buffer_head *bh;
1351 ext4_fsblk_t newblock, goal = 0;
1352 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1353 int err = 0;
1354 size_t ext_size = 0;
1355
1356 /* Try to prepend new index to old one */
1357 if (ext_depth(inode))
1358 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1359 if (goal > le32_to_cpu(es->s_first_data_block)) {
1360 flags |= EXT4_MB_HINT_TRY_GOAL;
1361 goal--;
1362 } else
1363 goal = ext4_inode_to_goal_block(inode);
1364 newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1365 NULL, &err);
1366 if (newblock == 0)
1367 return err;
1368
1369 bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1370 if (unlikely(!bh))
1371 return -ENOMEM;
1372 lock_buffer(bh);
1373
1374 err = ext4_journal_get_create_access(handle, bh);
1375 if (err) {
1376 unlock_buffer(bh);
1377 goto out;
1378 }
1379
1380 ext_size = sizeof(EXT4_I(inode)->i_data);
1381 /* move top-level index/leaf into new block */
1382 memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1383 /* zero out unused area in the extent block */
1384 memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1385
1386 /* set size of new block */
1387 neh = ext_block_hdr(bh);
1388 /* old root could have indexes or leaves
1389 * so calculate e_max right way */
1390 if (ext_depth(inode))
1391 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1392 else
1393 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1394 neh->eh_magic = EXT4_EXT_MAGIC;
1395 ext4_extent_block_csum_set(inode, neh);
1396 set_buffer_uptodate(bh);
1397 unlock_buffer(bh);
1398
1399 err = ext4_handle_dirty_metadata(handle, inode, bh);
1400 if (err)
1401 goto out;
1402
1403 /* Update top-level index: num,max,pointer */
1404 neh = ext_inode_hdr(inode);
1405 neh->eh_entries = cpu_to_le16(1);
1406 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1407 if (neh->eh_depth == 0) {
1408 /* Root extent block becomes index block */
1409 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1410 EXT_FIRST_INDEX(neh)->ei_block =
1411 EXT_FIRST_EXTENT(neh)->ee_block;
1412 }
1413 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1414 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1415 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1416 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1417
1418 le16_add_cpu(&neh->eh_depth, 1);
1419 ext4_mark_inode_dirty(handle, inode);
1420out:
1421 brelse(bh);
1422
1423 return err;
1424}
1425
1426/*
1427 * ext4_ext_create_new_leaf:
1428 * finds empty index and adds new leaf.
1429 * if no free index is found, then it requests in-depth growing.
1430 */
1431static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1432 unsigned int mb_flags,
1433 unsigned int gb_flags,
1434 struct ext4_ext_path **ppath,
1435 struct ext4_extent *newext)
1436{
1437 struct ext4_ext_path *path = *ppath;
1438 struct ext4_ext_path *curp;
1439 int depth, i, err = 0;
1440
1441repeat:
1442 i = depth = ext_depth(inode);
1443
1444 /* walk up to the tree and look for free index entry */
1445 curp = path + depth;
1446 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1447 i--;
1448 curp--;
1449 }
1450
1451 /* we use already allocated block for index block,
1452 * so subsequent data blocks should be contiguous */
1453 if (EXT_HAS_FREE_INDEX(curp)) {
1454 /* if we found index with free entry, then use that
1455 * entry: create all needed subtree and add new leaf */
1456 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1457 if (err)
1458 goto out;
1459
1460 /* refill path */
1461 path = ext4_find_extent(inode,
1462 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1463 ppath, gb_flags);
1464 if (IS_ERR(path))
1465 err = PTR_ERR(path);
1466 } else {
1467 /* tree is full, time to grow in depth */
1468 err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1469 if (err)
1470 goto out;
1471
1472 /* refill path */
1473 path = ext4_find_extent(inode,
1474 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1475 ppath, gb_flags);
1476 if (IS_ERR(path)) {
1477 err = PTR_ERR(path);
1478 goto out;
1479 }
1480
1481 /*
1482 * only first (depth 0 -> 1) produces free space;
1483 * in all other cases we have to split the grown tree
1484 */
1485 depth = ext_depth(inode);
1486 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1487 /* now we need to split */
1488 goto repeat;
1489 }
1490 }
1491
1492out:
1493 return err;
1494}
1495
1496/*
1497 * search the closest allocated block to the left for *logical
1498 * and returns it at @logical + it's physical address at @phys
1499 * if *logical is the smallest allocated block, the function
1500 * returns 0 at @phys
1501 * return value contains 0 (success) or error code
1502 */
1503static int ext4_ext_search_left(struct inode *inode,
1504 struct ext4_ext_path *path,
1505 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1506{
1507 struct ext4_extent_idx *ix;
1508 struct ext4_extent *ex;
1509 int depth, ee_len;
1510
1511 if (unlikely(path == NULL)) {
1512 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1513 return -EFSCORRUPTED;
1514 }
1515 depth = path->p_depth;
1516 *phys = 0;
1517
1518 if (depth == 0 && path->p_ext == NULL)
1519 return 0;
1520
1521 /* usually extent in the path covers blocks smaller
1522 * then *logical, but it can be that extent is the
1523 * first one in the file */
1524
1525 ex = path[depth].p_ext;
1526 ee_len = ext4_ext_get_actual_len(ex);
1527 if (*logical < le32_to_cpu(ex->ee_block)) {
1528 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1529 EXT4_ERROR_INODE(inode,
1530 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1531 *logical, le32_to_cpu(ex->ee_block));
1532 return -EFSCORRUPTED;
1533 }
1534 while (--depth >= 0) {
1535 ix = path[depth].p_idx;
1536 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1537 EXT4_ERROR_INODE(inode,
1538 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1539 ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1540 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1541 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1542 depth);
1543 return -EFSCORRUPTED;
1544 }
1545 }
1546 return 0;
1547 }
1548
1549 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1550 EXT4_ERROR_INODE(inode,
1551 "logical %d < ee_block %d + ee_len %d!",
1552 *logical, le32_to_cpu(ex->ee_block), ee_len);
1553 return -EFSCORRUPTED;
1554 }
1555
1556 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1557 *phys = ext4_ext_pblock(ex) + ee_len - 1;
1558 return 0;
1559}
1560
1561/*
1562 * search the closest allocated block to the right for *logical
1563 * and returns it at @logical + it's physical address at @phys
1564 * if *logical is the largest allocated block, the function
1565 * returns 0 at @phys
1566 * return value contains 0 (success) or error code
1567 */
1568static int ext4_ext_search_right(struct inode *inode,
1569 struct ext4_ext_path *path,
1570 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1571 struct ext4_extent **ret_ex)
1572{
1573 struct buffer_head *bh = NULL;
1574 struct ext4_extent_header *eh;
1575 struct ext4_extent_idx *ix;
1576 struct ext4_extent *ex;
1577 int depth; /* Note, NOT eh_depth; depth from top of tree */
1578 int ee_len;
1579
1580 if (unlikely(path == NULL)) {
1581 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1582 return -EFSCORRUPTED;
1583 }
1584 depth = path->p_depth;
1585 *phys = 0;
1586
1587 if (depth == 0 && path->p_ext == NULL)
1588 return 0;
1589
1590 /* usually extent in the path covers blocks smaller
1591 * then *logical, but it can be that extent is the
1592 * first one in the file */
1593
1594 ex = path[depth].p_ext;
1595 ee_len = ext4_ext_get_actual_len(ex);
1596 if (*logical < le32_to_cpu(ex->ee_block)) {
1597 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1598 EXT4_ERROR_INODE(inode,
1599 "first_extent(path[%d].p_hdr) != ex",
1600 depth);
1601 return -EFSCORRUPTED;
1602 }
1603 while (--depth >= 0) {
1604 ix = path[depth].p_idx;
1605 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1606 EXT4_ERROR_INODE(inode,
1607 "ix != EXT_FIRST_INDEX *logical %d!",
1608 *logical);
1609 return -EFSCORRUPTED;
1610 }
1611 }
1612 goto found_extent;
1613 }
1614
1615 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1616 EXT4_ERROR_INODE(inode,
1617 "logical %d < ee_block %d + ee_len %d!",
1618 *logical, le32_to_cpu(ex->ee_block), ee_len);
1619 return -EFSCORRUPTED;
1620 }
1621
1622 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1623 /* next allocated block in this leaf */
1624 ex++;
1625 goto found_extent;
1626 }
1627
1628 /* go up and search for index to the right */
1629 while (--depth >= 0) {
1630 ix = path[depth].p_idx;
1631 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1632 goto got_index;
1633 }
1634
1635 /* we've gone up to the root and found no index to the right */
1636 return 0;
1637
1638got_index:
1639 /* we've found index to the right, let's
1640 * follow it and find the closest allocated
1641 * block to the right */
1642 ix++;
1643 while (++depth < path->p_depth) {
1644 /* subtract from p_depth to get proper eh_depth */
1645 bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
1646 if (IS_ERR(bh))
1647 return PTR_ERR(bh);
1648 eh = ext_block_hdr(bh);
1649 ix = EXT_FIRST_INDEX(eh);
1650 put_bh(bh);
1651 }
1652
1653 bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
1654 if (IS_ERR(bh))
1655 return PTR_ERR(bh);
1656 eh = ext_block_hdr(bh);
1657 ex = EXT_FIRST_EXTENT(eh);
1658found_extent:
1659 *logical = le32_to_cpu(ex->ee_block);
1660 *phys = ext4_ext_pblock(ex);
1661 *ret_ex = ex;
1662 if (bh)
1663 put_bh(bh);
1664 return 0;
1665}
1666
1667/*
1668 * ext4_ext_next_allocated_block:
1669 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1670 * NOTE: it considers block number from index entry as
1671 * allocated block. Thus, index entries have to be consistent
1672 * with leaves.
1673 */
1674ext4_lblk_t
1675ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1676{
1677 int depth;
1678
1679 BUG_ON(path == NULL);
1680 depth = path->p_depth;
1681
1682 if (depth == 0 && path->p_ext == NULL)
1683 return EXT_MAX_BLOCKS;
1684
1685 while (depth >= 0) {
1686 if (depth == path->p_depth) {
1687 /* leaf */
1688 if (path[depth].p_ext &&
1689 path[depth].p_ext !=
1690 EXT_LAST_EXTENT(path[depth].p_hdr))
1691 return le32_to_cpu(path[depth].p_ext[1].ee_block);
1692 } else {
1693 /* index */
1694 if (path[depth].p_idx !=
1695 EXT_LAST_INDEX(path[depth].p_hdr))
1696 return le32_to_cpu(path[depth].p_idx[1].ei_block);
1697 }
1698 depth--;
1699 }
1700
1701 return EXT_MAX_BLOCKS;
1702}
1703
1704/*
1705 * ext4_ext_next_leaf_block:
1706 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1707 */
1708static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1709{
1710 int depth;
1711
1712 BUG_ON(path == NULL);
1713 depth = path->p_depth;
1714
1715 /* zero-tree has no leaf blocks at all */
1716 if (depth == 0)
1717 return EXT_MAX_BLOCKS;
1718
1719 /* go to index block */
1720 depth--;
1721
1722 while (depth >= 0) {
1723 if (path[depth].p_idx !=
1724 EXT_LAST_INDEX(path[depth].p_hdr))
1725 return (ext4_lblk_t)
1726 le32_to_cpu(path[depth].p_idx[1].ei_block);
1727 depth--;
1728 }
1729
1730 return EXT_MAX_BLOCKS;
1731}
1732
1733/*
1734 * ext4_ext_correct_indexes:
1735 * if leaf gets modified and modified extent is first in the leaf,
1736 * then we have to correct all indexes above.
1737 * TODO: do we need to correct tree in all cases?
1738 */
1739static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1740 struct ext4_ext_path *path)
1741{
1742 struct ext4_extent_header *eh;
1743 int depth = ext_depth(inode);
1744 struct ext4_extent *ex;
1745 __le32 border;
1746 int k, err = 0;
1747
1748 eh = path[depth].p_hdr;
1749 ex = path[depth].p_ext;
1750
1751 if (unlikely(ex == NULL || eh == NULL)) {
1752 EXT4_ERROR_INODE(inode,
1753 "ex %p == NULL or eh %p == NULL", ex, eh);
1754 return -EFSCORRUPTED;
1755 }
1756
1757 if (depth == 0) {
1758 /* there is no tree at all */
1759 return 0;
1760 }
1761
1762 if (ex != EXT_FIRST_EXTENT(eh)) {
1763 /* we correct tree if first leaf got modified only */
1764 return 0;
1765 }
1766
1767 /*
1768 * TODO: we need correction if border is smaller than current one
1769 */
1770 k = depth - 1;
1771 border = path[depth].p_ext->ee_block;
1772 err = ext4_ext_get_access(handle, inode, path + k);
1773 if (err)
1774 return err;
1775 path[k].p_idx->ei_block = border;
1776 err = ext4_ext_dirty(handle, inode, path + k);
1777 if (err)
1778 return err;
1779
1780 while (k--) {
1781 /* change all left-side indexes */
1782 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1783 break;
1784 err = ext4_ext_get_access(handle, inode, path + k);
1785 if (err)
1786 break;
1787 path[k].p_idx->ei_block = border;
1788 err = ext4_ext_dirty(handle, inode, path + k);
1789 if (err)
1790 break;
1791 }
1792
1793 return err;
1794}
1795
1796int
1797ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1798 struct ext4_extent *ex2)
1799{
1800 unsigned short ext1_ee_len, ext2_ee_len;
1801
1802 if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1803 return 0;
1804
1805 ext1_ee_len = ext4_ext_get_actual_len(ex1);
1806 ext2_ee_len = ext4_ext_get_actual_len(ex2);
1807
1808 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1809 le32_to_cpu(ex2->ee_block))
1810 return 0;
1811
1812 /*
1813 * To allow future support for preallocated extents to be added
1814 * as an RO_COMPAT feature, refuse to merge to extents if
1815 * this can result in the top bit of ee_len being set.
1816 */
1817 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1818 return 0;
1819 /*
1820 * The check for IO to unwritten extent is somewhat racy as we
1821 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
1822 * dropping i_data_sem. But reserved blocks should save us in that
1823 * case.
1824 */
1825 if (ext4_ext_is_unwritten(ex1) &&
1826 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1827 atomic_read(&EXT4_I(inode)->i_unwritten) ||
1828 (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
1829 return 0;
1830#ifdef AGGRESSIVE_TEST
1831 if (ext1_ee_len >= 4)
1832 return 0;
1833#endif
1834
1835 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1836 return 1;
1837 return 0;
1838}
1839
1840/*
1841 * This function tries to merge the "ex" extent to the next extent in the tree.
1842 * It always tries to merge towards right. If you want to merge towards
1843 * left, pass "ex - 1" as argument instead of "ex".
1844 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1845 * 1 if they got merged.
1846 */
1847static int ext4_ext_try_to_merge_right(struct inode *inode,
1848 struct ext4_ext_path *path,
1849 struct ext4_extent *ex)
1850{
1851 struct ext4_extent_header *eh;
1852 unsigned int depth, len;
1853 int merge_done = 0, unwritten;
1854
1855 depth = ext_depth(inode);
1856 BUG_ON(path[depth].p_hdr == NULL);
1857 eh = path[depth].p_hdr;
1858
1859 while (ex < EXT_LAST_EXTENT(eh)) {
1860 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1861 break;
1862 /* merge with next extent! */
1863 unwritten = ext4_ext_is_unwritten(ex);
1864 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1865 + ext4_ext_get_actual_len(ex + 1));
1866 if (unwritten)
1867 ext4_ext_mark_unwritten(ex);
1868
1869 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1870 len = (EXT_LAST_EXTENT(eh) - ex - 1)
1871 * sizeof(struct ext4_extent);
1872 memmove(ex + 1, ex + 2, len);
1873 }
1874 le16_add_cpu(&eh->eh_entries, -1);
1875 merge_done = 1;
1876 WARN_ON(eh->eh_entries == 0);
1877 if (!eh->eh_entries)
1878 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1879 }
1880
1881 return merge_done;
1882}
1883
1884/*
1885 * This function does a very simple check to see if we can collapse
1886 * an extent tree with a single extent tree leaf block into the inode.
1887 */
1888static void ext4_ext_try_to_merge_up(handle_t *handle,
1889 struct inode *inode,
1890 struct ext4_ext_path *path)
1891{
1892 size_t s;
1893 unsigned max_root = ext4_ext_space_root(inode, 0);
1894 ext4_fsblk_t blk;
1895
1896 if ((path[0].p_depth != 1) ||
1897 (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1898 (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1899 return;
1900
1901 /*
1902 * We need to modify the block allocation bitmap and the block
1903 * group descriptor to release the extent tree block. If we
1904 * can't get the journal credits, give up.
1905 */
1906 if (ext4_journal_extend(handle, 2))
1907 return;
1908
1909 /*
1910 * Copy the extent data up to the inode
1911 */
1912 blk = ext4_idx_pblock(path[0].p_idx);
1913 s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1914 sizeof(struct ext4_extent_idx);
1915 s += sizeof(struct ext4_extent_header);
1916
1917 path[1].p_maxdepth = path[0].p_maxdepth;
1918 memcpy(path[0].p_hdr, path[1].p_hdr, s);
1919 path[0].p_depth = 0;
1920 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1921 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1922 path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1923
1924 brelse(path[1].p_bh);
1925 path[1].p_bh = NULL;
1926 ext4_free_blocks(handle, inode, NULL, blk, 1,
1927 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1928}
1929
1930/*
1931 * This function tries to merge the @ex extent to neighbours in the tree.
1932 * return 1 if merge left else 0.
1933 */
1934static void ext4_ext_try_to_merge(handle_t *handle,
1935 struct inode *inode,
1936 struct ext4_ext_path *path,
1937 struct ext4_extent *ex) {
1938 struct ext4_extent_header *eh;
1939 unsigned int depth;
1940 int merge_done = 0;
1941
1942 depth = ext_depth(inode);
1943 BUG_ON(path[depth].p_hdr == NULL);
1944 eh = path[depth].p_hdr;
1945
1946 if (ex > EXT_FIRST_EXTENT(eh))
1947 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1948
1949 if (!merge_done)
1950 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1951
1952 ext4_ext_try_to_merge_up(handle, inode, path);
1953}
1954
1955/*
1956 * check if a portion of the "newext" extent overlaps with an
1957 * existing extent.
1958 *
1959 * If there is an overlap discovered, it updates the length of the newext
1960 * such that there will be no overlap, and then returns 1.
1961 * If there is no overlap found, it returns 0.
1962 */
1963static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1964 struct inode *inode,
1965 struct ext4_extent *newext,
1966 struct ext4_ext_path *path)
1967{
1968 ext4_lblk_t b1, b2;
1969 unsigned int depth, len1;
1970 unsigned int ret = 0;
1971
1972 b1 = le32_to_cpu(newext->ee_block);
1973 len1 = ext4_ext_get_actual_len(newext);
1974 depth = ext_depth(inode);
1975 if (!path[depth].p_ext)
1976 goto out;
1977 b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1978
1979 /*
1980 * get the next allocated block if the extent in the path
1981 * is before the requested block(s)
1982 */
1983 if (b2 < b1) {
1984 b2 = ext4_ext_next_allocated_block(path);
1985 if (b2 == EXT_MAX_BLOCKS)
1986 goto out;
1987 b2 = EXT4_LBLK_CMASK(sbi, b2);
1988 }
1989
1990 /* check for wrap through zero on extent logical start block*/
1991 if (b1 + len1 < b1) {
1992 len1 = EXT_MAX_BLOCKS - b1;
1993 newext->ee_len = cpu_to_le16(len1);
1994 ret = 1;
1995 }
1996
1997 /* check for overlap */
1998 if (b1 + len1 > b2) {
1999 newext->ee_len = cpu_to_le16(b2 - b1);
2000 ret = 1;
2001 }
2002out:
2003 return ret;
2004}
2005
2006/*
2007 * ext4_ext_insert_extent:
2008 * tries to merge requsted extent into the existing extent or
2009 * inserts requested extent as new one into the tree,
2010 * creating new leaf in the no-space case.
2011 */
2012int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
2013 struct ext4_ext_path **ppath,
2014 struct ext4_extent *newext, int gb_flags)
2015{
2016 struct ext4_ext_path *path = *ppath;
2017 struct ext4_extent_header *eh;
2018 struct ext4_extent *ex, *fex;
2019 struct ext4_extent *nearex; /* nearest extent */
2020 struct ext4_ext_path *npath = NULL;
2021 int depth, len, err;
2022 ext4_lblk_t next;
2023 int mb_flags = 0, unwritten;
2024
2025 if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
2026 mb_flags |= EXT4_MB_DELALLOC_RESERVED;
2027 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
2028 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
2029 return -EFSCORRUPTED;
2030 }
2031 depth = ext_depth(inode);
2032 ex = path[depth].p_ext;
2033 eh = path[depth].p_hdr;
2034 if (unlikely(path[depth].p_hdr == NULL)) {
2035 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2036 return -EFSCORRUPTED;
2037 }
2038
2039 /* try to insert block into found extent and return */
2040 if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
2041
2042 /*
2043 * Try to see whether we should rather test the extent on
2044 * right from ex, or from the left of ex. This is because
2045 * ext4_find_extent() can return either extent on the
2046 * left, or on the right from the searched position. This
2047 * will make merging more effective.
2048 */
2049 if (ex < EXT_LAST_EXTENT(eh) &&
2050 (le32_to_cpu(ex->ee_block) +
2051 ext4_ext_get_actual_len(ex) <
2052 le32_to_cpu(newext->ee_block))) {
2053 ex += 1;
2054 goto prepend;
2055 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
2056 (le32_to_cpu(newext->ee_block) +
2057 ext4_ext_get_actual_len(newext) <
2058 le32_to_cpu(ex->ee_block)))
2059 ex -= 1;
2060
2061 /* Try to append newex to the ex */
2062 if (ext4_can_extents_be_merged(inode, ex, newext)) {
2063 ext_debug("append [%d]%d block to %u:[%d]%d"
2064 "(from %llu)\n",
2065 ext4_ext_is_unwritten(newext),
2066 ext4_ext_get_actual_len(newext),
2067 le32_to_cpu(ex->ee_block),
2068 ext4_ext_is_unwritten(ex),
2069 ext4_ext_get_actual_len(ex),
2070 ext4_ext_pblock(ex));
2071 err = ext4_ext_get_access(handle, inode,
2072 path + depth);
2073 if (err)
2074 return err;
2075 unwritten = ext4_ext_is_unwritten(ex);
2076 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2077 + ext4_ext_get_actual_len(newext));
2078 if (unwritten)
2079 ext4_ext_mark_unwritten(ex);
2080 eh = path[depth].p_hdr;
2081 nearex = ex;
2082 goto merge;
2083 }
2084
2085prepend:
2086 /* Try to prepend newex to the ex */
2087 if (ext4_can_extents_be_merged(inode, newext, ex)) {
2088 ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
2089 "(from %llu)\n",
2090 le32_to_cpu(newext->ee_block),
2091 ext4_ext_is_unwritten(newext),
2092 ext4_ext_get_actual_len(newext),
2093 le32_to_cpu(ex->ee_block),
2094 ext4_ext_is_unwritten(ex),
2095 ext4_ext_get_actual_len(ex),
2096 ext4_ext_pblock(ex));
2097 err = ext4_ext_get_access(handle, inode,
2098 path + depth);
2099 if (err)
2100 return err;
2101
2102 unwritten = ext4_ext_is_unwritten(ex);
2103 ex->ee_block = newext->ee_block;
2104 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2105 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2106 + ext4_ext_get_actual_len(newext));
2107 if (unwritten)
2108 ext4_ext_mark_unwritten(ex);
2109 eh = path[depth].p_hdr;
2110 nearex = ex;
2111 goto merge;
2112 }
2113 }
2114
2115 depth = ext_depth(inode);
2116 eh = path[depth].p_hdr;
2117 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2118 goto has_space;
2119
2120 /* probably next leaf has space for us? */
2121 fex = EXT_LAST_EXTENT(eh);
2122 next = EXT_MAX_BLOCKS;
2123 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2124 next = ext4_ext_next_leaf_block(path);
2125 if (next != EXT_MAX_BLOCKS) {
2126 ext_debug("next leaf block - %u\n", next);
2127 BUG_ON(npath != NULL);
2128 npath = ext4_find_extent(inode, next, NULL, gb_flags);
2129 if (IS_ERR(npath))
2130 return PTR_ERR(npath);
2131 BUG_ON(npath->p_depth != path->p_depth);
2132 eh = npath[depth].p_hdr;
2133 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2134 ext_debug("next leaf isn't full(%d)\n",
2135 le16_to_cpu(eh->eh_entries));
2136 path = npath;
2137 goto has_space;
2138 }
2139 ext_debug("next leaf has no free space(%d,%d)\n",
2140 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2141 }
2142
2143 /*
2144 * There is no free space in the found leaf.
2145 * We're gonna add a new leaf in the tree.
2146 */
2147 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2148 mb_flags |= EXT4_MB_USE_RESERVED;
2149 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2150 ppath, newext);
2151 if (err)
2152 goto cleanup;
2153 path = *ppath;
2154 depth = ext_depth(inode);
2155 eh = path[depth].p_hdr;
2156
2157has_space:
2158 nearex = path[depth].p_ext;
2159
2160 err = ext4_ext_get_access(handle, inode, path + depth);
2161 if (err)
2162 goto cleanup;
2163
2164 if (!nearex) {
2165 /* there is no extent in this leaf, create first one */
2166 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2167 le32_to_cpu(newext->ee_block),
2168 ext4_ext_pblock(newext),
2169 ext4_ext_is_unwritten(newext),
2170 ext4_ext_get_actual_len(newext));
2171 nearex = EXT_FIRST_EXTENT(eh);
2172 } else {
2173 if (le32_to_cpu(newext->ee_block)
2174 > le32_to_cpu(nearex->ee_block)) {
2175 /* Insert after */
2176 ext_debug("insert %u:%llu:[%d]%d before: "
2177 "nearest %p\n",
2178 le32_to_cpu(newext->ee_block),
2179 ext4_ext_pblock(newext),
2180 ext4_ext_is_unwritten(newext),
2181 ext4_ext_get_actual_len(newext),
2182 nearex);
2183 nearex++;
2184 } else {
2185 /* Insert before */
2186 BUG_ON(newext->ee_block == nearex->ee_block);
2187 ext_debug("insert %u:%llu:[%d]%d after: "
2188 "nearest %p\n",
2189 le32_to_cpu(newext->ee_block),
2190 ext4_ext_pblock(newext),
2191 ext4_ext_is_unwritten(newext),
2192 ext4_ext_get_actual_len(newext),
2193 nearex);
2194 }
2195 len = EXT_LAST_EXTENT(eh) - nearex + 1;
2196 if (len > 0) {
2197 ext_debug("insert %u:%llu:[%d]%d: "
2198 "move %d extents from 0x%p to 0x%p\n",
2199 le32_to_cpu(newext->ee_block),
2200 ext4_ext_pblock(newext),
2201 ext4_ext_is_unwritten(newext),
2202 ext4_ext_get_actual_len(newext),
2203 len, nearex, nearex + 1);
2204 memmove(nearex + 1, nearex,
2205 len * sizeof(struct ext4_extent));
2206 }
2207 }
2208
2209 le16_add_cpu(&eh->eh_entries, 1);
2210 path[depth].p_ext = nearex;
2211 nearex->ee_block = newext->ee_block;
2212 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2213 nearex->ee_len = newext->ee_len;
2214
2215merge:
2216 /* try to merge extents */
2217 if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2218 ext4_ext_try_to_merge(handle, inode, path, nearex);
2219
2220
2221 /* time to correct all indexes above */
2222 err = ext4_ext_correct_indexes(handle, inode, path);
2223 if (err)
2224 goto cleanup;
2225
2226 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2227
2228cleanup:
2229 ext4_ext_drop_refs(npath);
2230 kfree(npath);
2231 return err;
2232}
2233
2234static int ext4_fill_fiemap_extents(struct inode *inode,
2235 ext4_lblk_t block, ext4_lblk_t num,
2236 struct fiemap_extent_info *fieinfo)
2237{
2238 struct ext4_ext_path *path = NULL;
2239 struct ext4_extent *ex;
2240 struct extent_status es;
2241 ext4_lblk_t next, next_del, start = 0, end = 0;
2242 ext4_lblk_t last = block + num;
2243 int exists, depth = 0, err = 0;
2244 unsigned int flags = 0;
2245 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2246
2247 while (block < last && block != EXT_MAX_BLOCKS) {
2248 num = last - block;
2249 /* find extent for this block */
2250 down_read(&EXT4_I(inode)->i_data_sem);
2251
2252 path = ext4_find_extent(inode, block, &path, 0);
2253 if (IS_ERR(path)) {
2254 up_read(&EXT4_I(inode)->i_data_sem);
2255 err = PTR_ERR(path);
2256 path = NULL;
2257 break;
2258 }
2259
2260 depth = ext_depth(inode);
2261 if (unlikely(path[depth].p_hdr == NULL)) {
2262 up_read(&EXT4_I(inode)->i_data_sem);
2263 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2264 err = -EFSCORRUPTED;
2265 break;
2266 }
2267 ex = path[depth].p_ext;
2268 next = ext4_ext_next_allocated_block(path);
2269
2270 flags = 0;
2271 exists = 0;
2272 if (!ex) {
2273 /* there is no extent yet, so try to allocate
2274 * all requested space */
2275 start = block;
2276 end = block + num;
2277 } else if (le32_to_cpu(ex->ee_block) > block) {
2278 /* need to allocate space before found extent */
2279 start = block;
2280 end = le32_to_cpu(ex->ee_block);
2281 if (block + num < end)
2282 end = block + num;
2283 } else if (block >= le32_to_cpu(ex->ee_block)
2284 + ext4_ext_get_actual_len(ex)) {
2285 /* need to allocate space after found extent */
2286 start = block;
2287 end = block + num;
2288 if (end >= next)
2289 end = next;
2290 } else if (block >= le32_to_cpu(ex->ee_block)) {
2291 /*
2292 * some part of requested space is covered
2293 * by found extent
2294 */
2295 start = block;
2296 end = le32_to_cpu(ex->ee_block)
2297 + ext4_ext_get_actual_len(ex);
2298 if (block + num < end)
2299 end = block + num;
2300 exists = 1;
2301 } else {
2302 BUG();
2303 }
2304 BUG_ON(end <= start);
2305
2306 if (!exists) {
2307 es.es_lblk = start;
2308 es.es_len = end - start;
2309 es.es_pblk = 0;
2310 } else {
2311 es.es_lblk = le32_to_cpu(ex->ee_block);
2312 es.es_len = ext4_ext_get_actual_len(ex);
2313 es.es_pblk = ext4_ext_pblock(ex);
2314 if (ext4_ext_is_unwritten(ex))
2315 flags |= FIEMAP_EXTENT_UNWRITTEN;
2316 }
2317
2318 /*
2319 * Find delayed extent and update es accordingly. We call
2320 * it even in !exists case to find out whether es is the
2321 * last existing extent or not.
2322 */
2323 next_del = ext4_find_delayed_extent(inode, &es);
2324 if (!exists && next_del) {
2325 exists = 1;
2326 flags |= (FIEMAP_EXTENT_DELALLOC |
2327 FIEMAP_EXTENT_UNKNOWN);
2328 }
2329 up_read(&EXT4_I(inode)->i_data_sem);
2330
2331 if (unlikely(es.es_len == 0)) {
2332 EXT4_ERROR_INODE(inode, "es.es_len == 0");
2333 err = -EFSCORRUPTED;
2334 break;
2335 }
2336
2337 /*
2338 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2339 * we need to check next == EXT_MAX_BLOCKS because it is
2340 * possible that an extent is with unwritten and delayed
2341 * status due to when an extent is delayed allocated and
2342 * is allocated by fallocate status tree will track both of
2343 * them in a extent.
2344 *
2345 * So we could return a unwritten and delayed extent, and
2346 * its block is equal to 'next'.
2347 */
2348 if (next == next_del && next == EXT_MAX_BLOCKS) {
2349 flags |= FIEMAP_EXTENT_LAST;
2350 if (unlikely(next_del != EXT_MAX_BLOCKS ||
2351 next != EXT_MAX_BLOCKS)) {
2352 EXT4_ERROR_INODE(inode,
2353 "next extent == %u, next "
2354 "delalloc extent = %u",
2355 next, next_del);
2356 err = -EFSCORRUPTED;
2357 break;
2358 }
2359 }
2360
2361 if (exists) {
2362 err = fiemap_fill_next_extent(fieinfo,
2363 (__u64)es.es_lblk << blksize_bits,
2364 (__u64)es.es_pblk << blksize_bits,
2365 (__u64)es.es_len << blksize_bits,
2366 flags);
2367 if (err < 0)
2368 break;
2369 if (err == 1) {
2370 err = 0;
2371 break;
2372 }
2373 }
2374
2375 block = es.es_lblk + es.es_len;
2376 }
2377
2378 ext4_ext_drop_refs(path);
2379 kfree(path);
2380 return err;
2381}
2382
2383static int ext4_fill_es_cache_info(struct inode *inode,
2384 ext4_lblk_t block, ext4_lblk_t num,
2385 struct fiemap_extent_info *fieinfo)
2386{
2387 ext4_lblk_t next, end = block + num - 1;
2388 struct extent_status es;
2389 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2390 unsigned int flags;
2391 int err;
2392
2393 while (block <= end) {
2394 next = 0;
2395 flags = 0;
2396 if (!ext4_es_lookup_extent(inode, block, &next, &es))
2397 break;
2398 if (ext4_es_is_unwritten(&es))
2399 flags |= FIEMAP_EXTENT_UNWRITTEN;
2400 if (ext4_es_is_delayed(&es))
2401 flags |= (FIEMAP_EXTENT_DELALLOC |
2402 FIEMAP_EXTENT_UNKNOWN);
2403 if (ext4_es_is_hole(&es))
2404 flags |= EXT4_FIEMAP_EXTENT_HOLE;
2405 if (next == 0)
2406 flags |= FIEMAP_EXTENT_LAST;
2407 if (flags & (FIEMAP_EXTENT_DELALLOC|
2408 EXT4_FIEMAP_EXTENT_HOLE))
2409 es.es_pblk = 0;
2410 else
2411 es.es_pblk = ext4_es_pblock(&es);
2412 err = fiemap_fill_next_extent(fieinfo,
2413 (__u64)es.es_lblk << blksize_bits,
2414 (__u64)es.es_pblk << blksize_bits,
2415 (__u64)es.es_len << blksize_bits,
2416 flags);
2417 if (next == 0)
2418 break;
2419 block = next;
2420 if (err < 0)
2421 return err;
2422 if (err == 1)
2423 return 0;
2424 }
2425 return 0;
2426}
2427
2428
2429/*
2430 * ext4_ext_determine_hole - determine hole around given block
2431 * @inode: inode we lookup in
2432 * @path: path in extent tree to @lblk
2433 * @lblk: pointer to logical block around which we want to determine hole
2434 *
2435 * Determine hole length (and start if easily possible) around given logical
2436 * block. We don't try too hard to find the beginning of the hole but @path
2437 * actually points to extent before @lblk, we provide it.
2438 *
2439 * The function returns the length of a hole starting at @lblk. We update @lblk
2440 * to the beginning of the hole if we managed to find it.
2441 */
2442static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2443 struct ext4_ext_path *path,
2444 ext4_lblk_t *lblk)
2445{
2446 int depth = ext_depth(inode);
2447 struct ext4_extent *ex;
2448 ext4_lblk_t len;
2449
2450 ex = path[depth].p_ext;
2451 if (ex == NULL) {
2452 /* there is no extent yet, so gap is [0;-] */
2453 *lblk = 0;
2454 len = EXT_MAX_BLOCKS;
2455 } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2456 len = le32_to_cpu(ex->ee_block) - *lblk;
2457 } else if (*lblk >= le32_to_cpu(ex->ee_block)
2458 + ext4_ext_get_actual_len(ex)) {
2459 ext4_lblk_t next;
2460
2461 *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2462 next = ext4_ext_next_allocated_block(path);
2463 BUG_ON(next == *lblk);
2464 len = next - *lblk;
2465 } else {
2466 BUG();
2467 }
2468 return len;
2469}
2470
2471/*
2472 * ext4_ext_put_gap_in_cache:
2473 * calculate boundaries of the gap that the requested block fits into
2474 * and cache this gap
2475 */
2476static void
2477ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2478 ext4_lblk_t hole_len)
2479{
2480 struct extent_status es;
2481
2482 ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2483 hole_start + hole_len - 1, &es);
2484 if (es.es_len) {
2485 /* There's delayed extent containing lblock? */
2486 if (es.es_lblk <= hole_start)
2487 return;
2488 hole_len = min(es.es_lblk - hole_start, hole_len);
2489 }
2490 ext_debug(" -> %u:%u\n", hole_start, hole_len);
2491 ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2492 EXTENT_STATUS_HOLE);
2493}
2494
2495/*
2496 * ext4_ext_rm_idx:
2497 * removes index from the index block.
2498 */
2499static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2500 struct ext4_ext_path *path, int depth)
2501{
2502 int err;
2503 ext4_fsblk_t leaf;
2504
2505 /* free index block */
2506 depth--;
2507 path = path + depth;
2508 leaf = ext4_idx_pblock(path->p_idx);
2509 if (unlikely(path->p_hdr->eh_entries == 0)) {
2510 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2511 return -EFSCORRUPTED;
2512 }
2513 err = ext4_ext_get_access(handle, inode, path);
2514 if (err)
2515 return err;
2516
2517 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2518 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2519 len *= sizeof(struct ext4_extent_idx);
2520 memmove(path->p_idx, path->p_idx + 1, len);
2521 }
2522
2523 le16_add_cpu(&path->p_hdr->eh_entries, -1);
2524 err = ext4_ext_dirty(handle, inode, path);
2525 if (err)
2526 return err;
2527 ext_debug("index is empty, remove it, free block %llu\n", leaf);
2528 trace_ext4_ext_rm_idx(inode, leaf);
2529
2530 ext4_free_blocks(handle, inode, NULL, leaf, 1,
2531 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2532
2533 while (--depth >= 0) {
2534 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2535 break;
2536 path--;
2537 err = ext4_ext_get_access(handle, inode, path);
2538 if (err)
2539 break;
2540 path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2541 err = ext4_ext_dirty(handle, inode, path);
2542 if (err)
2543 break;
2544 }
2545 return err;
2546}
2547
2548/*
2549 * ext4_ext_calc_credits_for_single_extent:
2550 * This routine returns max. credits that needed to insert an extent
2551 * to the extent tree.
2552 * When pass the actual path, the caller should calculate credits
2553 * under i_data_sem.
2554 */
2555int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2556 struct ext4_ext_path *path)
2557{
2558 if (path) {
2559 int depth = ext_depth(inode);
2560 int ret = 0;
2561
2562 /* probably there is space in leaf? */
2563 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2564 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2565
2566 /*
2567 * There are some space in the leaf tree, no
2568 * need to account for leaf block credit
2569 *
2570 * bitmaps and block group descriptor blocks
2571 * and other metadata blocks still need to be
2572 * accounted.
2573 */
2574 /* 1 bitmap, 1 block group descriptor */
2575 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2576 return ret;
2577 }
2578 }
2579
2580 return ext4_chunk_trans_blocks(inode, nrblocks);
2581}
2582
2583/*
2584 * How many index/leaf blocks need to change/allocate to add @extents extents?
2585 *
2586 * If we add a single extent, then in the worse case, each tree level
2587 * index/leaf need to be changed in case of the tree split.
2588 *
2589 * If more extents are inserted, they could cause the whole tree split more
2590 * than once, but this is really rare.
2591 */
2592int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2593{
2594 int index;
2595 int depth;
2596
2597 /* If we are converting the inline data, only one is needed here. */
2598 if (ext4_has_inline_data(inode))
2599 return 1;
2600
2601 depth = ext_depth(inode);
2602
2603 if (extents <= 1)
2604 index = depth * 2;
2605 else
2606 index = depth * 3;
2607
2608 return index;
2609}
2610
2611static inline int get_default_free_blocks_flags(struct inode *inode)
2612{
2613 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2614 ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2615 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2616 else if (ext4_should_journal_data(inode))
2617 return EXT4_FREE_BLOCKS_FORGET;
2618 return 0;
2619}
2620
2621/*
2622 * ext4_rereserve_cluster - increment the reserved cluster count when
2623 * freeing a cluster with a pending reservation
2624 *
2625 * @inode - file containing the cluster
2626 * @lblk - logical block in cluster to be reserved
2627 *
2628 * Increments the reserved cluster count and adjusts quota in a bigalloc
2629 * file system when freeing a partial cluster containing at least one
2630 * delayed and unwritten block. A partial cluster meeting that
2631 * requirement will have a pending reservation. If so, the
2632 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2633 * defer reserved and allocated space accounting to a subsequent call
2634 * to this function.
2635 */
2636static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2637{
2638 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2639 struct ext4_inode_info *ei = EXT4_I(inode);
2640
2641 dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2642
2643 spin_lock(&ei->i_block_reservation_lock);
2644 ei->i_reserved_data_blocks++;
2645 percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2646 spin_unlock(&ei->i_block_reservation_lock);
2647
2648 percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2649 ext4_remove_pending(inode, lblk);
2650}
2651
2652static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2653 struct ext4_extent *ex,
2654 struct partial_cluster *partial,
2655 ext4_lblk_t from, ext4_lblk_t to)
2656{
2657 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2658 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2659 ext4_fsblk_t last_pblk, pblk;
2660 ext4_lblk_t num;
2661 int flags;
2662
2663 /* only extent tail removal is allowed */
2664 if (from < le32_to_cpu(ex->ee_block) ||
2665 to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2666 ext4_error(sbi->s_sb,
2667 "strange request: removal(2) %u-%u from %u:%u",
2668 from, to, le32_to_cpu(ex->ee_block), ee_len);
2669 return 0;
2670 }
2671
2672#ifdef EXTENTS_STATS
2673 spin_lock(&sbi->s_ext_stats_lock);
2674 sbi->s_ext_blocks += ee_len;
2675 sbi->s_ext_extents++;
2676 if (ee_len < sbi->s_ext_min)
2677 sbi->s_ext_min = ee_len;
2678 if (ee_len > sbi->s_ext_max)
2679 sbi->s_ext_max = ee_len;
2680 if (ext_depth(inode) > sbi->s_depth_max)
2681 sbi->s_depth_max = ext_depth(inode);
2682 spin_unlock(&sbi->s_ext_stats_lock);
2683#endif
2684
2685 trace_ext4_remove_blocks(inode, ex, from, to, partial);
2686
2687 /*
2688 * if we have a partial cluster, and it's different from the
2689 * cluster of the last block in the extent, we free it
2690 */
2691 last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2692
2693 if (partial->state != initial &&
2694 partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2695 if (partial->state == tofree) {
2696 flags = get_default_free_blocks_flags(inode);
2697 if (ext4_is_pending(inode, partial->lblk))
2698 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2699 ext4_free_blocks(handle, inode, NULL,
2700 EXT4_C2B(sbi, partial->pclu),
2701 sbi->s_cluster_ratio, flags);
2702 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2703 ext4_rereserve_cluster(inode, partial->lblk);
2704 }
2705 partial->state = initial;
2706 }
2707
2708 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2709 pblk = ext4_ext_pblock(ex) + ee_len - num;
2710
2711 /*
2712 * We free the partial cluster at the end of the extent (if any),
2713 * unless the cluster is used by another extent (partial_cluster
2714 * state is nofree). If a partial cluster exists here, it must be
2715 * shared with the last block in the extent.
2716 */
2717 flags = get_default_free_blocks_flags(inode);
2718
2719 /* partial, left end cluster aligned, right end unaligned */
2720 if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2721 (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2722 (partial->state != nofree)) {
2723 if (ext4_is_pending(inode, to))
2724 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2725 ext4_free_blocks(handle, inode, NULL,
2726 EXT4_PBLK_CMASK(sbi, last_pblk),
2727 sbi->s_cluster_ratio, flags);
2728 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2729 ext4_rereserve_cluster(inode, to);
2730 partial->state = initial;
2731 flags = get_default_free_blocks_flags(inode);
2732 }
2733
2734 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2735
2736 /*
2737 * For bigalloc file systems, we never free a partial cluster
2738 * at the beginning of the extent. Instead, we check to see if we
2739 * need to free it on a subsequent call to ext4_remove_blocks,
2740 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2741 */
2742 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2743 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2744
2745 /* reset the partial cluster if we've freed past it */
2746 if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2747 partial->state = initial;
2748
2749 /*
2750 * If we've freed the entire extent but the beginning is not left
2751 * cluster aligned and is not marked as ineligible for freeing we
2752 * record the partial cluster at the beginning of the extent. It
2753 * wasn't freed by the preceding ext4_free_blocks() call, and we
2754 * need to look farther to the left to determine if it's to be freed
2755 * (not shared with another extent). Else, reset the partial
2756 * cluster - we're either done freeing or the beginning of the
2757 * extent is left cluster aligned.
2758 */
2759 if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2760 if (partial->state == initial) {
2761 partial->pclu = EXT4_B2C(sbi, pblk);
2762 partial->lblk = from;
2763 partial->state = tofree;
2764 }
2765 } else {
2766 partial->state = initial;
2767 }
2768
2769 return 0;
2770}
2771
2772/*
2773 * ext4_ext_rm_leaf() Removes the extents associated with the
2774 * blocks appearing between "start" and "end". Both "start"
2775 * and "end" must appear in the same extent or EIO is returned.
2776 *
2777 * @handle: The journal handle
2778 * @inode: The files inode
2779 * @path: The path to the leaf
2780 * @partial_cluster: The cluster which we'll have to free if all extents
2781 * has been released from it. However, if this value is
2782 * negative, it's a cluster just to the right of the
2783 * punched region and it must not be freed.
2784 * @start: The first block to remove
2785 * @end: The last block to remove
2786 */
2787static int
2788ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2789 struct ext4_ext_path *path,
2790 struct partial_cluster *partial,
2791 ext4_lblk_t start, ext4_lblk_t end)
2792{
2793 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2794 int err = 0, correct_index = 0;
2795 int depth = ext_depth(inode), credits;
2796 struct ext4_extent_header *eh;
2797 ext4_lblk_t a, b;
2798 unsigned num;
2799 ext4_lblk_t ex_ee_block;
2800 unsigned short ex_ee_len;
2801 unsigned unwritten = 0;
2802 struct ext4_extent *ex;
2803 ext4_fsblk_t pblk;
2804
2805 /* the header must be checked already in ext4_ext_remove_space() */
2806 ext_debug("truncate since %u in leaf to %u\n", start, end);
2807 if (!path[depth].p_hdr)
2808 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2809 eh = path[depth].p_hdr;
2810 if (unlikely(path[depth].p_hdr == NULL)) {
2811 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2812 return -EFSCORRUPTED;
2813 }
2814 /* find where to start removing */
2815 ex = path[depth].p_ext;
2816 if (!ex)
2817 ex = EXT_LAST_EXTENT(eh);
2818
2819 ex_ee_block = le32_to_cpu(ex->ee_block);
2820 ex_ee_len = ext4_ext_get_actual_len(ex);
2821
2822 trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2823
2824 while (ex >= EXT_FIRST_EXTENT(eh) &&
2825 ex_ee_block + ex_ee_len > start) {
2826
2827 if (ext4_ext_is_unwritten(ex))
2828 unwritten = 1;
2829 else
2830 unwritten = 0;
2831
2832 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2833 unwritten, ex_ee_len);
2834 path[depth].p_ext = ex;
2835
2836 a = ex_ee_block > start ? ex_ee_block : start;
2837 b = ex_ee_block+ex_ee_len - 1 < end ?
2838 ex_ee_block+ex_ee_len - 1 : end;
2839
2840 ext_debug(" border %u:%u\n", a, b);
2841
2842 /* If this extent is beyond the end of the hole, skip it */
2843 if (end < ex_ee_block) {
2844 /*
2845 * We're going to skip this extent and move to another,
2846 * so note that its first cluster is in use to avoid
2847 * freeing it when removing blocks. Eventually, the
2848 * right edge of the truncated/punched region will
2849 * be just to the left.
2850 */
2851 if (sbi->s_cluster_ratio > 1) {
2852 pblk = ext4_ext_pblock(ex);
2853 partial->pclu = EXT4_B2C(sbi, pblk);
2854 partial->state = nofree;
2855 }
2856 ex--;
2857 ex_ee_block = le32_to_cpu(ex->ee_block);
2858 ex_ee_len = ext4_ext_get_actual_len(ex);
2859 continue;
2860 } else if (b != ex_ee_block + ex_ee_len - 1) {
2861 EXT4_ERROR_INODE(inode,
2862 "can not handle truncate %u:%u "
2863 "on extent %u:%u",
2864 start, end, ex_ee_block,
2865 ex_ee_block + ex_ee_len - 1);
2866 err = -EFSCORRUPTED;
2867 goto out;
2868 } else if (a != ex_ee_block) {
2869 /* remove tail of the extent */
2870 num = a - ex_ee_block;
2871 } else {
2872 /* remove whole extent: excellent! */
2873 num = 0;
2874 }
2875 /*
2876 * 3 for leaf, sb, and inode plus 2 (bmap and group
2877 * descriptor) for each block group; assume two block
2878 * groups plus ex_ee_len/blocks_per_block_group for
2879 * the worst case
2880 */
2881 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2882 if (ex == EXT_FIRST_EXTENT(eh)) {
2883 correct_index = 1;
2884 credits += (ext_depth(inode)) + 1;
2885 }
2886 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2887
2888 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2889 if (err)
2890 goto out;
2891
2892 err = ext4_ext_get_access(handle, inode, path + depth);
2893 if (err)
2894 goto out;
2895
2896 err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2897 if (err)
2898 goto out;
2899
2900 if (num == 0)
2901 /* this extent is removed; mark slot entirely unused */
2902 ext4_ext_store_pblock(ex, 0);
2903
2904 ex->ee_len = cpu_to_le16(num);
2905 /*
2906 * Do not mark unwritten if all the blocks in the
2907 * extent have been removed.
2908 */
2909 if (unwritten && num)
2910 ext4_ext_mark_unwritten(ex);
2911 /*
2912 * If the extent was completely released,
2913 * we need to remove it from the leaf
2914 */
2915 if (num == 0) {
2916 if (end != EXT_MAX_BLOCKS - 1) {
2917 /*
2918 * For hole punching, we need to scoot all the
2919 * extents up when an extent is removed so that
2920 * we dont have blank extents in the middle
2921 */
2922 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2923 sizeof(struct ext4_extent));
2924
2925 /* Now get rid of the one at the end */
2926 memset(EXT_LAST_EXTENT(eh), 0,
2927 sizeof(struct ext4_extent));
2928 }
2929 le16_add_cpu(&eh->eh_entries, -1);
2930 }
2931
2932 err = ext4_ext_dirty(handle, inode, path + depth);
2933 if (err)
2934 goto out;
2935
2936 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2937 ext4_ext_pblock(ex));
2938 ex--;
2939 ex_ee_block = le32_to_cpu(ex->ee_block);
2940 ex_ee_len = ext4_ext_get_actual_len(ex);
2941 }
2942
2943 if (correct_index && eh->eh_entries)
2944 err = ext4_ext_correct_indexes(handle, inode, path);
2945
2946 /*
2947 * If there's a partial cluster and at least one extent remains in
2948 * the leaf, free the partial cluster if it isn't shared with the
2949 * current extent. If it is shared with the current extent
2950 * we reset the partial cluster because we've reached the start of the
2951 * truncated/punched region and we're done removing blocks.
2952 */
2953 if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2954 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2955 if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2956 int flags = get_default_free_blocks_flags(inode);
2957
2958 if (ext4_is_pending(inode, partial->lblk))
2959 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2960 ext4_free_blocks(handle, inode, NULL,
2961 EXT4_C2B(sbi, partial->pclu),
2962 sbi->s_cluster_ratio, flags);
2963 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2964 ext4_rereserve_cluster(inode, partial->lblk);
2965 }
2966 partial->state = initial;
2967 }
2968
2969 /* if this leaf is free, then we should
2970 * remove it from index block above */
2971 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2972 err = ext4_ext_rm_idx(handle, inode, path, depth);
2973
2974out:
2975 return err;
2976}
2977
2978/*
2979 * ext4_ext_more_to_rm:
2980 * returns 1 if current index has to be freed (even partial)
2981 */
2982static int
2983ext4_ext_more_to_rm(struct ext4_ext_path *path)
2984{
2985 BUG_ON(path->p_idx == NULL);
2986
2987 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2988 return 0;
2989
2990 /*
2991 * if truncate on deeper level happened, it wasn't partial,
2992 * so we have to consider current index for truncation
2993 */
2994 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2995 return 0;
2996 return 1;
2997}
2998
2999int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
3000 ext4_lblk_t end)
3001{
3002 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3003 int depth = ext_depth(inode);
3004 struct ext4_ext_path *path = NULL;
3005 struct partial_cluster partial;
3006 handle_t *handle;
3007 int i = 0, err = 0;
3008
3009 partial.pclu = 0;
3010 partial.lblk = 0;
3011 partial.state = initial;
3012
3013 ext_debug("truncate since %u to %u\n", start, end);
3014
3015 /* probably first extent we're gonna free will be last in block */
3016 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
3017 if (IS_ERR(handle))
3018 return PTR_ERR(handle);
3019
3020again:
3021 trace_ext4_ext_remove_space(inode, start, end, depth);
3022
3023 /*
3024 * Check if we are removing extents inside the extent tree. If that
3025 * is the case, we are going to punch a hole inside the extent tree
3026 * so we have to check whether we need to split the extent covering
3027 * the last block to remove so we can easily remove the part of it
3028 * in ext4_ext_rm_leaf().
3029 */
3030 if (end < EXT_MAX_BLOCKS - 1) {
3031 struct ext4_extent *ex;
3032 ext4_lblk_t ee_block, ex_end, lblk;
3033 ext4_fsblk_t pblk;
3034
3035 /* find extent for or closest extent to this block */
3036 path = ext4_find_extent(inode, end, NULL,
3037 EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
3038 if (IS_ERR(path)) {
3039 ext4_journal_stop(handle);
3040 return PTR_ERR(path);
3041 }
3042 depth = ext_depth(inode);
3043 /* Leaf not may not exist only if inode has no blocks at all */
3044 ex = path[depth].p_ext;
3045 if (!ex) {
3046 if (depth) {
3047 EXT4_ERROR_INODE(inode,
3048 "path[%d].p_hdr == NULL",
3049 depth);
3050 err = -EFSCORRUPTED;
3051 }
3052 goto out;
3053 }
3054
3055 ee_block = le32_to_cpu(ex->ee_block);
3056 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
3057
3058 /*
3059 * See if the last block is inside the extent, if so split
3060 * the extent at 'end' block so we can easily remove the
3061 * tail of the first part of the split extent in
3062 * ext4_ext_rm_leaf().
3063 */
3064 if (end >= ee_block && end < ex_end) {
3065
3066 /*
3067 * If we're going to split the extent, note that
3068 * the cluster containing the block after 'end' is
3069 * in use to avoid freeing it when removing blocks.
3070 */
3071 if (sbi->s_cluster_ratio > 1) {
3072 pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
3073 partial.pclu = EXT4_B2C(sbi, pblk);
3074 partial.state = nofree;
3075 }
3076
3077 /*
3078 * Split the extent in two so that 'end' is the last
3079 * block in the first new extent. Also we should not
3080 * fail removing space due to ENOSPC so try to use
3081 * reserved block if that happens.
3082 */
3083 err = ext4_force_split_extent_at(handle, inode, &path,
3084 end + 1, 1);
3085 if (err < 0)
3086 goto out;
3087
3088 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
3089 partial.state == initial) {
3090 /*
3091 * If we're punching, there's an extent to the right.
3092 * If the partial cluster hasn't been set, set it to
3093 * that extent's first cluster and its state to nofree
3094 * so it won't be freed should it contain blocks to be
3095 * removed. If it's already set (tofree/nofree), we're
3096 * retrying and keep the original partial cluster info
3097 * so a cluster marked tofree as a result of earlier
3098 * extent removal is not lost.
3099 */
3100 lblk = ex_end + 1;
3101 err = ext4_ext_search_right(inode, path, &lblk, &pblk,
3102 &ex);
3103 if (err)
3104 goto out;
3105 if (pblk) {
3106 partial.pclu = EXT4_B2C(sbi, pblk);
3107 partial.state = nofree;
3108 }
3109 }
3110 }
3111 /*
3112 * We start scanning from right side, freeing all the blocks
3113 * after i_size and walking into the tree depth-wise.
3114 */
3115 depth = ext_depth(inode);
3116 if (path) {
3117 int k = i = depth;
3118 while (--k > 0)
3119 path[k].p_block =
3120 le16_to_cpu(path[k].p_hdr->eh_entries)+1;
3121 } else {
3122 path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
3123 GFP_NOFS | __GFP_NOFAIL);
3124 if (path == NULL) {
3125 ext4_journal_stop(handle);
3126 return -ENOMEM;
3127 }
3128 path[0].p_maxdepth = path[0].p_depth = depth;
3129 path[0].p_hdr = ext_inode_hdr(inode);
3130 i = 0;
3131
3132 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
3133 err = -EFSCORRUPTED;
3134 goto out;
3135 }
3136 }
3137 err = 0;
3138
3139 while (i >= 0 && err == 0) {
3140 if (i == depth) {
3141 /* this is leaf block */
3142 err = ext4_ext_rm_leaf(handle, inode, path,
3143 &partial, start, end);
3144 /* root level has p_bh == NULL, brelse() eats this */
3145 brelse(path[i].p_bh);
3146 path[i].p_bh = NULL;
3147 i--;
3148 continue;
3149 }
3150
3151 /* this is index block */
3152 if (!path[i].p_hdr) {
3153 ext_debug("initialize header\n");
3154 path[i].p_hdr = ext_block_hdr(path[i].p_bh);
3155 }
3156
3157 if (!path[i].p_idx) {
3158 /* this level hasn't been touched yet */
3159 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
3160 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
3161 ext_debug("init index ptr: hdr 0x%p, num %d\n",
3162 path[i].p_hdr,
3163 le16_to_cpu(path[i].p_hdr->eh_entries));
3164 } else {
3165 /* we were already here, see at next index */
3166 path[i].p_idx--;
3167 }
3168
3169 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
3170 i, EXT_FIRST_INDEX(path[i].p_hdr),
3171 path[i].p_idx);
3172 if (ext4_ext_more_to_rm(path + i)) {
3173 struct buffer_head *bh;
3174 /* go to the next level */
3175 ext_debug("move to level %d (block %llu)\n",
3176 i + 1, ext4_idx_pblock(path[i].p_idx));
3177 memset(path + i + 1, 0, sizeof(*path));
3178 bh = read_extent_tree_block(inode, path[i].p_idx,
3179 depth - i - 1,
3180 EXT4_EX_NOCACHE);
3181 if (IS_ERR(bh)) {
3182 /* should we reset i_size? */
3183 err = PTR_ERR(bh);
3184 break;
3185 }
3186 /* Yield here to deal with large extent trees.
3187 * Should be a no-op if we did IO above. */
3188 cond_resched();
3189 if (WARN_ON(i + 1 > depth)) {
3190 err = -EFSCORRUPTED;
3191 break;
3192 }
3193 path[i + 1].p_bh = bh;
3194
3195 /* save actual number of indexes since this
3196 * number is changed at the next iteration */
3197 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
3198 i++;
3199 } else {
3200 /* we finished processing this index, go up */
3201 if (path[i].p_hdr->eh_entries == 0 && i > 0) {
3202 /* index is empty, remove it;
3203 * handle must be already prepared by the
3204 * truncatei_leaf() */
3205 err = ext4_ext_rm_idx(handle, inode, path, i);
3206 }
3207 /* root level has p_bh == NULL, brelse() eats this */
3208 brelse(path[i].p_bh);
3209 path[i].p_bh = NULL;
3210 i--;
3211 ext_debug("return to level %d\n", i);
3212 }
3213 }
3214
3215 trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
3216 path->p_hdr->eh_entries);
3217
3218 /*
3219 * if there's a partial cluster and we have removed the first extent
3220 * in the file, then we also free the partial cluster, if any
3221 */
3222 if (partial.state == tofree && err == 0) {
3223 int flags = get_default_free_blocks_flags(inode);
3224
3225 if (ext4_is_pending(inode, partial.lblk))
3226 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
3227 ext4_free_blocks(handle, inode, NULL,
3228 EXT4_C2B(sbi, partial.pclu),
3229 sbi->s_cluster_ratio, flags);
3230 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
3231 ext4_rereserve_cluster(inode, partial.lblk);
3232 partial.state = initial;
3233 }
3234
3235 /* TODO: flexible tree reduction should be here */
3236 if (path->p_hdr->eh_entries == 0) {
3237 /*
3238 * truncate to zero freed all the tree,
3239 * so we need to correct eh_depth
3240 */
3241 err = ext4_ext_get_access(handle, inode, path);
3242 if (err == 0) {
3243 ext_inode_hdr(inode)->eh_depth = 0;
3244 ext_inode_hdr(inode)->eh_max =
3245 cpu_to_le16(ext4_ext_space_root(inode, 0));
3246 err = ext4_ext_dirty(handle, inode, path);
3247 }
3248 }
3249out:
3250 ext4_ext_drop_refs(path);
3251 kfree(path);
3252 path = NULL;
3253 if (err == -EAGAIN)
3254 goto again;
3255 ext4_journal_stop(handle);
3256
3257 return err;
3258}
3259
3260/*
3261 * called at mount time
3262 */
3263void ext4_ext_init(struct super_block *sb)
3264{
3265 /*
3266 * possible initialization would be here
3267 */
3268
3269 if (ext4_has_feature_extents(sb)) {
3270#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3271 printk(KERN_INFO "EXT4-fs: file extents enabled"
3272#ifdef AGGRESSIVE_TEST
3273 ", aggressive tests"
3274#endif
3275#ifdef CHECK_BINSEARCH
3276 ", check binsearch"
3277#endif
3278#ifdef EXTENTS_STATS
3279 ", stats"
3280#endif
3281 "\n");
3282#endif
3283#ifdef EXTENTS_STATS
3284 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3285 EXT4_SB(sb)->s_ext_min = 1 << 30;
3286 EXT4_SB(sb)->s_ext_max = 0;
3287#endif
3288 }
3289}
3290
3291/*
3292 * called at umount time
3293 */
3294void ext4_ext_release(struct super_block *sb)
3295{
3296 if (!ext4_has_feature_extents(sb))
3297 return;
3298
3299#ifdef EXTENTS_STATS
3300 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3301 struct ext4_sb_info *sbi = EXT4_SB(sb);
3302 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3303 sbi->s_ext_blocks, sbi->s_ext_extents,
3304 sbi->s_ext_blocks / sbi->s_ext_extents);
3305 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3306 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3307 }
3308#endif
3309}
3310
3311static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3312{
3313 ext4_lblk_t ee_block;
3314 ext4_fsblk_t ee_pblock;
3315 unsigned int ee_len;
3316
3317 ee_block = le32_to_cpu(ex->ee_block);
3318 ee_len = ext4_ext_get_actual_len(ex);
3319 ee_pblock = ext4_ext_pblock(ex);
3320
3321 if (ee_len == 0)
3322 return 0;
3323
3324 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3325 EXTENT_STATUS_WRITTEN);
3326}
3327
3328/* FIXME!! we need to try to merge to left or right after zero-out */
3329static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3330{
3331 ext4_fsblk_t ee_pblock;
3332 unsigned int ee_len;
3333
3334 ee_len = ext4_ext_get_actual_len(ex);
3335 ee_pblock = ext4_ext_pblock(ex);
3336 return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3337 ee_len);
3338}
3339
3340/*
3341 * ext4_split_extent_at() splits an extent at given block.
3342 *
3343 * @handle: the journal handle
3344 * @inode: the file inode
3345 * @path: the path to the extent
3346 * @split: the logical block where the extent is splitted.
3347 * @split_flags: indicates if the extent could be zeroout if split fails, and
3348 * the states(init or unwritten) of new extents.
3349 * @flags: flags used to insert new extent to extent tree.
3350 *
3351 *
3352 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3353 * of which are deterimined by split_flag.
3354 *
3355 * There are two cases:
3356 * a> the extent are splitted into two extent.
3357 * b> split is not needed, and just mark the extent.
3358 *
3359 * return 0 on success.
3360 */
3361static int ext4_split_extent_at(handle_t *handle,
3362 struct inode *inode,
3363 struct ext4_ext_path **ppath,
3364 ext4_lblk_t split,
3365 int split_flag,
3366 int flags)
3367{
3368 struct ext4_ext_path *path = *ppath;
3369 ext4_fsblk_t newblock;
3370 ext4_lblk_t ee_block;
3371 struct ext4_extent *ex, newex, orig_ex, zero_ex;
3372 struct ext4_extent *ex2 = NULL;
3373 unsigned int ee_len, depth;
3374 int err = 0;
3375
3376 BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3377 (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3378
3379 ext_debug("ext4_split_extents_at: inode %lu, logical"
3380 "block %llu\n", inode->i_ino, (unsigned long long)split);
3381
3382 ext4_ext_show_leaf(inode, path);
3383
3384 depth = ext_depth(inode);
3385 ex = path[depth].p_ext;
3386 ee_block = le32_to_cpu(ex->ee_block);
3387 ee_len = ext4_ext_get_actual_len(ex);
3388 newblock = split - ee_block + ext4_ext_pblock(ex);
3389
3390 BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3391 BUG_ON(!ext4_ext_is_unwritten(ex) &&
3392 split_flag & (EXT4_EXT_MAY_ZEROOUT |
3393 EXT4_EXT_MARK_UNWRIT1 |
3394 EXT4_EXT_MARK_UNWRIT2));
3395
3396 err = ext4_ext_get_access(handle, inode, path + depth);
3397 if (err)
3398 goto out;
3399
3400 if (split == ee_block) {
3401 /*
3402 * case b: block @split is the block that the extent begins with
3403 * then we just change the state of the extent, and splitting
3404 * is not needed.
3405 */
3406 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3407 ext4_ext_mark_unwritten(ex);
3408 else
3409 ext4_ext_mark_initialized(ex);
3410
3411 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3412 ext4_ext_try_to_merge(handle, inode, path, ex);
3413
3414 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3415 goto out;
3416 }
3417
3418 /* case a */
3419 memcpy(&orig_ex, ex, sizeof(orig_ex));
3420 ex->ee_len = cpu_to_le16(split - ee_block);
3421 if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3422 ext4_ext_mark_unwritten(ex);
3423
3424 /*
3425 * path may lead to new leaf, not to original leaf any more
3426 * after ext4_ext_insert_extent() returns,
3427 */
3428 err = ext4_ext_dirty(handle, inode, path + depth);
3429 if (err)
3430 goto fix_extent_len;
3431
3432 ex2 = &newex;
3433 ex2->ee_block = cpu_to_le32(split);
3434 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
3435 ext4_ext_store_pblock(ex2, newblock);
3436 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3437 ext4_ext_mark_unwritten(ex2);
3438
3439 err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3440 if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
3441 goto out;
3442
3443 /*
3444 * Update path is required because previous ext4_ext_insert_extent()
3445 * may have freed or reallocated the path. Using EXT4_EX_NOFAIL
3446 * guarantees that ext4_find_extent() will not return -ENOMEM,
3447 * otherwise -ENOMEM will cause a retry in do_writepages(), and a
3448 * WARN_ON may be triggered in ext4_da_update_reserve_space() due to
3449 * an incorrect ee_len causing the i_reserved_data_blocks exception.
3450 */
3451 path = ext4_find_extent(inode, ee_block, ppath,
3452 flags | EXT4_EX_NOFAIL);
3453 if (IS_ERR(path)) {
3454 EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
3455 split, PTR_ERR(path));
3456 return PTR_ERR(path);
3457 }
3458 depth = ext_depth(inode);
3459 ex = path[depth].p_ext;
3460 *ppath = path;
3461
3462 if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
3463 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3464 if (split_flag & EXT4_EXT_DATA_VALID1) {
3465 err = ext4_ext_zeroout(inode, ex2);
3466 zero_ex.ee_block = ex2->ee_block;
3467 zero_ex.ee_len = cpu_to_le16(
3468 ext4_ext_get_actual_len(ex2));
3469 ext4_ext_store_pblock(&zero_ex,
3470 ext4_ext_pblock(ex2));
3471 } else {
3472 err = ext4_ext_zeroout(inode, ex);
3473 zero_ex.ee_block = ex->ee_block;
3474 zero_ex.ee_len = cpu_to_le16(
3475 ext4_ext_get_actual_len(ex));
3476 ext4_ext_store_pblock(&zero_ex,
3477 ext4_ext_pblock(ex));
3478 }
3479 } else {
3480 err = ext4_ext_zeroout(inode, &orig_ex);
3481 zero_ex.ee_block = orig_ex.ee_block;
3482 zero_ex.ee_len = cpu_to_le16(
3483 ext4_ext_get_actual_len(&orig_ex));
3484 ext4_ext_store_pblock(&zero_ex,
3485 ext4_ext_pblock(&orig_ex));
3486 }
3487
3488 if (!err) {
3489 /* update the extent length and mark as initialized */
3490 ex->ee_len = cpu_to_le16(ee_len);
3491 ext4_ext_try_to_merge(handle, inode, path, ex);
3492 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3493 if (!err)
3494 /* update extent status tree */
3495 err = ext4_zeroout_es(inode, &zero_ex);
3496 /* If we failed at this point, we don't know in which
3497 * state the extent tree exactly is so don't try to fix
3498 * length of the original extent as it may do even more
3499 * damage.
3500 */
3501 goto out;
3502 }
3503 }
3504
3505fix_extent_len:
3506 ex->ee_len = orig_ex.ee_len;
3507 ext4_ext_dirty(handle, inode, path + path->p_depth);
3508 return err;
3509out:
3510 ext4_ext_show_leaf(inode, *ppath);
3511 return err;
3512}
3513
3514/*
3515 * ext4_split_extents() splits an extent and mark extent which is covered
3516 * by @map as split_flags indicates
3517 *
3518 * It may result in splitting the extent into multiple extents (up to three)
3519 * There are three possibilities:
3520 * a> There is no split required
3521 * b> Splits in two extents: Split is happening at either end of the extent
3522 * c> Splits in three extents: Somone is splitting in middle of the extent
3523 *
3524 */
3525static int ext4_split_extent(handle_t *handle,
3526 struct inode *inode,
3527 struct ext4_ext_path **ppath,
3528 struct ext4_map_blocks *map,
3529 int split_flag,
3530 int flags)
3531{
3532 struct ext4_ext_path *path = *ppath;
3533 ext4_lblk_t ee_block;
3534 struct ext4_extent *ex;
3535 unsigned int ee_len, depth;
3536 int err = 0;
3537 int unwritten;
3538 int split_flag1, flags1;
3539 int allocated = map->m_len;
3540
3541 depth = ext_depth(inode);
3542 ex = path[depth].p_ext;
3543 ee_block = le32_to_cpu(ex->ee_block);
3544 ee_len = ext4_ext_get_actual_len(ex);
3545 unwritten = ext4_ext_is_unwritten(ex);
3546
3547 if (map->m_lblk + map->m_len < ee_block + ee_len) {
3548 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3549 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3550 if (unwritten)
3551 split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3552 EXT4_EXT_MARK_UNWRIT2;
3553 if (split_flag & EXT4_EXT_DATA_VALID2)
3554 split_flag1 |= EXT4_EXT_DATA_VALID1;
3555 err = ext4_split_extent_at(handle, inode, ppath,
3556 map->m_lblk + map->m_len, split_flag1, flags1);
3557 if (err)
3558 goto out;
3559 } else {
3560 allocated = ee_len - (map->m_lblk - ee_block);
3561 }
3562 /*
3563 * Update path is required because previous ext4_split_extent_at() may
3564 * result in split of original leaf or extent zeroout.
3565 */
3566 path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
3567 if (IS_ERR(path))
3568 return PTR_ERR(path);
3569 depth = ext_depth(inode);
3570 ex = path[depth].p_ext;
3571 if (!ex) {
3572 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3573 (unsigned long) map->m_lblk);
3574 return -EFSCORRUPTED;
3575 }
3576 unwritten = ext4_ext_is_unwritten(ex);
3577 split_flag1 = 0;
3578
3579 if (map->m_lblk >= ee_block) {
3580 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3581 if (unwritten) {
3582 split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3583 split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3584 EXT4_EXT_MARK_UNWRIT2);
3585 }
3586 err = ext4_split_extent_at(handle, inode, ppath,
3587 map->m_lblk, split_flag1, flags);
3588 if (err)
3589 goto out;
3590 }
3591
3592 ext4_ext_show_leaf(inode, path);
3593out:
3594 return err ? err : allocated;
3595}
3596
3597/*
3598 * This function is called by ext4_ext_map_blocks() if someone tries to write
3599 * to an unwritten extent. It may result in splitting the unwritten
3600 * extent into multiple extents (up to three - one initialized and two
3601 * unwritten).
3602 * There are three possibilities:
3603 * a> There is no split required: Entire extent should be initialized
3604 * b> Splits in two extents: Write is happening at either end of the extent
3605 * c> Splits in three extents: Somone is writing in middle of the extent
3606 *
3607 * Pre-conditions:
3608 * - The extent pointed to by 'path' is unwritten.
3609 * - The extent pointed to by 'path' contains a superset
3610 * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3611 *
3612 * Post-conditions on success:
3613 * - the returned value is the number of blocks beyond map->l_lblk
3614 * that are allocated and initialized.
3615 * It is guaranteed to be >= map->m_len.
3616 */
3617static int ext4_ext_convert_to_initialized(handle_t *handle,
3618 struct inode *inode,
3619 struct ext4_map_blocks *map,
3620 struct ext4_ext_path **ppath,
3621 int flags)
3622{
3623 struct ext4_ext_path *path = *ppath;
3624 struct ext4_sb_info *sbi;
3625 struct ext4_extent_header *eh;
3626 struct ext4_map_blocks split_map;
3627 struct ext4_extent zero_ex1, zero_ex2;
3628 struct ext4_extent *ex, *abut_ex;
3629 ext4_lblk_t ee_block, eof_block;
3630 unsigned int ee_len, depth, map_len = map->m_len;
3631 int err = 0;
3632 int split_flag = EXT4_EXT_DATA_VALID2;
3633 int allocated = 0;
3634 unsigned int max_zeroout = 0;
3635
3636 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3637 "block %llu, max_blocks %u\n", inode->i_ino,
3638 (unsigned long long)map->m_lblk, map_len);
3639
3640 sbi = EXT4_SB(inode->i_sb);
3641 eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3642 >> inode->i_sb->s_blocksize_bits;
3643 if (eof_block < map->m_lblk + map_len)
3644 eof_block = map->m_lblk + map_len;
3645
3646 depth = ext_depth(inode);
3647 eh = path[depth].p_hdr;
3648 ex = path[depth].p_ext;
3649 ee_block = le32_to_cpu(ex->ee_block);
3650 ee_len = ext4_ext_get_actual_len(ex);
3651 zero_ex1.ee_len = 0;
3652 zero_ex2.ee_len = 0;
3653
3654 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3655
3656 /* Pre-conditions */
3657 BUG_ON(!ext4_ext_is_unwritten(ex));
3658 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3659
3660 /*
3661 * Attempt to transfer newly initialized blocks from the currently
3662 * unwritten extent to its neighbor. This is much cheaper
3663 * than an insertion followed by a merge as those involve costly
3664 * memmove() calls. Transferring to the left is the common case in
3665 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3666 * followed by append writes.
3667 *
3668 * Limitations of the current logic:
3669 * - L1: we do not deal with writes covering the whole extent.
3670 * This would require removing the extent if the transfer
3671 * is possible.
3672 * - L2: we only attempt to merge with an extent stored in the
3673 * same extent tree node.
3674 */
3675 if ((map->m_lblk == ee_block) &&
3676 /* See if we can merge left */
3677 (map_len < ee_len) && /*L1*/
3678 (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/
3679 ext4_lblk_t prev_lblk;
3680 ext4_fsblk_t prev_pblk, ee_pblk;
3681 unsigned int prev_len;
3682
3683 abut_ex = ex - 1;
3684 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3685 prev_len = ext4_ext_get_actual_len(abut_ex);
3686 prev_pblk = ext4_ext_pblock(abut_ex);
3687 ee_pblk = ext4_ext_pblock(ex);
3688
3689 /*
3690 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3691 * upon those conditions:
3692 * - C1: abut_ex is initialized,
3693 * - C2: abut_ex is logically abutting ex,
3694 * - C3: abut_ex is physically abutting ex,
3695 * - C4: abut_ex can receive the additional blocks without
3696 * overflowing the (initialized) length limit.
3697 */
3698 if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
3699 ((prev_lblk + prev_len) == ee_block) && /*C2*/
3700 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
3701 (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
3702 err = ext4_ext_get_access(handle, inode, path + depth);
3703 if (err)
3704 goto out;
3705
3706 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3707 map, ex, abut_ex);
3708
3709 /* Shift the start of ex by 'map_len' blocks */
3710 ex->ee_block = cpu_to_le32(ee_block + map_len);
3711 ext4_ext_store_pblock(ex, ee_pblk + map_len);
3712 ex->ee_len = cpu_to_le16(ee_len - map_len);
3713 ext4_ext_mark_unwritten(ex); /* Restore the flag */
3714
3715 /* Extend abut_ex by 'map_len' blocks */
3716 abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3717
3718 /* Result: number of initialized blocks past m_lblk */
3719 allocated = map_len;
3720 }
3721 } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3722 (map_len < ee_len) && /*L1*/
3723 ex < EXT_LAST_EXTENT(eh)) { /*L2*/
3724 /* See if we can merge right */
3725 ext4_lblk_t next_lblk;
3726 ext4_fsblk_t next_pblk, ee_pblk;
3727 unsigned int next_len;
3728
3729 abut_ex = ex + 1;
3730 next_lblk = le32_to_cpu(abut_ex->ee_block);
3731 next_len = ext4_ext_get_actual_len(abut_ex);
3732 next_pblk = ext4_ext_pblock(abut_ex);
3733 ee_pblk = ext4_ext_pblock(ex);
3734
3735 /*
3736 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3737 * upon those conditions:
3738 * - C1: abut_ex is initialized,
3739 * - C2: abut_ex is logically abutting ex,
3740 * - C3: abut_ex is physically abutting ex,
3741 * - C4: abut_ex can receive the additional blocks without
3742 * overflowing the (initialized) length limit.
3743 */
3744 if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
3745 ((map->m_lblk + map_len) == next_lblk) && /*C2*/
3746 ((ee_pblk + ee_len) == next_pblk) && /*C3*/
3747 (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
3748 err = ext4_ext_get_access(handle, inode, path + depth);
3749 if (err)
3750 goto out;
3751
3752 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3753 map, ex, abut_ex);
3754
3755 /* Shift the start of abut_ex by 'map_len' blocks */
3756 abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3757 ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3758 ex->ee_len = cpu_to_le16(ee_len - map_len);
3759 ext4_ext_mark_unwritten(ex); /* Restore the flag */
3760
3761 /* Extend abut_ex by 'map_len' blocks */
3762 abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3763
3764 /* Result: number of initialized blocks past m_lblk */
3765 allocated = map_len;
3766 }
3767 }
3768 if (allocated) {
3769 /* Mark the block containing both extents as dirty */
3770 ext4_ext_dirty(handle, inode, path + depth);
3771
3772 /* Update path to point to the right extent */
3773 path[depth].p_ext = abut_ex;
3774 goto out;
3775 } else
3776 allocated = ee_len - (map->m_lblk - ee_block);
3777
3778 WARN_ON(map->m_lblk < ee_block);
3779 /*
3780 * It is safe to convert extent to initialized via explicit
3781 * zeroout only if extent is fully inside i_size or new_size.
3782 */
3783 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3784
3785 if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3786 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3787 (inode->i_sb->s_blocksize_bits - 10);
3788
3789 if (IS_ENCRYPTED(inode))
3790 max_zeroout = 0;
3791
3792 /*
3793 * five cases:
3794 * 1. split the extent into three extents.
3795 * 2. split the extent into two extents, zeroout the head of the first
3796 * extent.
3797 * 3. split the extent into two extents, zeroout the tail of the second
3798 * extent.
3799 * 4. split the extent into two extents with out zeroout.
3800 * 5. no splitting needed, just possibly zeroout the head and / or the
3801 * tail of the extent.
3802 */
3803 split_map.m_lblk = map->m_lblk;
3804 split_map.m_len = map->m_len;
3805
3806 if (max_zeroout && (allocated > split_map.m_len)) {
3807 if (allocated <= max_zeroout) {
3808 /* case 3 or 5 */
3809 zero_ex1.ee_block =
3810 cpu_to_le32(split_map.m_lblk +
3811 split_map.m_len);
3812 zero_ex1.ee_len =
3813 cpu_to_le16(allocated - split_map.m_len);
3814 ext4_ext_store_pblock(&zero_ex1,
3815 ext4_ext_pblock(ex) + split_map.m_lblk +
3816 split_map.m_len - ee_block);
3817 err = ext4_ext_zeroout(inode, &zero_ex1);
3818 if (err)
3819 goto out;
3820 split_map.m_len = allocated;
3821 }
3822 if (split_map.m_lblk - ee_block + split_map.m_len <
3823 max_zeroout) {
3824 /* case 2 or 5 */
3825 if (split_map.m_lblk != ee_block) {
3826 zero_ex2.ee_block = ex->ee_block;
3827 zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3828 ee_block);
3829 ext4_ext_store_pblock(&zero_ex2,
3830 ext4_ext_pblock(ex));
3831 err = ext4_ext_zeroout(inode, &zero_ex2);
3832 if (err)
3833 goto out;
3834 }
3835
3836 split_map.m_len += split_map.m_lblk - ee_block;
3837 split_map.m_lblk = ee_block;
3838 allocated = map->m_len;
3839 }
3840 }
3841
3842 err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3843 flags);
3844 if (err > 0)
3845 err = 0;
3846out:
3847 /* If we have gotten a failure, don't zero out status tree */
3848 if (!err) {
3849 err = ext4_zeroout_es(inode, &zero_ex1);
3850 if (!err)
3851 err = ext4_zeroout_es(inode, &zero_ex2);
3852 }
3853 return err ? err : allocated;
3854}
3855
3856/*
3857 * This function is called by ext4_ext_map_blocks() from
3858 * ext4_get_blocks_dio_write() when DIO to write
3859 * to an unwritten extent.
3860 *
3861 * Writing to an unwritten extent may result in splitting the unwritten
3862 * extent into multiple initialized/unwritten extents (up to three)
3863 * There are three possibilities:
3864 * a> There is no split required: Entire extent should be unwritten
3865 * b> Splits in two extents: Write is happening at either end of the extent
3866 * c> Splits in three extents: Somone is writing in middle of the extent
3867 *
3868 * This works the same way in the case of initialized -> unwritten conversion.
3869 *
3870 * One of more index blocks maybe needed if the extent tree grow after
3871 * the unwritten extent split. To prevent ENOSPC occur at the IO
3872 * complete, we need to split the unwritten extent before DIO submit
3873 * the IO. The unwritten extent called at this time will be split
3874 * into three unwritten extent(at most). After IO complete, the part
3875 * being filled will be convert to initialized by the end_io callback function
3876 * via ext4_convert_unwritten_extents().
3877 *
3878 * Returns the size of unwritten extent to be written on success.
3879 */
3880static int ext4_split_convert_extents(handle_t *handle,
3881 struct inode *inode,
3882 struct ext4_map_blocks *map,
3883 struct ext4_ext_path **ppath,
3884 int flags)
3885{
3886 struct ext4_ext_path *path = *ppath;
3887 ext4_lblk_t eof_block;
3888 ext4_lblk_t ee_block;
3889 struct ext4_extent *ex;
3890 unsigned int ee_len;
3891 int split_flag = 0, depth;
3892
3893 ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3894 __func__, inode->i_ino,
3895 (unsigned long long)map->m_lblk, map->m_len);
3896
3897 eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3898 >> inode->i_sb->s_blocksize_bits;
3899 if (eof_block < map->m_lblk + map->m_len)
3900 eof_block = map->m_lblk + map->m_len;
3901 /*
3902 * It is safe to convert extent to initialized via explicit
3903 * zeroout only if extent is fully insde i_size or new_size.
3904 */
3905 depth = ext_depth(inode);
3906 ex = path[depth].p_ext;
3907 ee_block = le32_to_cpu(ex->ee_block);
3908 ee_len = ext4_ext_get_actual_len(ex);
3909
3910 /* Convert to unwritten */
3911 if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3912 split_flag |= EXT4_EXT_DATA_VALID1;
3913 /* Convert to initialized */
3914 } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3915 split_flag |= ee_block + ee_len <= eof_block ?
3916 EXT4_EXT_MAY_ZEROOUT : 0;
3917 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3918 }
3919 flags |= EXT4_GET_BLOCKS_PRE_IO;
3920 return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3921}
3922
3923static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3924 struct inode *inode,
3925 struct ext4_map_blocks *map,
3926 struct ext4_ext_path **ppath)
3927{
3928 struct ext4_ext_path *path = *ppath;
3929 struct ext4_extent *ex;
3930 ext4_lblk_t ee_block;
3931 unsigned int ee_len;
3932 int depth;
3933 int err = 0;
3934
3935 depth = ext_depth(inode);
3936 ex = path[depth].p_ext;
3937 ee_block = le32_to_cpu(ex->ee_block);
3938 ee_len = ext4_ext_get_actual_len(ex);
3939
3940 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3941 "block %llu, max_blocks %u\n", inode->i_ino,
3942 (unsigned long long)ee_block, ee_len);
3943
3944 /* If extent is larger than requested it is a clear sign that we still
3945 * have some extent state machine issues left. So extent_split is still
3946 * required.
3947 * TODO: Once all related issues will be fixed this situation should be
3948 * illegal.
3949 */
3950 if (ee_block != map->m_lblk || ee_len > map->m_len) {
3951#ifdef CONFIG_EXT4_DEBUG
3952 ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3953 " len %u; IO logical block %llu, len %u",
3954 inode->i_ino, (unsigned long long)ee_block, ee_len,
3955 (unsigned long long)map->m_lblk, map->m_len);
3956#endif
3957 err = ext4_split_convert_extents(handle, inode, map, ppath,
3958 EXT4_GET_BLOCKS_CONVERT);
3959 if (err < 0)
3960 return err;
3961 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3962 if (IS_ERR(path))
3963 return PTR_ERR(path);
3964 depth = ext_depth(inode);
3965 ex = path[depth].p_ext;
3966 }
3967
3968 err = ext4_ext_get_access(handle, inode, path + depth);
3969 if (err)
3970 goto out;
3971 /* first mark the extent as initialized */
3972 ext4_ext_mark_initialized(ex);
3973
3974 /* note: ext4_ext_correct_indexes() isn't needed here because
3975 * borders are not changed
3976 */
3977 ext4_ext_try_to_merge(handle, inode, path, ex);
3978
3979 /* Mark modified extent as dirty */
3980 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3981out:
3982 ext4_ext_show_leaf(inode, path);
3983 return err;
3984}
3985
3986/*
3987 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3988 */
3989static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3990 ext4_lblk_t lblk,
3991 struct ext4_ext_path *path,
3992 unsigned int len)
3993{
3994 int i, depth;
3995 struct ext4_extent_header *eh;
3996 struct ext4_extent *last_ex;
3997
3998 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3999 return 0;
4000
4001 depth = ext_depth(inode);
4002 eh = path[depth].p_hdr;
4003
4004 /*
4005 * We're going to remove EOFBLOCKS_FL entirely in future so we
4006 * do not care for this case anymore. Simply remove the flag
4007 * if there are no extents.
4008 */
4009 if (unlikely(!eh->eh_entries))
4010 goto out;
4011 last_ex = EXT_LAST_EXTENT(eh);
4012 /*
4013 * We should clear the EOFBLOCKS_FL flag if we are writing the
4014 * last block in the last extent in the file. We test this by
4015 * first checking to see if the caller to
4016 * ext4_ext_get_blocks() was interested in the last block (or
4017 * a block beyond the last block) in the current extent. If
4018 * this turns out to be false, we can bail out from this
4019 * function immediately.
4020 */
4021 if (lblk + len < le32_to_cpu(last_ex->ee_block) +
4022 ext4_ext_get_actual_len(last_ex))
4023 return 0;
4024 /*
4025 * If the caller does appear to be planning to write at or
4026 * beyond the end of the current extent, we then test to see
4027 * if the current extent is the last extent in the file, by
4028 * checking to make sure it was reached via the rightmost node
4029 * at each level of the tree.
4030 */
4031 for (i = depth-1; i >= 0; i--)
4032 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
4033 return 0;
4034out:
4035 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4036 return ext4_mark_inode_dirty(handle, inode);
4037}
4038
4039static int
4040convert_initialized_extent(handle_t *handle, struct inode *inode,
4041 struct ext4_map_blocks *map,
4042 struct ext4_ext_path **ppath,
4043 unsigned int allocated)
4044{
4045 struct ext4_ext_path *path = *ppath;
4046 struct ext4_extent *ex;
4047 ext4_lblk_t ee_block;
4048 unsigned int ee_len;
4049 int depth;
4050 int err = 0;
4051
4052 /*
4053 * Make sure that the extent is no bigger than we support with
4054 * unwritten extent
4055 */
4056 if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
4057 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
4058
4059 depth = ext_depth(inode);
4060 ex = path[depth].p_ext;
4061 ee_block = le32_to_cpu(ex->ee_block);
4062 ee_len = ext4_ext_get_actual_len(ex);
4063
4064 ext_debug("%s: inode %lu, logical"
4065 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
4066 (unsigned long long)ee_block, ee_len);
4067
4068 if (ee_block != map->m_lblk || ee_len > map->m_len) {
4069 err = ext4_split_convert_extents(handle, inode, map, ppath,
4070 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
4071 if (err < 0)
4072 return err;
4073 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
4074 if (IS_ERR(path))
4075 return PTR_ERR(path);
4076 depth = ext_depth(inode);
4077 ex = path[depth].p_ext;
4078 if (!ex) {
4079 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
4080 (unsigned long) map->m_lblk);
4081 return -EFSCORRUPTED;
4082 }
4083 }
4084
4085 err = ext4_ext_get_access(handle, inode, path + depth);
4086 if (err)
4087 return err;
4088 /* first mark the extent as unwritten */
4089 ext4_ext_mark_unwritten(ex);
4090
4091 /* note: ext4_ext_correct_indexes() isn't needed here because
4092 * borders are not changed
4093 */
4094 ext4_ext_try_to_merge(handle, inode, path, ex);
4095
4096 /* Mark modified extent as dirty */
4097 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
4098 if (err)
4099 return err;
4100 ext4_ext_show_leaf(inode, path);
4101
4102 ext4_update_inode_fsync_trans(handle, inode, 1);
4103 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
4104 if (err)
4105 return err;
4106 map->m_flags |= EXT4_MAP_UNWRITTEN;
4107 if (allocated > map->m_len)
4108 allocated = map->m_len;
4109 map->m_len = allocated;
4110 return allocated;
4111}
4112
4113static int
4114ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4115 struct ext4_map_blocks *map,
4116 struct ext4_ext_path **ppath, int flags,
4117 unsigned int allocated, ext4_fsblk_t newblock)
4118{
4119 struct ext4_ext_path *path = *ppath;
4120 int ret = 0;
4121 int err = 0;
4122
4123 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4124 "block %llu, max_blocks %u, flags %x, allocated %u\n",
4125 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
4126 flags, allocated);
4127 ext4_ext_show_leaf(inode, path);
4128
4129 /*
4130 * When writing into unwritten space, we should not fail to
4131 * allocate metadata blocks for the new extent block if needed.
4132 */
4133 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4134
4135 trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4136 allocated, newblock);
4137
4138 /* get_block() before submit the IO, split the extent */
4139 if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4140 ret = ext4_split_convert_extents(handle, inode, map, ppath,
4141 flags | EXT4_GET_BLOCKS_CONVERT);
4142 if (ret <= 0)
4143 goto out;
4144 map->m_flags |= EXT4_MAP_UNWRITTEN;
4145 goto out;
4146 }
4147 /* IO end_io complete, convert the filled extent to written */
4148 if (flags & EXT4_GET_BLOCKS_CONVERT) {
4149 if (flags & EXT4_GET_BLOCKS_ZERO) {
4150 if (allocated > map->m_len)
4151 allocated = map->m_len;
4152 err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
4153 allocated);
4154 if (err < 0)
4155 goto out2;
4156 }
4157 ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4158 ppath);
4159 if (ret >= 0) {
4160 ext4_update_inode_fsync_trans(handle, inode, 1);
4161 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4162 path, map->m_len);
4163 } else
4164 err = ret;
4165 map->m_flags |= EXT4_MAP_MAPPED;
4166 map->m_pblk = newblock;
4167 if (allocated > map->m_len)
4168 allocated = map->m_len;
4169 map->m_len = allocated;
4170 goto out2;
4171 }
4172 /* buffered IO case */
4173 /*
4174 * repeat fallocate creation request
4175 * we already have an unwritten extent
4176 */
4177 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4178 map->m_flags |= EXT4_MAP_UNWRITTEN;
4179 goto map_out;
4180 }
4181
4182 /* buffered READ or buffered write_begin() lookup */
4183 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4184 /*
4185 * We have blocks reserved already. We
4186 * return allocated blocks so that delalloc
4187 * won't do block reservation for us. But
4188 * the buffer head will be unmapped so that
4189 * a read from the block returns 0s.
4190 */
4191 map->m_flags |= EXT4_MAP_UNWRITTEN;
4192 goto out1;
4193 }
4194
4195 /* buffered write, writepage time, convert*/
4196 ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
4197 if (ret >= 0)
4198 ext4_update_inode_fsync_trans(handle, inode, 1);
4199out:
4200 if (ret <= 0) {
4201 err = ret;
4202 goto out2;
4203 } else
4204 allocated = ret;
4205 map->m_flags |= EXT4_MAP_NEW;
4206 if (allocated > map->m_len)
4207 allocated = map->m_len;
4208 map->m_len = allocated;
4209
4210map_out:
4211 map->m_flags |= EXT4_MAP_MAPPED;
4212 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4213 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4214 map->m_len);
4215 if (err < 0)
4216 goto out2;
4217 }
4218out1:
4219 if (allocated > map->m_len)
4220 allocated = map->m_len;
4221 ext4_ext_show_leaf(inode, path);
4222 map->m_pblk = newblock;
4223 map->m_len = allocated;
4224out2:
4225 return err ? err : allocated;
4226}
4227
4228/*
4229 * get_implied_cluster_alloc - check to see if the requested
4230 * allocation (in the map structure) overlaps with a cluster already
4231 * allocated in an extent.
4232 * @sb The filesystem superblock structure
4233 * @map The requested lblk->pblk mapping
4234 * @ex The extent structure which might contain an implied
4235 * cluster allocation
4236 *
4237 * This function is called by ext4_ext_map_blocks() after we failed to
4238 * find blocks that were already in the inode's extent tree. Hence,
4239 * we know that the beginning of the requested region cannot overlap
4240 * the extent from the inode's extent tree. There are three cases we
4241 * want to catch. The first is this case:
4242 *
4243 * |--- cluster # N--|
4244 * |--- extent ---| |---- requested region ---|
4245 * |==========|
4246 *
4247 * The second case that we need to test for is this one:
4248 *
4249 * |--------- cluster # N ----------------|
4250 * |--- requested region --| |------- extent ----|
4251 * |=======================|
4252 *
4253 * The third case is when the requested region lies between two extents
4254 * within the same cluster:
4255 * |------------- cluster # N-------------|
4256 * |----- ex -----| |---- ex_right ----|
4257 * |------ requested region ------|
4258 * |================|
4259 *
4260 * In each of the above cases, we need to set the map->m_pblk and
4261 * map->m_len so it corresponds to the return the extent labelled as
4262 * "|====|" from cluster #N, since it is already in use for data in
4263 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
4264 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4265 * as a new "allocated" block region. Otherwise, we will return 0 and
4266 * ext4_ext_map_blocks() will then allocate one or more new clusters
4267 * by calling ext4_mb_new_blocks().
4268 */
4269static int get_implied_cluster_alloc(struct super_block *sb,
4270 struct ext4_map_blocks *map,
4271 struct ext4_extent *ex,
4272 struct ext4_ext_path *path)
4273{
4274 struct ext4_sb_info *sbi = EXT4_SB(sb);
4275 ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4276 ext4_lblk_t ex_cluster_start, ex_cluster_end;
4277 ext4_lblk_t rr_cluster_start;
4278 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4279 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4280 unsigned short ee_len = ext4_ext_get_actual_len(ex);
4281
4282 /* The extent passed in that we are trying to match */
4283 ex_cluster_start = EXT4_B2C(sbi, ee_block);
4284 ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4285
4286 /* The requested region passed into ext4_map_blocks() */
4287 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4288
4289 if ((rr_cluster_start == ex_cluster_end) ||
4290 (rr_cluster_start == ex_cluster_start)) {
4291 if (rr_cluster_start == ex_cluster_end)
4292 ee_start += ee_len - 1;
4293 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4294 map->m_len = min(map->m_len,
4295 (unsigned) sbi->s_cluster_ratio - c_offset);
4296 /*
4297 * Check for and handle this case:
4298 *
4299 * |--------- cluster # N-------------|
4300 * |------- extent ----|
4301 * |--- requested region ---|
4302 * |===========|
4303 */
4304
4305 if (map->m_lblk < ee_block)
4306 map->m_len = min(map->m_len, ee_block - map->m_lblk);
4307
4308 /*
4309 * Check for the case where there is already another allocated
4310 * block to the right of 'ex' but before the end of the cluster.
4311 *
4312 * |------------- cluster # N-------------|
4313 * |----- ex -----| |---- ex_right ----|
4314 * |------ requested region ------|
4315 * |================|
4316 */
4317 if (map->m_lblk > ee_block) {
4318 ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4319 map->m_len = min(map->m_len, next - map->m_lblk);
4320 }
4321
4322 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4323 return 1;
4324 }
4325
4326 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4327 return 0;
4328}
4329
4330
4331/*
4332 * Block allocation/map/preallocation routine for extents based files
4333 *
4334 *
4335 * Need to be called with
4336 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4337 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4338 *
4339 * return > 0, number of of blocks already mapped/allocated
4340 * if create == 0 and these are pre-allocated blocks
4341 * buffer head is unmapped
4342 * otherwise blocks are mapped
4343 *
4344 * return = 0, if plain look up failed (blocks have not been allocated)
4345 * buffer head is unmapped
4346 *
4347 * return < 0, error case.
4348 */
4349int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4350 struct ext4_map_blocks *map, int flags)
4351{
4352 struct ext4_ext_path *path = NULL;
4353 struct ext4_extent newex, *ex, *ex2;
4354 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4355 ext4_fsblk_t newblock = 0;
4356 int free_on_err = 0, err = 0, depth, ret;
4357 unsigned int allocated = 0, offset = 0;
4358 unsigned int allocated_clusters = 0;
4359 struct ext4_allocation_request ar;
4360 ext4_lblk_t cluster_offset;
4361 bool map_from_cluster = false;
4362
4363 ext_debug("blocks %u/%u requested for inode %lu\n",
4364 map->m_lblk, map->m_len, inode->i_ino);
4365 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4366
4367 /* find extent for this block */
4368 path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4369 if (IS_ERR(path)) {
4370 err = PTR_ERR(path);
4371 path = NULL;
4372 goto out2;
4373 }
4374
4375 depth = ext_depth(inode);
4376
4377 /*
4378 * consistent leaf must not be empty;
4379 * this situation is possible, though, _during_ tree modification;
4380 * this is why assert can't be put in ext4_find_extent()
4381 */
4382 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4383 EXT4_ERROR_INODE(inode, "bad extent address "
4384 "lblock: %lu, depth: %d pblock %lld",
4385 (unsigned long) map->m_lblk, depth,
4386 path[depth].p_block);
4387 err = -EFSCORRUPTED;
4388 goto out2;
4389 }
4390
4391 ex = path[depth].p_ext;
4392 if (ex) {
4393 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4394 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4395 unsigned short ee_len;
4396
4397
4398 /*
4399 * unwritten extents are treated as holes, except that
4400 * we split out initialized portions during a write.
4401 */
4402 ee_len = ext4_ext_get_actual_len(ex);
4403
4404 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4405
4406 /* if found extent covers block, simply return it */
4407 if (in_range(map->m_lblk, ee_block, ee_len)) {
4408 newblock = map->m_lblk - ee_block + ee_start;
4409 /* number of remaining blocks in the extent */
4410 allocated = ee_len - (map->m_lblk - ee_block);
4411 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4412 ee_block, ee_len, newblock);
4413
4414 /*
4415 * If the extent is initialized check whether the
4416 * caller wants to convert it to unwritten.
4417 */
4418 if ((!ext4_ext_is_unwritten(ex)) &&
4419 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4420 allocated = convert_initialized_extent(
4421 handle, inode, map, &path,
4422 allocated);
4423 goto out2;
4424 } else if (!ext4_ext_is_unwritten(ex))
4425 goto out;
4426
4427 ret = ext4_ext_handle_unwritten_extents(
4428 handle, inode, map, &path, flags,
4429 allocated, newblock);
4430 if (ret < 0)
4431 err = ret;
4432 else
4433 allocated = ret;
4434 goto out2;
4435 }
4436 }
4437
4438 /*
4439 * requested block isn't allocated yet;
4440 * we couldn't try to create block if create flag is zero
4441 */
4442 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4443 ext4_lblk_t hole_start, hole_len;
4444
4445 hole_start = map->m_lblk;
4446 hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4447 /*
4448 * put just found gap into cache to speed up
4449 * subsequent requests
4450 */
4451 ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4452
4453 /* Update hole_len to reflect hole size after map->m_lblk */
4454 if (hole_start != map->m_lblk)
4455 hole_len -= map->m_lblk - hole_start;
4456 map->m_pblk = 0;
4457 map->m_len = min_t(unsigned int, map->m_len, hole_len);
4458
4459 goto out2;
4460 }
4461
4462 /*
4463 * Okay, we need to do block allocation.
4464 */
4465 newex.ee_block = cpu_to_le32(map->m_lblk);
4466 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4467
4468 /*
4469 * If we are doing bigalloc, check to see if the extent returned
4470 * by ext4_find_extent() implies a cluster we can use.
4471 */
4472 if (cluster_offset && ex &&
4473 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4474 ar.len = allocated = map->m_len;
4475 newblock = map->m_pblk;
4476 map_from_cluster = true;
4477 goto got_allocated_blocks;
4478 }
4479
4480 /* find neighbour allocated blocks */
4481 ar.lleft = map->m_lblk;
4482 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4483 if (err)
4484 goto out2;
4485 ar.lright = map->m_lblk;
4486 ex2 = NULL;
4487 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4488 if (err)
4489 goto out2;
4490
4491 /* Check if the extent after searching to the right implies a
4492 * cluster we can use. */
4493 if ((sbi->s_cluster_ratio > 1) && ex2 &&
4494 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4495 ar.len = allocated = map->m_len;
4496 newblock = map->m_pblk;
4497 map_from_cluster = true;
4498 goto got_allocated_blocks;
4499 }
4500
4501 /*
4502 * See if request is beyond maximum number of blocks we can have in
4503 * a single extent. For an initialized extent this limit is
4504 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4505 * EXT_UNWRITTEN_MAX_LEN.
4506 */
4507 if (map->m_len > EXT_INIT_MAX_LEN &&
4508 !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4509 map->m_len = EXT_INIT_MAX_LEN;
4510 else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4511 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4512 map->m_len = EXT_UNWRITTEN_MAX_LEN;
4513
4514 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4515 newex.ee_len = cpu_to_le16(map->m_len);
4516 err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4517 if (err)
4518 allocated = ext4_ext_get_actual_len(&newex);
4519 else
4520 allocated = map->m_len;
4521
4522 /* allocate new block */
4523 ar.inode = inode;
4524 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4525 ar.logical = map->m_lblk;
4526 /*
4527 * We calculate the offset from the beginning of the cluster
4528 * for the logical block number, since when we allocate a
4529 * physical cluster, the physical block should start at the
4530 * same offset from the beginning of the cluster. This is
4531 * needed so that future calls to get_implied_cluster_alloc()
4532 * work correctly.
4533 */
4534 offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4535 ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4536 ar.goal -= offset;
4537 ar.logical -= offset;
4538 if (S_ISREG(inode->i_mode))
4539 ar.flags = EXT4_MB_HINT_DATA;
4540 else
4541 /* disable in-core preallocation for non-regular files */
4542 ar.flags = 0;
4543 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4544 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4545 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4546 ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4547 if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4548 ar.flags |= EXT4_MB_USE_RESERVED;
4549 newblock = ext4_mb_new_blocks(handle, &ar, &err);
4550 if (!newblock)
4551 goto out2;
4552 ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4553 ar.goal, newblock, allocated);
4554 free_on_err = 1;
4555 allocated_clusters = ar.len;
4556 ar.len = EXT4_C2B(sbi, ar.len) - offset;
4557 if (ar.len > allocated)
4558 ar.len = allocated;
4559
4560got_allocated_blocks:
4561 /* try to insert new extent into found leaf and return */
4562 ext4_ext_store_pblock(&newex, newblock + offset);
4563 newex.ee_len = cpu_to_le16(ar.len);
4564 /* Mark unwritten */
4565 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4566 ext4_ext_mark_unwritten(&newex);
4567 map->m_flags |= EXT4_MAP_UNWRITTEN;
4568 }
4569
4570 err = 0;
4571 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4572 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4573 path, ar.len);
4574 if (!err)
4575 err = ext4_ext_insert_extent(handle, inode, &path,
4576 &newex, flags);
4577
4578 if (err && free_on_err) {
4579 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4580 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4581 /* free data blocks we just allocated */
4582 /* not a good idea to call discard here directly,
4583 * but otherwise we'd need to call it every free() */
4584 ext4_discard_preallocations(inode);
4585 ext4_free_blocks(handle, inode, NULL, newblock,
4586 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4587 goto out2;
4588 }
4589
4590 /* previous routine could use block we allocated */
4591 newblock = ext4_ext_pblock(&newex);
4592 allocated = ext4_ext_get_actual_len(&newex);
4593 if (allocated > map->m_len)
4594 allocated = map->m_len;
4595 map->m_flags |= EXT4_MAP_NEW;
4596
4597 /*
4598 * Reduce the reserved cluster count to reflect successful deferred
4599 * allocation of delayed allocated clusters or direct allocation of
4600 * clusters discovered to be delayed allocated. Once allocated, a
4601 * cluster is not included in the reserved count.
4602 */
4603 if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
4604 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4605 /*
4606 * When allocating delayed allocated clusters, simply
4607 * reduce the reserved cluster count and claim quota
4608 */
4609 ext4_da_update_reserve_space(inode, allocated_clusters,
4610 1);
4611 } else {
4612 ext4_lblk_t lblk, len;
4613 unsigned int n;
4614
4615 /*
4616 * When allocating non-delayed allocated clusters
4617 * (from fallocate, filemap, DIO, or clusters
4618 * allocated when delalloc has been disabled by
4619 * ext4_nonda_switch), reduce the reserved cluster
4620 * count by the number of allocated clusters that
4621 * have previously been delayed allocated. Quota
4622 * has been claimed by ext4_mb_new_blocks() above,
4623 * so release the quota reservations made for any
4624 * previously delayed allocated clusters.
4625 */
4626 lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4627 len = allocated_clusters << sbi->s_cluster_bits;
4628 n = ext4_es_delayed_clu(inode, lblk, len);
4629 if (n > 0)
4630 ext4_da_update_reserve_space(inode, (int) n, 0);
4631 }
4632 }
4633
4634 /*
4635 * Cache the extent and update transaction to commit on fdatasync only
4636 * when it is _not_ an unwritten extent.
4637 */
4638 if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4639 ext4_update_inode_fsync_trans(handle, inode, 1);
4640 else
4641 ext4_update_inode_fsync_trans(handle, inode, 0);
4642out:
4643 if (allocated > map->m_len)
4644 allocated = map->m_len;
4645 ext4_ext_show_leaf(inode, path);
4646 map->m_flags |= EXT4_MAP_MAPPED;
4647 map->m_pblk = newblock;
4648 map->m_len = allocated;
4649out2:
4650 ext4_ext_drop_refs(path);
4651 kfree(path);
4652
4653 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4654 err ? err : allocated);
4655 return err ? err : allocated;
4656}
4657
4658int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4659{
4660 struct super_block *sb = inode->i_sb;
4661 ext4_lblk_t last_block;
4662 int err = 0;
4663
4664 /*
4665 * TODO: optimization is possible here.
4666 * Probably we need not scan at all,
4667 * because page truncation is enough.
4668 */
4669
4670 /* we have to know where to truncate from in crash case */
4671 EXT4_I(inode)->i_disksize = inode->i_size;
4672 err = ext4_mark_inode_dirty(handle, inode);
4673 if (err)
4674 return err;
4675
4676 last_block = (inode->i_size + sb->s_blocksize - 1)
4677 >> EXT4_BLOCK_SIZE_BITS(sb);
4678retry:
4679 err = ext4_es_remove_extent(inode, last_block,
4680 EXT_MAX_BLOCKS - last_block);
4681 if (err == -ENOMEM) {
4682 cond_resched();
4683 congestion_wait(BLK_RW_ASYNC, HZ/50);
4684 goto retry;
4685 }
4686 if (err)
4687 return err;
4688retry_remove_space:
4689 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4690 if (err == -ENOMEM) {
4691 cond_resched();
4692 congestion_wait(BLK_RW_ASYNC, HZ/50);
4693 goto retry_remove_space;
4694 }
4695 return err;
4696}
4697
4698static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4699 ext4_lblk_t len, loff_t new_size,
4700 int flags)
4701{
4702 struct inode *inode = file_inode(file);
4703 handle_t *handle;
4704 int ret = 0;
4705 int ret2 = 0;
4706 int retries = 0;
4707 int depth = 0;
4708 struct ext4_map_blocks map;
4709 unsigned int credits;
4710 loff_t epos;
4711
4712 BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4713 map.m_lblk = offset;
4714 map.m_len = len;
4715 /*
4716 * Don't normalize the request if it can fit in one extent so
4717 * that it doesn't get unnecessarily split into multiple
4718 * extents.
4719 */
4720 if (len <= EXT_UNWRITTEN_MAX_LEN)
4721 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4722
4723 /*
4724 * credits to insert 1 extent into extent tree
4725 */
4726 credits = ext4_chunk_trans_blocks(inode, len);
4727 depth = ext_depth(inode);
4728
4729retry:
4730 while (ret >= 0 && len) {
4731 /*
4732 * Recalculate credits when extent tree depth changes.
4733 */
4734 if (depth != ext_depth(inode)) {
4735 credits = ext4_chunk_trans_blocks(inode, len);
4736 depth = ext_depth(inode);
4737 }
4738
4739 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4740 credits);
4741 if (IS_ERR(handle)) {
4742 ret = PTR_ERR(handle);
4743 break;
4744 }
4745 ret = ext4_map_blocks(handle, inode, &map, flags);
4746 if (ret <= 0) {
4747 ext4_debug("inode #%lu: block %u: len %u: "
4748 "ext4_ext_map_blocks returned %d",
4749 inode->i_ino, map.m_lblk,
4750 map.m_len, ret);
4751 ext4_mark_inode_dirty(handle, inode);
4752 ret2 = ext4_journal_stop(handle);
4753 break;
4754 }
4755 map.m_lblk += ret;
4756 map.m_len = len = len - ret;
4757 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4758 inode->i_ctime = current_time(inode);
4759 if (new_size) {
4760 if (epos > new_size)
4761 epos = new_size;
4762 if (ext4_update_inode_size(inode, epos) & 0x1)
4763 inode->i_mtime = inode->i_ctime;
4764 } else {
4765 if (epos > inode->i_size)
4766 ext4_set_inode_flag(inode,
4767 EXT4_INODE_EOFBLOCKS);
4768 }
4769 ext4_mark_inode_dirty(handle, inode);
4770 ext4_update_inode_fsync_trans(handle, inode, 1);
4771 ret2 = ext4_journal_stop(handle);
4772 if (ret2)
4773 break;
4774 }
4775 if (ret == -ENOSPC &&
4776 ext4_should_retry_alloc(inode->i_sb, &retries)) {
4777 ret = 0;
4778 goto retry;
4779 }
4780
4781 return ret > 0 ? ret2 : ret;
4782}
4783
4784static long ext4_zero_range(struct file *file, loff_t offset,
4785 loff_t len, int mode)
4786{
4787 struct inode *inode = file_inode(file);
4788 handle_t *handle = NULL;
4789 unsigned int max_blocks;
4790 loff_t new_size = 0;
4791 int ret = 0;
4792 int flags;
4793 int credits;
4794 int partial_begin, partial_end;
4795 loff_t start, end;
4796 ext4_lblk_t lblk;
4797 unsigned int blkbits = inode->i_blkbits;
4798
4799 trace_ext4_zero_range(inode, offset, len, mode);
4800
4801 if (!S_ISREG(inode->i_mode))
4802 return -EINVAL;
4803
4804 /* Call ext4_force_commit to flush all data in case of data=journal. */
4805 if (ext4_should_journal_data(inode)) {
4806 ret = ext4_force_commit(inode->i_sb);
4807 if (ret)
4808 return ret;
4809 }
4810
4811 /*
4812 * Round up offset. This is not fallocate, we neet to zero out
4813 * blocks, so convert interior block aligned part of the range to
4814 * unwritten and possibly manually zero out unaligned parts of the
4815 * range.
4816 */
4817 start = round_up(offset, 1 << blkbits);
4818 end = round_down((offset + len), 1 << blkbits);
4819
4820 if (start < offset || end > offset + len)
4821 return -EINVAL;
4822 partial_begin = offset & ((1 << blkbits) - 1);
4823 partial_end = (offset + len) & ((1 << blkbits) - 1);
4824
4825 lblk = start >> blkbits;
4826 max_blocks = (end >> blkbits);
4827 if (max_blocks < lblk)
4828 max_blocks = 0;
4829 else
4830 max_blocks -= lblk;
4831
4832 inode_lock(inode);
4833
4834 /*
4835 * Indirect files do not support unwritten extnets
4836 */
4837 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4838 ret = -EOPNOTSUPP;
4839 goto out_mutex;
4840 }
4841
4842 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4843 (offset + len > i_size_read(inode) ||
4844 offset + len > EXT4_I(inode)->i_disksize)) {
4845 new_size = offset + len;
4846 ret = inode_newsize_ok(inode, new_size);
4847 if (ret)
4848 goto out_mutex;
4849 }
4850
4851 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4852 if (mode & FALLOC_FL_KEEP_SIZE)
4853 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4854
4855 /* Wait all existing dio workers, newcomers will block on i_mutex */
4856 inode_dio_wait(inode);
4857
4858 /* Preallocate the range including the unaligned edges */
4859 if (partial_begin || partial_end) {
4860 ret = ext4_alloc_file_blocks(file,
4861 round_down(offset, 1 << blkbits) >> blkbits,
4862 (round_up((offset + len), 1 << blkbits) -
4863 round_down(offset, 1 << blkbits)) >> blkbits,
4864 new_size, flags);
4865 if (ret)
4866 goto out_mutex;
4867
4868 }
4869
4870 /* Zero range excluding the unaligned edges */
4871 if (max_blocks > 0) {
4872 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4873 EXT4_EX_NOCACHE);
4874
4875 /*
4876 * Prevent page faults from reinstantiating pages we have
4877 * released from page cache.
4878 */
4879 down_write(&EXT4_I(inode)->i_mmap_sem);
4880
4881 ret = ext4_break_layouts(inode);
4882 if (ret) {
4883 up_write(&EXT4_I(inode)->i_mmap_sem);
4884 goto out_mutex;
4885 }
4886
4887 ret = ext4_update_disksize_before_punch(inode, offset, len);
4888 if (ret) {
4889 up_write(&EXT4_I(inode)->i_mmap_sem);
4890 goto out_mutex;
4891 }
4892 /* Now release the pages and zero block aligned part of pages */
4893 truncate_pagecache_range(inode, start, end - 1);
4894 inode->i_mtime = inode->i_ctime = current_time(inode);
4895
4896 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4897 flags);
4898 up_write(&EXT4_I(inode)->i_mmap_sem);
4899 if (ret)
4900 goto out_mutex;
4901 }
4902 if (!partial_begin && !partial_end)
4903 goto out_mutex;
4904
4905 /*
4906 * In worst case we have to writeout two nonadjacent unwritten
4907 * blocks and update the inode
4908 */
4909 credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4910 if (ext4_should_journal_data(inode))
4911 credits += 2;
4912 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4913 if (IS_ERR(handle)) {
4914 ret = PTR_ERR(handle);
4915 ext4_std_error(inode->i_sb, ret);
4916 goto out_mutex;
4917 }
4918
4919 inode->i_mtime = inode->i_ctime = current_time(inode);
4920 if (new_size) {
4921 ext4_update_inode_size(inode, new_size);
4922 } else {
4923 /*
4924 * Mark that we allocate beyond EOF so the subsequent truncate
4925 * can proceed even if the new size is the same as i_size.
4926 */
4927 if ((offset + len) > i_size_read(inode))
4928 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4929 }
4930 ext4_mark_inode_dirty(handle, inode);
4931
4932 /* Zero out partial block at the edges of the range */
4933 ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4934 if (ret >= 0)
4935 ext4_update_inode_fsync_trans(handle, inode, 1);
4936
4937 if (file->f_flags & O_SYNC)
4938 ext4_handle_sync(handle);
4939
4940 ext4_journal_stop(handle);
4941out_mutex:
4942 inode_unlock(inode);
4943 return ret;
4944}
4945
4946/*
4947 * preallocate space for a file. This implements ext4's fallocate file
4948 * operation, which gets called from sys_fallocate system call.
4949 * For block-mapped files, posix_fallocate should fall back to the method
4950 * of writing zeroes to the required new blocks (the same behavior which is
4951 * expected for file systems which do not support fallocate() system call).
4952 */
4953long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4954{
4955 struct inode *inode = file_inode(file);
4956 loff_t new_size = 0;
4957 unsigned int max_blocks;
4958 int ret = 0;
4959 int flags;
4960 ext4_lblk_t lblk;
4961 unsigned int blkbits = inode->i_blkbits;
4962
4963 /*
4964 * Encrypted inodes can't handle collapse range or insert
4965 * range since we would need to re-encrypt blocks with a
4966 * different IV or XTS tweak (which are based on the logical
4967 * block number).
4968 *
4969 * XXX It's not clear why zero range isn't working, but we'll
4970 * leave it disabled for encrypted inodes for now. This is a
4971 * bug we should fix....
4972 */
4973 if (IS_ENCRYPTED(inode) &&
4974 (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
4975 FALLOC_FL_ZERO_RANGE)))
4976 return -EOPNOTSUPP;
4977
4978 /* Return error if mode is not supported */
4979 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4980 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4981 FALLOC_FL_INSERT_RANGE))
4982 return -EOPNOTSUPP;
4983
4984 inode_lock(inode);
4985 ret = ext4_convert_inline_data(inode);
4986 inode_unlock(inode);
4987 if (ret)
4988 return ret;
4989
4990 if (mode & FALLOC_FL_PUNCH_HOLE)
4991 return ext4_punch_hole(inode, offset, len);
4992
4993 if (mode & FALLOC_FL_COLLAPSE_RANGE)
4994 return ext4_collapse_range(inode, offset, len);
4995
4996 if (mode & FALLOC_FL_INSERT_RANGE)
4997 return ext4_insert_range(inode, offset, len);
4998
4999 if (mode & FALLOC_FL_ZERO_RANGE)
5000 return ext4_zero_range(file, offset, len, mode);
5001
5002 trace_ext4_fallocate_enter(inode, offset, len, mode);
5003 lblk = offset >> blkbits;
5004
5005 max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
5006 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
5007 if (mode & FALLOC_FL_KEEP_SIZE)
5008 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
5009
5010 inode_lock(inode);
5011
5012 /*
5013 * We only support preallocation for extent-based files only
5014 */
5015 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5016 ret = -EOPNOTSUPP;
5017 goto out;
5018 }
5019
5020 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5021 (offset + len > i_size_read(inode) ||
5022 offset + len > EXT4_I(inode)->i_disksize)) {
5023 new_size = offset + len;
5024 ret = inode_newsize_ok(inode, new_size);
5025 if (ret)
5026 goto out;
5027 }
5028
5029 /* Wait all existing dio workers, newcomers will block on i_mutex */
5030 inode_dio_wait(inode);
5031
5032 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
5033 if (ret)
5034 goto out;
5035
5036 if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
5037 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
5038 EXT4_I(inode)->i_sync_tid);
5039 }
5040out:
5041 inode_unlock(inode);
5042 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
5043 return ret;
5044}
5045
5046/*
5047 * This function convert a range of blocks to written extents
5048 * The caller of this function will pass the start offset and the size.
5049 * all unwritten extents within this range will be converted to
5050 * written extents.
5051 *
5052 * This function is called from the direct IO end io call back
5053 * function, to convert the fallocated extents after IO is completed.
5054 * Returns 0 on success.
5055 */
5056int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
5057 loff_t offset, ssize_t len)
5058{
5059 unsigned int max_blocks;
5060 int ret = 0;
5061 int ret2 = 0;
5062 struct ext4_map_blocks map;
5063 unsigned int credits, blkbits = inode->i_blkbits;
5064
5065 map.m_lblk = offset >> blkbits;
5066 max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
5067
5068 /*
5069 * This is somewhat ugly but the idea is clear: When transaction is
5070 * reserved, everything goes into it. Otherwise we rather start several
5071 * smaller transactions for conversion of each extent separately.
5072 */
5073 if (handle) {
5074 handle = ext4_journal_start_reserved(handle,
5075 EXT4_HT_EXT_CONVERT);
5076 if (IS_ERR(handle))
5077 return PTR_ERR(handle);
5078 credits = 0;
5079 } else {
5080 /*
5081 * credits to insert 1 extent into extent tree
5082 */
5083 credits = ext4_chunk_trans_blocks(inode, max_blocks);
5084 }
5085 while (ret >= 0 && ret < max_blocks) {
5086 map.m_lblk += ret;
5087 map.m_len = (max_blocks -= ret);
5088 if (credits) {
5089 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
5090 credits);
5091 if (IS_ERR(handle)) {
5092 ret = PTR_ERR(handle);
5093 break;
5094 }
5095 }
5096 ret = ext4_map_blocks(handle, inode, &map,
5097 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
5098 if (ret <= 0)
5099 ext4_warning(inode->i_sb,
5100 "inode #%lu: block %u: len %u: "
5101 "ext4_ext_map_blocks returned %d",
5102 inode->i_ino, map.m_lblk,
5103 map.m_len, ret);
5104 ext4_mark_inode_dirty(handle, inode);
5105 if (credits)
5106 ret2 = ext4_journal_stop(handle);
5107 if (ret <= 0 || ret2)
5108 break;
5109 }
5110 if (!credits)
5111 ret2 = ext4_journal_stop(handle);
5112 return ret > 0 ? ret2 : ret;
5113}
5114
5115/*
5116 * If newes is not existing extent (newes->ec_pblk equals zero) find
5117 * delayed extent at start of newes and update newes accordingly and
5118 * return start of the next delayed extent.
5119 *
5120 * If newes is existing extent (newes->ec_pblk is not equal zero)
5121 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5122 * extent found. Leave newes unmodified.
5123 */
5124static int ext4_find_delayed_extent(struct inode *inode,
5125 struct extent_status *newes)
5126{
5127 struct extent_status es;
5128 ext4_lblk_t block, next_del;
5129
5130 if (newes->es_pblk == 0) {
5131 ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
5132 newes->es_lblk,
5133 newes->es_lblk + newes->es_len - 1,
5134 &es);
5135
5136 /*
5137 * No extent in extent-tree contains block @newes->es_pblk,
5138 * then the block may stay in 1)a hole or 2)delayed-extent.
5139 */
5140 if (es.es_len == 0)
5141 /* A hole found. */
5142 return 0;
5143
5144 if (es.es_lblk > newes->es_lblk) {
5145 /* A hole found. */
5146 newes->es_len = min(es.es_lblk - newes->es_lblk,
5147 newes->es_len);
5148 return 0;
5149 }
5150
5151 newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5152 }
5153
5154 block = newes->es_lblk + newes->es_len;
5155 ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
5156 EXT_MAX_BLOCKS, &es);
5157 if (es.es_len == 0)
5158 next_del = EXT_MAX_BLOCKS;
5159 else
5160 next_del = es.es_lblk;
5161
5162 return next_del;
5163}
5164
5165static int ext4_xattr_fiemap(struct inode *inode,
5166 struct fiemap_extent_info *fieinfo)
5167{
5168 __u64 physical = 0;
5169 __u64 length;
5170 __u32 flags = FIEMAP_EXTENT_LAST;
5171 int blockbits = inode->i_sb->s_blocksize_bits;
5172 int error = 0;
5173
5174 /* in-inode? */
5175 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5176 struct ext4_iloc iloc;
5177 int offset; /* offset of xattr in inode */
5178
5179 error = ext4_get_inode_loc(inode, &iloc);
5180 if (error)
5181 return error;
5182 physical = (__u64)iloc.bh->b_blocknr << blockbits;
5183 offset = EXT4_GOOD_OLD_INODE_SIZE +
5184 EXT4_I(inode)->i_extra_isize;
5185 physical += offset;
5186 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5187 flags |= FIEMAP_EXTENT_DATA_INLINE;
5188 brelse(iloc.bh);
5189 } else { /* external block */
5190 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5191 length = inode->i_sb->s_blocksize;
5192 }
5193
5194 if (physical)
5195 error = fiemap_fill_next_extent(fieinfo, 0, physical,
5196 length, flags);
5197 return (error < 0 ? error : 0);
5198}
5199
5200static int _ext4_fiemap(struct inode *inode,
5201 struct fiemap_extent_info *fieinfo,
5202 __u64 start, __u64 len,
5203 int (*fill)(struct inode *, ext4_lblk_t,
5204 ext4_lblk_t,
5205 struct fiemap_extent_info *))
5206{
5207 ext4_lblk_t start_blk;
5208 u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
5209
5210 int error = 0;
5211
5212 if (ext4_has_inline_data(inode)) {
5213 int has_inline = 1;
5214
5215 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
5216 start, len);
5217
5218 if (has_inline)
5219 return error;
5220 }
5221
5222 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5223 error = ext4_ext_precache(inode);
5224 if (error)
5225 return error;
5226 fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
5227 }
5228
5229 /* fallback to generic here if not in extents fmt */
5230 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
5231 fill == ext4_fill_fiemap_extents)
5232 return generic_block_fiemap(inode, fieinfo, start, len,
5233 ext4_get_block);
5234
5235 if (fill == ext4_fill_es_cache_info)
5236 ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
5237 if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
5238 return -EBADR;
5239
5240 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5241 error = ext4_xattr_fiemap(inode, fieinfo);
5242 } else {
5243 ext4_lblk_t len_blks;
5244 __u64 last_blk;
5245
5246 start_blk = start >> inode->i_sb->s_blocksize_bits;
5247 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5248 if (last_blk >= EXT_MAX_BLOCKS)
5249 last_blk = EXT_MAX_BLOCKS-1;
5250 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5251
5252 /*
5253 * Walk the extent tree gathering extent information
5254 * and pushing extents back to the user.
5255 */
5256 error = fill(inode, start_blk, len_blks, fieinfo);
5257 }
5258 return error;
5259}
5260
5261int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5262 __u64 start, __u64 len)
5263{
5264 return _ext4_fiemap(inode, fieinfo, start, len,
5265 ext4_fill_fiemap_extents);
5266}
5267
5268int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
5269 __u64 start, __u64 len)
5270{
5271 if (ext4_has_inline_data(inode)) {
5272 int has_inline;
5273
5274 down_read(&EXT4_I(inode)->xattr_sem);
5275 has_inline = ext4_has_inline_data(inode);
5276 up_read(&EXT4_I(inode)->xattr_sem);
5277 if (has_inline)
5278 return 0;
5279 }
5280
5281 return _ext4_fiemap(inode, fieinfo, start, len,
5282 ext4_fill_es_cache_info);
5283}
5284
5285
5286/*
5287 * ext4_access_path:
5288 * Function to access the path buffer for marking it dirty.
5289 * It also checks if there are sufficient credits left in the journal handle
5290 * to update path.
5291 */
5292static int
5293ext4_access_path(handle_t *handle, struct inode *inode,
5294 struct ext4_ext_path *path)
5295{
5296 int credits, err;
5297
5298 if (!ext4_handle_valid(handle))
5299 return 0;
5300
5301 /*
5302 * Check if need to extend journal credits
5303 * 3 for leaf, sb, and inode plus 2 (bmap and group
5304 * descriptor) for each block group; assume two block
5305 * groups
5306 */
5307 if (handle->h_buffer_credits < 7) {
5308 credits = ext4_writepage_trans_blocks(inode);
5309 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5310 /* EAGAIN is success */
5311 if (err && err != -EAGAIN)
5312 return err;
5313 }
5314
5315 err = ext4_ext_get_access(handle, inode, path);
5316 return err;
5317}
5318
5319/*
5320 * ext4_ext_shift_path_extents:
5321 * Shift the extents of a path structure lying between path[depth].p_ext
5322 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5323 * if it is right shift or left shift operation.
5324 */
5325static int
5326ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5327 struct inode *inode, handle_t *handle,
5328 enum SHIFT_DIRECTION SHIFT)
5329{
5330 int depth, err = 0;
5331 struct ext4_extent *ex_start, *ex_last;
5332 bool update = 0;
5333 depth = path->p_depth;
5334
5335 while (depth >= 0) {
5336 if (depth == path->p_depth) {
5337 ex_start = path[depth].p_ext;
5338 if (!ex_start)
5339 return -EFSCORRUPTED;
5340
5341 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5342
5343 err = ext4_access_path(handle, inode, path + depth);
5344 if (err)
5345 goto out;
5346
5347 if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5348 update = 1;
5349
5350 while (ex_start <= ex_last) {
5351 if (SHIFT == SHIFT_LEFT) {
5352 le32_add_cpu(&ex_start->ee_block,
5353 -shift);
5354 /* Try to merge to the left. */
5355 if ((ex_start >
5356 EXT_FIRST_EXTENT(path[depth].p_hdr))
5357 &&
5358 ext4_ext_try_to_merge_right(inode,
5359 path, ex_start - 1))
5360 ex_last--;
5361 else
5362 ex_start++;
5363 } else {
5364 le32_add_cpu(&ex_last->ee_block, shift);
5365 ext4_ext_try_to_merge_right(inode, path,
5366 ex_last);
5367 ex_last--;
5368 }
5369 }
5370 err = ext4_ext_dirty(handle, inode, path + depth);
5371 if (err)
5372 goto out;
5373
5374 if (--depth < 0 || !update)
5375 break;
5376 }
5377
5378 /* Update index too */
5379 err = ext4_access_path(handle, inode, path + depth);
5380 if (err)
5381 goto out;
5382
5383 if (SHIFT == SHIFT_LEFT)
5384 le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5385 else
5386 le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5387 err = ext4_ext_dirty(handle, inode, path + depth);
5388 if (err)
5389 goto out;
5390
5391 /* we are done if current index is not a starting index */
5392 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5393 break;
5394
5395 depth--;
5396 }
5397
5398out:
5399 return err;
5400}
5401
5402/*
5403 * ext4_ext_shift_extents:
5404 * All the extents which lies in the range from @start to the last allocated
5405 * block for the @inode are shifted either towards left or right (depending
5406 * upon @SHIFT) by @shift blocks.
5407 * On success, 0 is returned, error otherwise.
5408 */
5409static int
5410ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5411 ext4_lblk_t start, ext4_lblk_t shift,
5412 enum SHIFT_DIRECTION SHIFT)
5413{
5414 struct ext4_ext_path *path;
5415 int ret = 0, depth;
5416 struct ext4_extent *extent;
5417 ext4_lblk_t stop, *iterator, ex_start, ex_end;
5418
5419 /* Let path point to the last extent */
5420 path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5421 EXT4_EX_NOCACHE);
5422 if (IS_ERR(path))
5423 return PTR_ERR(path);
5424
5425 depth = path->p_depth;
5426 extent = path[depth].p_ext;
5427 if (!extent)
5428 goto out;
5429
5430 stop = le32_to_cpu(extent->ee_block);
5431
5432 /*
5433 * For left shifts, make sure the hole on the left is big enough to
5434 * accommodate the shift. For right shifts, make sure the last extent
5435 * won't be shifted beyond EXT_MAX_BLOCKS.
5436 */
5437 if (SHIFT == SHIFT_LEFT) {
5438 path = ext4_find_extent(inode, start - 1, &path,
5439 EXT4_EX_NOCACHE);
5440 if (IS_ERR(path))
5441 return PTR_ERR(path);
5442 depth = path->p_depth;
5443 extent = path[depth].p_ext;
5444 if (extent) {
5445 ex_start = le32_to_cpu(extent->ee_block);
5446 ex_end = le32_to_cpu(extent->ee_block) +
5447 ext4_ext_get_actual_len(extent);
5448 } else {
5449 ex_start = 0;
5450 ex_end = 0;
5451 }
5452
5453 if ((start == ex_start && shift > ex_start) ||
5454 (shift > start - ex_end)) {
5455 ret = -EINVAL;
5456 goto out;
5457 }
5458 } else {
5459 if (shift > EXT_MAX_BLOCKS -
5460 (stop + ext4_ext_get_actual_len(extent))) {
5461 ret = -EINVAL;
5462 goto out;
5463 }
5464 }
5465
5466 /*
5467 * In case of left shift, iterator points to start and it is increased
5468 * till we reach stop. In case of right shift, iterator points to stop
5469 * and it is decreased till we reach start.
5470 */
5471 if (SHIFT == SHIFT_LEFT)
5472 iterator = &start;
5473 else
5474 iterator = &stop;
5475
5476 /*
5477 * Its safe to start updating extents. Start and stop are unsigned, so
5478 * in case of right shift if extent with 0 block is reached, iterator
5479 * becomes NULL to indicate the end of the loop.
5480 */
5481 while (iterator && start <= stop) {
5482 path = ext4_find_extent(inode, *iterator, &path,
5483 EXT4_EX_NOCACHE);
5484 if (IS_ERR(path))
5485 return PTR_ERR(path);
5486 depth = path->p_depth;
5487 extent = path[depth].p_ext;
5488 if (!extent) {
5489 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5490 (unsigned long) *iterator);
5491 return -EFSCORRUPTED;
5492 }
5493 if (SHIFT == SHIFT_LEFT && *iterator >
5494 le32_to_cpu(extent->ee_block)) {
5495 /* Hole, move to the next extent */
5496 if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5497 path[depth].p_ext++;
5498 } else {
5499 *iterator = ext4_ext_next_allocated_block(path);
5500 continue;
5501 }
5502 }
5503
5504 if (SHIFT == SHIFT_LEFT) {
5505 extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5506 *iterator = le32_to_cpu(extent->ee_block) +
5507 ext4_ext_get_actual_len(extent);
5508 } else {
5509 extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5510 if (le32_to_cpu(extent->ee_block) > 0)
5511 *iterator = le32_to_cpu(extent->ee_block) - 1;
5512 else
5513 /* Beginning is reached, end of the loop */
5514 iterator = NULL;
5515 /* Update path extent in case we need to stop */
5516 while (le32_to_cpu(extent->ee_block) < start)
5517 extent++;
5518 path[depth].p_ext = extent;
5519 }
5520 ret = ext4_ext_shift_path_extents(path, shift, inode,
5521 handle, SHIFT);
5522 if (ret)
5523 break;
5524 }
5525out:
5526 ext4_ext_drop_refs(path);
5527 kfree(path);
5528 return ret;
5529}
5530
5531/*
5532 * ext4_collapse_range:
5533 * This implements the fallocate's collapse range functionality for ext4
5534 * Returns: 0 and non-zero on error.
5535 */
5536int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5537{
5538 struct super_block *sb = inode->i_sb;
5539 ext4_lblk_t punch_start, punch_stop;
5540 handle_t *handle;
5541 unsigned int credits;
5542 loff_t new_size, ioffset;
5543 int ret;
5544
5545 /*
5546 * We need to test this early because xfstests assumes that a
5547 * collapse range of (0, 1) will return EOPNOTSUPP if the file
5548 * system does not support collapse range.
5549 */
5550 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5551 return -EOPNOTSUPP;
5552
5553 /* Collapse range works only on fs block size aligned offsets. */
5554 if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5555 len & (EXT4_CLUSTER_SIZE(sb) - 1))
5556 return -EINVAL;
5557
5558 if (!S_ISREG(inode->i_mode))
5559 return -EINVAL;
5560
5561 trace_ext4_collapse_range(inode, offset, len);
5562
5563 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5564 punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5565
5566 /* Call ext4_force_commit to flush all data in case of data=journal. */
5567 if (ext4_should_journal_data(inode)) {
5568 ret = ext4_force_commit(inode->i_sb);
5569 if (ret)
5570 return ret;
5571 }
5572
5573 inode_lock(inode);
5574 /*
5575 * There is no need to overlap collapse range with EOF, in which case
5576 * it is effectively a truncate operation
5577 */
5578 if (offset + len >= i_size_read(inode)) {
5579 ret = -EINVAL;
5580 goto out_mutex;
5581 }
5582
5583 /* Currently just for extent based files */
5584 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5585 ret = -EOPNOTSUPP;
5586 goto out_mutex;
5587 }
5588
5589 /* Wait for existing dio to complete */
5590 inode_dio_wait(inode);
5591
5592 /*
5593 * Prevent page faults from reinstantiating pages we have released from
5594 * page cache.
5595 */
5596 down_write(&EXT4_I(inode)->i_mmap_sem);
5597
5598 ret = ext4_break_layouts(inode);
5599 if (ret)
5600 goto out_mmap;
5601
5602 /*
5603 * Need to round down offset to be aligned with page size boundary
5604 * for page size > block size.
5605 */
5606 ioffset = round_down(offset, PAGE_SIZE);
5607 /*
5608 * Write tail of the last page before removed range since it will get
5609 * removed from the page cache below.
5610 */
5611 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
5612 if (ret)
5613 goto out_mmap;
5614 /*
5615 * Write data that will be shifted to preserve them when discarding
5616 * page cache below. We are also protected from pages becoming dirty
5617 * by i_mmap_sem.
5618 */
5619 ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
5620 LLONG_MAX);
5621 if (ret)
5622 goto out_mmap;
5623 truncate_pagecache(inode, ioffset);
5624
5625 credits = ext4_writepage_trans_blocks(inode);
5626 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5627 if (IS_ERR(handle)) {
5628 ret = PTR_ERR(handle);
5629 goto out_mmap;
5630 }
5631
5632 down_write(&EXT4_I(inode)->i_data_sem);
5633 ext4_discard_preallocations(inode);
5634
5635 ret = ext4_es_remove_extent(inode, punch_start,
5636 EXT_MAX_BLOCKS - punch_start);
5637 if (ret) {
5638 up_write(&EXT4_I(inode)->i_data_sem);
5639 goto out_stop;
5640 }
5641
5642 ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5643 if (ret) {
5644 up_write(&EXT4_I(inode)->i_data_sem);
5645 goto out_stop;
5646 }
5647 ext4_discard_preallocations(inode);
5648
5649 ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5650 punch_stop - punch_start, SHIFT_LEFT);
5651 if (ret) {
5652 up_write(&EXT4_I(inode)->i_data_sem);
5653 goto out_stop;
5654 }
5655
5656 new_size = i_size_read(inode) - len;
5657 i_size_write(inode, new_size);
5658 EXT4_I(inode)->i_disksize = new_size;
5659
5660 up_write(&EXT4_I(inode)->i_data_sem);
5661 if (IS_SYNC(inode))
5662 ext4_handle_sync(handle);
5663 inode->i_mtime = inode->i_ctime = current_time(inode);
5664 ext4_mark_inode_dirty(handle, inode);
5665 ext4_update_inode_fsync_trans(handle, inode, 1);
5666
5667out_stop:
5668 ext4_journal_stop(handle);
5669out_mmap:
5670 up_write(&EXT4_I(inode)->i_mmap_sem);
5671out_mutex:
5672 inode_unlock(inode);
5673 return ret;
5674}
5675
5676/*
5677 * ext4_insert_range:
5678 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5679 * The data blocks starting from @offset to the EOF are shifted by @len
5680 * towards right to create a hole in the @inode. Inode size is increased
5681 * by len bytes.
5682 * Returns 0 on success, error otherwise.
5683 */
5684int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5685{
5686 struct super_block *sb = inode->i_sb;
5687 handle_t *handle;
5688 struct ext4_ext_path *path;
5689 struct ext4_extent *extent;
5690 ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5691 unsigned int credits, ee_len;
5692 int ret = 0, depth, split_flag = 0;
5693 loff_t ioffset;
5694
5695 /*
5696 * We need to test this early because xfstests assumes that an
5697 * insert range of (0, 1) will return EOPNOTSUPP if the file
5698 * system does not support insert range.
5699 */
5700 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5701 return -EOPNOTSUPP;
5702
5703 /* Insert range works only on fs block size aligned offsets. */
5704 if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5705 len & (EXT4_CLUSTER_SIZE(sb) - 1))
5706 return -EINVAL;
5707
5708 if (!S_ISREG(inode->i_mode))
5709 return -EOPNOTSUPP;
5710
5711 trace_ext4_insert_range(inode, offset, len);
5712
5713 offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5714 len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5715
5716 /* Call ext4_force_commit to flush all data in case of data=journal */
5717 if (ext4_should_journal_data(inode)) {
5718 ret = ext4_force_commit(inode->i_sb);
5719 if (ret)
5720 return ret;
5721 }
5722
5723 inode_lock(inode);
5724 /* Currently just for extent based files */
5725 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5726 ret = -EOPNOTSUPP;
5727 goto out_mutex;
5728 }
5729
5730 /* Check for wrap through zero */
5731 if (inode->i_size + len > inode->i_sb->s_maxbytes) {
5732 ret = -EFBIG;
5733 goto out_mutex;
5734 }
5735
5736 /* Offset should be less than i_size */
5737 if (offset >= i_size_read(inode)) {
5738 ret = -EINVAL;
5739 goto out_mutex;
5740 }
5741
5742 /* Wait for existing dio to complete */
5743 inode_dio_wait(inode);
5744
5745 /*
5746 * Prevent page faults from reinstantiating pages we have released from
5747 * page cache.
5748 */
5749 down_write(&EXT4_I(inode)->i_mmap_sem);
5750
5751 ret = ext4_break_layouts(inode);
5752 if (ret)
5753 goto out_mmap;
5754
5755 /*
5756 * Need to round down to align start offset to page size boundary
5757 * for page size > block size.
5758 */
5759 ioffset = round_down(offset, PAGE_SIZE);
5760 /* Write out all dirty pages */
5761 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5762 LLONG_MAX);
5763 if (ret)
5764 goto out_mmap;
5765 truncate_pagecache(inode, ioffset);
5766
5767 credits = ext4_writepage_trans_blocks(inode);
5768 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5769 if (IS_ERR(handle)) {
5770 ret = PTR_ERR(handle);
5771 goto out_mmap;
5772 }
5773
5774 /* Expand file to avoid data loss if there is error while shifting */
5775 inode->i_size += len;
5776 EXT4_I(inode)->i_disksize += len;
5777 inode->i_mtime = inode->i_ctime = current_time(inode);
5778 ret = ext4_mark_inode_dirty(handle, inode);
5779 if (ret)
5780 goto out_stop;
5781
5782 down_write(&EXT4_I(inode)->i_data_sem);
5783 ext4_discard_preallocations(inode);
5784
5785 path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5786 if (IS_ERR(path)) {
5787 up_write(&EXT4_I(inode)->i_data_sem);
5788 ret = PTR_ERR(path);
5789 goto out_stop;
5790 }
5791
5792 depth = ext_depth(inode);
5793 extent = path[depth].p_ext;
5794 if (extent) {
5795 ee_start_lblk = le32_to_cpu(extent->ee_block);
5796 ee_len = ext4_ext_get_actual_len(extent);
5797
5798 /*
5799 * If offset_lblk is not the starting block of extent, split
5800 * the extent @offset_lblk
5801 */
5802 if ((offset_lblk > ee_start_lblk) &&
5803 (offset_lblk < (ee_start_lblk + ee_len))) {
5804 if (ext4_ext_is_unwritten(extent))
5805 split_flag = EXT4_EXT_MARK_UNWRIT1 |
5806 EXT4_EXT_MARK_UNWRIT2;
5807 ret = ext4_split_extent_at(handle, inode, &path,
5808 offset_lblk, split_flag,
5809 EXT4_EX_NOCACHE |
5810 EXT4_GET_BLOCKS_PRE_IO |
5811 EXT4_GET_BLOCKS_METADATA_NOFAIL);
5812 }
5813
5814 ext4_ext_drop_refs(path);
5815 kfree(path);
5816 if (ret < 0) {
5817 up_write(&EXT4_I(inode)->i_data_sem);
5818 goto out_stop;
5819 }
5820 } else {
5821 ext4_ext_drop_refs(path);
5822 kfree(path);
5823 }
5824
5825 ret = ext4_es_remove_extent(inode, offset_lblk,
5826 EXT_MAX_BLOCKS - offset_lblk);
5827 if (ret) {
5828 up_write(&EXT4_I(inode)->i_data_sem);
5829 goto out_stop;
5830 }
5831
5832 /*
5833 * if offset_lblk lies in a hole which is at start of file, use
5834 * ee_start_lblk to shift extents
5835 */
5836 ret = ext4_ext_shift_extents(inode, handle,
5837 ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5838 len_lblk, SHIFT_RIGHT);
5839
5840 up_write(&EXT4_I(inode)->i_data_sem);
5841 if (IS_SYNC(inode))
5842 ext4_handle_sync(handle);
5843 if (ret >= 0)
5844 ext4_update_inode_fsync_trans(handle, inode, 1);
5845
5846out_stop:
5847 ext4_journal_stop(handle);
5848out_mmap:
5849 up_write(&EXT4_I(inode)->i_mmap_sem);
5850out_mutex:
5851 inode_unlock(inode);
5852 return ret;
5853}
5854
5855/**
5856 * ext4_swap_extents() - Swap extents between two inodes
5857 * @handle: handle for this transaction
5858 * @inode1: First inode
5859 * @inode2: Second inode
5860 * @lblk1: Start block for first inode
5861 * @lblk2: Start block for second inode
5862 * @count: Number of blocks to swap
5863 * @unwritten: Mark second inode's extents as unwritten after swap
5864 * @erp: Pointer to save error value
5865 *
5866 * This helper routine does exactly what is promise "swap extents". All other
5867 * stuff such as page-cache locking consistency, bh mapping consistency or
5868 * extent's data copying must be performed by caller.
5869 * Locking:
5870 * i_mutex is held for both inodes
5871 * i_data_sem is locked for write for both inodes
5872 * Assumptions:
5873 * All pages from requested range are locked for both inodes
5874 */
5875int
5876ext4_swap_extents(handle_t *handle, struct inode *inode1,
5877 struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5878 ext4_lblk_t count, int unwritten, int *erp)
5879{
5880 struct ext4_ext_path *path1 = NULL;
5881 struct ext4_ext_path *path2 = NULL;
5882 int replaced_count = 0;
5883
5884 BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5885 BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5886 BUG_ON(!inode_is_locked(inode1));
5887 BUG_ON(!inode_is_locked(inode2));
5888
5889 *erp = ext4_es_remove_extent(inode1, lblk1, count);
5890 if (unlikely(*erp))
5891 return 0;
5892 *erp = ext4_es_remove_extent(inode2, lblk2, count);
5893 if (unlikely(*erp))
5894 return 0;
5895
5896 while (count) {
5897 struct ext4_extent *ex1, *ex2, tmp_ex;
5898 ext4_lblk_t e1_blk, e2_blk;
5899 int e1_len, e2_len, len;
5900 int split = 0;
5901
5902 path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5903 if (IS_ERR(path1)) {
5904 *erp = PTR_ERR(path1);
5905 path1 = NULL;
5906 finish:
5907 count = 0;
5908 goto repeat;
5909 }
5910 path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5911 if (IS_ERR(path2)) {
5912 *erp = PTR_ERR(path2);
5913 path2 = NULL;
5914 goto finish;
5915 }
5916 ex1 = path1[path1->p_depth].p_ext;
5917 ex2 = path2[path2->p_depth].p_ext;
5918 /* Do we have somthing to swap ? */
5919 if (unlikely(!ex2 || !ex1))
5920 goto finish;
5921
5922 e1_blk = le32_to_cpu(ex1->ee_block);
5923 e2_blk = le32_to_cpu(ex2->ee_block);
5924 e1_len = ext4_ext_get_actual_len(ex1);
5925 e2_len = ext4_ext_get_actual_len(ex2);
5926
5927 /* Hole handling */
5928 if (!in_range(lblk1, e1_blk, e1_len) ||
5929 !in_range(lblk2, e2_blk, e2_len)) {
5930 ext4_lblk_t next1, next2;
5931
5932 /* if hole after extent, then go to next extent */
5933 next1 = ext4_ext_next_allocated_block(path1);
5934 next2 = ext4_ext_next_allocated_block(path2);
5935 /* If hole before extent, then shift to that extent */
5936 if (e1_blk > lblk1)
5937 next1 = e1_blk;
5938 if (e2_blk > lblk2)
5939 next2 = e2_blk;
5940 /* Do we have something to swap */
5941 if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5942 goto finish;
5943 /* Move to the rightest boundary */
5944 len = next1 - lblk1;
5945 if (len < next2 - lblk2)
5946 len = next2 - lblk2;
5947 if (len > count)
5948 len = count;
5949 lblk1 += len;
5950 lblk2 += len;
5951 count -= len;
5952 goto repeat;
5953 }
5954
5955 /* Prepare left boundary */
5956 if (e1_blk < lblk1) {
5957 split = 1;
5958 *erp = ext4_force_split_extent_at(handle, inode1,
5959 &path1, lblk1, 0);
5960 if (unlikely(*erp))
5961 goto finish;
5962 }
5963 if (e2_blk < lblk2) {
5964 split = 1;
5965 *erp = ext4_force_split_extent_at(handle, inode2,
5966 &path2, lblk2, 0);
5967 if (unlikely(*erp))
5968 goto finish;
5969 }
5970 /* ext4_split_extent_at() may result in leaf extent split,
5971 * path must to be revalidated. */
5972 if (split)
5973 goto repeat;
5974
5975 /* Prepare right boundary */
5976 len = count;
5977 if (len > e1_blk + e1_len - lblk1)
5978 len = e1_blk + e1_len - lblk1;
5979 if (len > e2_blk + e2_len - lblk2)
5980 len = e2_blk + e2_len - lblk2;
5981
5982 if (len != e1_len) {
5983 split = 1;
5984 *erp = ext4_force_split_extent_at(handle, inode1,
5985 &path1, lblk1 + len, 0);
5986 if (unlikely(*erp))
5987 goto finish;
5988 }
5989 if (len != e2_len) {
5990 split = 1;
5991 *erp = ext4_force_split_extent_at(handle, inode2,
5992 &path2, lblk2 + len, 0);
5993 if (*erp)
5994 goto finish;
5995 }
5996 /* ext4_split_extent_at() may result in leaf extent split,
5997 * path must to be revalidated. */
5998 if (split)
5999 goto repeat;
6000
6001 BUG_ON(e2_len != e1_len);
6002 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
6003 if (unlikely(*erp))
6004 goto finish;
6005 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
6006 if (unlikely(*erp))
6007 goto finish;
6008
6009 /* Both extents are fully inside boundaries. Swap it now */
6010 tmp_ex = *ex1;
6011 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
6012 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
6013 ex1->ee_len = cpu_to_le16(e2_len);
6014 ex2->ee_len = cpu_to_le16(e1_len);
6015 if (unwritten)
6016 ext4_ext_mark_unwritten(ex2);
6017 if (ext4_ext_is_unwritten(&tmp_ex))
6018 ext4_ext_mark_unwritten(ex1);
6019
6020 ext4_ext_try_to_merge(handle, inode2, path2, ex2);
6021 ext4_ext_try_to_merge(handle, inode1, path1, ex1);
6022 *erp = ext4_ext_dirty(handle, inode2, path2 +
6023 path2->p_depth);
6024 if (unlikely(*erp))
6025 goto finish;
6026 *erp = ext4_ext_dirty(handle, inode1, path1 +
6027 path1->p_depth);
6028 /*
6029 * Looks scarry ah..? second inode already points to new blocks,
6030 * and it was successfully dirtied. But luckily error may happen
6031 * only due to journal error, so full transaction will be
6032 * aborted anyway.
6033 */
6034 if (unlikely(*erp))
6035 goto finish;
6036 lblk1 += len;
6037 lblk2 += len;
6038 replaced_count += len;
6039 count -= len;
6040
6041 repeat:
6042 ext4_ext_drop_refs(path1);
6043 kfree(path1);
6044 ext4_ext_drop_refs(path2);
6045 kfree(path2);
6046 path1 = path2 = NULL;
6047 }
6048 return replaced_count;
6049}
6050
6051/*
6052 * ext4_clu_mapped - determine whether any block in a logical cluster has
6053 * been mapped to a physical cluster
6054 *
6055 * @inode - file containing the logical cluster
6056 * @lclu - logical cluster of interest
6057 *
6058 * Returns 1 if any block in the logical cluster is mapped, signifying
6059 * that a physical cluster has been allocated for it. Otherwise,
6060 * returns 0. Can also return negative error codes. Derived from
6061 * ext4_ext_map_blocks().
6062 */
6063int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
6064{
6065 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
6066 struct ext4_ext_path *path;
6067 int depth, mapped = 0, err = 0;
6068 struct ext4_extent *extent;
6069 ext4_lblk_t first_lblk, first_lclu, last_lclu;
6070
6071 /*
6072 * if data can be stored inline, the logical cluster isn't
6073 * mapped - no physical clusters have been allocated, and the
6074 * file has no extents
6075 */
6076 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
6077 ext4_has_inline_data(inode))
6078 return 0;
6079
6080 /* search for the extent closest to the first block in the cluster */
6081 path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
6082 if (IS_ERR(path)) {
6083 err = PTR_ERR(path);
6084 path = NULL;
6085 goto out;
6086 }
6087
6088 depth = ext_depth(inode);
6089
6090 /*
6091 * A consistent leaf must not be empty. This situation is possible,
6092 * though, _during_ tree modification, and it's why an assert can't
6093 * be put in ext4_find_extent().
6094 */
6095 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
6096 EXT4_ERROR_INODE(inode,
6097 "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
6098 (unsigned long) EXT4_C2B(sbi, lclu),
6099 depth, path[depth].p_block);
6100 err = -EFSCORRUPTED;
6101 goto out;
6102 }
6103
6104 extent = path[depth].p_ext;
6105
6106 /* can't be mapped if the extent tree is empty */
6107 if (extent == NULL)
6108 goto out;
6109
6110 first_lblk = le32_to_cpu(extent->ee_block);
6111 first_lclu = EXT4_B2C(sbi, first_lblk);
6112
6113 /*
6114 * Three possible outcomes at this point - found extent spanning
6115 * the target cluster, to the left of the target cluster, or to the
6116 * right of the target cluster. The first two cases are handled here.
6117 * The last case indicates the target cluster is not mapped.
6118 */
6119 if (lclu >= first_lclu) {
6120 last_lclu = EXT4_B2C(sbi, first_lblk +
6121 ext4_ext_get_actual_len(extent) - 1);
6122 if (lclu <= last_lclu) {
6123 mapped = 1;
6124 } else {
6125 first_lblk = ext4_ext_next_allocated_block(path);
6126 first_lclu = EXT4_B2C(sbi, first_lblk);
6127 if (lclu == first_lclu)
6128 mapped = 1;
6129 }
6130 }
6131
6132out:
6133 ext4_ext_drop_refs(path);
6134 kfree(path);
6135
6136 return err ? err : mapped;
6137}