diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 1148 |
1 files changed, 426 insertions, 722 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 420fe3deed39..e60aca791d3f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -48,8 +48,6 @@ #include <trace/events/ext4.h> -#define MPAGE_DA_EXTENT_TAIL 0x01 - static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { @@ -164,39 +162,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Restart the transaction associated with *handle. This does a commit, - * so before we call here everything must be consistently dirtied against - * this transaction. - */ -int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, - int nblocks) -{ - int ret; - - /* - * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this - * moment, get_block can be called only for blocks inside i_size since - * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. - */ - BUG_ON(EXT4_JOURNAL(inode) == NULL); - jbd_debug(2, "restarting handle %p\n", handle); - up_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_journal_restart(handle, nblocks); - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - return ret; -} - -/* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; - int extra_credits = 3; + /* + * Credits for final inode cleanup and freeing: + * sb + inode (ext4_orphan_del()), block bitmap, group descriptor + * (xattr block freeing), bitmap, group descriptor (inode freeing) + */ + int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -252,8 +229,12 @@ void ext4_evict_inode(struct inode *inode) if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + /* + * Block bitmap, group descriptor, and inode are accounted in both + * ext4_blocks_for_truncate() and extra_credits. So subtract 3. + */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+extra_credits); + ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -288,6 +269,7 @@ void ext4_evict_inode(struct inode *inode) if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { + ext4_set_errno(inode->i_sb, -err); ext4_error(inode->i_sb, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); @@ -419,7 +401,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; - if (IS_ENCRYPTED(inode)) + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); @@ -527,7 +509,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return -EFSCORRUPTED; /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -695,7 +677,7 @@ found: * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) goto out_sem; } @@ -827,136 +809,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, #define DIO_MAX_BLOCKS 4096 /* - * Get blocks function for the cases that need to start a transaction - - * generally difference cases of direct IO and DAX IO. It also handles retries - * in case of ENOSPC. - */ -static int ext4_get_block_trans(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) -{ - int dio_credits; - handle_t *handle; - int retries = 0; - int ret; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) - bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; - dio_credits = ext4_chunk_trans_blocks(inode, - bh_result->b_size >> inode->i_blkbits); -retry: - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = _ext4_get_block(inode, iblock, bh_result, flags); - ext4_journal_stop(handle); - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; -} - -/* Get block function for DIO reads and writes to inodes without extents */ -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - if (!create) - return _ext4_get_block(inode, iblock, bh, 0); - return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); -} - -/* - * Get block function for AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete. - */ -static int ext4_dio_get_block_unwritten_async(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * When doing DIO using unwritten extents, we need io_end to convert - * unwritten extents to written on IO completion. We allocate io_end - * once we spot unwritten extent and store it in b_private. Generic - * DIO code keeps b_private set and furthermore passes the value to - * our completion callback in 'private' argument. - */ - if (!ret && buffer_unwritten(bh_result)) { - if (!bh_result->b_private) { - ext4_io_end_t *io_end; - - io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!io_end) - return -ENOMEM; - bh_result->b_private = io_end; - ext4_set_io_unwritten_flag(inode, io_end); - } - set_buffer_defer_completion(bh_result); - } - - return ret; -} - -/* - * Get block function for non-AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete by ext4_direct_IO_write(). - */ -static int ext4_dio_get_block_unwritten_sync(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * Mark inode as having pending DIO writes to unwritten extents. - * ext4_direct_IO_write() checks this flag and converts extents to - * written. - */ - if (!ret && buffer_unwritten(bh_result)) - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - - return ret; -} - -static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); - - return ret; -} - - -/* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, @@ -1024,7 +876,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; - if (!bh || buffer_uptodate(bh)) + if (!bh || ext4_buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); @@ -1051,7 +903,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ - if (bhs[i] && !buffer_uptodate(bhs[i])) + if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bhs[i]); @@ -1340,6 +1192,9 @@ retry_journal: } if (ret) { + bool extended = (pos + len > inode->i_size) && + !ext4_verity_in_progress(inode); + unlock_page(page); /* * __block_write_begin may have instantiated a few blocks @@ -1349,11 +1204,11 @@ retry_journal: * Add inode to orphan list in case we crash before * truncate finishes */ - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); - if (pos + len > inode->i_size) { + if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might @@ -1406,6 +1261,7 @@ static int ext4_write_end(struct file *file, int ret = 0, ret2; int i_size_changed = 0; int inline_data = ext4_has_inline_data(inode); + bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (inline_data) { @@ -1423,12 +1279,16 @@ static int ext4_write_end(struct file *file, /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. + * + * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree + * blocks are being written past EOF, so skip the i_size update. */ - i_size_changed = ext4_update_inode_size(inode, pos + copied); + if (!verity) + i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); put_page(page); - if (old_size < pos) + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily @@ -1439,7 +1299,7 @@ static int ext4_write_end(struct file *file, if (i_size_changed || inline_data) ext4_mark_inode_dirty(handle, inode); - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1450,7 +1310,7 @@ errout: if (!ret) ret = ret2; - if (pos + len > inode->i_size) { + if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be @@ -1511,6 +1371,7 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; int size_changed = 0; int inline_data = ext4_has_inline_data(inode); + bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); @@ -1540,13 +1401,14 @@ static int ext4_journalled_write_end(struct file *file, if (!partial) SetPageUptodate(page); } - size_changed = ext4_update_inode_size(inode, pos + copied); + if (!verity) + size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; unlock_page(page); put_page(page); - if (old_size < pos) + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); if (size_changed || inline_data) { @@ -1555,7 +1417,7 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1566,7 +1428,7 @@ errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - if (pos + len > inode->i_size) { + if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be @@ -1646,49 +1508,6 @@ void ext4_da_release_space(struct inode *inode, int to_free) dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } -static void ext4_da_page_release_reservation(struct page *page, - unsigned int offset, - unsigned int length) -{ - int contiguous_blks = 0; - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - struct inode *inode = page->mapping->host; - unsigned int stop = offset + length; - ext4_fsblk_t lblk; - - BUG_ON(stop > PAGE_SIZE || stop < length); - - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - - if (next_off > stop) - break; - - if ((offset <= curr_off) && (buffer_delay(bh))) { - contiguous_blks++; - clear_buffer_delay(bh); - } else if (contiguous_blks) { - lblk = page->index << - (PAGE_SHIFT - inode->i_blkbits); - lblk += (curr_off >> inode->i_blkbits) - - contiguous_blks; - ext4_es_remove_blks(inode, lblk, contiguous_blks); - contiguous_blks = 0; - } - curr_off = next_off; - } while ((bh = bh->b_this_page) != head); - - if (contiguous_blks) { - lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); - lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_blks(inode, lblk, contiguous_blks); - } - -} - /* * Delayed allocation stuff */ @@ -1868,7 +1687,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, &es)) { + if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); @@ -2162,7 +1981,8 @@ static int ext4_writepage(struct page *page, trace_ext4_writepage(page); size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT) + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; @@ -2246,7 +2066,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) * after page tables are updated. */ size = i_size_read(mpd->inode); - if (page->index == size >> PAGE_SHIFT) + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; @@ -2345,6 +2166,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; + if (ext4_verity_in_progress(inode)) + blocks = EXT_MAX_BLOCKS; + do { BUG_ON(buffer_locked(bh)); @@ -2369,6 +2193,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } /* + * mpage_process_page - update page buffers corresponding to changed extent and + * may submit fully mapped page for IO + * + * @mpd - description of extent to map, on return next extent to map + * @m_lblk - logical block mapping. + * @m_pblk - corresponding physical mapping. + * @map_bh - determines on return whether this page requires any further + * mapping or not. + * Scan given page buffers corresponding to changed extent and update buffer + * state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits. + * If the given page is not fully mapped, we update @map to the next extent in + * the given page that needs mapping & return @map_bh as true. + */ +static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, + ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, + bool *map_bh) +{ + struct buffer_head *head, *bh; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + ext4_lblk_t lblk = *m_lblk; + ext4_fsblk_t pblock = *m_pblk; + int err = 0; + int blkbits = mpd->inode->i_blkbits; + ssize_t io_end_size = 0; + struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); + + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + io_end_vec->size += io_end_size; + io_end_size = 0; + + err = mpage_process_page_bufs(mpd, head, bh, lblk); + if (err > 0) + err = 0; + if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) { + err = PTR_ERR(io_end_vec); + goto out; + } + io_end_vec->offset = mpd->map.m_lblk << blkbits; + } + *map_bh = true; + goto out; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + io_end_size += (1 << blkbits); + } while (lblk++, (bh = bh->b_this_page) != head); + + io_end_vec->size += io_end_size; + io_end_size = 0; + *map_bh = false; +out: + *m_lblk = lblk; + *m_pblk = pblock; + return err; +} + +/* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * @@ -2387,12 +2284,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct pagevec pvec; int nr_pages, i; struct inode *inode = mpd->inode; - struct buffer_head *head, *bh; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; - sector_t pblock; + ext4_fsblk_t pblock; int err; + bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; @@ -2408,50 +2305,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bh = head = page_buffers(page); - do { - if (lblk < mpd->map.m_lblk) - continue; - if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { - /* - * Buffer after end of mapped extent. - * Find next buffer in the page to map. - */ - mpd->map.m_len = 0; - mpd->map.m_flags = 0; - /* - * FIXME: If dioread_nolock supports - * blocksize < pagesize, we need to make - * sure we add size mapped so far to - * io_end->size as the following call - * can submit the page for IO. - */ - err = mpage_process_page_bufs(mpd, head, - bh, lblk); - pagevec_release(&pvec); - if (err > 0) - err = 0; - return err; - } - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock++; - } - clear_buffer_unwritten(bh); - } while (lblk++, (bh = bh->b_this_page) != head); - + err = mpage_process_page(mpd, page, &lblk, &pblock, + &map_bh); /* - * FIXME: This is going to break if dioread_nolock - * supports blocksize < pagesize as we will try to - * convert potentially unmapped parts of inode. + * If map_bh is true, means page may require further bh + * mapping, or maybe the page was submitted for IO. + * So we return to call further extent mapping. */ - mpd->io_submit.io_end->size += PAGE_SIZE; + if (err < 0 || map_bh == true) + goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); - if (err < 0) { - pagevec_release(&pvec); - return err; - } + if (err < 0) + goto out; } pagevec_release(&pvec); } @@ -2459,6 +2325,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; +out: + pagevec_release(&pvec); + return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) @@ -2538,9 +2407,13 @@ static int mpage_map_and_submit_extent(handle_t *handle, int err; loff_t disksize; int progress = 0; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + struct ext4_io_end_vec *io_end_vec; - mpd->io_submit.io_end->offset = - ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) + return PTR_ERR(io_end_vec); + io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -2604,10 +2477,12 @@ update_disksize: EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); - if (err2) + if (err2) { + ext4_set_errno(inode->i_sb, -err2); ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", inode->i_ino); + } if (!err) err = err2; } @@ -2785,15 +2660,6 @@ static int ext4_writepages(struct address_space *mapping, goto out_writepages; } - if (ext4_should_dioread_nolock(inode)) { - /* - * We may need to convert up to one extent per block in - * the page and we may dirty the inode. - */ - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, - PAGE_SIZE >> inode->i_blkbits); - } - /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so @@ -2812,6 +2678,15 @@ static int ext4_writepages(struct address_space *mapping, ext4_journal_stop(handle); } + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert up to one extent per block in + * the page and we may dirty the inode. + */ + rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, + PAGE_SIZE >> inode->i_blkbits); + } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; @@ -2992,7 +2867,7 @@ static int ext4_dax_writepages(struct address_space *mapping, percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); + ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); percpu_up_read(&sbi->s_journal_flag_rwsem); @@ -3061,8 +2936,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb) || - S_ISLNK(inode->i_mode)) { + if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) || + ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -3227,24 +3102,6 @@ static int ext4_da_write_end(struct file *file, return ret ? ret : copied; } -static void ext4_da_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - /* - * Drop reserved blocks - */ - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; - - ext4_da_page_release_reservation(page, offset, length); - -out: - ext4_invalidatepage(page, offset, length); - - return; -} - /* * Force all delayed allocation blocks to be allocated for a given inode. */ @@ -3452,471 +3309,256 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) return inode->i_state & I_DIRTY_DATASYNC; } -static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap) +static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, + struct ext4_map_blocks *map, loff_t offset, + loff_t length) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int blkbits = inode->i_blkbits; - unsigned long first_block, last_block; - struct ext4_map_blocks map; - bool delalloc = false; - int ret; - - if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) - return -EINVAL; - first_block = offset >> blkbits; - last_block = min_t(loff_t, (offset + length - 1) >> blkbits, - EXT4_MAX_LOGICAL_BLOCK); - - if (flags & IOMAP_REPORT) { - if (ext4_has_inline_data(inode)) { - ret = ext4_inline_data_iomap(inode, iomap); - if (ret != -EAGAIN) { - if (ret == 0 && offset >= iomap->length) - ret = -ENOENT; - return ret; - } - } - } else { - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) - return -ERANGE; - } - - map.m_lblk = first_block; - map.m_len = last_block - first_block + 1; - - if (flags & IOMAP_REPORT) { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - - if (ret == 0) { - ext4_lblk_t end = map.m_lblk + map.m_len - 1; - struct extent_status es; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map.m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) { - /* entire range is a hole */ - } else if (es.es_lblk > map.m_lblk) { - /* range starts with a hole */ - map.m_len = es.es_lblk - map.m_lblk; - } else { - ext4_lblk_t offs = 0; - - if (es.es_lblk < map.m_lblk) - offs = map.m_lblk - es.es_lblk; - map.m_lblk = es.es_lblk + offs; - map.m_len = es.es_len - offs; - delalloc = true; - } - } - } else if (flags & IOMAP_WRITE) { - int dio_credits; - handle_t *handle; - int retries = 0; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); -retry: - /* - * Either we allocate blocks and then we don't get unwritten - * extent so we have reserved enough credits, or the blocks - * are already allocated and unwritten and in that case - * extent conversion fits in the credits as well. - */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) { - ext4_journal_stop(handle); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; - } - - /* - * If we added blocks beyond i_size, we need to make sure they - * will get truncated if we crash before updating i_size in - * ext4_iomap_end(). For faults we don't need to do that (and - * even cannot because for orphan list operations inode_lock is - * required) - if we happen to instantiate block beyond i_size, - * it is because we race with truncate which has already added - * the inode to the orphan list. - */ - if (!(flags & IOMAP_FAULT) && first_block + map.m_len > - (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { - int err; - - err = ext4_orphan_add(handle, inode); - if (err < 0) { - ext4_journal_stop(handle); - return err; - } - } - ext4_journal_stop(handle); - } else { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - } + u8 blkbits = inode->i_blkbits; + /* + * Writes that span EOF might trigger an I/O size update on completion, + * so consider them to be dirty for the purpose of O_DSYNC, even if + * there is no other metadata changes being made or are pending. + */ iomap->flags = 0; - if (ext4_inode_datasync_dirty(inode)) + if (ext4_inode_datasync_dirty(inode) || + offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; + + if (map->m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = sbi->s_daxdev; - iomap->offset = (u64)first_block << blkbits; - iomap->length = (u64)map.m_len << blkbits; + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + iomap->offset = (u64) map->m_lblk << blkbits; + iomap->length = (u64) map->m_len << blkbits; - if (ret == 0) { - iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; + /* + * Flags passed to ext4_map_blocks() for direct I/O writes can result + * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits + * set. In order for any allocated unwritten extents to be converted + * into written extents correctly within the ->end_io() handler, we + * need to ensure that the iomap->type is set appropriately. Hence, the + * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has + * been set first. + */ + if (map->m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + iomap->addr = (u64) map->m_pblk << blkbits; + } else if (map->m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = (u64) map->m_pblk << blkbits; } else { - if (map.m_flags & EXT4_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { - iomap->type = IOMAP_UNWRITTEN; - } else { - WARN_ON_ONCE(1); - return -EIO; - } - iomap->addr = (u64)map.m_pblk << blkbits; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; } - - if (map.m_flags & EXT4_MAP_NEW) - iomap->flags |= IOMAP_F_NEW; - - return 0; } -static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) +static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, + unsigned int flags) { - int ret = 0; handle_t *handle; - int blkbits = inode->i_blkbits; - bool truncate = false; + u8 blkbits = inode->i_blkbits; + int ret, dio_credits, m_flags = 0, retries = 0; - if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) - return 0; + /* + * Trim the mapping request to the maximum value that we can map at + * once for direct I/O. + */ + if (map->m_len > DIO_MAX_BLOCKS) + map->m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto orphan_del; - } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); +retry: /* - * We may need to truncate allocated but not written blocks beyond EOF. + * Either we allocate blocks and then don't get an unwritten extent, so + * in that case we have reserved enough credits. Or, the blocks are + * already allocated and unwritten. In that case, the extent conversion + * fits into the credits as well. */ - if (iomap->offset + iomap->length > - ALIGN(inode->i_size, 1 << blkbits)) { - ext4_lblk_t written_blk, end_blk; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); - written_blk = (offset + written) >> blkbits; - end_blk = (offset + length) >> blkbits; - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - } /* - * Remove inode from orphan list if we were extending a inode and - * everything went fine. + * DAX and direct I/O are the only two operations that are currently + * supported with IOMAP_WRITE. */ - if (!truncate && inode->i_nlink && - !list_empty(&EXT4_I(inode)->i_orphan)) - ext4_orphan_del(handle, inode); + WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); + if (IS_DAX(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + /* + * We use i_size instead of i_disksize here because delalloc writeback + * can complete at any point during the I/O and subsequently push the + * i_disksize out to i_size. This could be beyond where direct I/O is + * happening and thus expose allocated blocks to direct I/O reads. + */ + else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE; + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; + + ret = ext4_map_blocks(handle, inode, map, m_flags); + + /* + * We cannot fill holes in indirect tree based inodes as that could + * expose stale data in the case of a crash. Use the magic error code + * to fallback to buffered I/O. + */ + if (!m_flags && !ret) + ret = -ENOTBLK; + ext4_journal_stop(handle); - if (truncate) { - ext4_truncate_failed_write(inode); -orphan_del: - /* - * If truncate failed early the inode might still be on the - * orphan list; we need to make sure the inode is removed from - * the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return ret; } -const struct iomap_ops ext4_iomap_ops = { - .iomap_begin = ext4_iomap_begin, - .iomap_end = ext4_iomap_end, -}; -static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { - ext4_io_end_t *io_end = private; + int ret; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; - /* if not async direct IO just return */ - if (!io_end) - return 0; + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; - ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - io_end, io_end->inode->i_ino, iocb, offset, size); + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; /* - * Error during AIO DIO. We cannot convert unwritten extents as the - * data was not written. Just clear the unwritten flag and drop io_end. + * Calculate the first and last logical blocks respectively. */ - if (size <= 0) { - ext4_clear_io_unwritten_flag(io_end); - size = 0; - } - io_end->offset = offset; - io_end->size = size; - ext4_put_io_end(io_end); + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + + if (flags & IOMAP_WRITE) + ret = ext4_iomap_alloc(inode, &map, flags); + else + ret = ext4_map_blocks(NULL, inode, &map, 0); + + if (ret < 0) + return ret; + + ext4_set_iomap(inode, iomap, &map, offset, length); return 0; } -/* - * Handling of direct IO writes. - * - * For ext4 extent files, ext4 will do direct-io write even to holes, - * preallocated extents, and those write extend the file, no need to - * fall back to buffered IO. - * - * For holes, we fallocate those blocks, mark them as unwritten - * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as unwritten. - * - * The unwritten extents will be converted to written when DIO is completed. - * For async direct IO, since the IO may still pending when return, we - * set up an end_io call back function, which will do the conversion - * when async direct IO completed. - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - */ -static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) +static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, struct iomap *iomap, + struct iomap *srcmap) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - ssize_t ret; - loff_t offset = iocb->ki_pos; - size_t count = iov_iter_count(iter); - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - loff_t final_size = offset + count; - int orphan = 0; - handle_t *handle; - - if (final_size > inode->i_size || final_size > ei->i_disksize) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ext4_update_i_disksize(inode, inode->i_size); - ext4_journal_stop(handle); - } - - BUG_ON(iocb->private == NULL); + int ret; /* - * Make all waiters for direct IO properly wait also for extent - * conversion. This also disallows race between truncate() and - * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + * Even for writes we don't need to allocate blocks, so just pretend + * we are reading to save overhead of starting a transaction. */ - inode_dio_begin(inode); - - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); - - if (overwrite) - inode_unlock(inode); + flags &= ~IOMAP_WRITE; + ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); + WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + return ret; +} +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ /* - * For extent mapped files we could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as unwritten to prevent - * parallel buffered read to expose the stale data before DIO complete - * the data IO. - * - * As to previously fallocated extents, ext4 get_block will just simply - * mark the buffer mapped but still keep the extents unwritten. - * - * For non AIO case, we will convert those unwritten extents to written - * after return back from blockdev_direct_IO. That way we save us from - * allocating io_end structure and also the overhead of offloading - * the extent convertion to a workqueue. - * - * For async DIO, the conversion needs to be deferred when the - * IO is completed. The ext4 end_io callback function will be - * called to take care of the conversion work. Here for async - * case, we allocate an io_end structure to hook to the iocb. + * Check to see whether an error occurred while writing out the data to + * the allocated blocks. If so, return the magic error code so that we + * fallback to buffered I/O and attempt to complete the remainder of + * the I/O. Any blocks that may have been allocated in preparation for + * the direct I/O will be reused during buffered I/O. */ - iocb->private = NULL; - if (overwrite) - get_block_func = ext4_dio_get_block_overwrite; - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, i_blocksize(inode)) >= inode->i_size) { - get_block_func = ext4_dio_get_block; - dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else if (is_sync_kiocb(iocb)) { - get_block_func = ext4_dio_get_block_unwritten_sync; - dio_flags = DIO_LOCKING; - } else { - get_block_func = ext4_dio_get_block_unwritten_async; - dio_flags = DIO_LOCKING; - } - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - get_block_func, ext4_end_io_dio, NULL, - dio_flags); + if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + return -ENOTBLK; - if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(NULL, inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } + return 0; +} - inode_dio_end(inode); - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) - inode_lock(inode); +const struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, + .iomap_end = ext4_iomap_end, +}; - if (ret < 0 && final_size > inode->i_size) - ext4_truncate_failed_write(inode); +const struct iomap_ops ext4_iomap_overwrite_ops = { + .iomap_begin = ext4_iomap_overwrite_begin, + .iomap_end = ext4_iomap_end, +}; - /* Handle extending of i_size after direct IO write */ - if (orphan) { - int err; +static bool ext4_iomap_is_delalloc(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct extent_status es; + ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* - * We wrote the data but cannot extend - * i_size. Bail out. In async io case, we do - * not return error here because we have - * already submmitted the corresponding - * bio. Returning error here makes the caller - * think that this IO is done and failed - * resulting in race with bio's completion - * handler. - */ - if (!ret) - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map->m_lblk, end, &es); - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size || end > ei->i_disksize) { - ext4_update_i_disksize(inode, end); - if (end > inode->i_size) - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; + if (!es.es_len || es.es_lblk > end) + return false; + + if (es.es_lblk > map->m_lblk) { + map->m_len = es.es_lblk - map->m_lblk; + return false; } -out: - return ret; -} -static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); - ssize_t ret; + offset = map->m_lblk - es.es_lblk; + map->m_len = es.es_len - offset; - /* - * Shared inode_lock is enough for us - it protects against concurrent - * writes & truncates and since we take care of writing back page cache, - * we are protected against page writeback as well. - */ - inode_lock_shared(inode); - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, NULL, NULL, 0); -out_unlock: - inode_unlock_shared(inode); - return ret; + return true; } -static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - ssize_t ret; + int ret; + bool delalloc = false; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; -#ifdef CONFIG_FS_ENCRYPTION - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) - return 0; -#endif + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } /* - * If we are doing data journalling we don't support O_DIRECT + * Calculate the first and last logical block respectively. */ - if (ext4_should_journal_data(inode)) - return 0; + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; - /* Let buffer I/O handle the inline data case. */ - if (ext4_has_inline_data(inode)) - return 0; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) + delalloc = ext4_iomap_is_delalloc(inode, &map); - trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == READ) - ret = ext4_direct_IO_read(iocb, iter); - else - ret = ext4_direct_IO_write(iocb, iter); - trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - return ret; + ext4_set_iomap(inode, iomap, &map, offset, length); + if (delalloc && iomap->type == IOMAP_HOLE) + iomap->type = IOMAP_DELALLOC; + + return 0; } +const struct iomap_ops ext4_iomap_report_ops = { + .iomap_begin = ext4_iomap_begin_report, +}; + /* * Pages can be marked dirty completely asynchronously from ext4's journalling * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do @@ -3954,7 +3596,7 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -3971,7 +3613,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -3985,9 +3627,9 @@ static const struct address_space_operations ext4_da_aops = { .write_end = ext4_da_write_end, .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, - .invalidatepage = ext4_da_invalidatepage, + .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -4081,8 +3723,12 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks( - page, blocksize, bh_offset(bh))); + err = fscrypt_decrypt_pagecache_blocks(page, blocksize, + bh_offset(bh)); + if (err) { + clear_buffer_uptodate(bh); + goto unlock; + } } } if (ext4_should_journal_data(inode)) { @@ -4292,11 +3938,17 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) unsigned int credits; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_punch_hole(inode, offset, length, 0); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + if (ext4_has_inline_data(inode)) { + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_convert_inline_data(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); + if (ret) + return ret; + } + /* * Write out all dirty pages to avoid race conditions * Then release them. @@ -4611,6 +4263,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; + if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) + goto simulate_eio; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -4709,6 +4363,8 @@ make_io: blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + simulate_eio: + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, block, "unable to read itable block"); brelse(bh); @@ -4739,6 +4395,8 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; + if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) + return false; return true; } @@ -4763,9 +4421,11 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; + if (flags & EXT4_VERITY_FL) + new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| - S_ENCRYPTED|S_CASEFOLD); + S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, @@ -4918,7 +4578,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, sizeof(gen)); } - if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + if (!ext4_inode_csum_verify(inode, raw_inode, ei) || + ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, function, line, 0, "iget: checksum invalid"); ret = -EFSBADCRC; @@ -4982,6 +4644,18 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } + /* + * If dir_index is not enabled but there's dir with INDEX flag set, + * we'd normally treat htree data as empty space. But with metadata + * checksumming that corrupts checksums so forbid that. + */ + if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + ext4_error_inode(inode, function, line, 0, + "iget: Dir with htree data on filesystem without dir_index feature."); + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; @@ -5119,6 +4793,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + ext4_error_inode(inode, function, line, 0, + "casefold flag without casefold feature"); brelse(iloc.bh); unlock_new_inode(inode); @@ -5454,6 +5131,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, "IO error syncing inode"); err = -EIO; @@ -5478,11 +5156,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) offset = inode->i_size & (PAGE_SIZE - 1); /* - * All buffers in the last page remain valid? Then there's nothing to - * do. We do the check mainly to optimize the common PAGE_SIZE == - * blocksize case + * If the page is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidatepage() may + * strip all buffers from the page but keep the page dirty which can then + * confuse e.g. concurrent ext4_writepage() seeing dirty page without + * buffers). Also we don't need to wait for any commit if all buffers in + * the page remain valid. This is most beneficial for the common case of + * blocksize == PAGESIZE. */ - if (offset > PAGE_SIZE - i_blocksize(inode)) + if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { page = find_lock_page(inode->i_mapping, @@ -5555,6 +5237,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + error = fsverity_prepare_setattr(dentry, attr); + if (error) + return error; + if (is_quota_modification(inode, attr)) { error = dquot_initialize(inode); if (error) @@ -5724,7 +5410,8 @@ int ext4_getattr(const struct path *path, struct kstat *stat, struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { + if ((request_mask & STATX_BTIME) && + EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; @@ -5741,12 +5428,15 @@ int ext4_getattr(const struct path *path, struct kstat *stat, stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; + if (flags & EXT4_VERITY_FL) + stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | - STATX_ATTR_NODUMP); + STATX_ATTR_NODUMP | + STATX_ATTR_VERITY); generic_fillattr(inode, stat); return 0; @@ -5936,8 +5626,23 @@ static int __ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; + unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); int error; + /* this was checked at iget time, but double check for good measure */ + if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + return -EFSCORRUPTED; + } + if ((new_extra_isize < ei->i_extra_isize) || + (new_extra_isize < 4) || + (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) + return -EINVAL; /* Should never happen */ + raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); @@ -5989,9 +5694,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode, * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if (ext4_handle_valid(handle) && - jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0) + if (ext4_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) @@ -6031,7 +5735,7 @@ int ext4_expand_extra_isize(struct inode *inode, error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); - goto out_stop; + goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, @@ -6041,8 +5745,8 @@ int ext4_expand_extra_isize(struct inode *inode, if (!error) error = rc; +out_unlock: ext4_write_unlock_xattr(inode, &no_expand); -out_stop: ext4_journal_stop(handle); return error; } |