diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2005-12-15 14:31:24 -0800 |
---|---|---|
committer | Joel Becker <joel.becker@oracle.com> | 2006-01-03 11:45:47 -0800 |
commit | ccd979bdbce9fba8412beb3f1de68a9d0171b12c (patch) | |
tree | c50ed941849ce06ccadd4ce27599b3ef9fdbe2ae /fs/ocfs2/alloc.c | |
parent | 8df08c89c668e1bd922a053fdb5ba1fadbecbb38 (diff) | |
download | blackbird-op-linux-ccd979bdbce9fba8412beb3f1de68a9d0171b12c.tar.gz blackbird-op-linux-ccd979bdbce9fba8412beb3f1de68a9d0171b12c.zip |
[PATCH] OCFS2: The Second Oracle Cluster Filesystem
The OCFS2 file system module.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Diffstat (limited to 'fs/ocfs2/alloc.c')
-rw-r--r-- | fs/ocfs2/alloc.c | 2040 |
1 files changed, 2040 insertions, 0 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c new file mode 100644 index 000000000000..465f797451ee --- /dev/null +++ b/fs/ocfs2/alloc.c @@ -0,0 +1,2040 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.c + * + * Extent allocs and frees + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_DISK_ALLOC +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "sysfile.h" +#include "file.h" +#include "super.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +static int ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + u64 blkno); + +static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + int wanted, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head *bhs[]); + +static int ocfs2_add_branch(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *eb_bh, + struct buffer_head *last_eb_bh, + struct ocfs2_alloc_context *meta_ac); + +static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_new_eb_bh); + +static int ocfs2_do_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 blkno, + u32 new_clusters); + +static int ocfs2_find_branch_target(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **target_bh); + +static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe, + unsigned int new_i_clusters, + struct buffer_head *old_last_eb, + struct buffer_head **new_last_eb); + +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); + +static int ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + u64 blkno) +{ + return blkno == (le64_to_cpu(ext->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(ext->e_clusters))); +} + +/* + * How many free extents have we got before we need more meta data? + */ +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe) +{ + int retval; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + struct buffer_head *eb_bh = NULL; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + retval = -EIO; + goto bail; + } + + if (fe->i_last_eb_blk) { + retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &eb_bh, OCFS2_BH_CACHED, inode); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + + BUG_ON(el->l_tree_depth != 0); + + retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); +bail: + if (eb_bh) + brelse(eb_bh); + + mlog_exit(retval); + return retval; +} + +/* expects array to already be allocated + * + * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and + * l_count for you + */ +static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + int wanted, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head *bhs[]) +{ + int count, status, i; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + struct ocfs2_extent_block *eb; + + mlog_entry_void(); + + count = 0; + while (count < wanted) { + status = ocfs2_claim_metadata(osb, + handle, + meta_ac, + wanted - count, + &suballoc_bit_start, + &num_got, + &first_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = count; i < (num_got + count); i++) { + bhs[i] = sb_getblk(osb->sb, first_blkno); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + + status = ocfs2_journal_access(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) bhs[i]->b_data; + /* Ok, setup the minimal stuff here. */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(first_blkno); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + +#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS + /* we always use slot zero's suballocator */ + eb->h_suballoc_slot = 0; +#else + eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); +#endif + eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + suballoc_bit_start++; + first_blkno++; + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. */ + status = ocfs2_journal_dirty(handle, bhs[i]); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + count += num_got; + } + + status = 0; +bail: + if (status < 0) { + for(i = 0; i < wanted; i++) { + if (bhs[i]) + brelse(bhs[i]); + bhs[i] = NULL; + } + } + mlog_exit(status); + return status; +} + +/* + * Add an entire tree branch to our inode. eb_bh is the extent block + * to start at, if we don't want to start the branch at the dinode + * structure. + * + * last_eb_bh is required as we have to update it's next_leaf pointer + * for the new last extent block. + * + * the new branch will be 'empty' in the sense that every block will + * contain a single record with e_clusters == 0. + */ +static int ocfs2_add_branch(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *eb_bh, + struct buffer_head *last_eb_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int status, new_blocks, i; + u64 next_blkno, new_last_eb_blk; + struct buffer_head *bh; + struct buffer_head **new_eb_bhs = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *eb_el; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + BUG_ON(!last_eb_bh); + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (eb_bh) { + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + + /* we never add a branch to a leaf. */ + BUG_ON(!el->l_tree_depth); + + new_blocks = le16_to_cpu(el->l_tree_depth); + + /* allocate the number of new eb blocks we need */ + new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!new_eb_bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, + meta_ac, new_eb_bhs); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be + * linked with the rest of the tree. + * conversly, new_eb_bhs[0] is the new bottommost leaf. + * + * when we leave the loop, new_last_eb_blk will point to the + * newest leaf, and next_blkno will point to the topmost extent + * block. */ + next_blkno = new_last_eb_blk = 0; + for(i = 0; i < new_blocks; i++) { + bh = new_eb_bhs[i]; + eb = (struct ocfs2_extent_block *) bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + eb_el = &eb->h_list; + + status = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb->h_next_leaf_blk = 0; + eb_el->l_tree_depth = cpu_to_le16(i); + eb_el->l_next_free_rec = cpu_to_le16(1); + eb_el->l_recs[0].e_cpos = fe->i_clusters; + eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); + eb_el->l_recs[0].e_clusters = cpu_to_le32(0); + if (!eb_el->l_tree_depth) + new_last_eb_blk = le64_to_cpu(eb->h_blkno); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + next_blkno = le64_to_cpu(eb->h_blkno); + } + + /* This is a bit hairy. We want to update up to three blocks + * here without leaving any of them in an inconsistent state + * in case of error. We don't have to worry about + * journal_dirty erroring as it won't unless we've aborted the + * handle (in which case we would never be here) so reserving + * the write with journal_access is all we need to do. */ + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (eb_bh) { + status = ocfs2_journal_access(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* Link the new branch into the rest of the tree (el will + * either be on the fe, or the extent block passed in. */ + i = le16_to_cpu(el->l_next_free_rec); + el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); + el->l_recs[i].e_cpos = fe->i_clusters; + el->l_recs[i].e_clusters = 0; + le16_add_cpu(&el->l_next_free_rec, 1); + + /* fe needs a new last extent block pointer, as does the + * next_leaf on the previously last-extent-block. */ + fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); + + status = ocfs2_journal_dirty(handle, last_eb_bh); + if (status < 0) + mlog_errno(status); + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) + mlog_errno(status); + if (eb_bh) { + status = ocfs2_journal_dirty(handle, eb_bh); + if (status < 0) + mlog_errno(status); + } + + status = 0; +bail: + if (new_eb_bhs) { + for (i = 0; i < new_blocks; i++) + if (new_eb_bhs[i]) + brelse(new_eb_bhs[i]); + kfree(new_eb_bhs); + } + + mlog_exit(status); + return status; +} + +/* + * adds another level to the allocation tree. + * returns back the new extent block so you can add a branch to it + * after this call. + */ +static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_new_eb_bh) +{ + int status, i; + struct buffer_head *new_eb_bh = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *fe_el; + struct ocfs2_extent_list *eb_el; + + mlog_entry_void(); + + status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, + &new_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + + eb_el = &eb->h_list; + fe = (struct ocfs2_dinode *) fe_bh->b_data; + fe_el = &fe->id2.i_list; + + status = ocfs2_journal_access(handle, inode, new_eb_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* copy the fe data into the new extent block */ + eb_el->l_tree_depth = fe_el->l_tree_depth; + eb_el->l_next_free_rec = fe_el->l_next_free_rec; + for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { + eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; + eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; + eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; + } + + status = ocfs2_journal_dirty(handle, new_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* update fe now */ + le16_add_cpu(&fe_el->l_tree_depth, 1); + fe_el->l_recs[0].e_cpos = 0; + fe_el->l_recs[0].e_blkno = eb->h_blkno; + fe_el->l_recs[0].e_clusters = fe->i_clusters; + for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { + fe_el->l_recs[i].e_cpos = 0; + fe_el->l_recs[i].e_clusters = 0; + fe_el->l_recs[i].e_blkno = 0; + } + fe_el->l_next_free_rec = cpu_to_le16(1); + + /* If this is our 1st tree depth shift, then last_eb_blk + * becomes the allocated extent block */ + if (fe_el->l_tree_depth == cpu_to_le16(1)) + fe->i_last_eb_blk = eb->h_blkno; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *ret_new_eb_bh = new_eb_bh; + new_eb_bh = NULL; + status = 0; +bail: + if (new_eb_bh) + brelse(new_eb_bh); + + mlog_exit(status); + return status; +} + +/* + * Expects the tree to already have room in the rightmost leaf for the + * extent. Updates all the extent blocks (and the dinode) on the way + * down. + */ +static int ocfs2_do_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 start_blk, + u32 new_clusters) +{ + int status, i, num_bhs = 0; + u64 next_blkno; + u16 next_free; + struct buffer_head **eb_bhs = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + if (el->l_tree_depth) { + /* This is another operation where we want to be + * careful about our tree updates. An error here means + * none of the previous changes we made should roll + * forward. As a result, we have to record the buffers + * for this part of the tree in an array and reserve a + * journal write to them before making any changes. */ + num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); + eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!eb_bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + i = 0; + while(el->l_tree_depth) { + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" has a bad " + "extent list", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); + + BUG_ON(i >= num_bhs); + status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], + OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, + eb); + status = -EIO; + goto bail; + } + + status = ocfs2_journal_access(handle, inode, eb_bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + el = &eb->h_list; + i++; + /* When we leave this loop, eb_bhs[num_bhs - 1] will + * hold the bottom-most leaf extent block. */ + } + BUG_ON(el->l_tree_depth); + + el = &fe->id2.i_list; + /* If we have tree depth, then the fe update is + * trivial, and we want to switch el out for the + * bottom-most leaf in order to update it with the + * actual extent data below. */ + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" has a bad " + "extent list", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, + new_clusters); + /* (num_bhs - 1) to avoid the leaf */ + for(i = 0; i < (num_bhs - 1); i++) { + eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; + el = &eb->h_list; + + /* finally, make our actual change to the + * intermediate extent blocks. */ + next_free = le16_to_cpu(el->l_next_free_rec); + le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, + new_clusters); + + status = ocfs2_journal_dirty(handle, eb_bhs[i]); + if (status < 0) + mlog_errno(status); + } + BUG_ON(i != (num_bhs - 1)); + /* note that the leaf block wasn't touched in + * the loop above */ + eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; + el = &eb->h_list; + BUG_ON(el->l_tree_depth); + } + + /* yay, we can finally add the actual extent now! */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le16_to_cpu(el->l_next_free_rec) && + ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { + le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); + } else if (le16_to_cpu(el->l_next_free_rec) && + (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { + /* having an empty extent at eof is legal. */ + if (el->l_recs[i].e_cpos != fe->i_clusters) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" trailing extent is bad: " + "cpos (%u) != number of clusters (%u)", + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(fe->i_clusters)); + status = -EIO; + goto bail; + } + el->l_recs[i].e_blkno = cpu_to_le64(start_blk); + el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); + } else { + /* No contiguous record, or no empty record at eof, so + * we add a new one. */ + + BUG_ON(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count)); + i = le16_to_cpu(el->l_next_free_rec); + + el->l_recs[i].e_blkno = cpu_to_le64(start_blk); + el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); + el->l_recs[i].e_cpos = fe->i_clusters; + le16_add_cpu(&el->l_next_free_rec, 1); + } + + /* + * extent_map errors are not fatal, so they are ignored outside + * of flushing the thing. + */ + status = ocfs2_extent_map_append(inode, &el->l_recs[i], + new_clusters); + if (status) { + mlog_errno(status); + ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); + } + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) + mlog_errno(status); + if (fe->id2.i_list.l_tree_depth) { + status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); + if (status < 0) + mlog_errno(status); + } + + status = 0; +bail: + if (eb_bhs) { + for (i = 0; i < num_bhs; i++) + if (eb_bhs[i]) + brelse(eb_bhs[i]); + kfree(eb_bhs); + } + + mlog_exit(status); + return status; +} + +/* + * Should only be called when there is no space left in any of the + * leaf nodes. What we want to do is find the lowest tree depth + * non-leaf extent block with room for new records. There are three + * valid results of this search: + * + * 1) a lowest extent block is found, then we pass it back in + * *lowest_eb_bh and return '0' + * + * 2) the search fails to find anything, but the dinode has room. We + * pass NULL back in *lowest_eb_bh, but still return '0' + * + * 3) the search fails to find anything AND the dinode is full, in + * which case we return > 0 + * + * return status < 0 indicates an error. + */ +static int ocfs2_find_branch_target(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **target_bh) +{ + int status = 0, i; + u64 blkno; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh = NULL; + struct buffer_head *lowest_bh = NULL; + + mlog_entry_void(); + + *target_bh = NULL; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + + while(le16_to_cpu(el->l_tree_depth) > 1) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty " + "extent list (next_free_rec == 0)", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + i = le16_to_cpu(el->l_next_free_rec) - 1; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (!blkno) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent " + "list where extent # %d has no physical " + "block start", + OCFS2_I(inode)->ip_blkno, i); + status = -EIO; + goto bail; + } + + if (bh) { + brelse(bh); + bh = NULL; + } + + status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &eb->h_list; + + if (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count)) { + if (lowest_bh) + brelse(lowest_bh); + lowest_bh = bh; + get_bh(lowest_bh); + } + } + + /* If we didn't find one and the fe doesn't have any room, + * then return '1' */ + if (!lowest_bh + && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) + status = 1; + + *target_bh = lowest_bh; +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +/* the caller needs to update fe->i_clusters */ +int ocfs2_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 start_blk, + u32 new_clusters, + struct ocfs2_alloc_context *meta_ac) +{ + int status, i, shift; + struct buffer_head *last_eb_bh = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + mlog(0, "add %u clusters starting at block %"MLFu64" to " + "inode %"MLFu64"\n", + new_clusters, start_blk, OCFS2_I(inode)->ip_blkno); + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + + if (el->l_tree_depth) { + /* jump to end of tree */ + status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_exit(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + el = &eb->h_list; + } + + /* Can we allocate without adding/shifting tree bits? */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le16_to_cpu(el->l_next_free_rec) == 0 + || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) + || le32_to_cpu(el->l_recs[i].e_clusters) == 0 + || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) + goto out_add; + + mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " + "tree now.\n"); + + shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); + if (shift < 0) { + status = shift; + mlog_errno(status); + goto bail; + } + + /* We traveled all the way to the bottom of the allocation tree + * and didn't find room for any more extents - we need to add + * another tree level */ + if (shift) { + /* if we hit a leaf, we'd better be empty :) */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) != + le16_to_cpu(el->l_count)); + BUG_ON(bh); + mlog(0, "ocfs2_allocate_extent: need to shift tree depth " + "(current = %u)\n", + le16_to_cpu(fe->id2.i_list.l_tree_depth)); + + /* ocfs2_shift_tree_depth will return us a buffer with + * the new extent block (so we can pass that to + * ocfs2_add_branch). */ + status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, + meta_ac, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + /* Special case: we have room now if we shifted from + * tree_depth 0 */ + if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) + goto out_add; + } + + /* call ocfs2_add_branch to add the final part of the tree with + * the new data. */ + mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); + status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, + meta_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +out_add: + /* Finally, we can add clusters. */ + status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, + start_blk, new_clusters); + if (status < 0) + mlog_errno(status); + +bail: + if (bh) + brelse(bh); + + if (last_eb_bh) + brelse(last_eb_bh); + + mlog_exit(status); + return status; +} + +static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) +{ + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + + mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), + "slot %d, invalid truncate log parameters: used = " + "%u, count = %u\n", osb->slot_num, + le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); + return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); +} + +static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, + unsigned int new_start) +{ + unsigned int tail_index; + unsigned int current_tail; + + /* No records, nothing to coalesce */ + if (!le16_to_cpu(tl->tl_used)) + return 0; + + tail_index = le16_to_cpu(tl->tl_used) - 1; + current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); + current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); + + return current_tail == new_start; +} + +static int ocfs2_truncate_log_append(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + u64 start_blk, + unsigned int num_clusters) +{ + int status, index; + unsigned int start_cluster, tl_count; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk, + num_clusters); + + BUG_ON(!down_trylock(&tl_inode->i_sem)); + + start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); + status = -EIO; + goto bail; + } + + tl_count = le16_to_cpu(tl->tl_count); + mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || + tl_count == 0, + "Truncate record count on #%"MLFu64" invalid (" + "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno, + ocfs2_truncate_recs_per_inode(osb->sb), + le16_to_cpu(tl->tl_count)); + + /* Caller should have known to flush before calling us. */ + index = le16_to_cpu(tl->tl_used); + if (index >= tl_count) { + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Log truncate of %u clusters starting at cluster %u to " + "%"MLFu64" (index = %d)\n", num_clusters, start_cluster, + OCFS2_I(tl_inode)->ip_blkno, index); + + if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { + /* + * Move index back to the record we are coalescing with. + * ocfs2_truncate_log_can_coalesce() guarantees nonzero + */ + index--; + + num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); + mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", + index, le32_to_cpu(tl->tl_recs[index].t_start), + num_clusters); + } else { + tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); + tl->tl_used = cpu_to_le16(index + 1); + } + tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); + + status = ocfs2_journal_dirty(handle, tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *data_alloc_inode, + struct buffer_head *data_alloc_bh) +{ + int status = 0; + int i; + unsigned int num_clusters; + u64 start_blk; + struct ocfs2_truncate_rec rec; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + + mlog_entry_void(); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + i = le16_to_cpu(tl->tl_used) - 1; + while (i >= 0) { + /* Caller has given us at least enough credits to + * update the truncate log dinode */ + status = ocfs2_journal_access(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + tl->tl_used = cpu_to_le16(i); + + status = ocfs2_journal_dirty(handle, tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* TODO: Perhaps we can calculate the bulk of the + * credits up front rather than extending like + * this. */ + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + rec = tl->tl_recs[i]; + start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, + le32_to_cpu(rec.t_start)); + num_clusters = le32_to_cpu(rec.t_clusters); + + /* if start_blk is not set, we ignore the record as + * invalid. */ + if (start_blk) { + mlog(0, "free record %d, start = %u, clusters = %u\n", + i, le32_to_cpu(rec.t_start), num_clusters); + + status = ocfs2_free_clusters(handle, data_alloc_inode, + data_alloc_bh, start_blk, + num_clusters); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + i--; + } + +bail: + mlog_exit(status); + return status; +} + +/* Expects you to already be holding tl_inode->i_sem */ +static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + unsigned int num_to_flush; + struct ocfs2_journal_handle *handle = NULL; + struct inode *tl_inode = osb->osb_tl_inode; + struct inode *data_alloc_inode = NULL; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct buffer_head *data_alloc_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + mlog_entry_void(); + + BUG_ON(!down_trylock(&tl_inode->i_sem)); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); + status = -EIO; + goto bail; + } + + num_to_flush = le16_to_cpu(tl->tl_used); + mlog(0, "Flush %u records from truncate log #%"MLFu64"\n", + num_to_flush, OCFS2_I(tl_inode)->ip_blkno); + if (!num_to_flush) { + status = 0; + goto bail; + } + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + data_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!data_alloc_inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get bitmap inode!\n"); + goto bail; + } + + ocfs2_handle_add_inode(handle, data_alloc_inode); + status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + data_alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (data_alloc_inode) + iput(data_alloc_inode); + + if (data_alloc_bh) + brelse(data_alloc_bh); + + mlog_exit(status); + return status; +} + +int ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + down(&tl_inode->i_sem); + status = __ocfs2_flush_truncate_log(osb); + up(&tl_inode->i_sem); + + return status; +} + +static void ocfs2_truncate_log_worker(void *data) +{ + int status; + struct ocfs2_super *osb = data; + + mlog_entry_void(); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); +} + +#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel) +{ + if (osb->osb_tl_inode) { + /* We want to push off log flushes while truncates are + * still running. */ + if (cancel) + cancel_delayed_work(&osb->osb_truncate_log_wq); + + queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, + OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); + } +} + +static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, + int slot_num, + struct inode **tl_inode, + struct buffer_head **tl_bh) +{ + int status; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + + inode = ocfs2_get_system_file_inode(osb, + TRUNCATE_LOG_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get load truncate log inode!\n"); + goto bail; + } + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, + OCFS2_BH_CACHED, inode); + if (status < 0) { + iput(inode); + mlog_errno(status); + goto bail; + } + + *tl_inode = inode; + *tl_bh = bh; +bail: + mlog_exit(status); + return status; +} + +/* called during the 1st stage of node recovery. we stamp a clean + * truncate log and pass back a copy for processing later. if the + * truncate log does not require processing, a *tl_copy is set to + * NULL. */ +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + *tl_copy = NULL; + + mlog(0, "recover truncate log from slot %d\n", slot_num); + + status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); + status = -EIO; + goto bail; + } + + if (le16_to_cpu(tl->tl_used)) { + mlog(0, "We'll have %u logs to recover\n", + le16_to_cpu(tl->tl_used)); + + *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); + if (!(*tl_copy)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* Assuming the write-out below goes well, this copy + * will be passed back to recovery for processing. */ + memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); + + /* All we need to do to clear the truncate log is set + * tl_used. */ + tl->tl_used = 0; + + status = ocfs2_write_block(osb, tl_bh, tl_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + +bail: + if (tl_inode) + iput(tl_inode); + if (tl_bh) + brelse(tl_bh); + + if (status < 0 && (*tl_copy)) { + kfree(*tl_copy); + *tl_copy = NULL; + } + + mlog_exit(status); + return status; +} + +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy) +{ + int status = 0; + int i; + unsigned int clusters, num_recs, start_cluster; + u64 start_blk; + struct ocfs2_journal_handle *handle; + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_truncate_log *tl; + + mlog_entry_void(); + + if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { + mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); + return -EINVAL; + } + + tl = &tl_copy->id2.i_dealloc; + num_recs = le16_to_cpu(tl->tl_used); + mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs, + tl_copy->i_blkno); + + down(&tl_inode->i_sem); + for(i = 0; i < num_recs; i++) { + if (ocfs2_truncate_log_needs_flush(osb)) { + status = __ocfs2_flush_truncate_log(osb); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + + handle = ocfs2_start_trans(osb, NULL, + OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_up; + } + + clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); + start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); + start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); + + status = ocfs2_truncate_log_append(osb, handle, + start_blk, clusters); + ocfs2_commit_trans(handle); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + +bail_up: + up(&tl_inode->i_sem); + + mlog_exit(status); + return status; +} + +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + mlog_entry_void(); + + if (tl_inode) { + cancel_delayed_work(&osb->osb_truncate_log_wq); + flush_workqueue(ocfs2_wq); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + + brelse(osb->osb_tl_bh); + iput(osb->osb_tl_inode); + } + + mlog_exit_void(); +} + +int ocfs2_truncate_log_init(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + + mlog_entry_void(); + + status = ocfs2_get_truncate_log_info(osb, + osb->slot_num, + &tl_inode, + &tl_bh); + if (status < 0) + mlog_errno(status); + + /* ocfs2_truncate_log_shutdown keys on the existence of + * osb->osb_tl_inode so we don't set any of the osb variables + * until we're sure all is well. */ + INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb); + osb->osb_tl_bh = tl_bh; + osb->osb_tl_inode = tl_inode; + + mlog_exit(status); + return status; +} + +/* This function will figure out whether the currently last extent + * block will be deleted, and if it will, what the new last extent + * block will be so we can update his h_next_leaf_blk field, as well + * as the dinodes i_last_eb_blk */ +static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe, + u32 new_i_clusters, + struct buffer_head *old_last_eb, + struct buffer_head **new_last_eb) +{ + int i, status = 0; + u64 block = 0; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh = NULL; + + *new_last_eb = NULL; + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto bail; + } + + /* we have no tree, so of course, no last_eb. */ + if (!fe->id2.i_list.l_tree_depth) + goto bail; + + /* trunc to zero special case - this makes tree_depth = 0 + * regardless of what it is. */ + if (!new_i_clusters) + goto bail; + + eb = (struct ocfs2_extent_block *) old_last_eb->b_data; + el = &(eb->h_list); + BUG_ON(!el->l_next_free_rec); + + /* Make sure that this guy will actually be empty after we + * clear away the data. */ + if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) + goto bail; + + /* Ok, at this point, we know that last_eb will definitely + * change, so lets traverse the tree and find the second to + * last extent block. */ + el = &(fe->id2.i_list); + /* go down the tree, */ + do { + for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { + if (le32_to_cpu(el->l_recs[i].e_cpos) < + new_i_clusters) { + block = le64_to_cpu(el->l_recs[i].e_blkno); + break; + } + } + BUG_ON(i < 0); + + if (bh) { + brelse(bh); + bh = NULL; + } + + status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + } while (el->l_tree_depth); + + *new_last_eb = bh; + get_bh(*new_last_eb); + mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno)); +bail: + if (bh) + brelse(bh); + + return status; +} + +static int ocfs2_do_truncate(struct ocfs2_super *osb, + unsigned int clusters_to_del, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *old_last_eb_bh, + struct ocfs2_journal_handle *handle, + struct ocfs2_truncate_context *tc) +{ + int status, i, depth; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_block *last_eb = NULL; + struct ocfs2_extent_list *el; + struct buffer_head *eb_bh = NULL; + struct buffer_head *last_eb_bh = NULL; + u64 next_eb = 0; + u64 delete_blk = 0; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + status = ocfs2_find_new_last_ext_blk(osb, + inode, + fe, + le32_to_cpu(fe->i_clusters) - + clusters_to_del, + old_last_eb_bh, + &last_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (last_eb_bh) + last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + el = &(fe->id2.i_list); + + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - + clusters_to_del; + spin_unlock(&OCFS2_I(inode)->ip_lock); + le32_add_cpu(&fe->i_clusters, -clusters_to_del); + fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); + fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); + le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); + /* tree depth zero, we can just delete the clusters, otherwise + * we need to record the offset of the next level extent block + * as we may overwrite it. */ + if (!el->l_tree_depth) + delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + + ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(el->l_recs[i].e_clusters)); + else + next_eb = le64_to_cpu(el->l_recs[i].e_blkno); + + if (!el->l_recs[i].e_clusters) { + /* if we deleted the whole extent record, then clear + * out the other fields and update the extent + * list. For depth > 0 trees, we've already recorded + * the extent block in 'next_eb' */ + el->l_recs[i].e_cpos = 0; + el->l_recs[i].e_blkno = 0; + BUG_ON(!el->l_next_free_rec); + le16_add_cpu(&el->l_next_free_rec, -1); + } + + depth = le16_to_cpu(el->l_tree_depth); + if (!fe->i_clusters) { + /* trunc to zero is a special case. */ + el->l_tree_depth = 0; + fe->i_last_eb_blk = 0; + } else if (last_eb) + fe->i_last_eb_blk = last_eb->h_blkno; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (last_eb) { + /* If there will be a new last extent block, then by + * definition, there cannot be any leaves to the right of + * him. */ + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + last_eb->h_next_leaf_blk = 0; + status = ocfs2_journal_dirty(handle, last_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* if our tree depth > 0, update all the tree blocks below us. */ + while (depth) { + mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n", + depth, next_eb); + status = ocfs2_read_block(osb, next_eb, &eb_bh, + OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *)eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + + status = ocfs2_journal_access(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + mlog(0, "extent block %"MLFu64", before: record %d: " + "(%u, %u, %"MLFu64"), next = %u\n", + le64_to_cpu(eb->h_blkno), i, + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(el->l_recs[i].e_clusters), + le64_to_cpu(el->l_recs[i].e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); + le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); + + next_eb = le64_to_cpu(el->l_recs[i].e_blkno); + /* bottom-most block requires us to delete data.*/ + if (!el->l_tree_depth) + delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + + ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(el->l_recs[i].e_clusters)); + if (!el->l_recs[i].e_clusters) { + el->l_recs[i].e_cpos = 0; + el->l_recs[i].e_blkno = 0; + BUG_ON(!el->l_next_free_rec); + le16_add_cpu(&el->l_next_free_rec, -1); + } + mlog(0, "extent block %"MLFu64", after: record %d: " + "(%u, %u, %"MLFu64"), next = %u\n", + le64_to_cpu(eb->h_blkno), i, + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(el->l_recs[i].e_clusters), + le64_to_cpu(el->l_recs[i].e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + status = ocfs2_journal_dirty(handle, eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (!el->l_next_free_rec) { + mlog(0, "deleting this extent block.\n"); + + ocfs2_remove_from_cache(inode, eb_bh); + + BUG_ON(eb->h_suballoc_slot); + BUG_ON(el->l_recs[0].e_clusters); + BUG_ON(el->l_recs[0].e_cpos); + BUG_ON(el->l_recs[0].e_blkno); + status = ocfs2_free_extent_block(handle, + tc->tc_ext_alloc_inode, + tc->tc_ext_alloc_bh, + eb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + brelse(eb_bh); + eb_bh = NULL; + depth--; + } + + BUG_ON(!delete_blk); + status = ocfs2_truncate_log_append(osb, handle, delete_blk, + clusters_to_del); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = 0; +bail: + if (!status) + ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); + else + ocfs2_extent_map_drop(inode, 0); + mlog_exit(status); + return status; +} + +/* + * It is expected, that by the time you call this function, + * inode->i_size and fe->i_size have been adjusted. + * + * WARNING: This will kfree the truncate context + */ +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context *tc) +{ + int status, i, credits, tl_sem = 0; + u32 clusters_to_del, target_i_clusters; + u64 last_eb = 0; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *last_eb_bh; + struct ocfs2_journal_handle *handle = NULL; + struct inode *tl_inode = osb->osb_tl_inode; + + mlog_entry_void(); + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + i_size_read(inode)); + + last_eb_bh = tc->tc_last_eb_bh; + tc->tc_last_eb_bh = NULL; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (fe->id2.i_list.l_tree_depth) { + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + last_eb = le64_to_cpu(fe->i_last_eb_blk); +start: + mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " + "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", " + "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", + le32_to_cpu(fe->i_clusters), last_eb, + le64_to_cpu(fe->i_last_eb_blk), + le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); + + if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { + mlog(0, "last_eb changed!\n"); + BUG_ON(!fe->id2.i_list.l_tree_depth); + last_eb = le64_to_cpu(fe->i_last_eb_blk); + /* i_last_eb_blk may have changed, read it if + * necessary. We don't have to worry about the + * truncate to zero case here (where there becomes no + * last_eb) because we never loop back after our work + * is done. */ + if (last_eb_bh) { + brelse(last_eb_bh); + last_eb_bh = NULL; + } + + status = ocfs2_read_block(osb, last_eb, + &last_eb_bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + } + + /* by now, el will point to the extent list on the bottom most + * portion of this tree. */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) + clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); + else + clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + + le32_to_cpu(el->l_recs[i].e_cpos)) - + target_i_clusters; + + mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); + + down(&tl_inode->i_sem); + tl_sem = 1; + /* ocfs2_truncate_log_needs_flush guarantees us at least one + * record is free for use. If there isn't any, we flush to get + * an empty truncate log. */ + if (ocfs2_truncate_log_needs_flush(osb)) { + status = __ocfs2_flush_truncate_log(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, + fe, el); + handle = ocfs2_start_trans(osb, NULL, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) + mlog_errno(status); + + status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, + last_eb_bh, handle, tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + up(&tl_inode->i_sem); + tl_sem = 0; + + ocfs2_commit_trans(handle); + handle = NULL; + + BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); + if (le32_to_cpu(fe->i_clusters) > target_i_clusters) + goto start; +bail: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_schedule_truncate_log_flush(osb, 1); + + if (tl_sem) + up(&tl_inode->i_sem); + + if (handle) + ocfs2_commit_trans(handle); + + if (last_eb_bh) + brelse(last_eb_bh); + + /* This will drop the ext_alloc cluster lock for us */ + ocfs2_free_truncate_context(tc); + + mlog_exit(status); + return status; +} + + +/* + * Expects the inode to already be locked. This will figure out which + * inodes need to be locked and will put them on the returned truncate + * context. + */ +int ocfs2_prepare_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context **tc) +{ + int status, metadata_delete; + unsigned int new_i_clusters; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *last_eb_bh = NULL; + struct inode *ext_alloc_inode = NULL; + struct buffer_head *ext_alloc_bh = NULL; + + mlog_entry_void(); + + *tc = NULL; + + new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + i_size_read(inode)); + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" + "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size); + + if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count " + "%u and size %"MLFu64" whereas struct inode has " + "cluster count %u and size %llu which caused an " + "invalid truncate to %u clusters.", + le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->i_clusters), + le64_to_cpu(fe->i_size), + OCFS2_I(inode)->ip_clusters, i_size_read(inode), + new_i_clusters); + mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); + status = -EIO; + goto bail; + } + + *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL); + if (!(*tc)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + metadata_delete = 0; + if (fe->id2.i_list.l_tree_depth) { + /* If we have a tree, then the truncate may result in + * metadata deletes. Figure this out from the + * rightmost leaf block.*/ + status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + + brelse(last_eb_bh); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) + metadata_delete = 1; + } + + (*tc)->tc_last_eb_bh = last_eb_bh; + + if (metadata_delete) { + mlog(0, "Will have to delete metadata for this trunc. " + "locking allocator.\n"); + ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); + if (!ext_alloc_inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + down(&ext_alloc_inode->i_sem); + (*tc)->tc_ext_alloc_inode = ext_alloc_inode; + + status = ocfs2_meta_lock(ext_alloc_inode, + NULL, + &ext_alloc_bh, + 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + (*tc)->tc_ext_alloc_bh = ext_alloc_bh; + (*tc)->tc_ext_alloc_locked = 1; + } + + status = 0; +bail: + if (status < 0) { + if (*tc) + ocfs2_free_truncate_context(*tc); + *tc = NULL; + } + mlog_exit_void(); + return status; +} + +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) +{ + if (tc->tc_ext_alloc_inode) { + if (tc->tc_ext_alloc_locked) + ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); + + up(&tc->tc_ext_alloc_inode->i_sem); + iput(tc->tc_ext_alloc_inode); + } + + if (tc->tc_ext_alloc_bh) + brelse(tc->tc_ext_alloc_bh); + + if (tc->tc_last_eb_bh) + brelse(tc->tc_last_eb_bh); + + kfree(tc); +} |