diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/reiserfs | |
download | talos-op-linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz talos-op-linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'fs/reiserfs')
-rw-r--r-- | fs/reiserfs/Makefile | 36 | ||||
-rw-r--r-- | fs/reiserfs/README | 161 | ||||
-rw-r--r-- | fs/reiserfs/bitmap.c | 1169 | ||||
-rw-r--r-- | fs/reiserfs/dir.c | 275 | ||||
-rw-r--r-- | fs/reiserfs/do_balan.c | 1597 | ||||
-rw-r--r-- | fs/reiserfs/file.c | 1408 | ||||
-rw-r--r-- | fs/reiserfs/fix_node.c | 2518 | ||||
-rw-r--r-- | fs/reiserfs/hashes.c | 209 | ||||
-rw-r--r-- | fs/reiserfs/ibalance.c | 1058 | ||||
-rw-r--r-- | fs/reiserfs/inode.c | 2846 | ||||
-rw-r--r-- | fs/reiserfs/ioctl.c | 151 | ||||
-rw-r--r-- | fs/reiserfs/item_ops.c | 788 | ||||
-rw-r--r-- | fs/reiserfs/journal.c | 3876 | ||||
-rw-r--r-- | fs/reiserfs/lbalance.c | 1222 | ||||
-rw-r--r-- | fs/reiserfs/namei.c | 1491 | ||||
-rw-r--r-- | fs/reiserfs/objectid.c | 206 | ||||
-rw-r--r-- | fs/reiserfs/prints.c | 727 | ||||
-rw-r--r-- | fs/reiserfs/procfs.c | 664 | ||||
-rw-r--r-- | fs/reiserfs/resize.c | 182 | ||||
-rw-r--r-- | fs/reiserfs/stree.c | 2073 | ||||
-rw-r--r-- | fs/reiserfs/super.c | 2148 | ||||
-rw-r--r-- | fs/reiserfs/tail_conversion.c | 276 | ||||
-rw-r--r-- | fs/reiserfs/xattr.c | 1450 | ||||
-rw-r--r-- | fs/reiserfs/xattr_acl.c | 571 | ||||
-rw-r--r-- | fs/reiserfs/xattr_security.c | 69 | ||||
-rw-r--r-- | fs/reiserfs/xattr_trusted.c | 81 | ||||
-rw-r--r-- | fs/reiserfs/xattr_user.c | 99 |
27 files changed, 27351 insertions, 0 deletions
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile new file mode 100644 index 000000000000..3a59309f3ca9 --- /dev/null +++ b/fs/reiserfs/Makefile @@ -0,0 +1,36 @@ +# +# Makefile for the linux reiser-filesystem routines. +# + +obj-$(CONFIG_REISERFS_FS) += reiserfs.o + +reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ + super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ + hashes.o tail_conversion.o journal.o resize.o \ + item_ops.o ioctl.o procfs.o + +ifeq ($(CONFIG_REISERFS_FS_XATTR),y) +reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o +endif + +ifeq ($(CONFIG_REISERFS_FS_SECURITY),y) +reiserfs-objs += xattr_security.o +endif + +ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) +reiserfs-objs += xattr_acl.o +endif + +# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline +# functions are used. This causes the compiler to advance the stack +# pointer out of the available stack space, corrupting kernel space, +# and causing a panic. Since this behavior only affects ppc32, this ifeq +# will work around it. If any other architecture displays this behavior, +# add it here. +ifeq ($(CONFIG_PPC32),y) +EXTRA_CFLAGS := -O1 +endif + +TAGS: + etags *.c + diff --git a/fs/reiserfs/README b/fs/reiserfs/README new file mode 100644 index 000000000000..90e1670e4e6f --- /dev/null +++ b/fs/reiserfs/README @@ -0,0 +1,161 @@ +[LICENSING] + +ReiserFS is hereby licensed under the GNU General +Public License version 2. + +Source code files that contain the phrase "licensing governed by +reiserfs/README" are "governed files" throughout this file. Governed +files are licensed under the GPL. The portions of them owned by Hans +Reiser, or authorized to be licensed by him, have been in the past, +and likely will be in the future, licensed to other parties under +other licenses. If you add your code to governed files, and don't +want it to be owned by Hans Reiser, put your copyright label on that +code so the poor blight and his customers can keep things straight. +All portions of governed files not labeled otherwise are owned by Hans +Reiser, and by adding your code to it, widely distributing it to +others or sending us a patch, and leaving the sentence in stating that +licensing is governed by the statement in this file, you accept this. +It will be a kindness if you identify whether Hans Reiser is allowed +to license code labeled as owned by you on your behalf other than +under the GPL, because he wants to know if it is okay to do so and put +a check in the mail to you (for non-trivial improvements) when he +makes his next sale. He makes no guarantees as to the amount if any, +though he feels motivated to motivate contributors, and you can surely +discuss this with him before or after contributing. You have the +right to decline to allow him to license your code contribution other +than under the GPL. + +Further licensing options are available for commercial and/or other +interests directly from Hans Reiser: hans@reiser.to. If you interpret +the GPL as not allowing those additional licensing options, you read +it wrongly, and Richard Stallman agrees with me, when carefully read +you can see that those restrictions on additional terms do not apply +to the owner of the copyright, and my interpretation of this shall +govern for this license. + +Finally, nothing in this license shall be interpreted to allow you to +fail to fairly credit me, or to remove my credits, without my +permission, unless you are an end user not redistributing to others. +If you have doubts about how to properly do that, or about what is +fair, ask. (Last I spoke with him Richard was contemplating how best +to address the fair crediting issue in the next GPL version.) + +[END LICENSING] + +Reiserfs is a file system based on balanced tree algorithms, which is +described at http://devlinux.com/namesys. + +Stop reading here. Go there, then return. + +Send bug reports to yura@namesys.botik.ru. + +mkreiserfs and other utilities are in reiserfs/utils, or wherever your +Linux provider put them. There is some disagreement about how useful +it is for users to get their fsck and mkreiserfs out of sync with the +version of reiserfs that is in their kernel, with many important +distributors wanting them out of sync.:-) Please try to remember to +recompile and reinstall fsck and mkreiserfs with every update of +reiserfs, this is a common source of confusion. Note that some of the +utilities cannot be compiled without accessing the balancing code +which is in the kernel code, and relocating the utilities may require +you to specify where that code can be found. + +Yes, if you update your reiserfs kernel module you do have to +recompile your kernel, most of the time. The errors you get will be +quite cryptic if your forget to do so. + +Real users, as opposed to folks who want to hack and then understand +what went wrong, will want REISERFS_CHECK off. + +Hideous Commercial Pitch: Spread your development costs across other OS +vendors. Select from the best in the world, not the best in your +building, by buying from third party OS component suppliers. Leverage +the software component development power of the internet. Be the most +aggressive in taking advantage of the commercial possibilities of +decentralized internet development, and add value through your branded +integration that you sell as an operating system. Let your competitors +be the ones to compete against the entire internet by themselves. Be +hip, get with the new economic trend, before your competitors do. Send +email to hans@reiser.to. + +To understand the code, after reading the website, start reading the +code by reading reiserfs_fs.h first. + +Hans Reiser was the project initiator, primary architect, source of all +funding for the first 5.5 years, and one of the programmers. He owns +the copyright. + +Vladimir Saveljev was one of the programmers, and he worked long hours +writing the cleanest code. He always made the effort to be the best he +could be, and to make his code the best that it could be. What resulted +was quite remarkable. I don't think that money can ever motivate someone +to work the way he did, he is one of the most selfless men I know. + +Yura helps with benchmarking, coding hashes, and block pre-allocation +code. + +Anatoly Pinchuk is a former member of our team who worked closely with +Vladimir throughout the project's development. He wrote a quite +substantial portion of the total code. He realized that there was a +space problem with packing tails of files for files larger than a node +that start on a node aligned boundary (there are reasons to want to node +align files), and he invented and implemented indirect items and +unformatted nodes as the solution. + +Konstantin Shvachko, with the help of the Russian version of a VC, +tried to put me in a position where I was forced into giving control +of the project to him. (Fortunately, as the person paying the money +for all salaries from my dayjob I owned all copyrights, and you can't +really force takeovers of sole proprietorships.) This was something +curious, because he never really understood the value of our project, +why we should do what we do, or why innovation was possible in +general, but he was sure that he ought to be controlling it. Every +innovation had to be forced past him while he was with us. He added +two years to the time required to complete reiserfs, and was a net +loss for me. Mikhail Gilula was a brilliant innovator who also left +in a destructive way that erased the value of his contributions, and +that he was shown much generosity just makes it more painful. + +Grigory Zaigralin was an extremely effective system administrator for +our group. + +Igor Krasheninnikov was wonderful at hardware procurement, repair, and +network installation. + +Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a +textbook he got the algorithm from in the code. Note that his analysis +of how we could use the hashing code in making 32 bit NFS cookies work +was probably more important than the actual algorithm. Colin Plumb also +contributed to it. + +Chris Mason dived right into our code, and in just a few months produced +the journaling code that dramatically increased the value of ReiserFS. +He is just an amazing programmer. + +Igor Zagorovsky is writing much of the new item handler and extent code +for our next major release. + +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the +resizer, and is hard at work on implementing allocate on flush. SGI +implemented allocate on flush before us for XFS, and generously took +the time to convince me we should do it also. They are great people, +and a great company. + +Yuri Shevchuk and Nikita Danilov are doing squid cache optimization. + +Vitaly Fertman is doing fsck. + +Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably +the endian safe patches which allow ReiserFS to run on any platform +supported by the Linux kernel. + +SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the +Alpha PC Company made it possible for me to not have a day job +anymore, and to dramatically increase our staffing. Ecila funded +hypertext feature development, MP3.com funded journaling, SuSE funded +core development, IntegratedLinux.com funded squid web cache +appliances, bigstorage.com funded HSM, and the alpha PC company funded +the alpha port. Many of these tasks were helped by sponsors other +than the ones just named. SuSE has helped in much more than just +funding.... + diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c new file mode 100644 index 000000000000..a4e2ed544bbe --- /dev/null +++ b/fs/reiserfs/bitmap.c @@ -0,0 +1,1169 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ +/* Reiserfs block (de)allocator, bitmap-based. */ + +#include <linux/config.h> +#include <linux/time.h> +#include <linux/reiserfs_fs.h> +#include <linux/errno.h> +#include <linux/buffer_head.h> +#include <linux/kernel.h> +#include <linux/pagemap.h> +#include <linux/reiserfs_fs_sb.h> +#include <linux/reiserfs_fs_i.h> +#include <linux/quotaops.h> + +#define PREALLOCATION_SIZE 9 + +/* different reiserfs block allocator options */ + +#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits) + +#define _ALLOC_concentrating_formatted_nodes 0 +#define _ALLOC_displacing_large_files 1 +#define _ALLOC_displacing_new_packing_localities 2 +#define _ALLOC_old_hashed_relocation 3 +#define _ALLOC_new_hashed_relocation 4 +#define _ALLOC_skip_busy 5 +#define _ALLOC_displace_based_on_dirid 6 +#define _ALLOC_hashed_formatted_nodes 7 +#define _ALLOC_old_way 8 +#define _ALLOC_hundredth_slices 9 +#define _ALLOC_dirid_groups 10 +#define _ALLOC_oid_groups 11 +#define _ALLOC_packing_groups 12 + +#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s)) +#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s)) +#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s)) + +#define SET_OPTION(optname) \ + do { \ + reiserfs_warning(s, "reiserfs: option \"%s\" is set", #optname); \ + set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \ + } while(0) +#define TEST_OPTION(optname, s) \ + test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) + +static inline void get_bit_address (struct super_block * s, + b_blocknr_t block, int * bmap_nr, int * offset) +{ + /* It is in the bitmap block number equal to the block + * number divided by the number of bits in a block. */ + *bmap_nr = block / (s->s_blocksize << 3); + /* Within that bitmap block it is located at bit offset *offset. */ + *offset = block & ((s->s_blocksize << 3) - 1 ); + return; +} + +#ifdef CONFIG_REISERFS_CHECK +int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value) +{ + int i, j; + + if (block == 0 || block >= SB_BLOCK_COUNT (s)) { + reiserfs_warning (s, "vs-4010: is_reusable: block number is out of range %lu (%u)", + block, SB_BLOCK_COUNT (s)); + return 0; + } + + /* it can't be one of the bitmap blocks */ + for (i = 0; i < SB_BMAP_NR (s); i ++) + if (block == SB_AP_BITMAP (s)[i].bh->b_blocknr) { + reiserfs_warning (s, "vs: 4020: is_reusable: " + "bitmap block %lu(%u) can't be freed or reused", + block, SB_BMAP_NR (s)); + return 0; + } + + get_bit_address (s, block, &i, &j); + + if (i >= SB_BMAP_NR (s)) { + reiserfs_warning (s, "vs-4030: is_reusable: there is no so many bitmap blocks: " + "block=%lu, bitmap_nr=%d", block, i); + return 0; + } + + if ((bit_value == 0 && + reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) || + (bit_value == 1 && + reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i].bh->b_data) == 0)) { + reiserfs_warning (s, "vs-4040: is_reusable: corresponding bit of block %lu does not " + "match required value (i==%d, j==%d) test_bit==%d", + block, i, j, reiserfs_test_le_bit (j, SB_AP_BITMAP (s)[i].bh->b_data)); + + return 0; + } + + if (bit_value == 0 && block == SB_ROOT_BLOCK (s)) { + reiserfs_warning (s, "vs-4050: is_reusable: this is root block (%u), " + "it must be busy", SB_ROOT_BLOCK (s)); + return 0; + } + + return 1; +} +#endif /* CONFIG_REISERFS_CHECK */ + +/* searches in journal structures for a given block number (bmap, off). If block + is found in reiserfs journal it suggests next free block candidate to test. */ +static inline int is_block_in_journal (struct super_block * s, int bmap, int +off, int *next) +{ + b_blocknr_t tmp; + + if (reiserfs_in_journal (s, bmap, off, 1, &tmp)) { + if (tmp) { /* hint supplied */ + *next = tmp; + PROC_INFO_INC( s, scan_bitmap.in_journal_hint ); + } else { + (*next) = off + 1; /* inc offset to avoid looping. */ + PROC_INFO_INC( s, scan_bitmap.in_journal_nohint ); + } + PROC_INFO_INC( s, scan_bitmap.retry ); + return 1; + } + return 0; +} + +/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap + * block; */ +static int scan_bitmap_block (struct reiserfs_transaction_handle *th, + int bmap_n, int *beg, int boundary, int min, int max, int unfm) +{ + struct super_block *s = th->t_super; + struct reiserfs_bitmap_info *bi=&SB_AP_BITMAP(s)[bmap_n]; + int end, next; + int org = *beg; + + BUG_ON (!th->t_trans_id); + + RFALSE(bmap_n >= SB_BMAP_NR (s), "Bitmap %d is out of range (0..%d)",bmap_n, SB_BMAP_NR (s) - 1); + PROC_INFO_INC( s, scan_bitmap.bmap ); +/* this is unclear and lacks comments, explain how journal bitmaps + work here for the reader. Convey a sense of the design here. What + is a window? */ +/* - I mean `a window of zero bits' as in description of this function - Zam. */ + + if ( !bi ) { + reiserfs_warning (s, "NULL bitmap info pointer for bitmap %d", bmap_n); + return 0; + } + if (buffer_locked (bi->bh)) { + PROC_INFO_INC( s, scan_bitmap.wait ); + __wait_on_buffer (bi->bh); + } + + while (1) { + cont: + if (bi->free_count < min) + return 0; // No free blocks in this bitmap + + /* search for a first zero bit -- beggining of a window */ + *beg = reiserfs_find_next_zero_le_bit + ((unsigned long*)(bi->bh->b_data), boundary, *beg); + + if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block + * cannot contain a zero window of minimum size */ + return 0; + } + + if (unfm && is_block_in_journal(s,bmap_n, *beg, beg)) + continue; + /* first zero bit found; we check next bits */ + for (end = *beg + 1;; end ++) { + if (end >= *beg + max || end >= boundary || reiserfs_test_le_bit (end, bi->bh->b_data)) { + next = end; + break; + } + /* finding the other end of zero bit window requires looking into journal structures (in + * case of searching for free blocks for unformatted nodes) */ + if (unfm && is_block_in_journal(s, bmap_n, end, &next)) + break; + } + + /* now (*beg) points to beginning of zero bits window, + * (end) points to one bit after the window end */ + if (end - *beg >= min) { /* it seems we have found window of proper size */ + int i; + reiserfs_prepare_for_journal (s, bi->bh, 1); + /* try to set all blocks used checking are they still free */ + for (i = *beg; i < end; i++) { + /* It seems that we should not check in journal again. */ + if (reiserfs_test_and_set_le_bit (i, bi->bh->b_data)) { + /* bit was set by another process + * while we slept in prepare_for_journal() */ + PROC_INFO_INC( s, scan_bitmap.stolen ); + if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks, + * if length of this set is more or equal to `min' */ + end = i; + break; + } + /* otherwise we clear all bit were set ... */ + while (--i >= *beg) + reiserfs_test_and_clear_le_bit (i, bi->bh->b_data); + reiserfs_restore_prepared_buffer (s, bi->bh); + *beg = org; + /* ... and search again in current block from beginning */ + goto cont; + } + } + bi->free_count -= (end - *beg); + journal_mark_dirty (th, s, bi->bh); + + /* free block count calculation */ + reiserfs_prepare_for_journal (s, SB_BUFFER_WITH_SB(s), 1); + PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); + journal_mark_dirty (th, s, SB_BUFFER_WITH_SB(s)); + + return end - (*beg); + } else { + *beg = next; + } + } +} + +static int bmap_hash_id(struct super_block *s, u32 id) { + char * hash_in = NULL; + unsigned long hash; + unsigned bm; + + if (id <= 2) { + bm = 1; + } else { + hash_in = (char *)(&id); + hash = keyed_hash(hash_in, 4); + bm = hash % SB_BMAP_NR(s); + if (!bm) + bm = 1; + } + /* this can only be true when SB_BMAP_NR = 1 */ + if (bm >= SB_BMAP_NR(s)) + bm = 0; + return bm; +} + +/* + * hashes the id and then returns > 0 if the block group for the + * corresponding hash is full + */ +static inline int block_group_used(struct super_block *s, u32 id) { + int bm; + bm = bmap_hash_id(s, id); + if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) { + return 0; + } + return 1; +} + +/* + * the packing is returned in disk byte order + */ +u32 reiserfs_choose_packing(struct inode *dir) { + u32 packing; + if (TEST_OPTION(packing_groups, dir->i_sb)) { + u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); + /* + * some versions of reiserfsck expect packing locality 1 to be + * special + */ + if (parent_dir == 1 || block_group_used(dir->i_sb,parent_dir)) + packing = INODE_PKEY(dir)->k_objectid; + else + packing = INODE_PKEY(dir)->k_dir_id; + } else + packing = INODE_PKEY(dir)->k_objectid; + return packing; +} + +/* Tries to find contiguous zero bit window (given size) in given region of + * bitmap and place new blocks there. Returns number of allocated blocks. */ +static int scan_bitmap (struct reiserfs_transaction_handle *th, + b_blocknr_t *start, b_blocknr_t finish, + int min, int max, int unfm, unsigned long file_block) +{ + int nr_allocated=0; + struct super_block * s = th->t_super; + /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr + * - Hans, it is not a block number - Zam. */ + + int bm, off; + int end_bm, end_off; + int off_max = s->s_blocksize << 3; + + BUG_ON (!th->t_trans_id); + + PROC_INFO_INC( s, scan_bitmap.call ); + if ( SB_FREE_BLOCKS(s) <= 0) + return 0; // No point in looking for more free blocks + + get_bit_address (s, *start, &bm, &off); + get_bit_address (s, finish, &end_bm, &end_off); + if (bm > SB_BMAP_NR(s)) + return 0; + if (end_bm > SB_BMAP_NR(s)) + end_bm = SB_BMAP_NR(s); + + /* When the bitmap is more than 10% free, anyone can allocate. + * When it's less than 10% free, only files that already use the + * bitmap are allowed. Once we pass 80% full, this restriction + * is lifted. + * + * We do this so that files that grow later still have space close to + * their original allocation. This improves locality, and presumably + * performance as a result. + * + * This is only an allocation policy and does not make up for getting a + * bad hint. Decent hinting must be implemented for this to work well. + */ + if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) { + for (;bm < end_bm; bm++, off = 0) { + if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 ) + nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); + if (nr_allocated) + goto ret; + } + /* we know from above that start is a reasonable number */ + get_bit_address (s, *start, &bm, &off); + } + + for (;bm < end_bm; bm++, off = 0) { + nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); + if (nr_allocated) + goto ret; + } + + nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); + + ret: + *start = bm * off_max + off; + return nr_allocated; + +} + +static void _reiserfs_free_block (struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) +{ + struct super_block * s = th->t_super; + struct reiserfs_super_block * rs; + struct buffer_head * sbh; + struct reiserfs_bitmap_info *apbi; + int nr, offset; + + BUG_ON (!th->t_trans_id); + + PROC_INFO_INC( s, free_block ); + + rs = SB_DISK_SUPER_BLOCK (s); + sbh = SB_BUFFER_WITH_SB (s); + apbi = SB_AP_BITMAP(s); + + get_bit_address (s, block, &nr, &offset); + + if (nr >= sb_bmap_nr (rs)) { + reiserfs_warning (s, "vs-4075: reiserfs_free_block: " + "block %lu is out of range on %s", + block, reiserfs_bdevname (s)); + return; + } + + reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ; + + /* clear bit for the given block in bit map */ + if (!reiserfs_test_and_clear_le_bit (offset, apbi[nr].bh->b_data)) { + reiserfs_warning (s, "vs-4080: reiserfs_free_block: " + "free_block (%s:%lu)[dev:blocknr]: bit already cleared", + reiserfs_bdevname (s), block); + } + apbi[nr].free_count ++; + journal_mark_dirty (th, s, apbi[nr].bh); + + reiserfs_prepare_for_journal(s, sbh, 1) ; + /* update super block */ + set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 ); + + journal_mark_dirty (th, s, sbh); + if (for_unformatted) + DQUOT_FREE_BLOCK_NODIRTY(inode, 1); +} + +void reiserfs_free_block (struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) +{ + struct super_block * s = th->t_super; + + BUG_ON (!th->t_trans_id); + + RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); + RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block"); + /* mark it before we clear it, just in case */ + journal_mark_freed(th, s, block) ; + _reiserfs_free_block(th, inode, block, for_unformatted) ; +} + +/* preallocated blocks don't need to be run through journal_mark_freed */ +static void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block) { + RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device"); + RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block"); + BUG_ON (!th->t_trans_id); + _reiserfs_free_block(th, inode, block, 1) ; +} + +static void __discard_prealloc (struct reiserfs_transaction_handle * th, + struct reiserfs_inode_info *ei) +{ + unsigned long save = ei->i_prealloc_block ; + int dirty = 0; + struct inode *inode = &ei->vfs_inode; + BUG_ON (!th->t_trans_id); +#ifdef CONFIG_REISERFS_CHECK + if (ei->i_prealloc_count < 0) + reiserfs_warning (th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.", __FUNCTION__ ); +#endif + while (ei->i_prealloc_count > 0) { + reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); + ei->i_prealloc_block++; + ei->i_prealloc_count --; + dirty = 1; + } + if (dirty) + reiserfs_update_sd(th, inode); + ei->i_prealloc_block = save; + list_del_init(&(ei->i_prealloc_list)); +} + +/* FIXME: It should be inline function */ +void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + struct reiserfs_inode_info *ei = REISERFS_I(inode); + BUG_ON (!th->t_trans_id); + if (ei->i_prealloc_count) + __discard_prealloc(th, ei); +} + +void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th) +{ + struct list_head * plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; + + BUG_ON (!th->t_trans_id); + + while (!list_empty(plist)) { + struct reiserfs_inode_info *ei; + ei = list_entry(plist->next, struct reiserfs_inode_info, i_prealloc_list); +#ifdef CONFIG_REISERFS_CHECK + if (!ei->i_prealloc_count) { + reiserfs_warning (th->t_super, "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", __FUNCTION__); + } +#endif + __discard_prealloc(th, ei); + } +} + +void reiserfs_init_alloc_options (struct super_block *s) +{ + set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); + set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); + set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); +} + +/* block allocator related options are parsed here */ +int reiserfs_parse_alloc_options(struct super_block * s, char * options) +{ + char * this_char, * value; + + REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */ + + while ( (this_char = strsep (&options, ":")) != NULL ) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, "concentrating_formatted_nodes")) { + int temp; + SET_OPTION(concentrating_formatted_nodes); + temp = (value && *value) ? simple_strtoul (value, &value, 0) : 10; + if (temp <= 0 || temp > 100) { + REISERFS_SB(s)->s_alloc_options.border = 10; + } else { + REISERFS_SB(s)->s_alloc_options.border = 100 / temp; + } + continue; + } + if (!strcmp(this_char, "displacing_large_files")) { + SET_OPTION(displacing_large_files); + REISERFS_SB(s)->s_alloc_options.large_file_size = + (value && *value) ? simple_strtoul (value, &value, 0) : 16; + continue; + } + if (!strcmp(this_char, "displacing_new_packing_localities")) { + SET_OPTION(displacing_new_packing_localities); + continue; + }; + + if (!strcmp(this_char, "old_hashed_relocation")) { + SET_OPTION(old_hashed_relocation); + continue; + } + + if (!strcmp(this_char, "new_hashed_relocation")) { + SET_OPTION(new_hashed_relocation); + continue; + } + + if (!strcmp(this_char, "dirid_groups")) { + SET_OPTION(dirid_groups); + continue; + } + if (!strcmp(this_char, "oid_groups")) { + SET_OPTION(oid_groups); + continue; + } + if (!strcmp(this_char, "packing_groups")) { + SET_OPTION(packing_groups); + continue; + } + if (!strcmp(this_char, "hashed_formatted_nodes")) { + SET_OPTION(hashed_formatted_nodes); + continue; + } + + if (!strcmp(this_char, "skip_busy")) { + SET_OPTION(skip_busy); + continue; + } + + if (!strcmp(this_char, "hundredth_slices")) { + SET_OPTION(hundredth_slices); + continue; + } + + if (!strcmp(this_char, "old_way")) { + SET_OPTION(old_way); + continue; + } + + if (!strcmp(this_char, "displace_based_on_dirid")) { + SET_OPTION(displace_based_on_dirid); + continue; + } + + if (!strcmp(this_char, "preallocmin")) { + REISERFS_SB(s)->s_alloc_options.preallocmin = + (value && *value) ? simple_strtoul (value, &value, 0) : 4; + continue; + } + + if (!strcmp(this_char, "preallocsize")) { + REISERFS_SB(s)->s_alloc_options.preallocsize = + (value && *value) ? simple_strtoul (value, &value, 0) : PREALLOCATION_SIZE; + continue; + } + + reiserfs_warning (s, "zam-4001: %s : unknown option - %s", + __FUNCTION__ , this_char); + return 1; + } + + reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); + return 0; +} + +static inline void new_hashed_relocation (reiserfs_blocknr_hint_t * hint) +{ + char * hash_in; + if (hint->formatted_node) { + hash_in = (char*)&hint->key.k_dir_id; + } else { + if (!hint->inode) { + //hint->search_start = hint->beg; + hash_in = (char*)&hint->key.k_dir_id; + } else + if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); + } + + hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); +} + +/* + * Relocation based on dirid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a seperate policy covers them + */ +static void +dirid_groups (reiserfs_blocknr_hint_t *hint) +{ + unsigned long hash; + __u32 dirid = 0; + int bm = 0; + struct super_block *sb = hint->th->t_super; + if (hint->inode) + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + else if (hint->formatted_node) + dirid = hint->key.k_dir_id; + + if (dirid) { + bm = bmap_hash_id(sb, dirid); + hash = bm * (sb->s_blocksize << 3); + /* give a portion of the block group to metadata */ + if (hint->inode) + hash += sb->s_blocksize/2; + hint->search_start = hash; + } +} + +/* + * Relocation based on oid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a seperate policy covers them + */ +static void +oid_groups (reiserfs_blocknr_hint_t *hint) +{ + if (hint->inode) { + unsigned long hash; + __u32 oid; + __u32 dirid; + int bm; + + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + + /* keep the root dir and it's first set of subdirs close to + * the start of the disk + */ + if (dirid <= 2) + hash = (hint->inode->i_sb->s_blocksize << 3); + else { + oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); + bm = bmap_hash_id(hint->inode->i_sb, oid); + hash = bm * (hint->inode->i_sb->s_blocksize << 3); + } + hint->search_start = hash; + } +} + +/* returns 1 if it finds an indirect item and gets valid hint info + * from it, otherwise 0 + */ +static int get_left_neighbor(reiserfs_blocknr_hint_t *hint) +{ + struct path * path; + struct buffer_head * bh; + struct item_head * ih; + int pos_in_item; + __u32 * item; + int ret = 0; + + if (!hint->path) /* reiserfs code can call this function w/o pointer to path + * structure supplied; then we rely on supplied search_start */ + return 0; + + path = hint->path; + bh = get_last_bh(path); + RFALSE( !bh, "green-4002: Illegal path specified to get_left_neighbor"); + ih = get_ih(path); + pos_in_item = path->pos_in_item; + item = get_item (path); + + hint->search_start = bh->b_blocknr; + + if (!hint->formatted_node && is_indirect_le_ih (ih)) { + /* for indirect item: go to left and look for the first non-hole entry + in the indirect item */ + if (pos_in_item == I_UNFM_NUM (ih)) + pos_in_item--; +// pos_in_item = I_UNFM_NUM (ih) - 1; + while (pos_in_item >= 0) { + int t=get_block_num(item,pos_in_item); + if (t) { + hint->search_start = t; + ret = 1; + break; + } + pos_in_item --; + } + } + + /* does result value fit into specified region? */ + return ret; +} + +/* should be, if formatted node, then try to put on first part of the device + specified as number of percent with mount option device, else try to put + on last of device. This is not to say it is good code to do so, + but the effect should be measured. */ +static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t *hint) +{ + b_blocknr_t border = SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; + + if (hint->formatted_node) + hint->end = border - 1; + else + hint->beg = border; +} + +static inline void displace_large_file(reiserfs_blocknr_hint_t *hint) +{ + if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), 4) % (hint->end - hint->beg); + else + hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), 4) % (hint->end - hint->beg); +} + +static inline void hash_formatted_node(reiserfs_blocknr_hint_t *hint) +{ + char * hash_in; + + if (!hint->inode) + hash_in = (char*)&hint->key.k_dir_id; + else if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); + + hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); +} + +static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *hint) +{ + return hint->block == REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; +} + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES +static inline void displace_new_packing_locality (reiserfs_blocknr_hint_t *hint) +{ + struct reiserfs_key * key = &hint->key; + + hint->th->displace_new_blocks = 0; + hint->search_start = hint->beg + keyed_hash((char*)(&key->k_objectid),4) % (hint->end - hint->beg); +} + #endif + +static inline int old_hashed_relocation (reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border; + u32 hash_in; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); + border = hint->beg + (u32) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1); + if (border > hint->search_start) + hint->search_start = border; + + return 1; + } + +static inline int old_way (reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + border = hint->beg + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - hint->beg); + if (border > hint->search_start) + hint->search_start = border; + + return 1; +} + +static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint) +{ + struct reiserfs_key * key = &hint->key; + b_blocknr_t slice_start; + + slice_start = (keyed_hash((char*)(&key->k_dir_id),4) % 100) * (hint->end / 100); + if ( slice_start > hint->search_start || slice_start + (hint->end / 100) <= hint->search_start) { + hint->search_start = slice_start; + } +} + +static void determine_search_start(reiserfs_blocknr_hint_t *hint, + int amount_needed) +{ + struct super_block *s = hint->th->t_super; + int unfm_hint; + + hint->beg = 0; + hint->end = SB_BLOCK_COUNT(s) - 1; + + /* This is former border algorithm. Now with tunable border offset */ + if (concentrating_formatted_nodes(s)) + set_border_in_hint(s, hint); + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* whenever we create a new directory, we displace it. At first we will + hash for location, later we might look for a moderately empty place for + it */ + if (displacing_new_packing_localities(s) + && hint->th->displace_new_blocks) { + displace_new_packing_locality(hint); + + /* we do not continue determine_search_start, + * if new packing locality is being displaced */ + return; + } +#endif + + /* all persons should feel encouraged to add more special cases here and + * test them */ + + if (displacing_large_files(s) && !hint->formatted_node + && this_blocknr_allocation_would_make_it_a_large_file(hint)) { + displace_large_file(hint); + return; + } + + /* if none of our special cases is relevant, use the left neighbor in the + tree order of the new node we are allocating for */ + if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) { + hash_formatted_node(hint); + return; + } + + unfm_hint = get_left_neighbor(hint); + + /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation, + new blocks are displaced based on directory ID. Also, if suggested search_start + is less than last preallocated block, we start searching from it, assuming that + HDD dataflow is faster in forward direction */ + if ( TEST_OPTION(old_way, s)) { + if (!hint->formatted_node) { + if ( !reiserfs_hashed_relocation(s)) + old_way(hint); + else if (!reiserfs_no_unhashed_relocation(s)) + old_hashed_relocation(hint); + + if ( hint->inode && hint->search_start < REISERFS_I(hint->inode)->i_prealloc_block) + hint->search_start = REISERFS_I(hint->inode)->i_prealloc_block; + } + return; + } + + /* This is an approach proposed by Hans */ + if ( TEST_OPTION(hundredth_slices, s) && ! (displacing_large_files(s) && !hint->formatted_node)) { + hundredth_slices(hint); + return; + } + + /* old_hashed_relocation only works on unformatted */ + if (!unfm_hint && !hint->formatted_node && + TEST_OPTION(old_hashed_relocation, s)) + { + old_hashed_relocation(hint); + } + /* new_hashed_relocation works with both formatted/unformatted nodes */ + if ((!unfm_hint || hint->formatted_node) && + TEST_OPTION(new_hashed_relocation, s)) + { + new_hashed_relocation(hint); + } + /* dirid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s)) + { + dirid_groups(hint); + } + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (hint->formatted_node && TEST_OPTION(dirid_groups,s)) + { + dirid_groups(hint); + } +#endif + + /* oid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s)) + { + oid_groups(hint); + } + return; +} + +static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) +{ + /* make minimum size a mount option and benchmark both ways */ + /* we preallocate blocks only for regular files, specific size */ + /* benchmark preallocating always and see what happens */ + + hint->prealloc_size = 0; + + if (!hint->formatted_node && hint->preallocate) { + if (S_ISREG(hint->inode->i_mode) + && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocmin * hint->inode->i_sb->s_blocksize) + hint->prealloc_size = REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocsize - 1; + } + return CARRY_ON; +} + +/* XXX I know it could be merged with upper-level function; + but may be result function would be too complex. */ +static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + b_blocknr_t start, b_blocknr_t finish, + int min, + int amount_needed, int prealloc_size) +{ + int rest = amount_needed; + int nr_allocated; + + while (rest > 0 && start <= finish) { + nr_allocated = scan_bitmap (hint->th, &start, finish, min, + rest + prealloc_size, !hint->formatted_node, + hint->block); + + if (nr_allocated == 0) /* no new blocks allocated, return */ + break; + + /* fill free_blocknrs array first */ + while (rest > 0 && nr_allocated > 0) { + * new_blocknrs ++ = start ++; + rest --; nr_allocated --; + } + + /* do we have something to fill prealloc. array also ? */ + if (nr_allocated > 0) { + /* it means prealloc_size was greater that 0 and we do preallocation */ + list_add(&REISERFS_I(hint->inode)->i_prealloc_list, + &SB_JOURNAL(hint->th->t_super)->j_prealloc_list); + REISERFS_I(hint->inode)->i_prealloc_block = start; + REISERFS_I(hint->inode)->i_prealloc_count = nr_allocated; + break; + } + } + + return (amount_needed - rest); +} + +static inline int blocknrs_and_prealloc_arrays_from_search_start + (reiserfs_blocknr_hint_t *hint, b_blocknr_t *new_blocknrs, int amount_needed) +{ + struct super_block *s = hint->th->t_super; + b_blocknr_t start = hint->search_start; + b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; + int passno = 0; + int nr_allocated = 0; + int bigalloc = 0; + + determine_prealloc_size(hint); + if (!hint->formatted_node) { + int quota_ret; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid); +#endif + quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed); + if (quota_ret) /* Quota exceeded? */ + return QUOTA_EXCEEDED; + if (hint->preallocate && hint->prealloc_size ) { +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid); +#endif + quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size); + if (quota_ret) + hint->preallocate=hint->prealloc_size=0; + } + /* for unformatted nodes, force large allocations */ + bigalloc = amount_needed; + } + + do { + /* in bigalloc mode, nr_allocated should stay zero until + * the entire allocation is filled + */ + if (unlikely(bigalloc && nr_allocated)) { + reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n", + bigalloc, nr_allocated); + /* reset things to a sane value */ + bigalloc = amount_needed - nr_allocated; + } + /* + * try pass 0 and pass 1 looking for a nice big + * contiguous allocation. Then reset and look + * for anything you can find. + */ + if (passno == 2 && bigalloc) { + passno = 0; + bigalloc = 0; + } + switch (passno++) { + case 0: /* Search from hint->search_start to end of disk */ + start = hint->search_start; + finish = SB_BLOCK_COUNT(s) - 1; + break; + case 1: /* Search from hint->beg to hint->search_start */ + start = hint->beg; + finish = hint->search_start; + break; + case 2: /* Last chance: Search from 0 to hint->beg */ + start = 0; + finish = hint->beg; + break; + default: /* We've tried searching everywhere, not enough space */ + /* Free the blocks */ + if (!hint->formatted_node) { +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid); +#endif + DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */ + } + while (nr_allocated --) + reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node); + + return NO_DISK_SPACE; + } + } while ((nr_allocated += allocate_without_wrapping_disk (hint, + new_blocknrs + nr_allocated, start, finish, + bigalloc ? bigalloc : 1, + amount_needed - nr_allocated, + hint->prealloc_size)) + < amount_needed); + if ( !hint->formatted_node && + amount_needed + hint->prealloc_size > + nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { + /* Some of preallocation blocks were not allocated */ +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (failed prealloc) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); +#endif + DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + + hint->prealloc_size - nr_allocated - + REISERFS_I(hint->inode)->i_prealloc_count); + } + + return CARRY_ON; +} + +/* grab new blocknrs from preallocated list */ +/* return amount still needed after using them */ +static int use_preallocated_list_if_available (reiserfs_blocknr_hint_t *hint, + b_blocknr_t *new_blocknrs, int amount_needed) +{ + struct inode * inode = hint->inode; + + if (REISERFS_I(inode)->i_prealloc_count > 0) { + while (amount_needed) { + + *new_blocknrs ++ = REISERFS_I(inode)->i_prealloc_block ++; + REISERFS_I(inode)->i_prealloc_count --; + + amount_needed --; + + if (REISERFS_I(inode)->i_prealloc_count <= 0) { + list_del(&REISERFS_I(inode)->i_prealloc_list); + break; + } + } + } + /* return amount still needed after using preallocated blocks */ + return amount_needed; +} + +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, + b_blocknr_t * new_blocknrs, int amount_needed, + int reserved_by_us /* Amount of blocks we have + already reserved */) +{ + int initial_amount_needed = amount_needed; + int ret; + struct super_block *s = hint->th->t_super; + + /* Check if there is enough space, taking into account reserved space */ + if ( SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < + amount_needed - reserved_by_us) + return NO_DISK_SPACE; + /* should this be if !hint->inode && hint->preallocate? */ + /* do you mean hint->formatted_node can be removed ? - Zam */ + /* hint->formatted_node cannot be removed because we try to access + inode information here, and there is often no inode assotiated with + metadata allocations - green */ + + if (!hint->formatted_node && hint->preallocate) { + amount_needed = use_preallocated_list_if_available + (hint, new_blocknrs, amount_needed); + if (amount_needed == 0) /* all blocknrs we need we got from + prealloc. list */ + return CARRY_ON; + new_blocknrs += (initial_amount_needed - amount_needed); + } + + /* find search start and save it in hint structure */ + determine_search_start(hint, amount_needed); + if (hint->search_start >= SB_BLOCK_COUNT(s)) + hint->search_start = SB_BLOCK_COUNT(s) - 1; + + /* allocation itself; fill new_blocknrs and preallocation arrays */ + ret = blocknrs_and_prealloc_arrays_from_search_start + (hint, new_blocknrs, amount_needed); + + /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we + * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second + * variant) */ + + if (ret != CARRY_ON) { + while (amount_needed ++ < initial_amount_needed) { + reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1); + } + } + return ret; +} + +/* These 2 functions are here to provide blocks reservation to the rest of kernel */ +/* Reserve @blocks amount of blocks in fs pointed by @sb. Caller must make sure + there are actually this much blocks on the FS available */ +void reiserfs_claim_blocks_to_be_allocated( + struct super_block *sb, /* super block of + filesystem where + blocks should be + reserved */ + int blocks /* How much to reserve */ + ) +{ + + /* Fast case, if reservation is zero - exit immediately. */ + if ( !blocks ) + return; + + spin_lock(&REISERFS_SB(sb)->bitmap_lock); + REISERFS_SB(sb)->reserved_blocks += blocks; + spin_unlock(&REISERFS_SB(sb)->bitmap_lock); +} + +/* Unreserve @blocks amount of blocks in fs pointed by @sb */ +void reiserfs_release_claimed_blocks( + struct super_block *sb, /* super block of + filesystem where + blocks should be + reserved */ + int blocks /* How much to unreserve */ + ) +{ + + /* Fast case, if unreservation is zero - exit immediately. */ + if ( !blocks ) + return; + + spin_lock(&REISERFS_SB(sb)->bitmap_lock); + REISERFS_SB(sb)->reserved_blocks -= blocks; + spin_unlock(&REISERFS_SB(sb)->bitmap_lock); + RFALSE( REISERFS_SB(sb)->reserved_blocks < 0, "amount of blocks reserved became zero?"); +} + +/* This function estimates how much pages we will be able to write to FS + used for reiserfs_file_write() purposes for now. */ +int reiserfs_can_fit_pages ( struct super_block *sb /* superblock of filesystem + to estimate space */ ) +{ + int space; + + spin_lock(&REISERFS_SB(sb)->bitmap_lock); + space = (SB_FREE_BLOCKS(sb) - REISERFS_SB(sb)->reserved_blocks) >> ( PAGE_CACHE_SHIFT - sb->s_blocksize_bits); + spin_unlock(&REISERFS_SB(sb)->bitmap_lock); + + return space>0?space:0; +} diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c new file mode 100644 index 000000000000..d1514a9b0514 --- /dev/null +++ b/fs/reiserfs/dir.c @@ -0,0 +1,275 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include <linux/config.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/reiserfs_fs.h> +#include <linux/stat.h> +#include <linux/smp_lock.h> +#include <linux/buffer_head.h> +#include <asm/uaccess.h> + +extern struct reiserfs_key MIN_KEY; + +static int reiserfs_readdir (struct file *, void *, filldir_t); +static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) ; + +struct file_operations reiserfs_dir_operations = { + .read = generic_read_dir, + .readdir = reiserfs_readdir, + .fsync = reiserfs_dir_fsync, + .ioctl = reiserfs_ioctl, +}; + +static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) { + struct inode *inode = dentry->d_inode; + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_commit_for_inode(inode) ; + reiserfs_write_unlock(inode->i_sb) ; + if (err < 0) + return err; + return 0; +} + + +#define store_ih(where,what) copy_item_head (where, what) + +// +static int reiserfs_readdir (struct file * filp, void * dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + INITIALIZE_PATH (path_to_entry); + struct buffer_head * bh; + int item_num, entry_num; + const struct reiserfs_key * rkey; + struct item_head * ih, tmp_ih; + int search_res; + char * local_buf; + loff_t next_pos; + char small_buf[32] ; /* avoid kmalloc if we can */ + struct reiserfs_dir_entry de; + int ret = 0; + + reiserfs_write_lock(inode->i_sb); + + reiserfs_check_lock_depth(inode->i_sb, "readdir") ; + + /* form key for search the next directory entry using f_pos field of + file structure */ + make_cpu_key (&pos_key, inode, (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, + TYPE_DIRENTRY, 3); + next_pos = cpu_key_k_offset (&pos_key); + + /* reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos);*/ + + path_to_entry.reada = PATH_READA; + while (1) { + research: + /* search the directory item, containing entry with specified key */ + search_res = search_by_entry_key (inode->i_sb, &pos_key, &path_to_entry, &de); + if (search_res == IO_ERROR) { + // FIXME: we could just skip part of directory which could + // not be read + ret = -EIO; + goto out; + } + entry_num = de.de_entry_num; + bh = de.de_bh; + item_num = de.de_item_num; + ih = de.de_ih; + store_ih (&tmp_ih, ih); + + /* we must have found item, that is item of this directory, */ + RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key), + "vs-9000: found item %h does not match to dir we readdir %K", + ih, &pos_key); + RFALSE( item_num > B_NR_ITEMS (bh) - 1, + "vs-9005 item_num == %d, item amount == %d", + item_num, B_NR_ITEMS (bh)); + + /* and entry must be not more than number of entries in the item */ + RFALSE( I_ENTRY_COUNT (ih) < entry_num, + "vs-9010: entry number is too big %d (%d)", + entry_num, I_ENTRY_COUNT (ih)); + + if (search_res == POSITION_FOUND || entry_num < I_ENTRY_COUNT (ih)) { + /* go through all entries in the directory item beginning from the entry, that has been found */ + struct reiserfs_de_head * deh = B_I_DEH (bh, ih) + entry_num; + + for (; entry_num < I_ENTRY_COUNT (ih); entry_num ++, deh ++) { + int d_reclen; + char * d_name; + off_t d_off; + ino_t d_ino; + + if (!de_visible (deh)) + /* it is hidden entry */ + continue; + d_reclen = entry_length (bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh); + if (!d_name[d_reclen - 1]) + d_reclen = strlen (d_name); + + if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){ + /* too big to send back to VFS */ + continue ; + } + + /* Ignore the .reiserfs_priv entry */ + if (reiserfs_xattrs (inode->i_sb) && + !old_format_only(inode->i_sb) && + filp->f_dentry == inode->i_sb->s_root && + REISERFS_SB(inode->i_sb)->priv_root && + REISERFS_SB(inode->i_sb)->priv_root->d_inode && + deh_objectid(deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) { + continue; + } + + d_off = deh_offset (deh); + filp->f_pos = d_off ; + d_ino = deh_objectid (deh); + if (d_reclen <= 32) { + local_buf = small_buf ; + } else { + local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ; + if (!local_buf) { + pathrelse (&path_to_entry); + ret = -ENOMEM ; + goto out; + } + if (item_moved (&tmp_ih, &path_to_entry)) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + goto research; + } + } + // Note, that we copy name to user space via temporary + // buffer (local_buf) because filldir will block if + // user space buffer is swapped out. At that time + // entry can move to somewhere else + memcpy (local_buf, d_name, d_reclen); + if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, + DT_UNKNOWN) < 0) { + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + } + goto end; + } + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + } + + // next entry should be looked for with such offset + next_pos = deh_offset (deh) + 1; + + if (item_moved (&tmp_ih, &path_to_entry)) { + goto research; + } + } /* for */ + } + + if (item_num != B_NR_ITEMS (bh) - 1) + // end of directory has been reached + goto end; + + /* item we went through is last item of node. Using right + delimiting key check is it directory end */ + rkey = get_rkey (&path_to_entry, inode->i_sb); + if (! comp_le_keys (rkey, &MIN_KEY)) { + /* set pos_key to key, that is the smallest and greater + that key of the last entry in the item */ + set_cpu_key_k_offset (&pos_key, next_pos); + continue; + } + + if ( COMP_SHORT_KEYS (rkey, &pos_key)) { + // end of directory has been reached + goto end; + } + + /* directory continues in the right neighboring block */ + set_cpu_key_k_offset (&pos_key, le_key_k_offset (KEY_FORMAT_3_5, rkey)); + + } /* while */ + + + end: + filp->f_pos = next_pos; + pathrelse (&path_to_entry); + reiserfs_check_path(&path_to_entry) ; + out: + reiserfs_write_unlock(inode->i_sb); + return ret; +} + +/* compose directory item containing "." and ".." entries (entries are + not aligned to 4 byte boundary) */ +/* the last four params are LE */ +void make_empty_dir_item_v1 (char * body, __u32 dirid, __u32 objid, + __u32 par_dirid, __u32 par_objid) +{ + struct reiserfs_de_head * deh; + + memset (body, 0, EMPTY_DIR_SIZE_V1); + deh = (struct reiserfs_de_head *)body; + + /* direntry header of "." */ + put_deh_offset( &(deh[0]), DOT_OFFSET ); + /* these two are from make_le_item_head, and are are LE */ + deh[0].deh_dir_id = dirid; + deh[0].deh_objectid = objid; + deh[0].deh_state = 0; /* Endian safe if 0 */ + put_deh_location( &(deh[0]), EMPTY_DIR_SIZE_V1 - strlen( "." )); + mark_de_visible(&(deh[0])); + + /* direntry header of ".." */ + put_deh_offset( &(deh[1]), DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + deh[1].deh_dir_id = par_dirid; + deh[1].deh_objectid = par_objid; + deh[1].deh_state = 0; /* Endian safe if 0 */ + put_deh_location( &(deh[1]), deh_location( &(deh[0]) ) - strlen( ".." ) ); + mark_de_visible(&(deh[1])); + + /* copy ".." and "." */ + memcpy (body + deh_location( &(deh[0]) ), ".", 1); + memcpy (body + deh_location( &(deh[1]) ), "..", 2); +} + +/* compose directory item containing "." and ".." entries */ +void make_empty_dir_item (char * body, __u32 dirid, __u32 objid, + __u32 par_dirid, __u32 par_objid) +{ + struct reiserfs_de_head * deh; + + memset (body, 0, EMPTY_DIR_SIZE); + deh = (struct reiserfs_de_head *)body; + + /* direntry header of "." */ + put_deh_offset( &(deh[0]), DOT_OFFSET ); + /* these two are from make_le_item_head, and are are LE */ + deh[0].deh_dir_id = dirid; + deh[0].deh_objectid = objid; + deh[0].deh_state = 0; /* Endian safe if 0 */ + put_deh_location( &(deh[0]), EMPTY_DIR_SIZE - ROUND_UP( strlen( "." ) ) ); + mark_de_visible(&(deh[0])); + + /* direntry header of ".." */ + put_deh_offset( &(deh[1]), DOT_DOT_OFFSET ); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + deh[1].deh_dir_id = par_dirid; + deh[1].deh_objectid = par_objid; + deh[1].deh_state = 0; /* Endian safe if 0 */ + put_deh_location( &(deh[1]), deh_location( &(deh[0])) - ROUND_UP( strlen( ".." ) ) ); + mark_de_visible(&(deh[1])); + + /* copy ".." and "." */ + memcpy (body + deh_location( &(deh[0]) ), ".", 1); + memcpy (body + deh_location( &(deh[1]) ), "..", 2); +} diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c new file mode 100644 index 000000000000..2118db2896c7 --- /dev/null +++ b/fs/reiserfs/do_balan.c @@ -0,0 +1,1597 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* Now we have all buffers that must be used in balancing of the tree */ +/* Further calculations can not cause schedule(), and thus the buffer */ +/* tree will be stable until the balancing will be finished */ +/* balance the tree according to the analysis made before, */ +/* and using buffers obtained after all above. */ + + +/** + ** balance_leaf_when_delete + ** balance_leaf + ** do_balance + ** + **/ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <linux/time.h> +#include <linux/reiserfs_fs.h> +#include <linux/buffer_head.h> + +#ifdef CONFIG_REISERFS_CHECK + +struct tree_balance * cur_tb = NULL; /* detects whether more than one + copy of tb exists as a means + of checking whether schedule + is interrupting do_balance */ +#endif + +inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, + struct buffer_head * bh, int flag) +{ + journal_mark_dirty(tb->transaction_handle, + tb->transaction_handle->t_super, bh) ; +} + +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + + +/* summary: + if deleting something ( tb->insert_size[0] < 0 ) + return(balance_leaf_when_delete()); (flag d handled here) + else + if lnum is larger than 0 we put items into the left node + if rnum is larger than 0 we put items into the right node + if snum1 is larger than 0 we put items into the new node s1 + if snum2 is larger than 0 we put items into the new node s2 +Note that all *num* count new items being created. + +It would be easier to read balance_leaf() if each of these summary +lines was a separate procedure rather than being inlined. I think +that there are many passages here and in balance_leaf_when_delete() in +which two calls to one procedure can replace two passages, and it +might save cache space and improve software maintenance costs to do so. + +Vladimir made the perceptive comment that we should offload most of +the decision making in this function into fix_nodes/check_balance, and +then create some sort of structure in tb that says what actions should +be performed by do_balance. + +-Hans */ + + + +/* Balance leaf node in case of delete or cut: insert_size[0] < 0 + * + * lnum, rnum can have values >= -1 + * -1 means that the neighbor must be joined with S + * 0 means that nothing should be done with the neighbor + * >0 means to shift entirely or partly the specified number of items to the neighbor + */ +static int balance_leaf_when_delete (struct tree_balance * tb, int flag) +{ + struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + int item_pos = PATH_LAST_POSITION (tb->tb_path); + int pos_in_item = tb->tb_path->pos_in_item; + struct buffer_info bi; + int n; + struct item_head * ih; + + RFALSE( tb->FR[0] && B_LEVEL (tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, + "vs- 12000: level: wrong FR %z", tb->FR[0]); + RFALSE( tb->blknum[0] > 1, + "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]); + RFALSE( ! tb->blknum[0] && ! PATH_H_PPARENT(tb->tb_path, 0), + "PAP-12010: tree can not be empty"); + + ih = B_N_PITEM_HEAD (tbS0, item_pos); + + /* Delete or truncate the item */ + + switch (flag) { + case M_DELETE: /* delete item in S[0] */ + + RFALSE( ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], + "vs-12013: mode Delete, insert size %d, ih to be deleted %h", + -tb->insert_size [0], ih); + + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + leaf_delete_items (&bi, 0, item_pos, 1, -1); + + if ( ! item_pos && tb->CFL[0] ) { + if ( B_NR_ITEMS(tbS0) ) { + replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0); + } + else { + if ( ! PATH_H_POSITION (tb->tb_path, 1) ) + replace_key(tb, tb->CFL[0],tb->lkey[0],PATH_H_PPARENT(tb->tb_path, 0),0); + } + } + + RFALSE( ! item_pos && !tb->CFL[0], + "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], tb->L[0]); + + break; + + case M_CUT: { /* cut item in S[0] */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + if (is_direntry_le_ih (ih)) { + + /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ + /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */ + tb->insert_size[0] = -1; + leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]); + + RFALSE( ! item_pos && ! pos_in_item && ! tb->CFL[0], + "PAP-12030: can not change delimiting key. CFL[0]=%p", + tb->CFL[0]); + + if ( ! item_pos && ! pos_in_item && tb->CFL[0] ) { + replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0); + } + } else { + leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]); + + RFALSE( ! ih_item_len(ih), + "PAP-12035: cut must leave non-zero dynamic length of item"); + } + break; + } + + default: + print_cur_tb ("12040"); + reiserfs_panic (tb->tb_sb, "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)", + (flag == M_PASTE) ? "PASTE" : ((flag == M_INSERT) ? "INSERT" : "UNKNOWN"), flag); + } + + /* the rule is that no shifting occurs unless by shifting a node can be freed */ + n = B_NR_ITEMS(tbS0); + if ( tb->lnum[0] ) /* L[0] takes part in balancing */ + { + if ( tb->lnum[0] == -1 ) /* L[0] must be joined with S[0] */ + { + if ( tb->rnum[0] == -1 ) /* R[0] must be also joined with S[0] */ + { + if ( tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0) ) + { + /* all contents of all the 3 buffers will be in L[0] */ + if ( PATH_H_POSITION (tb->tb_path, 1) == 0 && 1 < B_NR_ITEMS(tb->FR[0]) ) + replace_key(tb, tb->CFL[0],tb->lkey[0],tb->FR[0],1); + + leaf_move_items (LEAF_FROM_S_TO_L, tb, n, -1, NULL); + leaf_move_items (LEAF_FROM_R_TO_L, tb, B_NR_ITEMS(tb->R[0]), -1, NULL); + + reiserfs_invalidate_buffer (tb, tbS0); + reiserfs_invalidate_buffer (tb, tb->R[0]); + + return 0; + } + /* all contents of all the 3 buffers will be in R[0] */ + leaf_move_items (LEAF_FROM_S_TO_R, tb, n, -1, NULL); + leaf_move_items (LEAF_FROM_L_TO_R, tb, B_NR_ITEMS(tb->L[0]), -1, NULL); + + /* right_delimiting_key is correct in R[0] */ + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + + reiserfs_invalidate_buffer (tb, tbS0); + reiserfs_invalidate_buffer (tb, tb->L[0]); + + return -1; + } + + RFALSE( tb->rnum[0] != 0, + "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); + /* all contents of L[0] and S[0] will be in L[0] */ + leaf_shift_left(tb, n, -1); + + reiserfs_invalidate_buffer (tb, tbS0); + + return 0; + } + /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */ + + RFALSE( ( tb->lnum[0] + tb->rnum[0] < n ) || + ( tb->lnum[0] + tb->rnum[0] > n+1 ), + "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent", + tb->rnum[0], tb->lnum[0], n); + RFALSE( ( tb->lnum[0] + tb->rnum[0] == n ) && + (tb->lbytes != -1 || tb->rbytes != -1), + "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split", + tb->rbytes, tb->lbytes); + RFALSE( ( tb->lnum[0] + tb->rnum[0] == n + 1 ) && + (tb->lbytes < 1 || tb->rbytes != -1), + "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split", + tb->rbytes, tb->lbytes); + + leaf_shift_left (tb, tb->lnum[0], tb->lbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + reiserfs_invalidate_buffer (tb, tbS0); + + return 0; + } + + if ( tb->rnum[0] == -1 ) { + /* all contents of R[0] and S[0] will be in R[0] */ + leaf_shift_right(tb, n, -1); + reiserfs_invalidate_buffer (tb, tbS0); + return 0; + } + + RFALSE( tb->rnum[0], + "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); + return 0; +} + + +static int balance_leaf (struct tree_balance * tb, + struct item_head * ih, /* item header of inserted item (this is on little endian) */ + const char * body, /* body of inserted item or bytes to paste */ + int flag, /* i - insert, d - delete, c - cut, p - paste + (see comment to do_balance) */ + struct item_head * insert_key, /* in our processing of one level we sometimes determine what + must be inserted into the next higher level. This insertion + consists of a key or two keys and their corresponding + pointers */ + struct buffer_head ** insert_ptr /* inserted node-ptrs for the next level */ + ) +{ + struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + int item_pos = PATH_LAST_POSITION (tb->tb_path); /* index into the array of item headers in S[0] + of the affected item */ + struct buffer_info bi; + struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ + int snum[2]; /* number of items that will be placed + into S_new (includes partially shifted + items) */ + int sbytes[2]; /* if an item is partially shifted into S_new then + if it is a directory item + it is the number of entries from the item that are shifted into S_new + else + it is the number of bytes from the item that are shifted into S_new + */ + int n, i; + int ret_val; + int pos_in_item; + int zeros_num; + + PROC_INFO_INC( tb -> tb_sb, balance_at[ 0 ] ); + + /* Make balance in case insert_size[0] < 0 */ + if ( tb->insert_size[0] < 0 ) + return balance_leaf_when_delete (tb, flag); + + zeros_num = 0; + if (flag == M_INSERT && body == 0) + zeros_num = ih_item_len( ih ); + + pos_in_item = tb->tb_path->pos_in_item; + /* for indirect item pos_in_item is measured in unformatted node + pointers. Recalculate to bytes */ + if (flag != M_INSERT && is_indirect_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) + pos_in_item *= UNFM_P_SIZE; + + if ( tb->lnum[0] > 0 ) { + /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ + if ( item_pos < tb->lnum[0] ) { + /* new item or it part falls to L[0], shift it too */ + n = B_NR_ITEMS(tb->L[0]); + + switch (flag) { + case M_INSERT: /* insert item into L[0] */ + + if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) { + /* part of new item falls into L[0] */ + int new_item_len; + int version; + + ret_val = leaf_shift_left (tb, tb->lnum[0]-1, -1); + + /* Calculate item length to insert to S[0] */ + new_item_len = ih_item_len(ih) - tb->lbytes; + /* Calculate and check item length to insert to L[0] */ + put_ih_item_len(ih, ih_item_len(ih) - new_item_len ); + + RFALSE( ih_item_len(ih) <= 0, + "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", + ih_item_len(ih)); + + /* Insert new item into L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, + zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num); + + version = ih_version (ih); + + /* Calculate key component, item length and body to insert into S[0] */ + set_le_ih_k_offset( ih, le_ih_k_offset( ih ) + (tb->lbytes << (is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) ); + + put_ih_item_len( ih, new_item_len ); + if ( tb->lbytes > zeros_num ) { + body += (tb->lbytes - zeros_num); + zeros_num = 0; + } + else + zeros_num -= tb->lbytes; + + RFALSE( ih_item_len(ih) <= 0, + "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d", + ih_item_len(ih)); + } else { + /* new item in whole falls into L[0] */ + /* Shift lnum[0]-1 items to L[0] */ + ret_val = leaf_shift_left(tb, tb->lnum[0]-1, tb->lbytes); + /* Insert new item into L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, zeros_num); + tb->insert_size[0] = 0; + zeros_num = 0; + } + break; + + case M_PASTE: /* append item in L[0] */ + + if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) { + /* we must shift the part of the appended item */ + if ( is_direntry_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) { + + RFALSE( zeros_num, + "PAP-12090: invalid parameter in case of a directory"); + /* directory item */ + if ( tb->lbytes > pos_in_item ) { + /* new directory entry falls into L[0] */ + struct item_head * pasted; + int l_pos_in_item = pos_in_item; + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ + ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); + if ( ret_val && ! item_pos ) { + pasted = B_N_PITEM_HEAD(tb->L[0],B_NR_ITEMS(tb->L[0])-1); + l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes-1); + } + + /* Append given directory entry to directory item */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_paste_in_buffer (&bi, n + item_pos - ret_val, l_pos_in_item, + tb->insert_size[0], body, zeros_num); + + /* previous string prepared space for pasting new entry, following string pastes this entry */ + + /* when we have merge directory item, pos_in_item has been changed too */ + + /* paste new directory entry. 1 is entry number */ + leaf_paste_entries (bi.bi_bh, n + item_pos - ret_val, l_pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0] + ); + tb->insert_size[0] = 0; + } else { + /* new directory item doesn't fall into L[0] */ + /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ + leaf_shift_left (tb, tb->lnum[0], tb->lbytes); + } + /* Calculate new position to append in item body */ + pos_in_item -= tb->lbytes; + } + else { + /* regular object */ + RFALSE( tb->lbytes <= 0, + "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", + tb->lbytes); + RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)), + "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", + ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)), pos_in_item); + + if ( tb->lbytes >= pos_in_item ) { + /* appended item will be in L[0] in whole */ + int l_n; + + /* this bytes number must be appended to the last item of L[h] */ + l_n = tb->lbytes - pos_in_item; + + /* Calculate new insert_size[0] */ + tb->insert_size[0] -= l_n; + + RFALSE( tb->insert_size[0] <= 0, + "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", + tb->insert_size[0]); + ret_val = leaf_shift_left(tb,tb->lnum[0], + ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos))); + /* Append to body of item in L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_paste_in_buffer( + &bi,n + item_pos - ret_val, + ih_item_len( B_N_PITEM_HEAD(tb->L[0],n+item_pos-ret_val)), + l_n,body, zeros_num > l_n ? l_n : zeros_num + ); + /* 0-th item in S0 can be only of DIRECT type when l_n != 0*/ + { + int version; + int temp_l = l_n; + + RFALSE (ih_item_len (B_N_PITEM_HEAD (tbS0, 0)), + "PAP-12106: item length must be 0"); + RFALSE (comp_short_le_keys (B_N_PKEY (tbS0, 0), + B_N_PKEY (tb->L[0], + n + item_pos - ret_val)), + "PAP-12107: items must be of the same file"); + if (is_indirect_le_ih(B_N_PITEM_HEAD (tb->L[0], + n + item_pos - ret_val))) { + temp_l = l_n << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT); + } + /* update key of first item in S0 */ + version = ih_version (B_N_PITEM_HEAD (tbS0, 0)); + set_le_key_k_offset (version, B_N_PKEY (tbS0, 0), + le_key_k_offset (version, B_N_PKEY (tbS0, 0)) + temp_l); + /* update left delimiting key */ + set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), + le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0])) + temp_l); + } + + /* Calculate new body, position in item and insert_size[0] */ + if ( l_n > zeros_num ) { + body += (l_n - zeros_num); + zeros_num = 0; + } + else + zeros_num -= l_n; + pos_in_item = 0; + + RFALSE( comp_short_le_keys + (B_N_PKEY(tbS0,0), + B_N_PKEY(tb->L[0],B_NR_ITEMS(tb->L[0])-1)) || + + !op_is_left_mergeable + (B_N_PKEY (tbS0, 0), tbS0->b_size) || + !op_is_left_mergeable + (B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), + tbS0->b_size), + "PAP-12120: item must be merge-able with left neighboring item"); + } + else /* only part of the appended item will be in L[0] */ + { + /* Calculate position in item for append in S[0] */ + pos_in_item -= tb->lbytes; + + RFALSE( pos_in_item <= 0, + "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item); + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ + leaf_shift_left(tb,tb->lnum[0],tb->lbytes); + } + } + } + else /* appended item will be in L[0] in whole */ + { + struct item_head * pasted; + + if ( ! item_pos && op_is_left_mergeable (B_N_PKEY (tbS0, 0), tbS0->b_size) ) + { /* if we paste into first item of S[0] and it is left mergable */ + /* then increment pos_in_item by the size of the last item in L[0] */ + pasted = B_N_PITEM_HEAD(tb->L[0],n-1); + if ( is_direntry_le_ih (pasted) ) + pos_in_item += ih_entry_count(pasted); + else + pos_in_item += ih_item_len(pasted); + } + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ + ret_val = leaf_shift_left(tb,tb->lnum[0],tb->lbytes); + /* Append to body of item in L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_paste_in_buffer (&bi, n + item_pos - ret_val, pos_in_item, tb->insert_size[0], + body, zeros_num); + + /* if appended item is directory, paste entry */ + pasted = B_N_PITEM_HEAD (tb->L[0], n + item_pos - ret_val); + if (is_direntry_le_ih (pasted)) + leaf_paste_entries ( + bi.bi_bh, n + item_pos - ret_val, pos_in_item, 1, + (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] + ); + /* if appended item is indirect item, put unformatted node into un list */ + if (is_indirect_le_ih (pasted)) + set_ih_free_space (pasted, 0); + tb->insert_size[0] = 0; + zeros_num = 0; + } + break; + default: /* cases d and t */ + reiserfs_panic (tb->tb_sb, "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)", + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); + } + } else { + /* new item doesn't fall into L[0] */ + leaf_shift_left(tb,tb->lnum[0],tb->lbytes); + } + } /* tb->lnum[0] > 0 */ + + /* Calculate new item position */ + item_pos -= ( tb->lnum[0] - (( tb->lbytes != -1 ) ? 1 : 0)); + + if ( tb->rnum[0] > 0 ) { + /* shift rnum[0] items from S[0] to the right neighbor R[0] */ + n = B_NR_ITEMS(tbS0); + switch ( flag ) { + + case M_INSERT: /* insert item */ + if ( n - tb->rnum[0] < item_pos ) + { /* new item or its part falls to R[0] */ + if ( item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1 ) + { /* part of new item falls into R[0] */ + loff_t old_key_comp, old_len, r_zeros_number; + const char * r_body; + int version; + loff_t offset; + + leaf_shift_right(tb,tb->rnum[0]-1,-1); + + version = ih_version(ih); + /* Remember key component and item length */ + old_key_comp = le_ih_k_offset( ih ); + old_len = ih_item_len(ih); + + /* Calculate key component and item length to insert into R[0] */ + offset = le_ih_k_offset( ih ) + ((old_len - tb->rbytes )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)); + set_le_ih_k_offset( ih, offset ); + put_ih_item_len( ih, tb->rbytes); + /* Insert part of the item into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + if ( (old_len - tb->rbytes) > zeros_num ) { + r_zeros_number = 0; + r_body = body + (old_len - tb->rbytes) - zeros_num; + } + else { + r_body = body; + r_zeros_number = zeros_num - (old_len - tb->rbytes); + zeros_num -= r_zeros_number; + } + + leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number); + + /* Replace right delimiting key by first key in R[0] */ + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + + /* Calculate key component and item length to insert into S[0] */ + set_le_ih_k_offset( ih, old_key_comp ); + put_ih_item_len( ih, old_len - tb->rbytes ); + + tb->insert_size[0] -= tb->rbytes; + + } + else /* whole new item falls into R[0] */ + { + /* Shift rnum[0]-1 items to R[0] */ + ret_val = leaf_shift_right(tb,tb->rnum[0]-1,tb->rbytes); + /* Insert new item into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + leaf_insert_into_buf (&bi, item_pos - n + tb->rnum[0] - 1, ih, body, zeros_num); + + if ( item_pos - n + tb->rnum[0] - 1 == 0 ) { + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + + } + zeros_num = tb->insert_size[0] = 0; + } + } + else /* new item or part of it doesn't fall into R[0] */ + { + leaf_shift_right(tb,tb->rnum[0],tb->rbytes); + } + break; + + case M_PASTE: /* append item */ + + if ( n - tb->rnum[0] <= item_pos ) /* pasted item or part of it falls to R[0] */ + { + if ( item_pos == n - tb->rnum[0] && tb->rbytes != -1 ) + { /* we must shift the part of the appended item */ + if ( is_direntry_le_ih (B_N_PITEM_HEAD(tbS0, item_pos))) + { /* we append to directory item */ + int entry_count; + + RFALSE( zeros_num, + "PAP-12145: invalid parameter in case of a directory"); + entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD(tbS0, item_pos)); + if ( entry_count - tb->rbytes < pos_in_item ) + /* new directory entry falls into R[0] */ + { + int paste_entry_position; + + RFALSE( tb->rbytes - 1 >= entry_count || + ! tb->insert_size[0], + "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", + tb->rbytes, entry_count); + /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ + leaf_shift_right(tb,tb->rnum[0],tb->rbytes - 1); + /* Paste given directory entry to directory item */ + paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1; + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + leaf_paste_in_buffer (&bi, 0, paste_entry_position, + tb->insert_size[0],body,zeros_num); + /* paste entry */ + leaf_paste_entries ( + bi.bi_bh, 0, paste_entry_position, 1, (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0] + ); + + if ( paste_entry_position == 0 ) { + /* change delimiting keys */ + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + } + + tb->insert_size[0] = 0; + pos_in_item++; + } + else /* new directory entry doesn't fall into R[0] */ + { + leaf_shift_right(tb,tb->rnum[0],tb->rbytes); + } + } + else /* regular object */ + { + int n_shift, n_rem, r_zeros_number; + const char * r_body; + + /* Calculate number of bytes which must be shifted from appended item */ + if ( (n_shift = tb->rbytes - tb->insert_size[0]) < 0 ) + n_shift = 0; + + RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD (tbS0, item_pos)), + "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", + pos_in_item, ih_item_len( B_N_PITEM_HEAD(tbS0,item_pos))); + + leaf_shift_right(tb,tb->rnum[0],n_shift); + /* Calculate number of bytes which must remain in body after appending to R[0] */ + if ( (n_rem = tb->insert_size[0] - tb->rbytes) < 0 ) + n_rem = 0; + + { + int version; + unsigned long temp_rem = n_rem; + + version = ih_version (B_N_PITEM_HEAD (tb->R[0],0)); + if (is_indirect_le_key(version,B_N_PKEY(tb->R[0],0))){ + temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits - + UNFM_P_SHIFT); + } + set_le_key_k_offset (version, B_N_PKEY(tb->R[0],0), + le_key_k_offset (version, B_N_PKEY(tb->R[0],0)) + temp_rem); + set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0]), + le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) + temp_rem); + } +/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; + k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ + do_balance_mark_internal_dirty (tb, tb->CFR[0], 0); + + /* Append part of body into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + if ( n_rem > zeros_num ) { + r_zeros_number = 0; + r_body = body + n_rem - zeros_num; + } + else { + r_body = body; + r_zeros_number = zeros_num - n_rem; + zeros_num -= r_zeros_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, r_body, r_zeros_number); + + if (is_indirect_le_ih (B_N_PITEM_HEAD(tb->R[0],0))) { +#if 0 + RFALSE( n_rem, + "PAP-12160: paste more than one unformatted node pointer"); +#endif + set_ih_free_space (B_N_PITEM_HEAD(tb->R[0],0), 0); + } + tb->insert_size[0] = n_rem; + if ( ! n_rem ) + pos_in_item ++; + } + } + else /* pasted item in whole falls into R[0] */ + { + struct item_head * pasted; + + ret_val = leaf_shift_right(tb,tb->rnum[0],tb->rbytes); + /* append item in R[0] */ + if ( pos_in_item >= 0 ) { + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + leaf_paste_in_buffer(&bi,item_pos - n + tb->rnum[0], pos_in_item, + tb->insert_size[0],body, zeros_num); + } + + /* paste new entry, if item is directory item */ + pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]); + if (is_direntry_le_ih (pasted) && pos_in_item >= 0 ) { + leaf_paste_entries ( + bi.bi_bh, item_pos - n + tb->rnum[0], pos_in_item, 1, + (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] + ); + if ( ! pos_in_item ) { + + RFALSE( item_pos - n + tb->rnum[0], + "PAP-12165: directory item must be first item of node when pasting is in 0th position"); + + /* update delimiting keys */ + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + } + } + + if (is_indirect_le_ih (pasted)) + set_ih_free_space (pasted, 0); + zeros_num = tb->insert_size[0] = 0; + } + } + else /* new item doesn't fall into R[0] */ + { + leaf_shift_right(tb,tb->rnum[0],tb->rbytes); + } + break; + default: /* cases d and t */ + reiserfs_panic (tb->tb_sb, "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)", + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); + } + + } /* tb->rnum[0] > 0 */ + + + RFALSE( tb->blknum[0] > 3, + "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); + RFALSE( tb->blknum[0] < 0, + "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); + + /* if while adding to a node we discover that it is possible to split + it in two, and merge the left part into the left neighbor and the + right part into the right neighbor, eliminating the node */ + if ( tb->blknum[0] == 0 ) { /* node S[0] is empty now */ + + RFALSE( ! tb->lnum[0] || ! tb->rnum[0], + "PAP-12190: lnum and rnum must not be zero"); + /* if insertion was done before 0-th position in R[0], right + delimiting key of the tb->L[0]'s and left delimiting key are + not set correctly */ + if (tb->CFL[0]) { + if (!tb->CFR[0]) + reiserfs_panic (tb->tb_sb, "vs-12195: balance_leaf: CFR not initialized"); + copy_key (B_N_PDELIM_KEY (tb->CFL[0], tb->lkey[0]), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])); + do_balance_mark_internal_dirty (tb, tb->CFL[0], 0); + } + + reiserfs_invalidate_buffer(tb,tbS0); + return 0; + } + + + /* Fill new nodes that appear in place of S[0] */ + + /* I am told that this copying is because we need an array to enable + the looping code. -Hans */ + snum[0] = tb->s1num, + snum[1] = tb->s2num; + sbytes[0] = tb->s1bytes; + sbytes[1] = tb->s2bytes; + for( i = tb->blknum[0] - 2; i >= 0; i-- ) { + + RFALSE( !snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, snum[i]); + + /* here we shift from S to S_new nodes */ + + S_new[i] = get_FEB(tb); + + /* initialized block type and tree level */ + set_blkh_level( B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL ); + + + n = B_NR_ITEMS(tbS0); + + switch (flag) { + case M_INSERT: /* insert item */ + + if ( n - snum[i] < item_pos ) + { /* new item or it's part falls to first new node S_new[i]*/ + if ( item_pos == n - snum[i] + 1 && sbytes[i] != -1 ) + { /* part of new item falls into S_new[i] */ + int old_key_comp, old_len, r_zeros_number; + const char * r_body; + int version; + + /* Move snum[i]-1 items from S[0] to S_new[i] */ + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, -1, S_new[i]); + /* Remember key component and item length */ + version = ih_version (ih); + old_key_comp = le_ih_k_offset( ih ); + old_len = ih_item_len(ih); + + /* Calculate key component and item length to insert into S_new[i] */ + set_le_ih_k_offset( ih, + le_ih_k_offset(ih) + ((old_len - sbytes[i] )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) ); + + put_ih_item_len( ih, sbytes[i] ); + + /* Insert part of the item into S_new[i] before 0-th item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + + if ( (old_len - sbytes[i]) > zeros_num ) { + r_zeros_number = 0; + r_body = body + (old_len - sbytes[i]) - zeros_num; + } + else { + r_body = body; + r_zeros_number = zeros_num - (old_len - sbytes[i]); + zeros_num -= r_zeros_number; + } + + leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number); + + /* Calculate key component and item length to insert into S[i] */ + set_le_ih_k_offset( ih, old_key_comp ); + put_ih_item_len( ih, old_len - sbytes[i] ); + tb->insert_size[0] -= sbytes[i]; + } + else /* whole new item falls into S_new[i] */ + { + /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, sbytes[i], S_new[i]); + + /* Insert new item into S_new[i] */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + leaf_insert_into_buf (&bi, item_pos - n + snum[i] - 1, ih, body, zeros_num); + + zeros_num = tb->insert_size[0] = 0; + } + } + + else /* new item or it part don't falls into S_new[i] */ + { + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); + } + break; + + case M_PASTE: /* append item */ + + if ( n - snum[i] <= item_pos ) /* pasted item or part if it falls to S_new[i] */ + { + if ( item_pos == n - snum[i] && sbytes[i] != -1 ) + { /* we must shift part of the appended item */ + struct item_head * aux_ih; + + RFALSE( ih, "PAP-12210: ih must be 0"); + + if ( is_direntry_le_ih (aux_ih = B_N_PITEM_HEAD(tbS0,item_pos))) { + /* we append to directory item */ + + int entry_count; + + entry_count = ih_entry_count(aux_ih); + + if ( entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count ) { + /* new directory entry falls into S_new[i] */ + + RFALSE( ! tb->insert_size[0], + "PAP-12215: insert_size is already 0"); + RFALSE( sbytes[i] - 1 >= entry_count, + "PAP-12220: there are no so much entries (%d), only %d", + sbytes[i] - 1, entry_count); + + /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i]-1, S_new[i]); + /* Paste given directory entry to directory item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + leaf_paste_in_buffer (&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, + tb->insert_size[0], body,zeros_num); + /* paste new directory entry */ + leaf_paste_entries ( + bi.bi_bh, 0, pos_in_item - entry_count + sbytes[i] - 1, + 1, (struct reiserfs_de_head *)body, body + DEH_SIZE, + tb->insert_size[0] + ); + tb->insert_size[0] = 0; + pos_in_item++; + } else { /* new directory entry doesn't fall into S_new[i] */ + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); + } + } + else /* regular object */ + { + int n_shift, n_rem, r_zeros_number; + const char * r_body; + + RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)) || + tb->insert_size[0] <= 0, + "PAP-12225: item too short or insert_size <= 0"); + + /* Calculate number of bytes which must be shifted from appended item */ + n_shift = sbytes[i] - tb->insert_size[0]; + if ( n_shift < 0 ) + n_shift = 0; + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]); + + /* Calculate number of bytes which must remain in body after append to S_new[i] */ + n_rem = tb->insert_size[0] - sbytes[i]; + if ( n_rem < 0 ) + n_rem = 0; + /* Append part of body into S_new[0] */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + + if ( n_rem > zeros_num ) { + r_zeros_number = 0; + r_body = body + n_rem - zeros_num; + } + else { + r_body = body; + r_zeros_number = zeros_num - n_rem; + zeros_num -= r_zeros_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0]-n_rem, r_body,r_zeros_number); + { + struct item_head * tmp; + + tmp = B_N_PITEM_HEAD(S_new[i],0); + if (is_indirect_le_ih (tmp)) { + set_ih_free_space (tmp, 0); + set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) + + (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT))); + } else { + set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) + + n_rem ); + } + } + + tb->insert_size[0] = n_rem; + if ( ! n_rem ) + pos_in_item++; + } + } + else + /* item falls wholly into S_new[i] */ + { + int ret_val; + struct item_head * pasted; + +#ifdef CONFIG_REISERFS_CHECK + struct item_head * ih = B_N_PITEM_HEAD(tbS0,item_pos); + + if ( ! is_direntry_le_ih(ih) && (pos_in_item != ih_item_len(ih) || + tb->insert_size[0] <= 0) ) + reiserfs_panic (tb->tb_sb, "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len"); +#endif /* CONFIG_REISERFS_CHECK */ + + ret_val = leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); + + RFALSE( ret_val, + "PAP-12240: unexpected value returned by leaf_move_items (%d)", + ret_val); + + /* paste into item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + leaf_paste_in_buffer(&bi, item_pos - n + snum[i], pos_in_item, tb->insert_size[0], body, zeros_num); + + pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]); + if (is_direntry_le_ih (pasted)) + { + leaf_paste_entries ( + bi.bi_bh, item_pos - n + snum[i], pos_in_item, 1, + (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] + ); + } + + /* if we paste to indirect item update ih_free_space */ + if (is_indirect_le_ih (pasted)) + set_ih_free_space (pasted, 0); + zeros_num = tb->insert_size[0] = 0; + } + } + + else /* pasted item doesn't fall into S_new[i] */ + { + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); + } + break; + default: /* cases d and t */ + reiserfs_panic (tb->tb_sb, "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)", + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); + } + + memcpy (insert_key + i,B_N_PKEY(S_new[i],0),KEY_SIZE); + insert_ptr[i] = S_new[i]; + + RFALSE (!buffer_journaled (S_new [i]) || buffer_journal_dirty (S_new [i]) || + buffer_dirty (S_new [i]), + "PAP-12247: S_new[%d] : (%b)", i, S_new[i]); + } + + /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the + affected item which remains in S */ + if ( 0 <= item_pos && item_pos < tb->s0num ) + { /* if we must insert or append into buffer S[0] */ + + switch (flag) + { + case M_INSERT: /* insert item into S[0] */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + leaf_insert_into_buf (&bi, item_pos, ih, body, zeros_num); + + /* If we insert the first key change the delimiting key */ + if( item_pos == 0 ) { + if (tb->CFL[0]) /* can be 0 in reiserfsck */ + replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0); + + } + break; + + case M_PASTE: { /* append item in S[0] */ + struct item_head * pasted; + + pasted = B_N_PITEM_HEAD (tbS0, item_pos); + /* when directory, may be new entry already pasted */ + if (is_direntry_le_ih (pasted)) { + if ( pos_in_item >= 0 && + pos_in_item <= ih_entry_count(pasted) ) { + + RFALSE( ! tb->insert_size[0], + "PAP-12260: insert_size is 0 already"); + + /* prepare space */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + leaf_paste_in_buffer(&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num); + + /* paste entry */ + leaf_paste_entries ( + bi.bi_bh, item_pos, pos_in_item, 1, (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0] + ); + if ( ! item_pos && ! pos_in_item ) { + RFALSE( !tb->CFL[0] || !tb->L[0], + "PAP-12270: CFL[0]/L[0] must be specified"); + if (tb->CFL[0]) { + replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0); + + } + } + tb->insert_size[0] = 0; + } + } else { /* regular object */ + if ( pos_in_item == ih_item_len(pasted) ) { + + RFALSE( tb->insert_size[0] <= 0, + "PAP-12275: insert size must not be %d", + tb->insert_size[0]); + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + leaf_paste_in_buffer (&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num); + + if (is_indirect_le_ih (pasted)) { +#if 0 + RFALSE( tb->insert_size[0] != UNFM_P_SIZE, + "PAP-12280: insert_size for indirect item must be %d, not %d", + UNFM_P_SIZE, tb->insert_size[0]); +#endif + set_ih_free_space (pasted, 0); + } + tb->insert_size[0] = 0; + } + +#ifdef CONFIG_REISERFS_CHECK + else { + if ( tb->insert_size[0] ) { + print_cur_tb ("12285"); + reiserfs_panic (tb->tb_sb, "PAP-12285: balance_leaf: insert_size must be 0 (%d)", tb->insert_size[0]); + } + } +#endif /* CONFIG_REISERFS_CHECK */ + + } + } /* case M_PASTE: */ + } + } + +#ifdef CONFIG_REISERFS_CHECK + if ( flag == M_PASTE && tb->insert_size[0] ) { + print_cur_tb ("12290"); + reiserfs_panic (tb->tb_sb, "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", tb->insert_size[0]); + } +#endif /* CONFIG_REISERFS_CHECK */ + + return 0; +} /* Leaf level of the tree is balanced (end of balance_leaf) */ + + + +/* Make empty node */ +void make_empty_node (struct buffer_info * bi) +{ + struct block_head * blkh; + + RFALSE( bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL"); + + blkh = B_BLK_HEAD(bi->bi_bh); + set_blkh_nr_item( blkh, 0 ); + set_blkh_free_space( blkh, MAX_CHILD_SIZE(bi->bi_bh) ); + + if (bi->bi_parent) + B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */ +} + + +/* Get first empty buffer */ +struct buffer_head * get_FEB (struct tree_balance * tb) +{ + int i; + struct buffer_head * first_b; + struct buffer_info bi; + + for (i = 0; i < MAX_FEB_SIZE; i ++) + if (tb->FEB[i] != 0) + break; + + if (i == MAX_FEB_SIZE) + reiserfs_panic(tb->tb_sb, "vs-12300: get_FEB: FEB list is empty"); + + bi.tb = tb; + bi.bi_bh = first_b = tb->FEB[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + make_empty_node (&bi); + set_buffer_uptodate(first_b); + tb->FEB[i] = NULL; + tb->used[i] = first_b; + + return(first_b); +} + + +/* This is now used because reiserfs_free_block has to be able to +** schedule. +*/ +static void store_thrown (struct tree_balance * tb, struct buffer_head * bh) +{ + int i; + + if (buffer_dirty (bh)) + reiserfs_warning (tb->tb_sb, "store_thrown deals with dirty buffer"); + for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i ++) + if (!tb->thrown[i]) { + tb->thrown[i] = bh; + get_bh(bh) ; /* free_thrown puts this */ + return; + } + reiserfs_warning (tb->tb_sb, "store_thrown: too many thrown buffers"); +} + +static void free_thrown(struct tree_balance *tb) { + int i ; + b_blocknr_t blocknr ; + for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i++) { + if (tb->thrown[i]) { + blocknr = tb->thrown[i]->b_blocknr ; + if (buffer_dirty (tb->thrown[i])) + reiserfs_warning (tb->tb_sb, + "free_thrown deals with dirty buffer %d", + blocknr); + brelse(tb->thrown[i]) ; /* incremented in store_thrown */ + reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0); + } + } +} + +void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh) +{ + struct block_head *blkh; + blkh = B_BLK_HEAD(bh); + set_blkh_level( blkh, FREE_LEVEL ); + set_blkh_nr_item( blkh, 0 ); + + clear_buffer_dirty(bh); + store_thrown (tb, bh); +} + +/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/ +void replace_key (struct tree_balance * tb, struct buffer_head * dest, int n_dest, + struct buffer_head * src, int n_src) +{ + + RFALSE( dest == NULL || src == NULL, + "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)", + src, dest); + RFALSE( ! B_IS_KEYS_LEVEL (dest), + "vs-12310: invalid level (%z) for destination buffer. dest must be leaf", + dest); + RFALSE( n_dest < 0 || n_src < 0, + "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest); + RFALSE( n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src), + "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big", + n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); + + if (B_IS_ITEMS_LEVEL (src)) + /* source buffer contains leaf node */ + memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PITEM_HEAD(src,n_src), KEY_SIZE); + else + memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PDELIM_KEY(src,n_src), KEY_SIZE); + + do_balance_mark_internal_dirty (tb, dest, 0); +} + + +int get_left_neighbor_position ( + struct tree_balance * tb, + int h + ) +{ + int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1); + + RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FL[h] == 0, + "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", + h, tb->FL[h], h, PATH_H_PPARENT (tb->tb_path, h)); + + if (Sh_position == 0) + return B_NR_ITEMS (tb->FL[h]); + else + return Sh_position - 1; +} + + +int get_right_neighbor_position (struct tree_balance * tb, int h) +{ + int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1); + + RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FR[h] == 0, + "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", + h, PATH_H_PPARENT (tb->tb_path, h), h, tb->FR[h]); + + if (Sh_position == B_NR_ITEMS (PATH_H_PPARENT (tb->tb_path, h))) + return 0; + else + return Sh_position + 1; +} + + +#ifdef CONFIG_REISERFS_CHECK + +int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value); +static void check_internal_node (struct super_block * s, struct buffer_head * bh, char * mes) +{ + struct disk_child * dc; + int i; + + RFALSE( !bh, "PAP-12336: bh == 0"); + + if (!bh || !B_IS_IN_TREE (bh)) + return; + + RFALSE( !buffer_dirty (bh) && + !(buffer_journaled(bh) || buffer_journal_dirty(bh)), + "PAP-12337: buffer (%b) must be dirty", bh); + dc = B_N_CHILD (bh, 0); + + for (i = 0; i <= B_NR_ITEMS (bh); i ++, dc ++) { + if (!is_reusable (s, dc_block_number(dc), 1) ) { + print_cur_tb (mes); + reiserfs_panic (s, "PAP-12338: check_internal_node: invalid child pointer %y in %b", dc, bh); + } + } +} + + +static int locked_or_not_in_tree (struct buffer_head * bh, char * which) +{ + if ( (!buffer_journal_prepared (bh) && buffer_locked (bh)) || + !B_IS_IN_TREE (bh) ) { + reiserfs_warning (NULL, "vs-12339: locked_or_not_in_tree: %s (%b)", + which, bh); + return 1; + } + return 0; +} + + +static int check_before_balancing (struct tree_balance * tb) +{ + int retval = 0; + + if ( cur_tb ) { + reiserfs_panic (tb->tb_sb, "vs-12335: check_before_balancing: " + "suspect that schedule occurred based on cur_tb not being null at this point in code. " + "do_balance cannot properly handle schedule occurring while it runs."); + } + + /* double check that buffers that we will modify are unlocked. (fix_nodes should already have + prepped all of these for us). */ + if ( tb->lnum[0] ) { + retval |= locked_or_not_in_tree (tb->L[0], "L[0]"); + retval |= locked_or_not_in_tree (tb->FL[0], "FL[0]"); + retval |= locked_or_not_in_tree (tb->CFL[0], "CFL[0]"); + check_leaf (tb->L[0]); + } + if ( tb->rnum[0] ) { + retval |= locked_or_not_in_tree (tb->R[0], "R[0]"); + retval |= locked_or_not_in_tree (tb->FR[0], "FR[0]"); + retval |= locked_or_not_in_tree (tb->CFR[0], "CFR[0]"); + check_leaf (tb->R[0]); + } + retval |= locked_or_not_in_tree (PATH_PLAST_BUFFER (tb->tb_path), "S[0]"); + check_leaf (PATH_PLAST_BUFFER (tb->tb_path)); + + return retval; +} + + +static void check_after_balance_leaf (struct tree_balance * tb) +{ + if (tb->lnum[0]) { + if (B_FREE_SPACE (tb->L[0]) != + MAX_CHILD_SIZE (tb->L[0]) - dc_size(B_N_CHILD (tb->FL[0], get_left_neighbor_position (tb, 0)))) { + print_cur_tb ("12221"); + reiserfs_panic (tb->tb_sb, "PAP-12355: check_after_balance_leaf: shift to left was incorrect"); + } + } + if (tb->rnum[0]) { + if (B_FREE_SPACE (tb->R[0]) != + MAX_CHILD_SIZE (tb->R[0]) - dc_size(B_N_CHILD (tb->FR[0], get_right_neighbor_position (tb, 0)))) { + print_cur_tb ("12222"); + reiserfs_panic (tb->tb_sb, "PAP-12360: check_after_balance_leaf: shift to right was incorrect"); + } + } + if (PATH_H_PBUFFER(tb->tb_path,1) && + (B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) != + (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) - + dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), + PATH_H_POSITION (tb->tb_path, 1)))) )) { + int left = B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)); + int right = (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) - + dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), + PATH_H_POSITION (tb->tb_path, 1)))); + print_cur_tb ("12223"); + reiserfs_warning (tb->tb_sb, + "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " + "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", + left, + MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)), + PATH_H_PBUFFER(tb->tb_path,1), + PATH_H_POSITION (tb->tb_path, 1), + dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), PATH_H_POSITION (tb->tb_path, 1 )) ), + right ); + reiserfs_panic (tb->tb_sb, "PAP-12365: check_after_balance_leaf: S is incorrect"); + } +} + + +static void check_leaf_level (struct tree_balance * tb) +{ + check_leaf (tb->L[0]); + check_leaf (tb->R[0]); + check_leaf (PATH_PLAST_BUFFER (tb->tb_path)); +} + +static void check_internal_levels (struct tree_balance * tb) +{ + int h; + + /* check all internal nodes */ + for (h = 1; tb->insert_size[h]; h ++) { + check_internal_node (tb->tb_sb, PATH_H_PBUFFER (tb->tb_path, h), "BAD BUFFER ON PATH"); + if (tb->lnum[h]) + check_internal_node (tb->tb_sb, tb->L[h], "BAD L"); + if (tb->rnum[h]) + check_internal_node (tb->tb_sb, tb->R[h], "BAD R"); + } + +} + +#endif + + + + + + +/* Now we have all of the buffers that must be used in balancing of + the tree. We rely on the assumption that schedule() will not occur + while do_balance works. ( Only interrupt handlers are acceptable.) + We balance the tree according to the analysis made before this, + using buffers already obtained. For SMP support it will someday be + necessary to add ordered locking of tb. */ + +/* Some interesting rules of balancing: + + we delete a maximum of two nodes per level per balancing: we never + delete R, when we delete two of three nodes L, S, R then we move + them into R. + + we only delete L if we are deleting two nodes, if we delete only + one node we delete S + + if we shift leaves then we shift as much as we can: this is a + deliberate policy of extremism in node packing which results in + higher average utilization after repeated random balance operations + at the cost of more memory copies and more balancing as a result of + small insertions to full nodes. + + if we shift internal nodes we try to evenly balance the node + utilization, with consequent less balancing at the cost of lower + utilization. + + one could argue that the policy for directories in leaves should be + that of internal nodes, but we will wait until another day to + evaluate this.... It would be nice to someday measure and prove + these assumptions as to what is optimal.... + +*/ + +static inline void do_balance_starts (struct tree_balance *tb) +{ + /* use print_cur_tb() to see initial state of struct + tree_balance */ + + /* store_print_tb (tb); */ + + /* do not delete, just comment it out */ +/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, + "check");*/ + RFALSE( check_before_balancing (tb), "PAP-12340: locked buffers in TB"); +#ifdef CONFIG_REISERFS_CHECK + cur_tb = tb; +#endif +} + + +static inline void do_balance_completed (struct tree_balance * tb) +{ + +#ifdef CONFIG_REISERFS_CHECK + check_leaf_level (tb); + check_internal_levels (tb); + cur_tb = NULL; +#endif + + /* reiserfs_free_block is no longer schedule safe. So, we need to + ** put the buffers we want freed on the thrown list during do_balance, + ** and then free them now + */ + + REISERFS_SB(tb->tb_sb)->s_do_balance ++; + + + /* release all nodes hold to perform the balancing */ + unfix_nodes(tb); + + free_thrown(tb) ; +} + + + + + +void do_balance (struct tree_balance * tb, /* tree_balance structure */ + struct item_head * ih, /* item header of inserted item */ + const char * body, /* body of inserted item or bytes to paste */ + int flag) /* i - insert, d - delete + c - cut, p - paste + + Cut means delete part of an item + (includes removing an entry from a + directory). + + Delete means delete whole item. + + Insert means add a new item into the + tree. + + Paste means to append to the end of an + existing file or to insert a directory + entry. */ +{ + int child_pos, /* position of a child node in its parent */ + h; /* level of the tree being processed */ + struct item_head insert_key[2]; /* in our processing of one level + we sometimes determine what + must be inserted into the next + higher level. This insertion + consists of a key or two keys + and their corresponding + pointers */ + struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next + level */ + + tb->tb_mode = flag; + tb->need_balance_dirty = 0; + + if (FILESYSTEM_CHANGED_TB(tb)) { + reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ; + } + /* if we have no real work to do */ + if ( ! tb->insert_size[0] ) { + reiserfs_warning (tb->tb_sb, + "PAP-12350: do_balance: insert_size == 0, mode == %c", + flag); + unfix_nodes(tb); + return; + } + + atomic_inc (&(fs_generation (tb->tb_sb))); + do_balance_starts (tb); + + /* balance leaf returns 0 except if combining L R and S into + one node. see balance_internal() for explanation of this + line of code.*/ + child_pos = PATH_H_B_ITEM_ORDER (tb->tb_path, 0) + + balance_leaf (tb, ih, body, flag, insert_key, insert_ptr); + +#ifdef CONFIG_REISERFS_CHECK + check_after_balance_leaf (tb); +#endif + + /* Balance internal level of the tree. */ + for ( h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++ ) + child_pos = balance_internal (tb, h, child_pos, insert_key, insert_ptr); + + + do_balance_completed (tb); + +} diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c new file mode 100644 index 000000000000..26950113af8c --- /dev/null +++ b/fs/reiserfs/file.c @@ -0,0 +1,1408 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + + +#include <linux/time.h> +#include <linux/reiserfs_fs.h> +#include <linux/reiserfs_acl.h> +#include <linux/reiserfs_xattr.h> +#include <linux/smp_lock.h> +#include <asm/uaccess.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/quotaops.h> + +/* +** We pack the tails of files on file close, not at the time they are written. +** This implies an unnecessary copy of the tail and an unnecessary indirect item +** insertion/balancing, for files that are written in one write. +** It avoids unnecessary tail packings (balances) for files that are written in +** multiple writes and are small enough to have tails. +** +** file_release is called by the VFS layer when the file is closed. If +** this is the last open file descriptor, and the file +** small enough to have a tail, and the tail is currently in an +** unformatted node, the tail is converted back into a direct item. +** +** We use reiserfs_truncate_file to pack the tail, since it already has +** all the conditions coded. +*/ +static int reiserfs_file_release (struct inode * inode, struct file * filp) +{ + + struct reiserfs_transaction_handle th ; + int err; + int jbegin_failure = 0; + + if (!S_ISREG (inode->i_mode)) + BUG (); + + /* fast out for when nothing needs to be done */ + if ((atomic_read(&inode->i_count) > 1 || + !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || + !tail_has_to_be_packed(inode)) && + REISERFS_I(inode)->i_prealloc_count <= 0) { + return 0; + } + + reiserfs_write_lock(inode->i_sb); + down (&inode->i_sem); + /* freeing preallocation only involves relogging blocks that + * are already in the current transaction. preallocation gets + * freed at the end of each transaction, so it is impossible for + * us to log any additional blocks (including quota blocks) + */ + err = journal_begin(&th, inode->i_sb, 1); + if (err) { + /* uh oh, we can't allow the inode to go away while there + * is still preallocation blocks pending. Try to join the + * aborted transaction + */ + jbegin_failure = err; + err = journal_join_abort(&th, inode->i_sb, 1); + + if (err) { + /* hmpf, our choices here aren't good. We can pin the inode + * which will disallow unmount from every happening, we can + * do nothing, which will corrupt random memory on unmount, + * or we can forcibly remove the file from the preallocation + * list, which will leak blocks on disk. Lets pin the inode + * and let the admin know what is going on. + */ + igrab(inode); + reiserfs_warning(inode->i_sb, "pinning inode %lu because the " + "preallocation can't be freed"); + goto out; + } + } + reiserfs_update_inode_transaction(inode) ; + +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_prealloc (&th, inode); +#endif + err = journal_end(&th, inode->i_sb, 1); + + /* copy back the error code from journal_begin */ + if (!err) + err = jbegin_failure; + + if (!err && atomic_read(&inode->i_count) <= 1 && + (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && + tail_has_to_be_packed (inode)) { + /* if regular file is released by last holder and it has been + appended (we append by unformatted node only) or its direct + item(s) had to be converted, then it may have to be + indirect2direct converted */ + err = reiserfs_truncate_file(inode, 0) ; + } +out: + up (&inode->i_sem); + reiserfs_write_unlock(inode->i_sb); + return err; +} + +static void reiserfs_vfs_truncate_file(struct inode *inode) { + reiserfs_truncate_file(inode, 1) ; +} + +/* Sync a reiserfs file. */ + +/* + * FIXME: sync_mapping_buffers() never has anything to sync. Can + * be removed... + */ + +static int reiserfs_sync_file( + struct file * p_s_filp, + struct dentry * p_s_dentry, + int datasync + ) { + struct inode * p_s_inode = p_s_dentry->d_inode; + int n_err; + int barrier_done; + + if (!S_ISREG(p_s_inode->i_mode)) + BUG (); + n_err = sync_mapping_buffers(p_s_inode->i_mapping) ; + reiserfs_write_lock(p_s_inode->i_sb); + barrier_done = reiserfs_commit_for_inode(p_s_inode); + reiserfs_write_unlock(p_s_inode->i_sb); + if (barrier_done != 1) + blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL); + if (barrier_done < 0) + return barrier_done; + return ( n_err < 0 ) ? -EIO : 0; +} + +/* I really do not want to play with memory shortage right now, so + to simplify the code, we are not going to write more than this much pages at + a time. This still should considerably improve performance compared to 4k + at a time case. This is 32 pages of 4k size. */ +#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE + +/* Allocates blocks for a file to fulfil write request. + Maps all unmapped but prepared pages from the list. + Updates metadata with newly allocated blocknumbers as needed */ +static int reiserfs_allocate_blocks_for_region( + struct reiserfs_transaction_handle *th, + struct inode *inode, /* Inode we work with */ + loff_t pos, /* Writing position */ + int num_pages, /* number of pages write going + to touch */ + int write_bytes, /* amount of bytes to write */ + struct page **prepared_pages, /* array of + prepared pages + */ + int blocks_to_allocate /* Amount of blocks we + need to allocate to + fit the data into file + */ + ) +{ + struct cpu_key key; // cpu key of item that we are going to deal with + struct item_head *ih; // pointer to item head that we are going to deal with + struct buffer_head *bh; // Buffer head that contains items that we are going to deal with + __u32 * item; // pointer to item we are going to deal with + INITIALIZE_PATH(path); // path to item, that we are going to deal with. + b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored. + reiserfs_blocknr_hint_t hint; // hint structure for block allocator. + size_t res; // return value of various functions that we call. + int curr_block; // current block used to keep track of unmapped blocks. + int i; // loop counter + int itempos; // position in item + unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in + // first page + unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */ + __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created. + int modifying_this_item = 0; // Flag for items traversal code to keep track + // of the fact that we already prepared + // current block for journal + int will_prealloc = 0; + RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?"); + + /* only preallocate if this is a small write */ + if (REISERFS_I(inode)->i_prealloc_count || + (!(write_bytes & (inode->i_sb->s_blocksize -1)) && + blocks_to_allocate < + REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize)) + will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize; + + allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) * + sizeof(b_blocknr_t), GFP_NOFS); + + /* First we compose a key to point at the writing position, we want to do + that outside of any locking region. */ + make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/); + + /* If we came here, it means we absolutely need to open a transaction, + since we need to allocate some blocks */ + reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that. + res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); // Wish I know if this number enough + if (res) + goto error_exit; + reiserfs_update_inode_transaction(inode) ; + + /* Look for the in-tree position of our write, need path for block allocator */ + res = search_for_position_by_key(inode->i_sb, &key, &path); + if ( res == IO_ERROR ) { + res = -EIO; + goto error_exit; + } + + /* Allocate blocks */ + /* First fill in "hint" structure for block allocator */ + hint.th = th; // transaction handle. + hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine. + hint.inode = inode; // Inode is needed by block allocator too. + hint.search_start = 0; // We have no hint on where to search free blocks for block allocator. + hint.key = key.on_disk_key; // on disk key of file. + hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already. + hint.formatted_node = 0; // We are allocating blocks for unformatted node. + hint.preallocate = will_prealloc; + + /* Call block allocator to allocate blocks */ + res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate); + if ( res != CARRY_ON ) { + if ( res == NO_DISK_SPACE ) { + /* We flush the transaction in case of no space. This way some + blocks might become free */ + SB_JOURNAL(inode->i_sb)->j_must_wait = 1; + res = restart_transaction(th, inode, &path); + if (res) + goto error_exit; + + /* We might have scheduled, so search again */ + res = search_for_position_by_key(inode->i_sb, &key, &path); + if ( res == IO_ERROR ) { + res = -EIO; + goto error_exit; + } + + /* update changed info for hint structure. */ + res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate); + if ( res != CARRY_ON ) { + res = -ENOSPC; + pathrelse(&path); + goto error_exit; + } + } else { + res = -ENOSPC; + pathrelse(&path); + goto error_exit; + } + } + +#ifdef __BIG_ENDIAN + // Too bad, I have not found any way to convert a given region from + // cpu format to little endian format + { + int i; + for ( i = 0; i < blocks_to_allocate ; i++) + allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]); + } +#endif + + /* Blocks allocating well might have scheduled and tree might have changed, + let's search the tree again */ + /* find where in the tree our write should go */ + res = search_for_position_by_key(inode->i_sb, &key, &path); + if ( res == IO_ERROR ) { + res = -EIO; + goto error_exit_free_blocks; + } + + bh = get_last_bh( &path ); // Get a bufferhead for last element in path. + ih = get_ih( &path ); // Get a pointer to last item head in path. + item = get_item( &path ); // Get a pointer to last item in path + + /* Let's see what we have found */ + if ( res != POSITION_FOUND ) { /* position not found, this means that we + might need to append file with holes + first */ + // Since we are writing past the file's end, we need to find out if + // there is a hole that needs to be inserted before our writing + // position, and how many blocks it is going to cover (we need to + // populate pointers to file blocks representing the hole with zeros) + + { + int item_offset = 1; + /* + * if ih is stat data, its offset is 0 and we don't want to + * add 1 to pos in the hole_size calculation + */ + if (is_statdata_le_ih(ih)) + item_offset = 0; + hole_size = (pos + item_offset - + (le_key_k_offset( get_inode_item_key_version(inode), + &(ih->ih_key)) + + op_bytes_number(ih, inode->i_sb->s_blocksize))) >> + inode->i_sb->s_blocksize_bits; + } + + if ( hole_size > 0 ) { + int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time. + /* area filled with zeroes, to supply as list of zero blocknumbers + We allocate it outside of loop just in case loop would spin for + several iterations. */ + char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway. + if ( !zeros ) { + res = -ENOMEM; + goto error_exit_free_blocks; + } + memset ( zeros, 0, to_paste*UNFM_P_SIZE); + do { + to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); + if ( is_indirect_le_ih(ih) ) { + /* Ok, there is existing indirect item already. Need to append it */ + /* Calculate position past inserted item */ + make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3); + res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste); + if ( res ) { + kfree(zeros); + goto error_exit_free_blocks; + } + } else if ( is_statdata_le_ih(ih) ) { + /* No existing item, create it */ + /* item head for new item */ + struct item_head ins_ih; + + /* create a key for our new item */ + make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); + + /* Create new item head for our new item */ + make_le_item_head (&ins_ih, &key, key.version, 1, + TYPE_INDIRECT, to_paste*UNFM_P_SIZE, + 0 /* free space */); + + /* Find where such item should live in the tree */ + res = search_item (inode->i_sb, &key, &path); + if ( res != ITEM_NOT_FOUND ) { + /* item should not exist, otherwise we have error */ + if ( res != -ENOSPC ) { + reiserfs_warning (inode->i_sb, + "green-9008: search_by_key (%K) returned %d", + &key, res); + } + res = -EIO; + kfree(zeros); + goto error_exit_free_blocks; + } + res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros); + } else { + reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key); + } + if ( res ) { + kfree(zeros); + goto error_exit_free_blocks; + } + /* Now we want to check if transaction is too full, and if it is + we restart it. This will also free the path. */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + res = restart_transaction(th, inode, &path); + if (res) { + pathrelse (&path); + kfree(zeros); + goto error_exit; + } + } + + /* Well, need to recalculate path and stuff */ + set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits)); + res = search_for_position_by_key(inode->i_sb, &key, &path); + if ( res == IO_ERROR ) { + res = -EIO; + kfree(zeros); + goto error_exit_free_blocks; + } + bh=get_last_bh(&path); + ih=get_ih(&path); + item = get_item(&path); + hole_size -= to_paste; + } while ( hole_size ); + kfree(zeros); + } + } + + // Go through existing indirect items first + // replace all zeroes with blocknumbers from list + // Note that if no corresponding item was found, by previous search, + // it means there are no existing in-tree representation for file area + // we are going to overwrite, so there is nothing to scan through for holes. + for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) { +retry: + + if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) { + /* We run out of data in this indirect item, let's look for another + one. */ + /* First if we are already modifying current item, log it */ + if ( modifying_this_item ) { + journal_mark_dirty (th, inode->i_sb, bh); + modifying_this_item = 0; + } + /* Then set the key to look for a new indirect item (offset of old + item is added to old item length */ + set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize)); + /* Search ofor position of new key in the tree. */ + res = search_for_position_by_key(inode->i_sb, &key, &path); + if ( res == IO_ERROR) { + res = -EIO; + goto error_exit_free_blocks; + } + bh=get_last_bh(&path); + ih=get_ih(&path); + item = get_item(&path); + itempos = path.pos_in_item; + continue; // loop to check all kinds of conditions and so on. + } + /* Ok, we have correct position in item now, so let's see if it is + representing file hole (blocknumber is zero) and fill it if needed */ + if ( !item[itempos] ) { + /* Ok, a hole. Now we need to check if we already prepared this + block to be journaled */ + while ( !modifying_this_item ) { // loop until succeed + /* Well, this item is not journaled yet, so we must prepare + it for journal first, before we can change it */ + struct item_head tmp_ih; // We copy item head of found item, + // here to detect if fs changed under + // us while we were preparing for + // journal. + int fs_gen; // We store fs generation here to find if someone + // changes fs under our feet + + copy_item_head (&tmp_ih, ih); // Remember itemhead + fs_gen = get_generation (inode->i_sb); // remember fs generation + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing. + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + // Sigh, fs was changed under us, we need to look for new + // location of item we are working with + + /* unmark prepaerd area as journaled and search for it's + new position */ + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + res = search_for_position_by_key(inode->i_sb, &key, &path); + if ( res == IO_ERROR) { + res = -EIO; + goto error_exit_free_blocks; + } + bh=get_last_bh(&path); + ih=get_ih(&path); + item = get_item(&path); + itempos = path.pos_in_item; + goto retry; + } + modifying_this_item = 1; + } + item[itempos] = allocated_blocks[curr_block]; // Assign new block + curr_block++; + } + itempos++; + } + + if ( modifying_this_item ) { // We need to log last-accessed block, if it + // was modified, but not logged yet. + journal_mark_dirty (th, inode->i_sb, bh); + } + + if ( curr_block < blocks_to_allocate ) { + // Oh, well need to append to indirect item, or to create indirect item + // if there weren't any + if ( is_indirect_le_ih(ih) ) { + // Existing indirect item - append. First calculate key for append + // position. We do not need to recalculate path as it should + // already point to correct place. + make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3); + res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block)); + if ( res ) { + goto error_exit_free_blocks; + } + } else if (is_statdata_le_ih(ih) ) { + // Last found item was statdata. That means we need to create indirect item. + struct item_head ins_ih; /* itemhead for new item */ + + /* create a key for our new item */ + make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one, + // because that's + // where first + // indirect item + // begins + /* Create new item head for our new item */ + make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT, + (blocks_to_allocate-curr_block)*UNFM_P_SIZE, + 0 /* free space */); + /* Find where such item should live in the tree */ + res = search_item (inode->i_sb, &key, &path); + if ( res != ITEM_NOT_FOUND ) { + /* Well, if we have found such item already, or some error + occured, we need to warn user and return error */ + if ( res != -ENOSPC ) { + reiserfs_warning (inode->i_sb, + "green-9009: search_by_key (%K) " + "returned %d", &key, res); + } + res = -EIO; + goto error_exit_free_blocks; + } + /* Insert item into the tree with the data as its body */ + res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block)); + } else { + reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key); + } + } + + // the caller is responsible for closing the transaction + // unless we return an error, they are also responsible for logging + // the inode. + // + pathrelse(&path); + /* + * cleanup prellocation from previous writes + * if this is a partial block write + */ + if (write_bytes & (inode->i_sb->s_blocksize -1)) + reiserfs_discard_prealloc(th, inode); + reiserfs_write_unlock(inode->i_sb); + + // go through all the pages/buffers and map the buffers to newly allocated + // blocks (so that system knows where to write these pages later). + curr_block = 0; + for ( i = 0; i < num_pages ; i++ ) { + struct page *page=prepared_pages[i]; //current page + struct buffer_head *head = page_buffers(page);// first buffer for a page + int block_start, block_end; // in-page offsets for buffers. + + if (!page_buffers(page)) + reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???"); + + /* For each buffer in page */ + for(bh = head, block_start = 0; bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + if (!bh) + reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?"); + block_end = block_start+inode->i_sb->s_blocksize; + if (i == 0 && block_end <= from ) + /* if this buffer is before requested data to map, skip it */ + continue; + if (i == num_pages - 1 && block_start >= to) + /* If this buffer is after requested data to map, abort + processing of current page */ + break; + + if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it + map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block])); + curr_block++; + set_buffer_new(bh); + } + } + } + + RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird"); + + kfree(allocated_blocks); + return 0; + +// Need to deal with transaction here. +error_exit_free_blocks: + pathrelse(&path); + // free blocks + for( i = 0; i < blocks_to_allocate; i++ ) + reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1); + +error_exit: + if (th->t_trans_id) { + int err; + // update any changes we made to blk count + reiserfs_update_sd(th, inode); + err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); + if (err) + res = err; + } + reiserfs_write_unlock(inode->i_sb); + kfree(allocated_blocks); + + return res; +} + +/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */ +static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */ + size_t num_pages /* amount of pages */) { + int i; // loop counter + + for (i=0; i < num_pages ; i++) { + struct page *page = prepared_pages[i]; + + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + } +} + +/* This function will copy data from userspace to specified pages within + supplied byte range */ +static int reiserfs_copy_from_user_to_file_region( + loff_t pos, /* In-file position */ + int num_pages, /* Number of pages affected */ + int write_bytes, /* Amount of bytes to write */ + struct page **prepared_pages, /* pointer to + array to + prepared pages + */ + const char __user *buf /* Pointer to user-supplied + data*/ + ) +{ + long page_fault=0; // status of copy_from_user. + int i; // loop counter. + int offset; // offset in page + + for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) { + size_t count = min_t(size_t,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page + struct page *page=prepared_pages[i]; // Current page we process. + + fault_in_pages_readable( buf, count); + + /* Copy data from userspace to the current page */ + kmap(page); + page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data. + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + kunmap(page); + buf+=count; + write_bytes-=count; + + if (page_fault) + break; // Was there a fault? abort. + } + + return page_fault?-EFAULT:0; +} + +/* taken fs/buffer.c:__block_commit_write */ +int reiserfs_commit_page(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; + int new; + int logit = reiserfs_file_data_log(inode); + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + struct reiserfs_transaction_handle th; + int ret = 0; + + th.t_trans_id = 0; + blocksize = 1 << inode->i_blkbits; + + if (logit) { + reiserfs_write_lock(s); + ret = journal_begin(&th, s, bh_per_page + 1); + if (ret) + goto drop_write_lock; + reiserfs_update_inode_transaction(inode); + } + for(bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) + { + + new = buffer_new(bh); + clear_buffer_new(bh); + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + if (logit) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, s, bh); + } else if (!buffer_dirty(bh)) { + mark_buffer_dirty(bh); + /* do data=ordered on any page past the end + * of file and any buffer marked BH_New. + */ + if (reiserfs_data_ordered(inode->i_sb) && + (new || page->index >= i_size_index)) { + reiserfs_add_ordered_list(inode, bh); + } + } + } + } + if (logit) { + ret = journal_end(&th, s, bh_per_page + 1); +drop_write_lock: + reiserfs_write_unlock(s); + } + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return ret; +} + + +/* Submit pages for write. This was separated from actual file copying + because we might want to allocate block numbers in-between. + This function assumes that caller will adjust file size to correct value. */ +static int reiserfs_submit_file_region_for_write( + struct reiserfs_transaction_handle *th, + struct inode *inode, + loff_t pos, /* Writing position offset */ + size_t num_pages, /* Number of pages to write */ + size_t write_bytes, /* number of bytes to write */ + struct page **prepared_pages /* list of pages */ + ) +{ + int status; // return status of block_commit_write. + int retval = 0; // Return value we are going to return. + int i; // loop counter + int offset; // Writing offset in page. + int orig_write_bytes = write_bytes; + int sd_update = 0; + + for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) { + int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page + struct page *page=prepared_pages[i]; // Current page we process. + + status = reiserfs_commit_page(inode, page, offset, offset+count); + if ( status ) + retval = status; // To not overcomplicate matters We are going to + // submit all the pages even if there was error. + // we only remember error status to report it on + // exit. + write_bytes-=count; + } + /* now that we've gotten all the ordered buffers marked dirty, + * we can safely update i_size and close any running transaction + */ + if ( pos + orig_write_bytes > inode->i_size) { + inode->i_size = pos + orig_write_bytes; // Set new size + /* If the file have grown so much that tail packing is no + * longer possible, reset "need to pack" flag */ + if ( (have_large_tails (inode->i_sb) && + inode->i_size > i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size > i_block_size(inode)) ) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; + else if ( (have_large_tails (inode->i_sb) && + inode->i_size < i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)) ) + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; + + if (th->t_trans_id) { + reiserfs_write_lock(inode->i_sb); + reiserfs_update_sd(th, inode); // And update on-disk metadata + reiserfs_write_unlock(inode->i_sb); + } else + inode->i_sb->s_op->dirty_inode(inode); + + sd_update = 1; + } + if (th->t_trans_id) { + reiserfs_write_lock(inode->i_sb); + if (!sd_update) + reiserfs_update_sd(th, inode); + status = journal_end(th, th->t_super, th->t_blocks_allocated); + if (status) + retval = status; + reiserfs_write_unlock(inode->i_sb); + } + th->t_trans_id = 0; + + /* + * we have to unlock the pages after updating i_size, otherwise + * we race with writepage + */ + for ( i = 0; i < num_pages ; i++) { + struct page *page=prepared_pages[i]; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + } + return retval; +} + +/* Look if passed writing region is going to touch file's tail + (if it is present). And if it is, convert the tail to unformatted node */ +static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */ + loff_t pos, /* Writing position */ + int write_bytes /* amount of bytes to write */ + ) +{ + INITIALIZE_PATH(path); // needed for search_for_position + struct cpu_key key; // Key that would represent last touched writing byte. + struct item_head *ih; // item header of found block; + int res; // Return value of various functions we call. + int cont_expand_offset; // We will put offset for generic_cont_expand here + // This can be int just because tails are created + // only for small files. + +/* this embodies a dependency on a particular tail policy */ + if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) { + /* such a big files do not have tails, so we won't bother ourselves + to look for tails, simply return */ + return 0; + } + + reiserfs_write_lock(inode->i_sb); + /* find the item containing the last byte to be written, or if + * writing past the end of the file then the last item of the + * file (and then we check its type). */ + make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/); + res = search_for_position_by_key(inode->i_sb, &key, &path); + if ( res == IO_ERROR ) { + reiserfs_write_unlock(inode->i_sb); + return -EIO; + } + ih = get_ih(&path); + res = 0; + if ( is_direct_le_ih(ih) ) { + /* Ok, closest item is file tail (tails are stored in "direct" + * items), so we need to unpack it. */ + /* To not overcomplicate matters, we just call generic_cont_expand + which will in turn call other stuff and finally will boil down to + reiserfs_get_block() that would do necessary conversion. */ + cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key)); + pathrelse(&path); + res = generic_cont_expand( inode, cont_expand_offset); + } else + pathrelse(&path); + + reiserfs_write_unlock(inode->i_sb); + return res; +} + +/* This function locks pages starting from @pos for @inode. + @num_pages pages are locked and stored in + @prepared_pages array. Also buffers are allocated for these pages. + First and last page of the region is read if it is overwritten only + partially. If last page did not exist before write (file hole or file + append), it is zeroed, then. + Returns number of unallocated blocks that should be allocated to cover + new file data.*/ +static int reiserfs_prepare_file_region_for_write( + struct inode *inode /* Inode of the file */, + loff_t pos, /* position in the file */ + size_t num_pages, /* number of pages to + prepare */ + size_t write_bytes, /* Amount of bytes to be + overwritten from + @pos */ + struct page **prepared_pages /* pointer to array + where to store + prepared pages */ + ) +{ + int res=0; // Return values of different functions we call. + unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages. + int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page + int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; + /* offset of last modified byte in last + page */ + struct address_space *mapping = inode->i_mapping; // Pages are mapped here. + int i; // Simple counter + int blocks = 0; /* Return value (blocks that should be allocated) */ + struct buffer_head *bh, *head; // Current bufferhead and first bufferhead + // of a page. + unsigned block_start, block_end; // Starting and ending offsets of current + // buffer in the page. + struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if + // Page appeared to be not up + // to date. Note how we have + // at most 2 buffers, this is + // because we at most may + // partially overwrite two + // buffers for one page. One at // the beginning of write area + // and one at the end. + // Everything inthe middle gets // overwritten totally. + + struct cpu_key key; // cpu key of item that we are going to deal with + struct item_head *ih = NULL; // pointer to item head that we are going to deal with + struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with + INITIALIZE_PATH(path); // path to item, that we are going to deal with. + __u32 * item=NULL; // pointer to item we are going to deal with + int item_pos=-1; /* Position in indirect item */ + + + if ( num_pages < 1 ) { + reiserfs_warning (inode->i_sb, + "green-9001: reiserfs_prepare_file_region_for_write " + "called with zero number of pages to process"); + return -EFAULT; + } + + /* We have 2 loops for pages. In first loop we grab and lock the pages, so + that nobody would touch these until we release the pages. Then + we'd start to deal with mapping buffers to blocks. */ + for ( i = 0; i < num_pages; i++) { + prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page + if ( !prepared_pages[i]) { + res = -ENOMEM; + goto failed_page_grabbing; + } + if (!page_has_buffers(prepared_pages[i])) + create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0); + } + + /* Let's count amount of blocks for a case where all the blocks + overwritten are new (we will substract already allocated blocks later)*/ + if ( num_pages > 2 ) + /* These are full-overwritten pages so we count all the blocks in + these pages are counted as needed to be allocated */ + blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + /* count blocks needed for first page (possibly partially written) */ + blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + + !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */ + + /* Now we account for last page. If last page == first page (we + overwrite only one page), we substract all the blocks past the + last writing position in a page out of already calculated number + of blocks */ + blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) - + ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits); + /* Note how we do not roundup here since partial blocks still + should be allocated */ + + /* Now if all the write area lies past the file end, no point in + maping blocks, since there is none, so we just zero out remaining + parts of first and last pages in write area (if needed) */ + if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) { + if ( from != 0 ) {/* First page needs to be partially zeroed */ + char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0); + memset(kaddr, 0, from); + kunmap_atomic( kaddr, KM_USER0); + } + if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */ + char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0); + memset(kaddr+to, 0, PAGE_CACHE_SIZE - to); + kunmap_atomic( kaddr, KM_USER0); + } + + /* Since all blocks are new - use already calculated value */ + return blocks; + } + + /* Well, since we write somewhere into the middle of a file, there is + possibility we are writing over some already allocated blocks, so + let's map these blocks and substract number of such blocks out of blocks + we need to allocate (calculated above) */ + /* Mask write position to start on blocksize, we do it out of the + loop for performance reasons */ + pos &= ~((loff_t) inode->i_sb->s_blocksize - 1); + /* Set cpu key to the starting position in a file (on left block boundary)*/ + make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/); + + reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key() + for ( i = 0; i < num_pages ; i++ ) { + + head = page_buffers(prepared_pages[i]); + /* For each buffer in the page */ + for(bh = head, block_start = 0; bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + if (!bh) + reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); + /* Find where this buffer ends */ + block_end = block_start+inode->i_sb->s_blocksize; + if (i == 0 && block_end <= from ) + /* if this buffer is before requested data to map, skip it*/ + continue; + + if (i == num_pages - 1 && block_start >= to) { + /* If this buffer is after requested data to map, abort + processing of current page */ + break; + } + + if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) { + /* This is optimisation for a case where buffer is mapped + and have blocknumber assigned. In case significant amount + of such buffers are present, we may avoid some amount + of search_by_key calls. + Probably it would be possible to move parts of this code + out of BKL, but I afraid that would overcomplicate code + without any noticeable benefit. + */ + item_pos++; + /* Update the key */ + set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize); + blocks--; // Decrease the amount of blocks that need to be + // allocated + continue; // Go to the next buffer + } + + if ( !itembuf || /* if first iteration */ + item_pos >= ih_item_len(ih)/UNFM_P_SIZE) + { /* or if we progressed past the + current unformatted_item */ + /* Try to find next item */ + res = search_for_position_by_key(inode->i_sb, &key, &path); + /* Abort if no more items */ + if ( res != POSITION_FOUND ) { + /* make sure later loops don't use this item */ + itembuf = NULL; + item = NULL; + break; + } + + /* Update information about current indirect item */ + itembuf = get_last_bh( &path ); + ih = get_ih( &path ); + item = get_item( &path ); + item_pos = path.pos_in_item; + + RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected"); + } + + /* See if there is some block associated with the file + at that position, map the buffer to this block */ + if ( get_block_num(item,item_pos) ) { + map_bh(bh, inode->i_sb, get_block_num(item,item_pos)); + blocks--; // Decrease the amount of blocks that need to be + // allocated + } + item_pos++; + /* Update the key */ + set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize); + } + } + pathrelse(&path); // Free the path + reiserfs_write_unlock(inode->i_sb); + + /* Now zero out unmappend buffers for the first and last pages of + write area or issue read requests if page is mapped. */ + /* First page, see if it is not uptodate */ + if ( !PageUptodate(prepared_pages[0]) ) { + head = page_buffers(prepared_pages[0]); + + /* For each buffer in page */ + for(bh = head, block_start = 0; bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + + if (!bh) + reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); + /* Find where this buffer ends */ + block_end = block_start+inode->i_sb->s_blocksize; + if ( block_end <= from ) + /* if this buffer is before requested data to map, skip it*/ + continue; + if ( block_start < from ) { /* Aha, our partial buffer */ + if ( buffer_mapped(bh) ) { /* If it is mapped, we need to + issue READ request for it to + not loose data */ + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } else { /* Not mapped, zero it */ + char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0); + memset(kaddr+block_start, 0, from-block_start); + kunmap_atomic( kaddr, KM_USER0); + set_buffer_uptodate(bh); + } + } + } + } + + /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */ + if ( !PageUptodate(prepared_pages[num_pages-1]) || + ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) { + head = page_buffers(prepared_pages[num_pages-1]); + + /* for each buffer in page */ + for(bh = head, block_start = 0; bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + + if (!bh) + reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); + /* Find where this buffer ends */ + block_end = block_start+inode->i_sb->s_blocksize; + if ( block_start >= to ) + /* if this buffer is after requested data to map, skip it*/ + break; + if ( block_end > to ) { /* Aha, our partial buffer */ + if ( buffer_mapped(bh) ) { /* If it is mapped, we need to + issue READ request for it to + not loose data */ + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } else { /* Not mapped, zero it */ + char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0); + memset(kaddr+to, 0, block_end-to); + kunmap_atomic( kaddr, KM_USER0); + set_buffer_uptodate(bh); + } + } + } + } + + /* Wait for read requests we made to happen, if necessary */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) { + res = -EIO; + goto failed_read; + } + } + + return blocks; +failed_page_grabbing: + num_pages = i; +failed_read: + reiserfs_unprepare_pages(prepared_pages, num_pages); + return res; +} + +/* Write @count bytes at position @ppos in a file indicated by @file + from the buffer @buf. + + generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want + something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was + written for (ext2/3). This is for several reasons: + + * It has no understanding of any filesystem specific optimizations. + + * It enters the filesystem repeatedly for each page that is written. + + * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key + * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time + * to reiserfs which allows for fewer tree traversals. + + * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks. + + * Asking the block allocation code for blocks one at a time is slightly less efficient. + + All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to + use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make + things right finally. + + Future Features: providing search_by_key with hints. + +*/ +static ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */ + const char __user *buf, /* pointer to user supplied data +(in userspace) */ + size_t count, /* amount of bytes to write */ + loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to + * new current position before returning. */ ) +{ + size_t already_written = 0; // Number of bytes already written to the file. + loff_t pos; // Current position in the file. + ssize_t res; // return value of various functions that we call. + int err = 0; + struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to. + /* To simplify coding at this time, we store + locked pages in array for now */ + struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME]; + struct reiserfs_transaction_handle th; + th.t_trans_id = 0; + + if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment + ssize_t result, after_file_end = 0; + if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) { + /* If we are appending a file, we need to put this savelink in here. + If we will crash while doing direct io, finish_unfinished will + cut the garbage from the file end. */ + reiserfs_write_lock(inode->i_sb); + err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT ); + if (err) { + reiserfs_write_unlock (inode->i_sb); + return err; + } + reiserfs_update_inode_transaction(inode); + add_save_link (&th, inode, 1 /* Truncate */); + after_file_end = 1; + err = journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT ); + reiserfs_write_unlock(inode->i_sb); + if (err) + return err; + } + result = generic_file_write(file, buf, count, ppos); + + if ( after_file_end ) { /* Now update i_size and remove the savelink */ + struct reiserfs_transaction_handle th; + reiserfs_write_lock(inode->i_sb); + err = journal_begin(&th, inode->i_sb, 1); + if (err) { + reiserfs_write_unlock (inode->i_sb); + return err; + } + reiserfs_update_inode_transaction(inode); + reiserfs_update_sd(&th, inode); + err = journal_end(&th, inode->i_sb, 1); + if (err) { + reiserfs_write_unlock (inode->i_sb); + return err; + } + err = remove_save_link (inode, 1/* truncate */); + reiserfs_write_unlock(inode->i_sb); + if (err) + return err; + } + + return result; + } + + if ( unlikely((ssize_t) count < 0 )) + return -EINVAL; + + if (unlikely(!access_ok(VERIFY_READ, buf, count))) + return -EFAULT; + + down(&inode->i_sem); // locks the entire file for just us + + pos = *ppos; + + /* Check if we can write to specified region of file, file + is not overly big and this kind of stuff. Adjust pos and + count, if needed */ + res = generic_write_checks(file, &pos, &count, 0); + if (res) + goto out; + + if ( count == 0 ) + goto out; + + res = remove_suid(file->f_dentry); + if (res) + goto out; + + inode_update_time(inode, 1); /* Both mtime and ctime */ + + // Ok, we are done with all the checks. + + // Now we should start real work + + /* If we are going to write past the file's packed tail or if we are going + to overwrite part of the tail, we need that tail to be converted into + unformatted node */ + res = reiserfs_check_for_tail_and_convert( inode, pos, count); + if (res) + goto out; + + while ( count > 0) { + /* This is the main loop in which we running until some error occures + or until we write all of the data. */ + size_t num_pages;/* amount of pages we are going to write this iteration */ + size_t write_bytes; /* amount of bytes to write during this iteration */ + size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */ + + /* (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/ + num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial + pages */ + ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT); + /* convert size to amount of + pages */ + reiserfs_write_lock(inode->i_sb); + if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME + || num_pages > reiserfs_can_fit_pages(inode->i_sb) ) { + /* If we were asked to write more data than we want to or if there + is not that much space, then we shorten amount of data to write + for this iteration. */ + num_pages = min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb)); + /* Also we should not forget to set size in bytes accordingly */ + write_bytes = (num_pages << PAGE_CACHE_SHIFT) - + (pos & (PAGE_CACHE_SIZE-1)); + /* If position is not on the + start of the page, we need + to substract the offset + within page */ + } else + write_bytes = count; + + /* reserve the blocks to be allocated later, so that later on + we still have the space to write the blocks to */ + reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits)); + reiserfs_write_unlock(inode->i_sb); + + if ( !num_pages ) { /* If we do not have enough space even for */ + res = -ENOSPC; /* single page, return -ENOSPC */ + if ( pos > (inode->i_size & (inode->i_sb->s_blocksize-1))) + break; // In case we are writing past the file end, break. + // Otherwise we are possibly overwriting the file, so + // let's set write size to be equal or less than blocksize. + // This way we get it correctly for file holes. + // But overwriting files on absolutelly full volumes would not + // be very efficient. Well, people are not supposed to fill + // 100% of disk space anyway. + write_bytes = min_t(size_t, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1))); + num_pages = 1; + // No blocks were claimed before, so do it now. + reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)); + } + + /* Prepare for writing into the region, read in all the + partially overwritten pages, if needed. And lock the pages, + so that nobody else can access these until we are done. + We get number of actual blocks needed as a result.*/ + blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages); + if ( blocks_to_allocate < 0 ) { + res = blocks_to_allocate; + reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits)); + break; + } + + /* First we correct our estimate of how many blocks we need */ + reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate ); + + if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/ + /* Fill in all the possible holes and append the file if needed */ + res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate); + } + + /* well, we have allocated the blocks, so it is time to free + the reservation we made earlier. */ + reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate); + if ( res ) { + reiserfs_unprepare_pages(prepared_pages, num_pages); + break; + } + +/* NOTE that allocating blocks and filling blocks can be done in reverse order + and probably we would do that just to get rid of garbage in files after a + crash */ + + /* Copy data from user-supplied buffer to file's pages */ + res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf); + if ( res ) { + reiserfs_unprepare_pages(prepared_pages, num_pages); + break; + } + + /* Send the pages to disk and unlock them. */ + res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages, + write_bytes,prepared_pages); + if ( res ) + break; + + already_written += write_bytes; + buf += write_bytes; + *ppos = pos += write_bytes; + count -= write_bytes; + balance_dirty_pages_ratelimited(inode->i_mapping); + } + + /* this is only true on error */ + if (th.t_trans_id) { + reiserfs_write_lock(inode->i_sb); + err = journal_end(&th, th.t_super, th.t_blocks_allocated); + reiserfs_write_unlock(inode->i_sb); + if (err) { + res = err; + goto out; + } + } + + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) + res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA); + + up(&inode->i_sem); + reiserfs_async_progress_wait(inode->i_sb); + return (already_written != 0)?already_written:res; + +out: + up(&inode->i_sem); // unlock the file on exit. + return res; +} + +static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf, + size_t count, loff_t pos) +{ + return generic_file_aio_write(iocb, buf, count, pos); +} + + + +struct file_operations reiserfs_file_operations = { + .read = generic_file_read, + .write = reiserfs_file_write, + .ioctl = reiserfs_ioctl, + .mmap = generic_file_mmap, + .release = reiserfs_file_release, + .fsync = reiserfs_sync_file, + .sendfile = generic_file_sendfile, + .aio_read = generic_file_aio_read, + .aio_write = reiserfs_aio_write, +}; + + +struct inode_operations reiserfs_file_inode_operations = { + .truncate = reiserfs_vfs_truncate_file, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, +}; + + diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c new file mode 100644 index 000000000000..e4f64be9e15b --- /dev/null +++ b/fs/reiserfs/fix_node.c @@ -0,0 +1,2518 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/** + ** old_item_num + ** old_entry_num + ** set_entry_sizes + ** create_virtual_node + ** check_left + ** check_right + ** directory_part_size + ** get_num_ver + ** set_parameters + ** is_leaf_removable + ** are_leaves_removable + ** get_empty_nodes + ** get_lfree + ** get_rfree + ** is_left_neighbor_in_cache + ** decrement_key + ** get_far_parent + ** get_parents + ** can_node_be_removed + ** ip_check_balance + ** dc_check_balance_internal + ** dc_check_balance_leaf + ** dc_check_balance + ** check_balance + ** get_direct_parent + ** get_neighbors + ** fix_nodes + ** + ** + **/ + + +#include <linux/config.h> +#include <linux/time.h> +#include <linux/string.h> +#include <linux/reiserfs_fs.h> +#include <linux/buffer_head.h> + + +/* To make any changes in the tree we find a node, that contains item + to be changed/deleted or position in the node we insert a new item + to. We call this node S. To do balancing we need to decide what we + will shift to left/right neighbor, or to a new node, where new item + will be etc. To make this analysis simpler we build virtual + node. Virtual node is an array of items, that will replace items of + node S. (For instance if we are going to delete an item, virtual + node does not contain it). Virtual node keeps information about + item sizes and types, mergeability of first and last items, sizes + of all entries in directory item. We use this array of items when + calculating what we can shift to neighbors and how many nodes we + have to have if we do not any shiftings, if we shift to left/right + neighbor or to both. */ + + +/* taking item number in virtual node, returns number of item, that it has in source buffer */ +static inline int old_item_num (int new_num, int affected_item_num, int mode) +{ + if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) + return new_num; + + if (mode == M_INSERT) { + + RFALSE( new_num == 0, + "vs-8005: for INSERT mode and item number of inserted item"); + + return new_num - 1; + } + + RFALSE( mode != M_DELETE, + "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode); + /* delete mode */ + return new_num + 1; +} + +static void create_virtual_node (struct tree_balance * tb, int h) +{ + struct item_head * ih; + struct virtual_node * vn = tb->tb_vn; + int new_num; + struct buffer_head * Sh; /* this comes from tb->S[h] */ + + Sh = PATH_H_PBUFFER (tb->tb_path, h); + + /* size of changed node */ + vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h]; + + /* for internal nodes array if virtual items is not created */ + if (h) { + vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); + return; + } + + /* number of items in virtual node */ + vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0); + + /* first virtual item */ + vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); + memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item)); + vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item); + + + /* first item in the node */ + ih = B_N_PITEM_HEAD (Sh, 0); + + /* define the mergeability for 0-th item (if it is not being deleted) */ + if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) + vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; + + /* go through all items those remain in the virtual node (except for the new (inserted) one) */ + for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) { + int j; + struct virtual_item * vi = vn->vn_vi + new_num; + int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1); + + + if (is_affected && vn->vn_mode == M_INSERT) + continue; + + /* get item number in source node */ + j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode); + + vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; + vi->vi_ih = ih + j; + vi->vi_item = B_I_PITEM (Sh, ih + j); + vi->vi_uarea = vn->vn_free_ptr; + + // FIXME: there is no check, that item operation did not + // consume too much memory + vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]); + if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) + reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: " + "virtual node space consumed"); + + if (!is_affected) + /* this is not being changed */ + continue; + + if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { + vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; + vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted + } + } + + + /* virtual inserted item is not defined yet */ + if (vn->vn_mode == M_INSERT) { + struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num; + + RFALSE( vn->vn_ins_ih == 0, + "vs-8040: item header of inserted item is not specified"); + vi->vi_item_len = tb->insert_size[0]; + vi->vi_ih = vn->vn_ins_ih; + vi->vi_item = vn->vn_data; + vi->vi_uarea = vn->vn_free_ptr; + + op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]); + } + + /* set right merge flag we take right delimiting key and check whether it is a mergeable item */ + if (tb->CFR[0]) { + struct reiserfs_key * key; + + key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]); + if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE || + vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1)) + vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE; + +#ifdef CONFIG_REISERFS_CHECK + if (op_is_left_mergeable (key, Sh->b_size) && + !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) { + /* we delete last item and it could be merged with right neighbor's first item */ + if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) && + I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) { + /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ + print_block (Sh, 0, -1, -1); + reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c", + key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE); + } else + /* we can delete directory item, that has only one directory entry in it */ + ; + } +#endif + + } +} + + +/* using virtual node check, how many items can be shifted to left + neighbor */ +static void check_left (struct tree_balance * tb, int h, int cur_free) +{ + int i; + struct virtual_node * vn = tb->tb_vn; + struct virtual_item * vi; + int d_size, ih_size; + + RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); + + /* internal level */ + if (h > 0) { + tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space or nothing to move */ + tb->lnum[h] = 0; + tb->lbytes = -1; + return; + } + + RFALSE( !PATH_H_PPARENT (tb->tb_path, 0), + "vs-8055: parent does not exist or invalid"); + + vi = vn->vn_vi; + if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into L[0] */ + + RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8055: invalid mode or balance condition failed"); + + tb->lnum[0] = vn->vn_nr_item; + tb->lbytes = -1; + return; + } + + + d_size = 0, ih_size = IH_SIZE; + + /* first item may be merge with last item in left neighbor */ + if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) + d_size = -((int)IH_SIZE), ih_size = 0; + + tb->lnum[0] = 0; + for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->lnum[0] ++; + continue; + } + + /* the item cannot be shifted entirely, try to split it */ + /* check whether L[0] can hold ih and at least one byte of the item body */ + if (cur_free <= ih_size) { + /* cannot shift even a part of the current item */ + tb->lbytes = -1; + return; + } + cur_free -= ih_size; + + tb->lbytes = op_check_left (vi, cur_free, 0, 0); + if (tb->lbytes != -1) + /* count partially shifted item */ + tb->lnum[0] ++; + + break; + } + + return; +} + + +/* using virtual node check, how many items can be shifted to right + neighbor */ +static void check_right (struct tree_balance * tb, int h, int cur_free) +{ + int i; + struct virtual_node * vn = tb->tb_vn; + struct virtual_item * vi; + int d_size, ih_size; + + RFALSE( cur_free < 0, "vs-8070: cur_free < 0"); + + /* internal level */ + if (h > 0) { + tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space */ + tb->rnum[h] = 0; + tb->rbytes = -1; + return; + } + + RFALSE( !PATH_H_PPARENT (tb->tb_path, 0), + "vs-8075: parent does not exist or invalid"); + + vi = vn->vn_vi + vn->vn_nr_item - 1; + if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into R[0] */ + + RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8080: invalid mode or balance condition failed"); + + tb->rnum[h] = vn->vn_nr_item; + tb->rbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* last item may be merge with first item in right neighbor */ + if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) + d_size = -(int)IH_SIZE, ih_size = 0; + + tb->rnum[0] = 0; + for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->rnum[0] ++; + continue; + } + + /* check whether R[0] can hold ih and at least one byte of the item body */ + if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */ + tb->rbytes = -1; + return; + } + + /* R[0] can hold the header of the item and at least one byte of its body */ + cur_free -= ih_size; /* cur_free is still > 0 */ + + tb->rbytes = op_check_right (vi, cur_free); + if (tb->rbytes != -1) + /* count partially shifted item */ + tb->rnum[0] ++; + + break; + } + + return; +} + + +/* + * from - number of items, which are shifted to left neighbor entirely + * to - number of item, which are shifted to right neighbor entirely + * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor + * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */ +static int get_num_ver (int mode, struct tree_balance * tb, int h, + int from, int from_bytes, + int to, int to_bytes, + short * snum012, int flow + ) +{ + int i; + int cur_free; + // int bytes; + int units; + struct virtual_node * vn = tb->tb_vn; + // struct virtual_item * vi; + + int total_node_size, max_node_size, current_item_size; + int needed_nodes; + int start_item, /* position of item we start filling node from */ + end_item, /* position of item we finish filling node by */ + start_bytes,/* number of first bytes (entries for directory) of start_item-th item + we do not include into node that is being filled */ + end_bytes; /* number of last bytes (entries for directory) of end_item-th item + we do node include into node that is being filled */ + int split_item_positions[2]; /* these are positions in virtual item of + items, that are split between S[0] and + S1new and S1new and S2new */ + + split_item_positions[0] = -1; + split_item_positions[1] = -1; + + /* We only create additional nodes if we are in insert or paste mode + or we are in replace mode at the internal level. If h is 0 and + the mode is M_REPLACE then in fix_nodes we change the mode to + paste or insert before we get here in the code. */ + RFALSE( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), + "vs-8100: insert_size < 0 in overflow"); + + max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h)); + + /* snum012 [0-2] - number of items, that lay + to S[0], first new node and second new node */ + snum012[3] = -1; /* s1bytes */ + snum012[4] = -1; /* s2bytes */ + + /* internal level */ + if (h > 0) { + i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); + if (i == max_node_size) + return 1; + return (i / max_node_size + 1); + } + + /* leaf level */ + needed_nodes = 1; + total_node_size = 0; + cur_free = max_node_size; + + // start from 'from'-th item + start_item = from; + // skip its first 'start_bytes' units + start_bytes = ((from_bytes != -1) ? from_bytes : 0); + + // last included item is the 'end_item'-th one + end_item = vn->vn_nr_item - to - 1; + // do not count last 'end_bytes' units of 'end_item'-th item + end_bytes = (to_bytes != -1) ? to_bytes : 0; + + /* go through all item beginning from the start_item-th item and ending by + the end_item-th item. Do not count first 'start_bytes' units of + 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */ + + for (i = start_item; i <= end_item; i ++) { + struct virtual_item * vi = vn->vn_vi + i; + int skip_from_end = ((i == end_item) ? end_bytes : 0); + + RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed"); + + /* get size of current item */ + current_item_size = vi->vi_item_len; + + /* do not take in calculation head part (from_bytes) of from-th item */ + current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes); + + /* do not take in calculation tail part of last item */ + current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end); + + /* if item fits into current node entierly */ + if (total_node_size + current_item_size <= max_node_size) { + snum012[needed_nodes - 1] ++; + total_node_size += current_item_size; + start_bytes = 0; + continue; + } + + if (current_item_size > max_node_size) { + /* virtual item length is longer, than max size of item in + a node. It is impossible for direct item */ + RFALSE( is_direct_le_ih (vi->vi_ih), + "vs-8110: " + "direct item length is %d. It can not be longer than %d", + current_item_size, max_node_size); + /* we will try to split it */ + flow = 1; + } + + if (!flow) { + /* as we do not split items, take new node and continue */ + needed_nodes ++; i --; total_node_size = 0; + continue; + } + + // calculate number of item units which fit into node being + // filled + { + int free_space; + + free_space = max_node_size - total_node_size - IH_SIZE; + units = op_check_left (vi, free_space, start_bytes, skip_from_end); + if (units == -1) { + /* nothing fits into current node, take new node and continue */ + needed_nodes ++, i--, total_node_size = 0; + continue; + } + } + + /* something fits into the current node */ + //if (snum012[3] != -1 || needed_nodes != 1) + // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required"); + //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units; + start_bytes += units; + snum012[needed_nodes - 1 + 3] = units; + + if (needed_nodes > 2) + reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: " + "split_item_position is out of boundary"); + snum012[needed_nodes - 1] ++; + split_item_positions[needed_nodes - 1] = i; + needed_nodes ++; + /* continue from the same item with start_bytes != -1 */ + start_item = i; + i --; + total_node_size = 0; + } + + // sum012[4] (if it is not -1) contains number of units of which + // are to be in S1new, snum012[3] - to be in S0. They are supposed + // to be S1bytes and S2bytes correspondingly, so recalculate + if (snum012[4] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S1new; + + split_item_num = split_item_positions[1]; + bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); + bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0); + + // s2bytes + snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new; + + if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && + vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) + reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not " + "directory or indirect item"); + } + + /* now we know S2bytes, calculate S1bytes */ + if (snum012[3] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S2new; + + split_item_num = split_item_positions[0]; + bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); + bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0); + + // s1bytes + snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new; + } + + return needed_nodes; +} + + +#ifdef CONFIG_REISERFS_CHECK +extern struct tree_balance * cur_tb; +#endif + + +/* Set parameters for balancing. + * Performs write of results of analysis of balancing into structure tb, + * where it will later be used by the functions that actually do the balancing. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * lnum number of items from S[h] that must be shifted to L[h]; + * rnum number of items from S[h] that must be shifted to R[h]; + * blk_num number of blocks that S[h] will be splitted into; + * s012 number of items that fall into splitted nodes. + * lbytes number of bytes which flow to the left neighbor from the item that is not + * not shifted entirely + * rbytes number of bytes which flow to the right neighbor from the item that is not + * not shifted entirely + * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array) + */ + +static void set_parameters (struct tree_balance * tb, int h, int lnum, + int rnum, int blk_num, short * s012, int lb, int rb) +{ + + tb->lnum[h] = lnum; + tb->rnum[h] = rnum; + tb->blknum[h] = blk_num; + + if (h == 0) + { /* only for leaf level */ + if (s012 != NULL) + { + tb->s0num = * s012 ++, + tb->s1num = * s012 ++, + tb->s2num = * s012 ++; + tb->s1bytes = * s012 ++; + tb->s2bytes = * s012; + } + tb->lbytes = lb; + tb->rbytes = rb; + } + PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum ); + PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum ); + + PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb ); + PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb ); +} + + + +/* check, does node disappear if we shift tb->lnum[0] items to left + neighbor and tb->rnum[0] to the right one. */ +static int is_leaf_removable (struct tree_balance * tb) +{ + struct virtual_node * vn = tb->tb_vn; + int to_left, to_right; + int size; + int remain_items; + + /* number of items, that will be shifted to left (right) neighbor + entirely */ + to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); + to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); + remain_items = vn->vn_nr_item; + + /* how many items remain in S[0] after shiftings to neighbors */ + remain_items -= (to_left + to_right); + + if (remain_items < 1) { + /* all content of node can be shifted to neighbors */ + set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1); + return 1; + } + + if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) + /* S[0] is not removable */ + return 0; + + /* check, whether we can divide 1 remaining item between neighbors */ + + /* get size of remaining item (in item units) */ + size = op_unit_num (&(vn->vn_vi[to_left])); + + if (tb->lbytes + tb->rbytes >= size) { + set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1); + return 1; + } + + return 0; +} + + +/* check whether L, S, R can be joined in one node */ +static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree) +{ + struct virtual_node * vn = tb->tb_vn; + int ih_size; + struct buffer_head *S0; + + S0 = PATH_H_PBUFFER (tb->tb_path, 0); + + ih_size = 0; + if (vn->vn_nr_item) { + if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) + ih_size += IH_SIZE; + + if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE) + ih_size += IH_SIZE; + } else { + /* there was only one item and it will be deleted */ + struct item_head * ih; + + RFALSE( B_NR_ITEMS (S0) != 1, + "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0)); + + ih = B_N_PITEM_HEAD (S0, 0); + if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]))) + if (is_direntry_le_ih (ih)) { + /* Directory must be in correct state here: that is + somewhere at the left side should exist first directory + item. But the item being deleted can not be that first + one because its right neighbor is item of the same + directory. (But first item always gets deleted in last + turn). So, neighbors of deleted item can be merged, so + we can save ih_size */ + ih_size = IH_SIZE; + + /* we might check that left neighbor exists and is of the + same directory */ + RFALSE(le_ih_k_offset (ih) == DOT_OFFSET, + "vs-8130: first directory item can not be removed until directory is not empty"); + } + + } + + if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) { + set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1); + PROC_INFO_INC( tb -> tb_sb, leaves_removable ); + return 1; + } + return 0; + +} + + + +/* when we do not split item, lnum and rnum are numbers of entire items */ +#define SET_PAR_SHIFT_LEFT \ +if (h)\ +{\ + int to_l;\ + \ + to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ + (MAX_NR_KEY(Sh) + 1 - lpar);\ + \ + set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (lset==LEFT_SHIFT_FLOW)\ + set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ + tb->lbytes, -1);\ + else\ + set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ + -1, -1);\ +} + + +#define SET_PAR_SHIFT_RIGHT \ +if (h)\ +{\ + int to_r;\ + \ + to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ + \ + set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (rset==RIGHT_SHIFT_FLOW)\ + set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ + -1, tb->rbytes);\ + else\ + set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ + -1, -1);\ +} + + +static void free_buffers_in_tb ( + struct tree_balance * p_s_tb + ) { + int n_counter; + + decrement_counters_in_path(p_s_tb->tb_path); + + for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) { + decrement_bcount(p_s_tb->L[n_counter]); + p_s_tb->L[n_counter] = NULL; + decrement_bcount(p_s_tb->R[n_counter]); + p_s_tb->R[n_counter] = NULL; + decrement_bcount(p_s_tb->FL[n_counter]); + p_s_tb->FL[n_counter] = NULL; + decrement_bcount(p_s_tb->FR[n_counter]); + p_s_tb->FR[n_counter] = NULL; + decrement_bcount(p_s_tb->CFL[n_counter]); + p_s_tb->CFL[n_counter] = NULL; + decrement_bcount(p_s_tb->CFR[n_counter]); + p_s_tb->CFR[n_counter] = NULL; + } +} + + +/* Get new buffers for storing new nodes that are created while balancing. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + * NO_DISK_SPACE - no disk space. + */ +/* The function is NOT SCHEDULE-SAFE! */ +static int get_empty_nodes( + struct tree_balance * p_s_tb, + int n_h + ) { + struct buffer_head * p_s_new_bh, + * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h); + b_blocknr_t * p_n_blocknr, + a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, }; + int n_counter, + n_number_of_freeblk, + n_amount_needed,/* number of needed empty blocks */ + n_retval = CARRY_ON; + struct super_block * p_s_sb = p_s_tb->tb_sb; + + + /* number_of_freeblk is the number of empty blocks which have been + acquired for use by the balancing algorithm minus the number of + empty blocks used in the previous levels of the analysis, + number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs + after empty blocks are acquired, and the balancing analysis is + then restarted, amount_needed is the number needed by this level + (n_h) of the balancing analysis. + + Note that for systems with many processes writing, it would be + more layout optimal to calculate the total number needed by all + levels and then to run reiserfs_new_blocks to get all of them at once. */ + + /* Initiate number_of_freeblk to the amount acquired prior to the restart of + the analysis or 0 if not restarted, then subtract the amount needed + by all of the levels of the tree below n_h. */ + /* blknum includes S[n_h], so we subtract 1 in this calculation */ + for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ ) + n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0; + + /* Allocate missing empty blocks. */ + /* if p_s_Sh == 0 then we are getting a new root */ + n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1; + /* Amount_needed = the amount that we need more than the amount that we have. */ + if ( n_amount_needed > n_number_of_freeblk ) + n_amount_needed -= n_number_of_freeblk; + else /* If we have enough already then there is nothing to do. */ + return CARRY_ON; + + /* No need to check quota - is not allocated for blocks used for formatted nodes */ + if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs, + n_amount_needed) == NO_DISK_SPACE) + return NO_DISK_SPACE; + + /* for each blocknumber we just got, get a buffer and stick it on FEB */ + for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed; + p_n_blocknr++, n_counter++ ) { + + RFALSE( ! *p_n_blocknr, + "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); + + p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr); + RFALSE (buffer_dirty (p_s_new_bh) || + buffer_journaled (p_s_new_bh) || + buffer_journal_dirty (p_s_new_bh), + "PAP-8140: journlaled or dirty buffer %b for the new block", + p_s_new_bh); + + /* Put empty buffers into the array. */ + RFALSE (p_s_tb->FEB[p_s_tb->cur_blknum], + "PAP-8141: busy slot for new buffer"); + + set_buffer_journal_new (p_s_new_bh); + p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh; + } + + if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) ) + n_retval = REPEAT_SEARCH ; + + return n_retval; +} + + +/* Get free space of the left neighbor, which is stored in the parent + * node of the left neighbor. */ +static int get_lfree (struct tree_balance * tb, int h) +{ + struct buffer_head * l, * f; + int order; + + if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0) + return 0; + + if (f == l) + order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1; + else { + order = B_NR_ITEMS (l); + f = l; + } + + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order))); +} + + +/* Get free space of the right neighbor, + * which is stored in the parent node of the right neighbor. + */ +static int get_rfree (struct tree_balance * tb, int h) +{ + struct buffer_head * r, * f; + int order; + + if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0) + return 0; + + if (f == r) + order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1; + else { + order = 0; + f = r; + } + + return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order))); + +} + + +/* Check whether left neighbor is in memory. */ +static int is_left_neighbor_in_cache( + struct tree_balance * p_s_tb, + int n_h + ) { + struct buffer_head * p_s_father, * left; + struct super_block * p_s_sb = p_s_tb->tb_sb; + b_blocknr_t n_left_neighbor_blocknr; + int n_left_neighbor_position; + + if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */ + return 0; + + /* Calculate father of the node to be balanced. */ + p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1); + + RFALSE( ! p_s_father || + ! B_IS_IN_TREE (p_s_father) || + ! B_IS_IN_TREE (p_s_tb->FL[n_h]) || + ! buffer_uptodate (p_s_father) || + ! buffer_uptodate (p_s_tb->FL[n_h]), + "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", + p_s_father, p_s_tb->FL[n_h]); + + + /* Get position of the pointer to the left neighbor into the left father. */ + n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ? + p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]); + /* Get left neighbor block number. */ + n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position); + /* Look for the left neighbor in the cache. */ + if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) { + + RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left), + "vs-8170: left neighbor (%b %z) is not in the tree", left, left); + put_bh(left) ; + return 1; + } + + return 0; +} + + +#define LEFT_PARENTS 'l' +#define RIGHT_PARENTS 'r' + + +static void decrement_key (struct cpu_key * p_s_key) +{ + // call item specific function for this key + item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key); +} + + + + +/* Calculate far left/right parent of the left/right neighbor of the current node, that + * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h]. + * Calculate left/right common parent of the current node and L[h]/R[h]. + * Calculate left/right delimiting key position. + * Returns: PATH_INCORRECT - path in the tree is not correct; + SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_far_parent (struct tree_balance * p_s_tb, + int n_h, + struct buffer_head ** pp_s_father, + struct buffer_head ** pp_s_com_father, + char c_lr_par) +{ + struct buffer_head * p_s_parent; + INITIALIZE_PATH (s_path_to_neighbor_father); + struct path * p_s_path = p_s_tb->tb_path; + struct cpu_key s_lr_father_key; + int n_counter, + n_position = INT_MAX, + n_first_last_position = 0, + n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h); + + /* Starting from F[n_h] go upwards in the tree, and look for the common + ancestor of F[n_h], and its neighbor l/r, that should be obtained. */ + + n_counter = n_path_offset; + + RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET, + "PAP-8180: invalid path length"); + + + for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) { + /* Check whether parent of the current buffer in the path is really parent in the tree. */ + if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) ) + return REPEAT_SEARCH; + /* Check whether position in the parent is correct. */ + if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) ) + return REPEAT_SEARCH; + /* Check whether parent at the path really points to the child. */ + if ( B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr ) + return REPEAT_SEARCH; + /* Return delimiting key if position in the parent is not equal to first/last one. */ + if ( c_lr_par == RIGHT_PARENTS ) + n_first_last_position = B_NR_ITEMS (p_s_parent); + if ( n_position != n_first_last_position ) { + *pp_s_com_father = p_s_parent; + get_bh(*pp_s_com_father) ; + /*(*pp_s_com_father = p_s_parent)->b_count++;*/ + break; + } + } + + /* if we are in the root of the tree, then there is no common father */ + if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) { + /* Check whether first buffer in the path is the root of the tree. */ + if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK (p_s_tb->tb_sb) ) { + *pp_s_father = *pp_s_com_father = NULL; + return CARRY_ON; + } + return REPEAT_SEARCH; + } + + RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL, + "PAP-8185: (%b %z) level too small", + *pp_s_com_father, *pp_s_com_father); + + /* Check whether the common parent is locked. */ + + if ( buffer_locked (*pp_s_com_father) ) { + __wait_on_buffer(*pp_s_com_father); + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + decrement_bcount(*pp_s_com_father); + return REPEAT_SEARCH; + } + } + + /* So, we got common parent of the current node and its left/right neighbor. + Now we are geting the parent of the left/right neighbor. */ + + /* Form key to get parent of the left/right neighbor. */ + le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ? + (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position))); + + + if ( c_lr_par == LEFT_PARENTS ) + decrement_key(&s_lr_father_key); + + if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR) + // path is released + return IO_ERROR; + + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + decrement_counters_in_path(&s_path_to_neighbor_father); + decrement_bcount(*pp_s_com_father); + return REPEAT_SEARCH; + } + + *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); + + RFALSE( B_LEVEL (*pp_s_father) != n_h + 1, + "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father); + RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET, + "PAP-8192: path length is too small"); + + s_path_to_neighbor_father.path_length--; + decrement_counters_in_path(&s_path_to_neighbor_father); + return CARRY_ON; +} + + +/* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of + * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset], + * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset]. + * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset]. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_parents (struct tree_balance * p_s_tb, int n_h) +{ + struct path * p_s_path = p_s_tb->tb_path; + int n_position, + n_ret_value, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); + struct buffer_head * p_s_curf, + * p_s_curcf; + + /* Current node is the root of the tree or will be root of the tree */ + if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) { + /* The root can not have parents. + Release nodes which previously were obtained as parents of the current node neighbors. */ + decrement_bcount(p_s_tb->FL[n_h]); + decrement_bcount(p_s_tb->CFL[n_h]); + decrement_bcount(p_s_tb->FR[n_h]); + decrement_bcount(p_s_tb->CFR[n_h]); + p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL; + return CARRY_ON; + } + + /* Get parent FL[n_path_offset] of L[n_path_offset]. */ + if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) { + /* Current node is not the first child of its parent. */ + /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/ + p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); + get_bh(p_s_curf) ; + get_bh(p_s_curf) ; + p_s_tb->lkey[n_h] = n_position - 1; + } + else { + /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node. + Calculate current common parent of L[n_path_offset] and the current node. Note that + CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset]. + Calculate lkey[n_path_offset]. */ + if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, + &p_s_curcf, LEFT_PARENTS)) != CARRY_ON ) + return n_ret_value; + } + + decrement_bcount(p_s_tb->FL[n_h]); + p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */ + decrement_bcount(p_s_tb->CFL[n_h]); + p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */ + + RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) || + (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)), + "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf); + +/* Get parent FR[n_h] of R[n_h]. */ + +/* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */ + if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) { +/* Calculate current parent of R[n_h], which is the right neighbor of F[n_h]. + Calculate current common parent of R[n_h] and current node. Note that CFR[n_h] + not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */ + if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON ) + return n_ret_value; + } + else { +/* Current node is not the last child of its parent F[n_h]. */ + /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/ + p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); + get_bh(p_s_curf) ; + get_bh(p_s_curf) ; + p_s_tb->rkey[n_h] = n_position; + } + + decrement_bcount(p_s_tb->FR[n_h]); + p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */ + + decrement_bcount(p_s_tb->CFR[n_h]); + p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */ + + RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) || + (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)), + "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf); + + return CARRY_ON; +} + + +/* it is possible to remove node as result of shiftings to + neighbors even when we insert or paste item. */ +static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h) +{ + struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h); + int levbytes = tb->insert_size[h]; + struct item_head * ih; + struct reiserfs_key * r_key = NULL; + + ih = B_N_PITEM_HEAD (Sh, 0); + if ( tb->CFR[h] ) + r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]); + + if ( + lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes + /* shifting may merge items which might save space */ + - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0) + - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0) + + (( h ) ? KEY_SIZE : 0)) + { + /* node can not be removed */ + if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */ + if ( ! h ) + tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0); + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + } + PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] ); + return !NO_BALANCING_NEEDED; +} + + + +/* Check whether current node S[h] is balanced when increasing its size by + * Inserting or Pasting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +/* ip means Inserting or Pasting */ +static int ip_check_balance (struct tree_balance * tb, int h) +{ + struct virtual_node * vn = tb->tb_vn; + int levbytes, /* Number of bytes that must be inserted into (value + is negative if bytes are deleted) buffer which + contains node being balanced. The mnemonic is + that the attempted change in node space used level + is levbytes bytes. */ + n_ret_value; + + int lfree, sfree, rfree /* free space in L, S and R */; + + /* nver is short for number of vertixes, and lnver is the number if + we shift to the left, rnver is the number if we shift to the + right, and lrnver is the number if we shift in both directions. + The goal is to minimize first the number of vertixes, and second, + the number of vertixes whose contents are changed by shifting, + and third the number of uncached vertixes whose contents are + changed by shifting and must be read from disk. */ + int nver, lnver, rnver, lrnver; + + /* used at leaf level only, S0 = S[0] is the node being balanced, + sInum [ I = 0,1,2 ] is the number of items that will + remain in node SI after balancing. S1 and S2 are new + nodes that might be created. */ + + /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. + where 4th parameter is s1bytes and 5th - s2bytes + */ + short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases + 0,1 - do not shift and do not shift but bottle + 2 - shift only whole item to left + 3 - shift to left and bottle as much as possible + 4,5 - shift to right (whole items and as much as possible + 6,7 - shift to both directions (whole items and as much as possible) + */ + + /* Sh is the node whose balance is currently being checked */ + struct buffer_head * Sh; + + Sh = PATH_H_PBUFFER (tb->tb_path, h); + levbytes = tb->insert_size[h]; + + /* Calculate balance parameters for creating new root. */ + if ( ! Sh ) { + if ( ! h ) + reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0"); + switch ( n_ret_value = get_empty_nodes (tb, h) ) { + case CARRY_ON: + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + + case NO_DISK_SPACE: + case REPEAT_SEARCH: + return n_ret_value; + default: + reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes"); + } + } + + if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */ + return n_ret_value; + + sfree = B_FREE_SPACE (Sh); + + /* get free space of neighbors */ + rfree = get_rfree (tb, h); + lfree = get_lfree (tb, h); + + if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED) + /* and new item fits into node S[h] without any shifting */ + return NO_BALANCING_NEEDED; + + create_virtual_node (tb, h); + + /* + determine maximal number of items we can shift to the left neighbor (in tb structure) + and the maximal number of bytes that can flow to the left neighbor + from the left most liquid item that cannot be shifted from S[0] entirely (returned value) + */ + check_left (tb, h, lfree); + + /* + determine maximal number of items we can shift to the right neighbor (in tb structure) + and the maximal number of bytes that can flow to the right neighbor + from the right most liquid item that cannot be shifted from S[0] entirely (returned value) + */ + check_right (tb, h, rfree); + + + /* all contents of internal node S[h] can be moved into its + neighbors, S[h] will be removed after balancing */ + if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { + int to_r; + + /* Since we are working on internal nodes, and our internal + nodes have fixed size entries, then we can balance by the + number of items rather than the space they consume. In this + routine we set the left node equal to the right node, + allowing a difference of less than or equal to 1 child + pointer. */ + to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* this checks balance condition, that any two neighboring nodes can not fit in one node */ + RFALSE( h && + ( tb->lnum[h] >= vn->vn_nr_item + 1 || + tb->rnum[h] >= vn->vn_nr_item + 1), + "vs-8220: tree is not balanced on internal level"); + RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || + (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ), + "vs-8225: tree is not balanced on leaf level"); + + /* all contents of S[0] can be moved into its neighbors + S[0] will be removed after balancing. */ + if (!h && is_leaf_removable (tb)) + return CARRY_ON; + + + /* why do we perform this check here rather than earlier?? + Answer: we can win 1 node in some cases above. Moreover we + checked it above, when we checked, that S[0] is not removable + in principle */ + if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + if ( ! h ) + tb->s0num = vn->vn_nr_item; + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + + { + int lpar, rpar, nset, lset, rset, lrset; + /* + * regular overflowing of the node + */ + + /* get_num_ver works in 2 modes (FLOW & NO_FLOW) + lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) + nset, lset, rset, lrset - shows, whether flowing items give better packing + */ +#define FLOW 1 +#define NO_FLOW 0 /* do not any splitting */ + + /* we choose one the following */ +#define NOTHING_SHIFT_NO_FLOW 0 +#define NOTHING_SHIFT_FLOW 5 +#define LEFT_SHIFT_NO_FLOW 10 +#define LEFT_SHIFT_FLOW 15 +#define RIGHT_SHIFT_NO_FLOW 20 +#define RIGHT_SHIFT_FLOW 25 +#define LR_SHIFT_NO_FLOW 30 +#define LR_SHIFT_FLOW 35 + + + lpar = tb->lnum[h]; + rpar = tb->rnum[h]; + + + /* calculate number of blocks S[h] must be split into when + nothing is shifted to the neighbors, + as well as number of items in each part of the split node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any */ + nset = NOTHING_SHIFT_NO_FLOW; + nver = get_num_ver (vn->vn_mode, tb, h, + 0, -1, h?vn->vn_nr_item:0, -1, + snum012, NO_FLOW); + + if (!h) + { + int nver1; + + /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */ + nver1 = get_num_ver (vn->vn_mode, tb, h, + 0, -1, 0, -1, + snum012 + NOTHING_SHIFT_FLOW, FLOW); + if (nver > nver1) + nset = NOTHING_SHIFT_FLOW, nver = nver1; + } + + + /* calculate number of blocks S[h] must be split into when + l_shift_num first items and l_shift_bytes of the right most + liquid item to be shifted are shifted to the left neighbor, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + lset = LEFT_SHIFT_NO_FLOW; + lnver = get_num_ver (vn->vn_mode, tb, h, + lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1, + snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) + { + int lnver1; + + lnver1 = get_num_ver (vn->vn_mode, tb, h, + lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1, + snum012 + LEFT_SHIFT_FLOW, FLOW); + if (lnver > lnver1) + lset = LEFT_SHIFT_FLOW, lnver = lnver1; + } + + + /* calculate number of blocks S[h] must be split into when + r_shift_num first items and r_shift_bytes of the left most + liquid item to be shifted are shifted to the right neighbor, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + rset = RIGHT_SHIFT_NO_FLOW; + rnver = get_num_ver (vn->vn_mode, tb, h, + 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1, + snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) + { + int rnver1; + + rnver1 = get_num_ver (vn->vn_mode, tb, h, + 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, + snum012 + RIGHT_SHIFT_FLOW, FLOW); + + if (rnver > rnver1) + rset = RIGHT_SHIFT_FLOW, rnver = rnver1; + } + + + /* calculate number of blocks S[h] must be split into when + items are shifted in both directions, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + lrset = LR_SHIFT_NO_FLOW; + lrnver = get_num_ver (vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1, + snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); + if (!h) + { + int lrnver1; + + lrnver1 = get_num_ver (vn->vn_mode, tb, h, + lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, + snum012 + LR_SHIFT_FLOW, FLOW); + if (lrnver > lrnver1) + lrset = LR_SHIFT_FLOW, lrnver = lrnver1; + } + + + + /* Our general shifting strategy is: + 1) to minimized number of new nodes; + 2) to minimized number of neighbors involved in shifting; + 3) to minimized number of disk reads; */ + + /* we can win TWO or ONE nodes by shifting in both directions */ + if (lrnver < lnver && lrnver < rnver) + { + RFALSE( h && + (tb->lnum[h] != 1 || + tb->rnum[h] != 1 || + lrnver != 1 || rnver != 2 || lnver != 2 || h != 1), + "vs-8230: bad h"); + if (lrset == LR_SHIFT_FLOW) + set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset, + tb->lbytes, tb->rbytes); + else + set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1), + tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1); + + return CARRY_ON; + } + + /* if shifting doesn't lead to better packing then don't shift */ + if (nver == lrnver) + { + set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1); + return CARRY_ON; + } + + + /* now we know that for better packing shifting in only one + direction either to the left or to the right is required */ + + /* if shifting to the left is better than shifting to the right */ + if (lnver < rnver) + { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* if shifting to the right is better than shifting to the left */ + if (lnver > rnver) + { + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } + + + /* now shifting in either direction gives the same number + of nodes and we can make use of the cached neighbors */ + if (is_left_neighbor_in_cache (tb,h)) + { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* shift to the right independently on whether the right neighbor in cache or not */ + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } +} + + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting for INTERNAL node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + * + * Note: Items of internal nodes have fixed size, so the balance condition for + * the internal part of S+tree is as for the B-trees. + */ +static int dc_check_balance_internal (struct tree_balance * tb, int h) +{ + struct virtual_node * vn = tb->tb_vn; + + /* Sh is the node whose balance is currently being checked, + and Fh is its father. */ + struct buffer_head * Sh, * Fh; + int maxsize, + n_ret_value; + int lfree, rfree /* free space in L and R */; + + Sh = PATH_H_PBUFFER (tb->tb_path, h); + Fh = PATH_H_PPARENT (tb->tb_path, h); + + maxsize = MAX_CHILD_SIZE(Sh); + +/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */ +/* new_nr_item = number of items node would have if operation is */ +/* performed without balancing (new_nr_item); */ + create_virtual_node (tb, h); + + if ( ! Fh ) + { /* S[h] is the root. */ + if ( vn->vn_nr_item > 0 ) + { + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + } + /* new_nr_item == 0. + * Current root will be deleted resulting in + * decrementing the tree height. */ + set_parameters (tb, h, 0, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON ) + return n_ret_value; + + + /* get free space of neighbors */ + rfree = get_rfree (tb, h); + lfree = get_lfree (tb, h); + + /* determine maximal number of items we can fit into neighbors */ + check_left (tb, h, lfree); + check_right (tb, h, rfree); + + + if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) ) + { /* Balance condition for the internal node is valid. + * In this case we balance only if it leads to better packing. */ + if ( vn->vn_nr_item == MIN_NR_KEY(Sh) ) + { /* Here we join S[h] with one of its neighbors, + * which is impossible with greater values of new_nr_item. */ + if ( tb->lnum[h] >= vn->vn_nr_item + 1 ) + { + /* All contents of S[h] can be moved to L[h]. */ + int n; + int order_L; + + order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE); + set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + if ( tb->rnum[h] >= vn->vn_nr_item + 1 ) + { + /* All contents of S[h] can be moved to R[h]. */ + int n; + int order_R; + + order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1; + n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE); + set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1); + return CARRY_ON; + } + } + + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) + { + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + int to_r; + + to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Balancing does not lead to better packing. */ + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + /* Current node contain insufficient number of items. Balancing is required. */ + /* Check whether we can merge S[h] with left neighbor. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) + if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) + { + int n; + int order_L; + + order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE); + set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Check whether we can merge S[h] with right neighbor. */ + if (tb->rnum[h] >= vn->vn_nr_item + 1) + { + int n; + int order_R; + + order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1); + n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE); + set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) + { + int to_r; + + to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* For internal nodes try to borrow item from a neighbor */ + RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); + + /* Borrow one or two items from caching neighbor */ + if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h]) + { + int from_l; + + from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1); + set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1); + return CARRY_ON; + } + + set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1, + NULL, -1, -1); + return CARRY_ON; +} + + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Truncating for LEAF node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance_leaf (struct tree_balance * tb, int h) +{ + struct virtual_node * vn = tb->tb_vn; + + /* Number of bytes that must be deleted from + (value is negative if bytes are deleted) buffer which + contains node being balanced. The mnemonic is that the + attempted change in node space used level is levbytes bytes. */ + int levbytes; + /* the maximal item size */ + int maxsize, + n_ret_value; + /* S0 is the node whose balance is currently being checked, + and F0 is its father. */ + struct buffer_head * S0, * F0; + int lfree, rfree /* free space in L and R */; + + S0 = PATH_H_PBUFFER (tb->tb_path, 0); + F0 = PATH_H_PPARENT (tb->tb_path, 0); + + levbytes = tb->insert_size[h]; + + maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ + + if ( ! F0 ) + { /* S[0] is the root now. */ + + RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0), + "vs-8240: attempt to create empty buffer tree"); + + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON ) + return n_ret_value; + + /* get free space of neighbors */ + rfree = get_rfree (tb, h); + lfree = get_lfree (tb, h); + + create_virtual_node (tb, h); + + /* if 3 leaves can be merge to one, set parameters and return */ + if (are_leaves_removable (tb, lfree, rfree)) + return CARRY_ON; + + /* determine maximal number of items we can shift to the left/right neighbor + and the maximal number of bytes that can flow to the left/right neighbor + from the left/right most liquid item that cannot be shifted from S[0] entirely + */ + check_left (tb, h, lfree); + check_right (tb, h, rfree); + + /* check whether we can merge S with left neighbor. */ + if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) + if (is_left_neighbor_in_cache (tb,h) || + ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ + !tb->FR[h]) { + + RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist"); + + /* set parameter to merge S[0] with its left neighbor */ + set_parameters (tb, h, -1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* check whether we can merge S[0] with right neighbor. */ + if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { + set_parameters (tb, h, 0, -1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */ + if (is_leaf_removable (tb)) + return CARRY_ON; + + /* Balancing is not required. */ + tb->s0num = vn->vn_nr_item; + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; +} + + + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode d - delete, c - cut. + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance (struct tree_balance * tb, int h) +{ + RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized"); + + if ( h ) + return dc_check_balance_internal (tb, h); + else + return dc_check_balance_leaf (tb, h); +} + + + +/* Check whether current node S[h] is balanced. + * Calculate parameters for balancing for current level h. + * Parameters: + * + * tb tree_balance structure: + * + * tb is a large structure that must be read about in the header file + * at the same time as this procedure if the reader is to successfully + * understand this procedure + * + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste, d - delete, c - cut. + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int check_balance (int mode, + struct tree_balance * tb, + int h, + int inum, + int pos_in_item, + struct item_head * ins_ih, + const void * data + ) +{ + struct virtual_node * vn; + + vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); + vn->vn_free_ptr = (char *)(tb->tb_vn + 1); + vn->vn_mode = mode; + vn->vn_affected_item_num = inum; + vn->vn_pos_in_item = pos_in_item; + vn->vn_ins_ih = ins_ih; + vn->vn_data = data; + + RFALSE( mode == M_INSERT && !vn->vn_ins_ih, + "vs-8255: ins_ih can not be 0 in insert mode"); + + if ( tb->insert_size[h] > 0 ) + /* Calculate balance parameters when size of node is increasing. */ + return ip_check_balance (tb, h); + + /* Calculate balance parameters when size of node is decreasing. */ + return dc_check_balance (tb, h); +} + + + +/* Check whether parent at the path is the really parent of the current node.*/ +static int get_direct_parent( + struct tree_balance * p_s_tb, + int n_h + ) { + struct buffer_head * p_s_bh; + struct path * p_s_path = p_s_tb->tb_path; + int n_position, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); + + /* We are in the root or in the new root. */ + if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) { + + RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, + "PAP-8260: invalid offset in the path"); + + if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK (p_s_tb->tb_sb) ) { + /* Root is not changed. */ + PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL; + PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0; + return CARRY_ON; + } + return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ + } + + if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) ) + return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ + + if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) ) + return REPEAT_SEARCH; + + if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr ) + /* Parent in the path is not parent of the current node in the tree. */ + return REPEAT_SEARCH; + + if ( buffer_locked(p_s_bh) ) { + __wait_on_buffer(p_s_bh); + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) + return REPEAT_SEARCH; + } + + return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ +} + + +/* Using lnum[n_h] and rnum[n_h] we should determine what neighbors + * of S[n_h] we + * need in order to balance S[n_h], and get them if necessary. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_neighbors( + struct tree_balance * p_s_tb, + int n_h + ) { + int n_child_position, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1); + unsigned long n_son_number; + struct super_block * p_s_sb = p_s_tb->tb_sb; + struct buffer_head * p_s_bh; + + + PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] ); + + if ( p_s_tb->lnum[n_h] ) { + /* We need left neighbor to balance S[n_h]. */ + PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] ); + p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); + + RFALSE( p_s_bh == p_s_tb->FL[n_h] && + ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset), + "PAP-8270: invalid position in the parent"); + + n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]); + n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position); + p_s_bh = sb_bread(p_s_sb, n_son_number); + if (!p_s_bh) + return IO_ERROR; + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + decrement_bcount(p_s_bh); + PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] ); + return REPEAT_SEARCH; + } + + RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) || + n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) || + B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) != + p_s_bh->b_blocknr, "PAP-8275: invalid parent"); + RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child"); + RFALSE( ! n_h && + B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)), + "PAP-8290: invalid child size of left neighbor"); + + decrement_bcount(p_s_tb->L[n_h]); + p_s_tb->L[n_h] = p_s_bh; + } + + + if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */ + PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] ); + p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); + + RFALSE( p_s_bh == p_s_tb->FR[n_h] && + PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh), + "PAP-8295: invalid position in the parent"); + + n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0; + n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position); + p_s_bh = sb_bread(p_s_sb, n_son_number); + if (!p_s_bh) + return IO_ERROR; + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + decrement_bcount(p_s_bh); + PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] ); + return REPEAT_SEARCH; + } + decrement_bcount(p_s_tb->R[n_h]); + p_s_tb->R[n_h] = p_s_bh; + + RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)), + "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", + B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh), + dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position))); + + } + return CARRY_ON; +} + +#ifdef CONFIG_REISERFS_CHECK +void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s) +{ + void * vp; + static size_t malloced; + + + vp = kmalloc (size, flags); + if (vp) { + REISERFS_SB(s)->s_kmallocs += size; + if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) { + reiserfs_warning (s, + "vs-8301: reiserfs_kmalloc: allocated memory %d", + REISERFS_SB(s)->s_kmallocs); + malloced = REISERFS_SB(s)->s_kmallocs; + } + } + return vp; +} + +void reiserfs_kfree (const void * vp, size_t size, struct super_block * s) +{ + kfree (vp); + + REISERFS_SB(s)->s_kmallocs -= size; + if (REISERFS_SB(s)->s_kmallocs < 0) + reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d", + REISERFS_SB(s)->s_kmallocs); + +} +#endif + + +static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh) +{ + int max_num_of_items; + int max_num_of_entries; + unsigned long blocksize = sb->s_blocksize; + +#define MIN_NAME_LEN 1 + + max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); + max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / + (DEH_SIZE + MIN_NAME_LEN); + + return sizeof(struct virtual_node) + + max(max_num_of_items * sizeof (struct virtual_item), + sizeof (struct virtual_item) + sizeof(struct direntry_uarea) + + (max_num_of_entries - 1) * sizeof (__u16)); +} + + + +/* maybe we should fail balancing we are going to perform when kmalloc + fails several times. But now it will loop until kmalloc gets + required memory */ +static int get_mem_for_virtual_node (struct tree_balance * tb) +{ + int check_fs = 0; + int size; + char * buf; + + size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path)); + + if (size > tb->vn_buf_size) { + /* we have to allocate more memory for virtual node */ + if (tb->vn_buf) { + /* free memory allocated before */ + reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + /* this is not needed if kfree is atomic */ + check_fs = 1; + } + + /* virtual node requires now more memory */ + tb->vn_buf_size = size; + + /* get memory for virtual item */ + buf = reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, tb->tb_sb); + if ( ! buf ) { + /* getting memory with GFP_KERNEL priority may involve + balancing now (due to indirect_to_direct conversion on + dcache shrinking). So, release path and collected + resources here */ + free_buffers_in_tb (tb); + buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb); + if ( !buf ) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning (tb->tb_sb, + "vs-8345: get_mem_for_virtual_node: " + "kmalloc failed. reiserfs kmalloced %d bytes", + REISERFS_SB(tb->tb_sb)->s_kmallocs); +#endif + tb->vn_buf_size = 0; + } + tb->vn_buf = buf; + schedule() ; + return REPEAT_SEARCH; + } + + tb->vn_buf = buf; + } + + if ( check_fs && FILESYSTEM_CHANGED_TB (tb) ) + return REPEAT_SEARCH; + + return CARRY_ON; +} + + +#ifdef CONFIG_REISERFS_CHECK +static void tb_buffer_sanity_check (struct super_block * p_s_sb, + struct buffer_head * p_s_bh, + const char *descr, int level) { + if (p_s_bh) { + if (atomic_read (&(p_s_bh->b_count)) <= 0) { + + reiserfs_panic (p_s_sb, "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh); + } + + if ( ! buffer_uptodate (p_s_bh) ) { + reiserfs_panic (p_s_sb, "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh); + } + + if ( ! B_IS_IN_TREE (p_s_bh) ) { + reiserfs_panic (p_s_sb, "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh); + } + + if (p_s_bh->b_bdev != p_s_sb->s_bdev) { + reiserfs_panic (p_s_sb, "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", descr, level, p_s_bh); + } + + if (p_s_bh->b_size != p_s_sb->s_blocksize) { + reiserfs_panic (p_s_sb, "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", descr, level, p_s_bh); + } + + if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) { + reiserfs_panic (p_s_sb, "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", descr, level, p_s_bh); + } + } +} +#else +static void tb_buffer_sanity_check (struct super_block * p_s_sb, + struct buffer_head * p_s_bh, + const char *descr, int level) +{;} +#endif + +static int clear_all_dirty_bits(struct super_block *s, + struct buffer_head *bh) { + return reiserfs_prepare_for_journal(s, bh, 0) ; +} + +static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb) +{ + struct buffer_head * locked; +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + int i; + + do { + + locked = NULL; + + for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) { + if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) { + /* if I understand correctly, we can only be sure the last buffer + ** in the path is in the tree --clm + */ +#ifdef CONFIG_REISERFS_CHECK + if (PATH_PLAST_BUFFER(p_s_tb->tb_path) == + PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) { + tb_buffer_sanity_check (p_s_tb->tb_sb, + PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i), + "S", + p_s_tb->tb_path->path_length - i); + } +#endif + if (!clear_all_dirty_bits(p_s_tb->tb_sb, + PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i))) + { + locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i); + } + } + } + + for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) { + + if (p_s_tb->lnum[i] ) { + + if ( p_s_tb->L[i] ) { + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i); + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i])) + locked = p_s_tb->L[i]; + } + + if ( !locked && p_s_tb->FL[i] ) { + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i); + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i])) + locked = p_s_tb->FL[i]; + } + + if ( !locked && p_s_tb->CFL[i] ) { + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i); + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i])) + locked = p_s_tb->CFL[i]; + } + + } + + if ( !locked && (p_s_tb->rnum[i]) ) { + + if ( p_s_tb->R[i] ) { + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i); + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i])) + locked = p_s_tb->R[i]; + } + + + if ( !locked && p_s_tb->FR[i] ) { + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i); + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i])) + locked = p_s_tb->FR[i]; + } + + if ( !locked && p_s_tb->CFR[i] ) { + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i); + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i])) + locked = p_s_tb->CFR[i]; + } + } + } + /* as far as I can tell, this is not required. The FEB list seems + ** to be full of newly allocated nodes, which will never be locked, + ** dirty, or anything else. + ** To be safe, I'm putting in the checks and waits in. For the moment, + ** they are needed to keep the code in journal.c from complaining + ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well. + ** --clm + */ + for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { + if ( p_s_tb->FEB[i] ) { + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i])) + locked = p_s_tb->FEB[i] ; + } + } + + if (locked) { +#ifdef CONFIG_REISERFS_CHECK + repeat_counter++; + if ( (repeat_counter % 10000) == 0) { + reiserfs_warning (p_s_tb->tb_sb, + "wait_tb_buffers_until_released(): too many " + "iterations waiting for buffer to unlock " + "(%b)", locked); + + /* Don't loop forever. Try to recover from possible error. */ + + return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON; + } +#endif + __wait_on_buffer (locked); + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + return REPEAT_SEARCH; + } + } + + } while (locked); + + return CARRY_ON; +} + + +/* Prepare for balancing, that is + * get all necessary parents, and neighbors; + * analyze what and where should be moved; + * get sufficient number of new nodes; + * Balancing will start only after all resources will be collected at a time. + * + * When ported to SMP kernels, only at the last moment after all needed nodes + * are collected in cache, will the resources be locked using the usual + * textbook ordered lock acquisition algorithms. Note that ensuring that + * this code neither write locks what it does not need to write lock nor locks out of order + * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans + * + * fix is meant in the sense of render unchanging + * + * Latency might be improved by first gathering a list of what buffers are needed + * and then getting as many of them in parallel as possible? -Hans + * + * Parameters: + * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) + * tb tree_balance structure; + * inum item number in S[h]; + * pos_in_item - comment this if you can + * ins_ih & ins_sd are used when inserting + * Returns: 1 - schedule occurred while the function worked; + * 0 - schedule didn't occur while the function worked; + * -1 - if no_disk_space + */ + + +int fix_nodes (int n_op_mode, + struct tree_balance * p_s_tb, + struct item_head * p_s_ins_ih, // item head of item being inserted + const void * data // inserted item or data to be pasted + ) { + int n_ret_value, + n_h, + n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path); + int n_pos_in_item; + + /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared + ** during wait_tb_buffers_run + */ + int wait_tb_buffers_run = 0 ; + struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path); + + ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes; + + n_pos_in_item = p_s_tb->tb_path->pos_in_item; + + + p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb); + + /* we prepare and log the super here so it will already be in the + ** transaction when do_balance needs to change it. + ** This way do_balance won't have to schedule when trying to prepare + ** the super for logging + */ + reiserfs_prepare_for_journal(p_s_tb->tb_sb, + SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ; + journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb, + SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ; + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) + return REPEAT_SEARCH; + + /* if it possible in indirect_to_direct conversion */ + if (buffer_locked (p_s_tbS0)) { + __wait_on_buffer (p_s_tbS0); + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) + return REPEAT_SEARCH; + } + +#ifdef CONFIG_REISERFS_CHECK + if ( cur_tb ) { + print_cur_tb ("fix_nodes"); + reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance"); + } + + if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) { + reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate " + "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode); + } + + /* Check parameters. */ + switch (n_op_mode) { + case M_INSERT: + if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) ) + reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert", + n_item_num, B_NR_ITEMS(p_s_tbS0)); + break; + case M_PASTE: + case M_DELETE: + case M_CUT: + if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) { + print_block (p_s_tbS0, 0, -1, -1); + reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", n_item_num, n_op_mode, p_s_tb->insert_size[0]); + } + break; + default: + reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation"); + } +#endif + + if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH) + // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat + return REPEAT_SEARCH; + + + /* Starting from the leaf level; for all levels n_h of the tree. */ + for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) { + if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) { + goto repeat; + } + + if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num, + n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) { + if ( n_ret_value == NO_BALANCING_NEEDED ) { + /* No balancing for higher levels needed. */ + if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) { + goto repeat; + } + if ( n_h != MAX_HEIGHT - 1 ) + p_s_tb->insert_size[n_h + 1] = 0; + /* ok, analysis and resource gathering are complete */ + break; + } + goto repeat; + } + + if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) { + goto repeat; + } + + if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) { + goto repeat; /* No disk space, or schedule occurred and + analysis may be invalid and needs to be redone. */ + } + + if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) { + /* We have a positive insert size but no nodes exist on this + level, this means that we are creating a new root. */ + + RFALSE( p_s_tb->blknum[n_h] != 1, + "PAP-8350: creating new empty root"); + + if ( n_h < MAX_HEIGHT - 1 ) + p_s_tb->insert_size[n_h + 1] = 0; + } + else + if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) { + if ( p_s_tb->blknum[n_h] > 1 ) { + /* The tree needs to be grown, so this node S[n_h] + which is the root node is split into two nodes, + and a new node (S[n_h+1]) will be created to + become the root node. */ + + RFALSE( n_h == MAX_HEIGHT - 1, + "PAP-8355: attempt to create too high of a tree"); + + p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE; + } + else + if ( n_h < MAX_HEIGHT - 1 ) + p_s_tb->insert_size[n_h + 1] = 0; + } + else + p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1); + } + + if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) { + if (FILESYSTEM_CHANGED_TB(p_s_tb)) { + wait_tb_buffers_run = 1 ; + n_ret_value = REPEAT_SEARCH ; + goto repeat; + } else { + return CARRY_ON; + } + } else { + wait_tb_buffers_run = 1 ; + goto repeat; + } + + repeat: + // fix_nodes was unable to perform its calculation due to + // filesystem got changed under us, lack of free disk space or i/o + // failure. If the first is the case - the search will be + // repeated. For now - free all resources acquired so far except + // for the new allocated nodes + { + int i; + + /* Release path buffers. */ + if (wait_tb_buffers_run) { + pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ; + } else { + pathrelse (p_s_tb->tb_path); + } + /* brelse all resources collected for balancing */ + for ( i = 0; i < MAX_HEIGHT; i++ ) { + if (wait_tb_buffers_run) { + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]); + } + + brelse (p_s_tb->L[i]);p_s_tb->L[i] = NULL; + brelse (p_s_tb->R[i]);p_s_tb->R[i] = NULL; + brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = NULL; + brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = NULL; + brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = NULL; + brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = NULL; + } + + if (wait_tb_buffers_run) { + for ( i = 0; i < MAX_FEB_SIZE; i++ ) { + if ( p_s_tb->FEB[i] ) { + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, + p_s_tb->FEB[i]) ; + } + } + } + return n_ret_value; + } + +} + + +/* Anatoly will probably forgive me renaming p_s_tb to tb. I just + wanted to make lines shorter */ +void unfix_nodes (struct tree_balance * tb) +{ + int i; + + /* Release path buffers. */ + pathrelse_and_restore (tb->tb_sb, tb->tb_path); + + /* brelse all resources collected for balancing */ + for ( i = 0; i < MAX_HEIGHT; i++ ) { + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]); + + brelse (tb->L[i]); + brelse (tb->R[i]); + brelse (tb->FL[i]); + brelse (tb->FR[i]); + brelse (tb->CFL[i]); + brelse (tb->CFR[i]); + } + + /* deal with list of allocated (used and unused) nodes */ + for ( i = 0; i < MAX_FEB_SIZE; i++ ) { + if ( tb->FEB[i] ) { + b_blocknr_t blocknr = tb->FEB[i]->b_blocknr ; + /* de-allocated block which was not used by balancing and + bforget about buffer for it */ + brelse (tb->FEB[i]); + reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0); + } + if (tb->used[i]) { + /* release used as new nodes including a new root */ + brelse (tb->used[i]); + } + } + + if (tb->vn_buf) + reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + +} + + + diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c new file mode 100644 index 000000000000..08d0508c2d39 --- /dev/null +++ b/fs/reiserfs/hashes.c @@ -0,0 +1,209 @@ + +/* + * Keyed 32-bit hash function using TEA in a Davis-Meyer function + * H0 = Key + * Hi = E Mi(Hi-1) + Hi-1 + * + * (see Applied Cryptography, 2nd edition, p448). + * + * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998 + * + * Jeremy has agreed to the contents of reiserfs/README. -Hans + * Yura's function is added (04/07/2000) + */ + +// +// keyed_hash +// yura_hash +// r5_hash +// + +#include <linux/kernel.h> +#include <asm/types.h> +#include <asm/bug.h> + + +#define DELTA 0x9E3779B9 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + +/* a, b, c, d - data; h0, h1 - accumulated hash */ +#define TEACORE(rounds) \ + do { \ + u32 sum = 0; \ + int n = rounds; \ + u32 b0, b1; \ + \ + b0 = h0; \ + b1 = h1; \ + \ + do \ + { \ + sum += DELTA; \ + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ + } while(--n); \ + \ + h0 += b0; \ + h1 += b1; \ + } while(0) + + +u32 keyed_hash(const signed char *msg, int len) +{ + u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3}; + + u32 h0 = k[0], h1 = k[1]; + u32 a, b, c, d; + u32 pad; + int i; + + // assert(len >= 0 && len < 256); + + pad = (u32)len | ((u32)len << 8); + pad |= pad << 16; + + while(len >= 16) + { + a = (u32)msg[ 0] | + (u32)msg[ 1] << 8 | + (u32)msg[ 2] << 16| + (u32)msg[ 3] << 24; + b = (u32)msg[ 4] | + (u32)msg[ 5] << 8 | + (u32)msg[ 6] << 16| + (u32)msg[ 7] << 24; + c = (u32)msg[ 8] | + (u32)msg[ 9] << 8 | + (u32)msg[10] << 16| + (u32)msg[11] << 24; + d = (u32)msg[12] | + (u32)msg[13] << 8 | + (u32)msg[14] << 16| + (u32)msg[15] << 24; + + TEACORE(PARTROUNDS); + + len -= 16; + msg += 16; + } + + if (len >= 12) + { + a = (u32)msg[ 0] | + (u32)msg[ 1] << 8 | + (u32)msg[ 2] << 16| + (u32)msg[ 3] << 24; + b = (u32)msg[ 4] | + (u32)msg[ 5] << 8 | + (u32)msg[ 6] << 16| + (u32)msg[ 7] << 24; + c = (u32)msg[ 8] | + (u32)msg[ 9] << 8 | + (u32)msg[10] << 16| + (u32)msg[11] << 24; + + d = pad; + for(i = 12; i < len; i++) + { + d <<= 8; + d |= msg[i]; + } + } + else if (len >= 8) + { + a = (u32)msg[ 0] | + (u32)msg[ 1] << 8 | + (u32)msg[ 2] << 16| + (u32)msg[ 3] << 24; + b = (u32)msg[ 4] | + (u32)msg[ 5] << 8 | + (u32)msg[ 6] << 16| + (u32)msg[ 7] << 24; + + c = d = pad; + for(i = 8; i < len; i++) + { + c <<= 8; + c |= msg[i]; + } + } + else if (len >= 4) + { + a = (u32)msg[ 0] | + (u32)msg[ 1] << 8 | + (u32)msg[ 2] << 16| + (u32)msg[ 3] << 24; + + b = c = d = pad; + for(i = 4; i < len; i++) + { + b <<= 8; + b |= msg[i]; + } + } + else + { + a = b = c = d = pad; + for(i = 0; i < len; i++) + { + a <<= 8; + a |= msg[i]; + } + } + + TEACORE(FULLROUNDS); + +/* return 0;*/ + return h0^h1; +} + +/* What follows in this file is copyright 2000 by Hans Reiser, and the + * licensing of what follows is governed by reiserfs/README */ + +u32 yura_hash (const signed char *msg, int len) +{ + int j, pow; + u32 a, c; + int i; + + for (pow=1,i=1; i < len; i++) pow = pow * 10; + + if (len == 1) + a = msg[0]-48; + else + a = (msg[0] - 48) * pow; + + for (i=1; i < len; i++) { + c = msg[i] - 48; + for (pow=1,j=i; j < len-1; j++) pow = pow * 10; + a = a + c * pow; + } + + for (; i < 40; i++) { + c = '0' - 48; + for (pow=1,j=i; j < len-1; j++) pow = pow * 10; + a = a + c * pow; + } + + for (; i < 256; i++) { + c = i; + for (pow=1,j=i; j < len-1; j++) pow = pow * 10; + a = a + c * pow; + } + + a = a << 7; + return a; +} + +u32 r5_hash (const signed char *msg, int len) +{ + u32 a=0; + while(*msg) { + a += *msg << 4; + a += *msg >> 4; + a *= 11; + msg++; + } + return a; +} diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c new file mode 100644 index 000000000000..a362125da0d8 --- /dev/null +++ b/fs/reiserfs/ibalance.c @@ -0,0 +1,1058 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <linux/string.h> +#include <linux/time.h> +#include <linux/reiserfs_fs.h> +#include <linux/buffer_head.h> + +/* this is one and only function that is used outside (do_balance.c) */ +int balance_internal ( + struct tree_balance * , + int, + int, + struct item_head * , + struct buffer_head ** + ); + +/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */ +#define INTERNAL_SHIFT_FROM_S_TO_L 0 +#define INTERNAL_SHIFT_FROM_R_TO_S 1 +#define INTERNAL_SHIFT_FROM_L_TO_S 2 +#define INTERNAL_SHIFT_FROM_S_TO_R 3 +#define INTERNAL_INSERT_TO_S 4 +#define INTERNAL_INSERT_TO_L 5 +#define INTERNAL_INSERT_TO_R 6 + +static void internal_define_dest_src_infos ( + int shift_mode, + struct tree_balance * tb, + int h, + struct buffer_info * dest_bi, + struct buffer_info * src_bi, + int * d_key, + struct buffer_head ** cf + ) +{ + memset (dest_bi, 0, sizeof (struct buffer_info)); + memset (src_bi, 0, sizeof (struct buffer_info)); + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position (tb, h); + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + case INTERNAL_SHIFT_FROM_L_TO_S: + src_bi->tb = tb; + src_bi->bi_bh = tb->L[h]; + src_bi->bi_parent = tb->FL[h]; + src_bi->bi_position = get_left_neighbor_position (tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */ + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + + case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[h]; + src_bi->bi_parent = tb->FR[h]; + src_bi->bi_position = get_right_neighbor_position (tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_SHIFT_FROM_S_TO_R: + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position (tb, h); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_INSERT_TO_L: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position (tb, h); + break; + + case INTERNAL_INSERT_TO_S: + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + break; + + case INTERNAL_INSERT_TO_R: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position (tb, h); + break; + + default: + reiserfs_panic (tb->tb_sb, "internal_define_dest_src_infos: shift type is unknown (%d)", shift_mode); + } +} + + + +/* Insert count node pointers into buffer cur before position to + 1. + * Insert count items into buffer cur before position to. + * Items and node pointers are specified by inserted and bh respectively. + */ +static void internal_insert_childs (struct buffer_info * cur_bi, + int to, int count, + struct item_head * inserted, + struct buffer_head ** bh + ) +{ + struct buffer_head * cur = cur_bi->bi_bh; + struct block_head * blkh; + int nr; + struct reiserfs_key * ih; + struct disk_child new_dc[2]; + struct disk_child * dc; + int i; + + if (count <= 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + RFALSE( count > 2, + "too many children (%d) are to be inserted", count); + RFALSE( B_FREE_SPACE (cur) < count * (KEY_SIZE + DC_SIZE), + "no enough free space (%d), needed %d bytes", + B_FREE_SPACE (cur), count * (KEY_SIZE + DC_SIZE)); + + /* prepare space for count disk_child */ + dc = B_N_CHILD(cur,to+1); + + memmove (dc + count, dc, (nr+1-(to+1)) * DC_SIZE); + + /* copy to_be_insert disk children */ + for (i = 0; i < count; i ++) { + put_dc_size( &(new_dc[i]), MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); + put_dc_block_number( &(new_dc[i]), bh[i]->b_blocknr ); + } + memcpy (dc, new_dc, DC_SIZE * count); + + + /* prepare space for count items */ + ih = B_N_PDELIM_KEY (cur, ((to == -1) ? 0 : to)); + + memmove (ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); + + /* copy item headers (keys) */ + memcpy (ih, inserted, KEY_SIZE); + if ( count > 1 ) + memcpy (ih + 1, inserted + 1, KEY_SIZE); + + /* sizes, item number */ + set_blkh_nr_item( blkh, blkh_nr_item(blkh) + count ); + set_blkh_free_space( blkh, + blkh_free_space(blkh) - count * (DC_SIZE + KEY_SIZE ) ); + + do_balance_mark_internal_dirty (cur_bi->tb, cur,0); + + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (cur); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc = B_N_CHILD (cur_bi->bi_parent,cur_bi->bi_position); + put_dc_size( t_dc, dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE))); + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + } + +} + + +/* Delete del_num items and node pointers from buffer cur starting from * + * the first_i'th item and first_p'th pointers respectively. */ +static void internal_delete_pointers_items ( + struct buffer_info * cur_bi, + int first_p, + int first_i, + int del_num + ) +{ + struct buffer_head * cur = cur_bi->bi_bh; + int nr; + struct block_head * blkh; + struct reiserfs_key * key; + struct disk_child * dc; + + RFALSE( cur == NULL, "buffer is 0"); + RFALSE( del_num < 0, + "negative number of items (%d) can not be deleted", del_num); + RFALSE( first_p < 0 || first_p + del_num > B_NR_ITEMS (cur) + 1 || first_i < 0, + "first pointer order (%d) < 0 or " + "no so many pointers (%d), only (%d) or " + "first key order %d < 0", first_p, + first_p + del_num, B_NR_ITEMS (cur) + 1, first_i); + if ( del_num == 0 ) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + if ( first_p == 0 && del_num == nr + 1 ) { + RFALSE( first_i != 0, "1st deleted key must have order 0, not %d", first_i); + make_empty_node (cur_bi); + return; + } + + RFALSE( first_i + del_num > B_NR_ITEMS (cur), + "first_i = %d del_num = %d " + "no so many keys (%d) in the node (%b)(%z)", + first_i, del_num, first_i + del_num, cur, cur); + + + /* deleting */ + dc = B_N_CHILD (cur, first_p); + + memmove (dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); + key = B_N_PDELIM_KEY (cur, first_i); + memmove (key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE); + + + /* sizes, item number */ + set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num ); + set_blkh_free_space( blkh, + blkh_free_space(blkh) + (del_num * (KEY_SIZE + DC_SIZE) ) ); + + do_balance_mark_internal_dirty (cur_bi->tb, cur, 0); + /*&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (cur); + /*&&&&&&&&&&&&&&&&&&&&&&&*/ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD (cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size( t_dc, dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE) ) ); + + do_balance_mark_internal_dirty (cur_bi->tb, cur_bi->bi_parent,0); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + } +} + + +/* delete n node pointers and items starting from given position */ +static void internal_delete_childs (struct buffer_info * cur_bi, + int from, int n) +{ + int i_from; + + i_from = (from == 0) ? from : from - 1; + + /* delete n pointers starting from `from' position in CUR; + delete n keys starting from 'i_from' position in CUR; + */ + internal_delete_pointers_items (cur_bi, from, i_from, n); +} + + +/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest +* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest + */ +static void internal_copy_pointers_items ( + struct buffer_info * dest_bi, + struct buffer_head * src, + int last_first, int cpy_num + ) +{ + /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST * + * as delimiting key have already inserted to buffer dest.*/ + struct buffer_head * dest = dest_bi->bi_bh; + int nr_dest, nr_src; + int dest_order, src_order; + struct block_head * blkh; + struct reiserfs_key * key; + struct disk_child * dc; + + nr_src = B_NR_ITEMS (src); + + RFALSE( dest == NULL || src == NULL, + "src (%p) or dest (%p) buffer is 0", src, dest); + RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "invalid last_first parameter (%d)", last_first); + RFALSE( nr_src < cpy_num - 1, + "no so many items (%d) in src (%d)", cpy_num, nr_src); + RFALSE( cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num); + RFALSE( cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest), + "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)", + cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); + + if ( cpy_num == 0 ) + return; + + /* coping */ + blkh = B_BLK_HEAD(dest); + nr_dest = blkh_nr_item(blkh); + + /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest;*/ + /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0;*/ + (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = nr_src - cpy_num + 1) : + (dest_order = nr_dest, src_order = 0); + + /* prepare space for cpy_num pointers */ + dc = B_N_CHILD (dest, dest_order); + + memmove (dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); + + /* insert pointers */ + memcpy (dc, B_N_CHILD (src, src_order), DC_SIZE * cpy_num); + + + /* prepare space for cpy_num - 1 item headers */ + key = B_N_PDELIM_KEY(dest, dest_order); + memmove (key + cpy_num - 1, key, + KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num)); + + + /* insert headers */ + memcpy (key, B_N_PDELIM_KEY (src, src_order), KEY_SIZE * (cpy_num - 1)); + + /* sizes, item number */ + set_blkh_nr_item( blkh, blkh_nr_item(blkh) + (cpy_num - 1 ) ); + set_blkh_free_space( blkh, + blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num ) ); + + do_balance_mark_internal_dirty (dest_bi->tb, dest, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (dest); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position); + put_dc_size( t_dc, dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num) ); + + do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (dest_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + } + +} + + +/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest. + * Delete cpy_num - del_par items and node pointers from buffer src. + * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. + * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. + */ +static void internal_move_pointers_items (struct buffer_info * dest_bi, + struct buffer_info * src_bi, + int last_first, int cpy_num, int del_par) +{ + int first_pointer; + int first_item; + + internal_copy_pointers_items (dest_bi, src_bi->bi_bh, last_first, cpy_num); + + if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ + first_pointer = 0; + first_item = 0; + /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, + for key - with first_item */ + internal_delete_pointers_items (src_bi, first_pointer, first_item, cpy_num - del_par); + } else { /* shift_right occurs */ + int i, j; + + i = ( cpy_num - del_par == ( j = B_NR_ITEMS(src_bi->bi_bh)) + 1 ) ? 0 : j - cpy_num + del_par; + + internal_delete_pointers_items (src_bi, j + 1 - cpy_num + del_par, i, cpy_num - del_par); + } +} + +/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ +static void internal_insert_key (struct buffer_info * dest_bi, + int dest_position_before, /* insert key before key with n_dest number */ + struct buffer_head * src, + int src_position) +{ + struct buffer_head * dest = dest_bi->bi_bh; + int nr; + struct block_head * blkh; + struct reiserfs_key * key; + + RFALSE( dest == NULL || src == NULL, + "source(%p) or dest(%p) buffer is 0", src, dest); + RFALSE( dest_position_before < 0 || src_position < 0, + "source(%d) or dest(%d) key number less than 0", + src_position, dest_position_before); + RFALSE( dest_position_before > B_NR_ITEMS (dest) || + src_position >= B_NR_ITEMS(src), + "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", + dest_position_before, B_NR_ITEMS (dest), + src_position, B_NR_ITEMS(src)); + RFALSE( B_FREE_SPACE (dest) < KEY_SIZE, + "no enough free space (%d) in dest buffer", B_FREE_SPACE (dest)); + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + + /* prepare space for inserting key */ + key = B_N_PDELIM_KEY (dest, dest_position_before); + memmove (key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); + + /* insert key */ + memcpy (key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE); + + /* Change dirt, free space, item number fields. */ + + set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 ); + set_blkh_free_space( blkh, blkh_free_space(blkh) - KEY_SIZE ); + + do_balance_mark_internal_dirty (dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position); + put_dc_size( t_dc, dc_size(t_dc) + KEY_SIZE ); + + do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0); + } +} + + + +/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. + * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest. + * Replace d_key'th key in buffer cfl. + * Delete pointer_amount items and node pointers from buffer src. + */ +/* this can be invoked both to shift from S to L and from R to S */ +static void internal_shift_left ( + int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */ + struct tree_balance * tb, + int h, + int pointer_amount + ) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head * cf; + int d_key_position; + + internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + + /*printk("pointer_amount = %d\n",pointer_amount);*/ + + if (pointer_amount) { + /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */ + internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); + + if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { + if (src_bi.bi_position/*src->b_item_order*/ == 0) + replace_key (tb, cf, d_key_position, src_bi.bi_parent/*src->b_parent*/, 0); + } else + replace_key (tb, cf, d_key_position, src_bi.bi_bh, pointer_amount - 1); + } + /* last parameter is del_parameter */ + internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 0); + +} + +/* Insert delimiting key to L[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shifts from S[h] to L[h] */ +static void internal_shift1_left ( + struct tree_balance * tb, + int h, + int pointer_amount + ) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head * cf; + int d_key_position; + + internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + + if ( pointer_amount > 0 ) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); + /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]);*/ + + /* last parameter is del_parameter */ + internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1); + /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1);*/ +} + + +/* Insert d_key'th (delimiting) key from buffer cfr to head of dest. + * Copy n node pointers and n - 1 items from buffer src to buffer dest. + * Replace d_key'th key in buffer cfr. + * Delete n items and node pointers from buffer src. + */ +static void internal_shift_right ( + int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */ + struct tree_balance * tb, + int h, + int pointer_amount + ) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head * cf; + int d_key_position; + int nr; + + + internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + + nr = B_NR_ITEMS (src_bi.bi_bh); + + if (pointer_amount > 0) { + /* insert delimiting key from common father of dest and src to dest node into position 0 */ + internal_insert_key (&dest_bi, 0, cf, d_key_position); + if (nr == pointer_amount - 1) { + RFALSE( src_bi.bi_bh != PATH_H_PBUFFER (tb->tb_path, h)/*tb->S[h]*/ || + dest_bi.bi_bh != tb->R[h], + "src (%p) must be == tb->S[h](%p) when it disappears", + src_bi.bi_bh, PATH_H_PBUFFER (tb->tb_path, h)); + /* when S[h] disappers replace left delemiting key as well */ + if (tb->CFL[h]) + replace_key (tb, cf, d_key_position, tb->CFL[h], tb->lkey[h]); + } else + replace_key (tb, cf, d_key_position, src_bi.bi_bh, nr - pointer_amount); + } + + /* last parameter is del_parameter */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 0); +} + +/* Insert delimiting key to R[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shift from S[h] to R[h] */ +static void internal_shift1_right ( + struct tree_balance * tb, + int h, + int pointer_amount + ) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head * cf; + int d_key_position; + + internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + + if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */ + internal_insert_key (&dest_bi, 0, cf, d_key_position); + /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]);*/ + + /* last parameter is del_parameter */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1); + /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1);*/ +} + + +/* Delete insert_num node pointers together with their left items + * and balance current node.*/ +static void balance_internal_when_delete (struct tree_balance * tb, + int h, int child_pos) +{ + int insert_num; + int n; + struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h); + struct buffer_info bi; + + insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); + + /* delete child-node-pointer(s) together with their left item(s) */ + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); + bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + + internal_delete_childs (&bi, child_pos, -insert_num); + + RFALSE( tb->blknum[h] > 1, + "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]); + + n = B_NR_ITEMS(tbSh); + + if ( tb->lnum[h] == 0 && tb->rnum[h] == 0 ) { + if ( tb->blknum[h] == 0 ) { + /* node S[h] (root of the tree) is empty now */ + struct buffer_head *new_root; + + RFALSE( n || B_FREE_SPACE (tbSh) != MAX_CHILD_SIZE(tbSh) - DC_SIZE, + "buffer must have only 0 keys (%d)", n); + RFALSE( bi.bi_parent, "root has parent (%p)", bi.bi_parent); + + /* choose a new root */ + if ( ! tb->L[h-1] || ! B_NR_ITEMS(tb->L[h-1]) ) + new_root = tb->R[h-1]; + else + new_root = tb->L[h-1]; + /* switch super block's tree root block number to the new value */ + PUT_SB_ROOT_BLOCK( tb->tb_sb, new_root->b_blocknr ); + //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; + PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) - 1 ); + + do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); + /*&&&&&&&&&&&&&&&&&&&&&&*/ + if (h > 1) + /* use check_internal if new root is an internal node */ + check_internal (new_root); + /*&&&&&&&&&&&&&&&&&&&&&&*/ + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + return; + } + + if ( tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1 ) { /* join S[h] with L[h] */ + + RFALSE( tb->rnum[h] != 0, + "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", + h, tb->rnum[h]); + + internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + + if ( tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1 ) { /* join S[h] with R[h] */ + RFALSE( tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", + h, tb->lnum[h]); + + internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); + + reiserfs_invalidate_buffer(tb,tbSh); + return; + } + + if ( tb->lnum[h] < 0 ) { /* borrow from left neighbor L[h] */ + RFALSE( tb->rnum[h] != 0, + "wrong tb->rnum[%d]==%d when borrow from L[h]", h, tb->rnum[h]); + /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]);*/ + internal_shift_right (INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]); + return; + } + + if ( tb->rnum[h] < 0 ) { /* borrow from right neighbor R[h] */ + RFALSE( tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when borrow from R[h]", + h, tb->lnum[h]); + internal_shift_left (INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]);*/ + return; + } + + if ( tb->lnum[h] > 0 ) { /* split S[h] into two parts and put them into neighbors */ + RFALSE( tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, + "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", + h, tb->lnum[h], h, tb->rnum[h], n); + + internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]);*/ + internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); + + reiserfs_invalidate_buffer (tb, tbSh); + + return; + } + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", + h, tb->lnum[h], h, tb->rnum[h]); +} + + +/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/ +static void replace_lkey ( + struct tree_balance * tb, + int h, + struct item_head * key + ) +{ + RFALSE( tb->L[h] == NULL || tb->CFL[h] == NULL, + "L[h](%p) and CFL[h](%p) must exist in replace_lkey", + tb->L[h], tb->CFL[h]); + + if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) + return; + + memcpy (B_N_PDELIM_KEY(tb->CFL[h],tb->lkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty (tb, tb->CFL[h],0); +} + + +/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/ +static void replace_rkey ( + struct tree_balance * tb, + int h, + struct item_head * key + ) +{ + RFALSE( tb->R[h] == NULL || tb->CFR[h] == NULL, + "R[h](%p) and CFR[h](%p) must exist in replace_rkey", + tb->R[h], tb->CFR[h]); + RFALSE( B_NR_ITEMS(tb->R[h]) == 0, + "R[h] can not be empty if it exists (item number=%d)", + B_NR_ITEMS(tb->R[h])); + + memcpy (B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty (tb, tb->CFR[h], 0); +} + + +int balance_internal (struct tree_balance * tb, /* tree_balance structure */ + int h, /* level of the tree */ + int child_pos, + struct item_head * insert_key, /* key for insertion on higher level */ + struct buffer_head ** insert_ptr /* node for insertion on higher level*/ + ) + /* if inserting/pasting + { + child_pos is the position of the node-pointer in S[h] that * + pointed to S[h-1] before balancing of the h-1 level; * + this means that new pointers and items must be inserted AFTER * + child_pos + } + else + { + it is the position of the leftmost pointer that must be deleted (together with + its corresponding key to the left of the pointer) + as a result of the previous level's balancing. + } +*/ +{ + struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h); + struct buffer_info bi; + int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */ + int insert_num, n, k; + struct buffer_head * S_new; + struct item_head new_insert_key; + struct buffer_head * new_insert_ptr = NULL; + struct item_head * new_insert_key_addr = insert_key; + + RFALSE( h < 1, "h (%d) can not be < 1 on internal level", h); + + PROC_INFO_INC( tb -> tb_sb, balance_at[ h ] ); + + order = ( tbSh ) ? PATH_H_POSITION (tb->tb_path, h + 1)/*tb->S[h]->b_item_order*/ : 0; + + /* Using insert_size[h] calculate the number insert_num of items + that must be inserted to or deleted from S[h]. */ + insert_num = tb->insert_size[h]/((int)(KEY_SIZE + DC_SIZE)); + + /* Check whether insert_num is proper **/ + RFALSE( insert_num < -2 || insert_num > 2, + "incorrect number of items inserted to the internal node (%d)", + insert_num); + RFALSE( h > 1 && (insert_num > 1 || insert_num < -1), + "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", + insert_num, h); + + /* Make balance in case insert_num < 0 */ + if ( insert_num < 0 ) { + balance_internal_when_delete (tb, h, child_pos); + return order; + } + + k = 0; + if ( tb->lnum[h] > 0 ) { + /* shift lnum[h] items from S[h] to the left neighbor L[h]. + check how many of new items fall into L[h] or CFL[h] after + shifting */ + n = B_NR_ITEMS (tb->L[h]); /* number of items in L[h] */ + if ( tb->lnum[h] <= child_pos ) { + /* new items don't fall into L[h] or CFL[h] */ + internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); + /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]);*/ + child_pos -= tb->lnum[h]; + } else if ( tb->lnum[h] > child_pos + insert_num ) { + /* all new items fall into L[h] */ + internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num); + /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh, + tb->lnum[h]-insert_num); + */ + /* insert insert_num keys and node-pointers into L[h] */ + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position (tb, h); + internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next*/ n + child_pos + 1, + insert_num,insert_key,insert_ptr); + + insert_num = 0; + } else { + struct disk_child * dc; + + /* some items fall into L[h] or CFL[h], but some don't fall */ + internal_shift1_left(tb,h,child_pos+1); + /* calculate number of new items that fall into L[h] */ + k = tb->lnum[h] - child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position (tb, h); + internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next,*/ n + child_pos + 1,k, + insert_key,insert_ptr); + + replace_lkey(tb,h,insert_key + k); + + /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */ + dc = B_N_CHILD(tbSh, 0); + put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[k]) - B_FREE_SPACE (insert_ptr[k])); + put_dc_block_number( dc, insert_ptr[k]->b_blocknr ); + + do_balance_mark_internal_dirty (tb, tbSh, 0); + + k++; + insert_key += k; + insert_ptr += k; + insert_num -= k; + child_pos = 0; + } + } /* tb->lnum[h] > 0 */ + + if ( tb->rnum[h] > 0 ) { + /*shift rnum[h] items from S[h] to the right neighbor R[h]*/ + /* check how many of new items fall into R or CFR after shifting */ + n = B_NR_ITEMS (tbSh); /* number of items in S[h] */ + if ( n - tb->rnum[h] >= child_pos ) + /* new items fall into S[h] */ + /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]);*/ + internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); + else + if ( n + insert_num - tb->rnum[h] < child_pos ) + { + /* all new items fall into R[h] */ + /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h], + tb->rnum[h] - insert_num);*/ + internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num); + + /* insert insert_num keys and node-pointers into R[h] */ + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position (tb, h); + internal_insert_childs (&bi, /*tb->R[h],tb->S[h-1]->b_next*/ child_pos - n - insert_num + tb->rnum[h] - 1, + insert_num,insert_key,insert_ptr); + insert_num = 0; + } + else + { + struct disk_child * dc; + + /* one of the items falls into CFR[h] */ + internal_shift1_right(tb,h,n - child_pos + 1); + /* calculate number of new items that fall into R[h] */ + k = tb->rnum[h] - n + child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position (tb, h); + internal_insert_childs (&bi, /*tb->R[h], tb->R[h]->b_child,*/ 0, k, insert_key + 1, insert_ptr + 1); + + replace_rkey(tb,h,insert_key + insert_num - k - 1); + + /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1]*/ + dc = B_N_CHILD(tb->R[h], 0); + put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) - + B_FREE_SPACE (insert_ptr[insert_num-k-1])); + put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr ); + + do_balance_mark_internal_dirty (tb, tb->R[h],0); + + insert_num -= (k + 1); + } + } + + /** Fill new node that appears instead of S[h] **/ + RFALSE( tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); + RFALSE( tb->blknum[h] < 0, "blknum can not be < 0"); + + if ( ! tb->blknum[h] ) + { /* node S[h] is empty now */ + RFALSE( ! tbSh, "S[h] is equal NULL"); + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb,tbSh); + return order; + } + + if ( ! tbSh ) { + /* create new root */ + struct disk_child * dc; + struct buffer_head * tbSh_1 = PATH_H_PBUFFER (tb->tb_path, h - 1); + struct block_head * blkh; + + + if ( tb->blknum[h] != 1 ) + reiserfs_panic(NULL, "balance_internal: One new node required for creating the new root"); + /* S[h] = empty buffer from the list FEB. */ + tbSh = get_FEB (tb); + blkh = B_BLK_HEAD(tbSh); + set_blkh_level( blkh, h + 1 ); + + /* Put the unique node-pointer to S[h] that points to S[h-1]. */ + + dc = B_N_CHILD(tbSh, 0); + put_dc_block_number( dc, tbSh_1->b_blocknr ); + put_dc_size( dc, (MAX_CHILD_SIZE (tbSh_1) - B_FREE_SPACE (tbSh_1))); + + tb->insert_size[h] -= DC_SIZE; + set_blkh_free_space( blkh, blkh_free_space(blkh) - DC_SIZE ); + + do_balance_mark_internal_dirty (tb, tbSh, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (tbSh); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + + /* put new root into path structure */ + PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = tbSh; + + /* Change root in structure super block. */ + PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr ); + PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 ); + do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); + } + + if ( tb->blknum[h] == 2 ) { + int snum; + struct buffer_info dest_bi, src_bi; + + + /* S_new = free buffer from list FEB */ + S_new = get_FEB(tb); + + set_blkh_level( B_BLK_HEAD(S_new), h + 1 ); + + dest_bi.tb = tb; + dest_bi.bi_bh = S_new; + dest_bi.bi_parent = NULL; + dest_bi.bi_position = 0; + src_bi.tb = tb; + src_bi.bi_bh = tbSh; + src_bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); + src_bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + + n = B_NR_ITEMS (tbSh); /* number of items in S[h] */ + snum = (insert_num + n + 1)/2; + if ( n - snum >= child_pos ) { + /* new items don't fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n - snum)'th key in S[h] */ + memcpy (&new_insert_key,B_N_PDELIM_KEY(tbSh,n - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0); + /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0);*/ + } else if ( n + insert_num - snum < child_pos ) { + /* all new items fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n + insert_item - snum)'th key in S[h] */ + memcpy(&new_insert_key,B_N_PDELIM_KEY(tbSh,n + insert_num - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0); + /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0);*/ + + /* insert insert_num keys and node-pointers into S_new */ + internal_insert_childs (&dest_bi, /*S_new,tb->S[h-1]->b_next,*/child_pos - n - insert_num + snum - 1, + insert_num,insert_key,insert_ptr); + + insert_num = 0; + } else { + struct disk_child * dc; + + /* some items fall into S_new, but some don't fall */ + /* last parameter is del_par */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1); + /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1);*/ + /* calculate number of new items that fall into S_new */ + k = snum - n + child_pos - 1; + + internal_insert_childs (&dest_bi, /*S_new,*/ 0, k, insert_key + 1, insert_ptr+1); + + /* new_insert_key = insert_key[insert_num - k - 1] */ + memcpy(&new_insert_key,insert_key + insert_num - k - 1, + KEY_SIZE); + /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */ + + dc = B_N_CHILD(S_new,0); + put_dc_size( dc, (MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) - + B_FREE_SPACE(insert_ptr[insert_num-k-1])) ); + put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr ); + + do_balance_mark_internal_dirty (tb, S_new,0); + + insert_num -= (k + 1); + } + /* new_insert_ptr = node_pointer to S_new */ + new_insert_ptr = S_new; + + RFALSE (!buffer_journaled(S_new) || buffer_journal_dirty(S_new) || + buffer_dirty (S_new), + "cm-00001: bad S_new (%b)", S_new); + + // S_new is released in unfix_nodes + } + + n = B_NR_ITEMS (tbSh); /*number of items in S[h] */ + + if ( 0 <= child_pos && child_pos <= n && insert_num > 0 ) { + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); + bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + internal_insert_childs ( + &bi,/*tbSh,*/ + /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next,*/ + child_pos,insert_num,insert_key,insert_ptr + ); + } + + + memcpy (new_insert_key_addr,&new_insert_key,KEY_SIZE); + insert_ptr[0] = new_insert_ptr; + + return order; + } + + + diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c new file mode 100644 index 000000000000..7543031396f4 --- /dev/null +++ b/fs/reiserfs/inode.c @@ -0,0 +1,2846 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include <linux/config.h> +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/reiserfs_fs.h> +#include <linux/reiserfs_acl.h> +#include <linux/reiserfs_xattr.h> +#include <linux/smp_lock.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <asm/uaccess.h> +#include <asm/unaligned.h> +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/quotaops.h> + +extern int reiserfs_default_io_size; /* default io size devuned in super.c */ + +static int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); +static int reiserfs_prepare_write(struct file *f, struct page *page, + unsigned from, unsigned to); + +void reiserfs_delete_inode (struct inode * inode) +{ + /* We need blocks for transaction + (user+group) quota update (possibly delete) */ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS; + struct reiserfs_transaction_handle th ; + + reiserfs_write_lock(inode->i_sb); + + /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + down (&inode->i_sem); + + reiserfs_delete_xattrs (inode); + + if (journal_begin(&th, inode->i_sb, jbegin_count)) { + up (&inode->i_sem); + goto out; + } + reiserfs_update_inode_transaction(inode) ; + + if (reiserfs_delete_object (&th, inode)) { + up (&inode->i_sem); + goto out; + } + + /* Do quota update inside a transaction for journaled quotas. We must do that + * after delete_object so that quota updates go into the same transaction as + * stat data deletion */ + DQUOT_FREE_INODE(inode); + + if (journal_end(&th, inode->i_sb, jbegin_count)) { + up (&inode->i_sem); + goto out; + } + + up (&inode->i_sem); + + /* all items of file are deleted, so we can remove "save" link */ + remove_save_link (inode, 0/* not truncate */); /* we can't do anything + * about an error here */ + } else { + /* no object items are in the tree */ + ; + } +out: + clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */ + inode->i_blocks = 0; + reiserfs_write_unlock(inode->i_sb); +} + +static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid, + loff_t offset, int type, int length ) +{ + key->version = version; + + key->on_disk_key.k_dir_id = dirid; + key->on_disk_key.k_objectid = objectid; + set_cpu_key_k_offset (key, offset); + set_cpu_key_k_type (key, type); + key->key_length = length; +} + + +/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set + offset and type of key */ +void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset, + int type, int length ) +{ + _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id), + le32_to_cpu (INODE_PKEY (inode)->k_objectid), + offset, type, length); +} + + +// +// when key is 0, do not set version and short key +// +inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key, + int version, + loff_t offset, int type, int length, + int entry_count/*or ih_free_space*/) +{ + if (key) { + ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id); + ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid); + } + put_ih_version( ih, version ); + set_le_ih_k_offset (ih, offset); + set_le_ih_k_type (ih, type); + put_ih_item_len( ih, length ); + /* set_ih_free_space (ih, 0);*/ + // for directory items it is entry count, for directs and stat + // datas - 0xffff, for indirects - 0 + put_ih_entry_count( ih, entry_count ); +} + +// +// FIXME: we might cache recently accessed indirect item + +// Ugh. Not too eager for that.... +// I cut the code until such time as I see a convincing argument (benchmark). +// I don't want a bloated inode struct..., and I don't like code complexity.... + +/* cutting the code is fine, since it really isn't in use yet and is easy +** to add back in. But, Vladimir has a really good idea here. Think +** about what happens for reading a file. For each page, +** The VFS layer calls reiserfs_readpage, who searches the tree to find +** an indirect item. This indirect item has X number of pointers, where +** X is a big number if we've done the block allocation right. But, +** we only use one or two of these pointers during each call to readpage, +** needlessly researching again later on. +** +** The size of the cache could be dynamic based on the size of the file. +** +** I'd also like to see us cache the location the stat data item, since +** we are needlessly researching for that frequently. +** +** --chris +*/ + +/* If this page has a file tail in it, and +** it was read in by get_block_create_0, the page data is valid, +** but tail is still sitting in a direct item, and we can't write to +** it. So, look through this page, and check all the mapped buffers +** to make sure they have valid block numbers. Any that don't need +** to be unmapped, so that block_prepare_write will correctly call +** reiserfs_get_block to convert the tail into an unformatted node +*/ +static inline void fix_tail_page_for_writing(struct page *page) { + struct buffer_head *head, *next, *bh ; + + if (page && page_has_buffers(page)) { + head = page_buffers(page) ; + bh = head ; + do { + next = bh->b_this_page ; + if (buffer_mapped(bh) && bh->b_blocknr == 0) { + reiserfs_unmap_buffer(bh) ; + } + bh = next ; + } while (bh != head) ; + } +} + +/* reiserfs_get_block does not need to allocate a block only if it has been + done already or non-hole position has been found in the indirect item */ +static inline int allocation_needed (int retval, b_blocknr_t allocated, + struct item_head * ih, + __u32 * item, int pos_in_item) +{ + if (allocated) + return 0; + if (retval == POSITION_FOUND && is_indirect_le_ih (ih) && + get_block_num(item, pos_in_item)) + return 0; + return 1; +} + +static inline int indirect_item_found (int retval, struct item_head * ih) +{ + return (retval == POSITION_FOUND) && is_indirect_le_ih (ih); +} + + +static inline void set_block_dev_mapped (struct buffer_head * bh, + b_blocknr_t block, struct inode * inode) +{ + map_bh(bh, inode->i_sb, block); +} + + +// +// files which were created in the earlier version can not be longer, +// than 2 gb +// +static int file_capable (struct inode * inode, long block) +{ + if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file. + block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb + return 1; + + return 0; +} + +/*static*/ int restart_transaction(struct reiserfs_transaction_handle *th, + struct inode *inode, struct path *path) { + struct super_block *s = th->t_super ; + int len = th->t_blocks_allocated ; + int err; + + BUG_ON (!th->t_trans_id); + BUG_ON (!th->t_refcount); + + /* we cannot restart while nested */ + if (th->t_refcount > 1) { + return 0 ; + } + pathrelse(path) ; + reiserfs_update_sd(th, inode) ; + err = journal_end(th, s, len) ; + if (!err) { + err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ; + if (!err) + reiserfs_update_inode_transaction(inode) ; + } + return err; +} + +// it is called by get_block when create == 0. Returns block number +// for 'block'-th logical block of file. When it hits direct item it +// returns 0 (being called from bmap) or read direct item into piece +// of page (bh_result) + +// Please improve the english/clarity in the comment above, as it is +// hard to understand. + +static int _get_block_create_0 (struct inode * inode, long block, + struct buffer_head * bh_result, + int args) +{ + INITIALIZE_PATH (path); + struct cpu_key key; + struct buffer_head * bh; + struct item_head * ih, tmp_ih; + int fs_gen ; + int blocknr; + char * p = NULL; + int chars; + int ret ; + int done = 0 ; + unsigned long offset ; + + // prepare the key to look for the 'block'-th block of file + make_cpu_key (&key, inode, + (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3); + +research: + if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) { + pathrelse (&path); + if (p) + kunmap(bh_result->b_page) ; + // We do not return -ENOENT if there is a hole but page is uptodate, because it means + // That there is some MMAPED data associated with it that is yet to be written to disk. + if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) { + return -ENOENT ; + } + return 0 ; + } + + // + bh = get_last_bh (&path); + ih = get_ih (&path); + if (is_indirect_le_ih (ih)) { + __u32 * ind_item = (__u32 *)B_I_PITEM (bh, ih); + + /* FIXME: here we could cache indirect item or part of it in + the inode to avoid search_by_key in case of subsequent + access to file */ + blocknr = get_block_num(ind_item, path.pos_in_item) ; + ret = 0 ; + if (blocknr) { + map_bh(bh_result, inode->i_sb, blocknr); + if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { + set_buffer_boundary(bh_result); + } + } else + // We do not return -ENOENT if there is a hole but page is uptodate, because it means + // That there is some MMAPED data associated with it that is yet to be written to disk. + if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) { + ret = -ENOENT ; + } + + pathrelse (&path); + if (p) + kunmap(bh_result->b_page) ; + return ret ; + } + + // requested data are in direct item(s) + if (!(args & GET_BLOCK_READ_DIRECT)) { + // we are called by bmap. FIXME: we can not map block of file + // when it is stored in direct item(s) + pathrelse (&path); + if (p) + kunmap(bh_result->b_page) ; + return -ENOENT; + } + + /* if we've got a direct item, and the buffer or page was uptodate, + ** we don't want to pull data off disk again. skip to the + ** end, where we map the buffer and return + */ + if (buffer_uptodate(bh_result)) { + goto finished ; + } else + /* + ** grab_tail_page can trigger calls to reiserfs_get_block on up to date + ** pages without any buffers. If the page is up to date, we don't want + ** read old data off disk. Set the up to date bit on the buffer instead + ** and jump to the end + */ + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { + set_buffer_uptodate(bh_result); + goto finished ; + } + + // read file tail into part of page + offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ; + fs_gen = get_generation(inode->i_sb) ; + copy_item_head (&tmp_ih, ih); + + /* we only want to kmap if we are reading the tail into the page. + ** this is not the common case, so we don't kmap until we are + ** sure we need to. But, this means the item might move if + ** kmap schedules + */ + if (!p) { + p = (char *)kmap(bh_result->b_page) ; + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + goto research; + } + } + p += offset ; + memset (p, 0, inode->i_sb->s_blocksize); + do { + if (!is_direct_le_ih (ih)) { + BUG (); + } + /* make sure we don't read more bytes than actually exist in + ** the file. This can happen in odd cases where i_size isn't + ** correct, and when direct item padding results in a few + ** extra bytes at the end of the direct item + */ + if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) + break ; + if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { + chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item; + done = 1 ; + } else { + chars = ih_item_len(ih) - path.pos_in_item; + } + memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars); + + if (done) + break ; + + p += chars; + + if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1)) + // we done, if read direct item is not the last item of + // node FIXME: we could try to check right delimiting key + // to see whether direct item continues in the right + // neighbor or rely on i_size + break; + + // update key to look for the next piece + set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars); + if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) + // we read something from tail, even if now we got IO_ERROR + break; + bh = get_last_bh (&path); + ih = get_ih (&path); + } while (1); + + flush_dcache_page(bh_result->b_page) ; + kunmap(bh_result->b_page) ; + +finished: + pathrelse (&path); + /* this buffer has valid data, but isn't valid for io. mapping it to + * block #0 tells the rest of reiserfs it just has a tail in it + */ + map_bh(bh_result, inode->i_sb, 0); + set_buffer_uptodate (bh_result); + return 0; +} + + +// this is called to create file map. So, _get_block_create_0 will not +// read direct item +static int reiserfs_bmap (struct inode * inode, sector_t block, + struct buffer_head * bh_result, int create) +{ + if (!file_capable (inode, block)) + return -EFBIG; + + reiserfs_write_lock(inode->i_sb); + /* do not read the direct item */ + _get_block_create_0 (inode, block, bh_result, 0) ; + reiserfs_write_unlock(inode->i_sb); + return 0; +} + +/* special version of get_block that is only used by grab_tail_page right +** now. It is sent to block_prepare_write, and when you try to get a +** block past the end of the file (or a block from a hole) it returns +** -ENOENT instead of a valid buffer. block_prepare_write expects to +** be able to do i/o on the buffers returned, unless an error value +** is also returned. +** +** So, this allows block_prepare_write to be used for reading a single block +** in a page. Where it does not produce a valid page for holes, or past the +** end of the file. This turns out to be exactly what we need for reading +** tails for conversion. +** +** The point of the wrapper is forcing a certain value for create, even +** though the VFS layer is calling this function with create==1. If you +** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, +** don't use this function. +*/ +static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block, + struct buffer_head * bh_result, int create) { + return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ; +} + +/* This is special helper for reiserfs_get_block in case we are executing + direct_IO request. */ +static int reiserfs_get_blocks_direct_io(struct inode *inode, + sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, + int create) +{ + int ret ; + + bh_result->b_page = NULL; + + /* We set the b_size before reiserfs_get_block call since it is + referenced in convert_tail_for_hole() that may be called from + reiserfs_get_block() */ + bh_result->b_size = (1 << inode->i_blkbits); + + ret = reiserfs_get_block(inode, iblock, bh_result, + create | GET_BLOCK_NO_DANGLE) ; + if (ret) + goto out; + + /* don't allow direct io onto tail pages */ + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* make sure future calls to the direct io funcs for this offset + ** in the file fail by unmapping the buffer + */ + clear_buffer_mapped(bh_result); + ret = -EINVAL ; + } + /* Possible unpacked tail. Flush the data before pages have + disappeared */ + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { + int err; + lock_kernel(); + err = reiserfs_commit_for_inode(inode); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + unlock_kernel(); + if (err < 0) + ret = err; + } +out: + return ret ; +} + + +/* +** helper function for when reiserfs_get_block is called for a hole +** but the file tail is still in a direct item +** bh_result is the buffer head for the hole +** tail_offset is the offset of the start of the tail in the file +** +** This calls prepare_write, which will start a new transaction +** you should not be in a transaction, or have any paths held when you +** call this. +*/ +static int convert_tail_for_hole(struct inode *inode, + struct buffer_head *bh_result, + loff_t tail_offset) { + unsigned long index ; + unsigned long tail_end ; + unsigned long tail_start ; + struct page * tail_page ; + struct page * hole_page = bh_result->b_page ; + int retval = 0 ; + + if ((tail_offset & (bh_result->b_size - 1)) != 1) + return -EIO ; + + /* always try to read until the end of the block */ + tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ; + tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ; + + index = tail_offset >> PAGE_CACHE_SHIFT ; + /* hole_page can be zero in case of direct_io, we are sure + that we cannot get here if we write with O_DIRECT into + tail page */ + if (!hole_page || index != hole_page->index) { + tail_page = grab_cache_page(inode->i_mapping, index) ; + retval = -ENOMEM; + if (!tail_page) { + goto out ; + } + } else { + tail_page = hole_page ; + } + + /* we don't have to make sure the conversion did not happen while + ** we were locking the page because anyone that could convert + ** must first take i_sem. + ** + ** We must fix the tail page for writing because it might have buffers + ** that are mapped, but have a block number of 0. This indicates tail + ** data that has been read directly into the page, and block_prepare_write + ** won't trigger a get_block in this case. + */ + fix_tail_page_for_writing(tail_page) ; + retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); + if (retval) + goto unlock ; + + /* tail conversion might change the data in the page */ + flush_dcache_page(tail_page) ; + + retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ; + +unlock: + if (tail_page != hole_page) { + unlock_page(tail_page) ; + page_cache_release(tail_page) ; + } +out: + return retval ; +} + +static inline int _allocate_block(struct reiserfs_transaction_handle *th, + long block, + struct inode *inode, + b_blocknr_t *allocated_block_nr, + struct path * path, + int flags) { + BUG_ON (!th->t_trans_id); + +#ifdef REISERFS_PREALLOCATE + if (!(flags & GET_BLOCK_NO_ISEM)) { + return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block); + } +#endif + return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block); +} + +int reiserfs_get_block (struct inode * inode, sector_t block, + struct buffer_head * bh_result, int create) +{ + int repeat, retval = 0; + b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int + INITIALIZE_PATH(path); + int pos_in_item; + struct cpu_key key; + struct buffer_head * bh, * unbh = NULL; + struct item_head * ih, tmp_ih; + __u32 * item; + int done; + int fs_gen; + struct reiserfs_transaction_handle *th = NULL; + /* space reserved in transaction batch: + . 3 balancings in direct->indirect conversion + . 1 block involved into reiserfs_update_sd() + XXX in practically impossible worst case direct2indirect() + can incur (much) more than 3 balancings. + quota update for user, group */ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS; + int version; + int dangle = 1; + loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ; + + /* bad.... */ + reiserfs_write_lock(inode->i_sb); + version = get_inode_item_key_version (inode); + + if (block < 0) { + reiserfs_write_unlock(inode->i_sb); + return -EIO; + } + + if (!file_capable (inode, block)) { + reiserfs_write_unlock(inode->i_sb); + return -EFBIG; + } + + /* if !create, we aren't changing the FS, so we don't need to + ** log anything, so we don't need to start a transaction + */ + if (!(create & GET_BLOCK_CREATE)) { + int ret ; + /* find number of block-th logical block of the file */ + ret = _get_block_create_0 (inode, block, bh_result, + create | GET_BLOCK_READ_DIRECT) ; + reiserfs_write_unlock(inode->i_sb); + return ret; + } + /* + * if we're already in a transaction, make sure to close + * any new transactions we start in this func + */ + if ((create & GET_BLOCK_NO_DANGLE) || + reiserfs_transaction_running(inode->i_sb)) + dangle = 0; + + /* If file is of such a size, that it might have a tail and tails are enabled + ** we should mark it as possibly needing tail packing on close + */ + if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) ) + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; + + /* set the key of the first byte in the 'block'-th block of file */ + make_cpu_key (&key, inode, new_offset, + TYPE_ANY, 3/*key length*/); + if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { +start_trans: + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); + if (!th) { + retval = -ENOMEM; + goto failure; + } + reiserfs_update_inode_transaction(inode) ; + } + research: + + retval = search_for_position_by_key (inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + + bh = get_last_bh (&path); + ih = get_ih (&path); + item = get_item (&path); + pos_in_item = path.pos_in_item; + + fs_gen = get_generation (inode->i_sb); + copy_item_head (&tmp_ih, ih); + + if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) { + /* we have to allocate block for the unformatted node */ + if (!th) { + pathrelse(&path) ; + goto start_trans; + } + + repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create); + + if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { + /* restart the transaction to give the journal a chance to free + ** some blocks. releases the path, so we have to go back to + ** research if we succeed on the second try + */ + SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; + retval = restart_transaction(th, inode, &path) ; + if (retval) + goto failure; + repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create); + + if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { + goto research ; + } + if (repeat == QUOTA_EXCEEDED) + retval = -EDQUOT; + else + retval = -ENOSPC; + goto failure; + } + + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + goto research; + } + } + + if (indirect_item_found (retval, ih)) { + b_blocknr_t unfm_ptr; + /* 'block'-th block is in the file already (there is + corresponding cell in some indirect item). But it may be + zero unformatted node pointer (hole) */ + unfm_ptr = get_block_num (item, pos_in_item); + if (unfm_ptr == 0) { + /* use allocated block to plug the hole */ + reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; + goto research; + } + set_buffer_new(bh_result); + if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb)) + reiserfs_add_ordered_list(inode, bh_result); + put_block_num(item, pos_in_item, allocated_block_nr) ; + unfm_ptr = allocated_block_nr; + journal_mark_dirty (th, inode->i_sb, bh); + reiserfs_update_sd(th, inode) ; + } + set_block_dev_mapped(bh_result, unfm_ptr, inode); + pathrelse (&path); + retval = 0; + if (!dangle && th) + retval = reiserfs_end_persistent_transaction(th); + + reiserfs_write_unlock(inode->i_sb); + + /* the item was found, so new blocks were not added to the file + ** there is no need to make sure the inode is updated with this + ** transaction + */ + return retval; + } + + if (!th) { + pathrelse(&path) ; + goto start_trans; + } + + /* desired position is not found or is in the direct item. We have + to append file with holes up to 'block'-th block converting + direct items to indirect one if necessary */ + done = 0; + do { + if (is_statdata_le_ih (ih)) { + __u32 unp = 0; + struct cpu_key tmp_key; + + /* indirect item has to be inserted */ + make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT, + UNFM_P_SIZE, 0/* free_space */); + + if (cpu_key_k_offset (&key) == 1) { + /* we are going to add 'block'-th block to the file. Use + allocated block for that */ + unp = cpu_to_le32 (allocated_block_nr); + set_block_dev_mapped (bh_result, allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } + tmp_key = key; // ;) + set_cpu_key_k_offset (&tmp_key, 1); + PATH_LAST_POSITION(&path) ++; + + retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp); + if (retval) { + reiserfs_free_block (th, inode, allocated_block_nr, 1); + goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST + } + //mark_tail_converted (inode); + } else if (is_direct_le_ih (ih)) { + /* direct item has to be converted */ + loff_t tail_offset; + + tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; + if (tail_offset == cpu_key_k_offset (&key)) { + /* direct item we just found fits into block we have + to map. Convert it into unformatted node: use + bh_result for the conversion */ + set_block_dev_mapped (bh_result, allocated_block_nr, inode); + unbh = bh_result; + done = 1; + } else { + /* we have to padd file tail stored in direct item(s) + up to block size and convert it to unformatted + node. FIXME: this should also get into page cache */ + + pathrelse(&path) ; + /* + * ugly, but we can only end the transaction if + * we aren't nested + */ + BUG_ON (!th->t_refcount); + if (th->t_refcount == 1) { + retval = reiserfs_end_persistent_transaction(th); + th = NULL; + if (retval) + goto failure; + } + + retval = convert_tail_for_hole(inode, bh_result, tail_offset) ; + if (retval) { + if ( retval != -ENOSPC ) + reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ; + if (allocated_block_nr) { + /* the bitmap, the super, and the stat data == 3 */ + if (!th) + th = reiserfs_persistent_transaction(inode->i_sb,3); + if (th) + reiserfs_free_block (th,inode,allocated_block_nr,1); + } + goto failure ; + } + goto research ; + } + retval = direct2indirect (th, inode, &path, unbh, tail_offset); + if (retval) { + reiserfs_unmap_buffer(unbh); + reiserfs_free_block (th, inode, allocated_block_nr, 1); + goto failure; + } + /* it is important the set_buffer_uptodate is done after + ** the direct2indirect. The buffer might contain valid + ** data newer than the data on disk (read by readpage, changed, + ** and then sent here by writepage). direct2indirect needs + ** to know if unbh was already up to date, so it can decide + ** if the data in unbh needs to be replaced with data from + ** the disk + */ + set_buffer_uptodate (unbh); + + /* unbh->b_page == NULL in case of DIRECT_IO request, this means + buffer will disappear shortly, so it should not be added to + */ + if ( unbh->b_page ) { + /* we've converted the tail, so we must + ** flush unbh before the transaction commits + */ + reiserfs_add_tail_list(inode, unbh) ; + + /* mark it dirty now to prevent commit_write from adding + ** this buffer to the inode's dirty buffer list + */ + /* + * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). + * It's still atomic, but it sets the page dirty too, + * which makes it eligible for writeback at any time by the + * VM (which was also the case with __mark_buffer_dirty()) + */ + mark_buffer_dirty(unbh) ; + } + } else { + /* append indirect item with holes if needed, when appending + pointer to 'block'-th block use block, which is already + allocated */ + struct cpu_key tmp_key; + unp_t unf_single=0; // We use this in case we need to allocate only + // one block which is a fastpath + unp_t *un; + __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE; + __u64 blocks_needed; + + RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, + "vs-804: invalid position for append"); + /* indirect item has to be appended, set up key of that position */ + make_cpu_key (&tmp_key, inode, + le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize), + //pos_in_item * inode->i_sb->s_blocksize, + TYPE_INDIRECT, 3);// key type is unimportant + + blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits); + RFALSE( blocks_needed < 0, "green-805: invalid offset"); + + if ( blocks_needed == 1 ) { + un = &unf_single; + } else { + un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE, + GFP_ATOMIC); // We need to avoid scheduling. + if ( !un) { + un = &unf_single; + blocks_needed = 1; + max_to_insert = 0; + } else + memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert)); + } + if ( blocks_needed <= max_to_insert) { + /* we are going to add target block to the file. Use allocated + block for that */ + un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr); + set_block_dev_mapped (bh_result, allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } else { + /* paste hole to the indirect item */ + /* If kmalloc failed, max_to_insert becomes zero and it means we + only have space for one block */ + blocks_needed=max_to_insert?max_to_insert:1; + } + retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed); + + if (blocks_needed != 1) + kfree(un); + + if (retval) { + reiserfs_free_block (th, inode, allocated_block_nr, 1); + goto failure; + } + if (!done) { + /* We need to mark new file size in case this function will be + interrupted/aborted later on. And we may do this only for + holes. */ + inode->i_size += inode->i_sb->s_blocksize * blocks_needed; + } + } + + if (done == 1) + break; + + /* this loop could log more blocks than we had originally asked + ** for. So, we have to allow the transaction to end if it is + ** too big or too full. Update the inode so things are + ** consistent if we crash before the function returns + ** + ** release the path so that anybody waiting on the path before + ** ending their transaction will be able to continue. + */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + retval = restart_transaction(th, inode, &path) ; + if (retval) + goto failure; + } + /* inserting indirect pointers for a hole can take a + ** long time. reschedule if needed + */ + cond_resched(); + + retval = search_for_position_by_key (inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + if (retval == POSITION_FOUND) { + reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: " + "%K should not be found", &key); + retval = -EEXIST; + if (allocated_block_nr) + reiserfs_free_block (th, inode, allocated_block_nr, 1); + pathrelse(&path) ; + goto failure; + } + bh = get_last_bh (&path); + ih = get_ih (&path); + item = get_item (&path); + pos_in_item = path.pos_in_item; + } while (1); + + + retval = 0; + + failure: + if (th && (!dangle || (retval && !th->t_trans_id))) { + int err; + if (th->t_trans_id) + reiserfs_update_sd(th, inode); + err = reiserfs_end_persistent_transaction(th); + if (err) + retval = err; + } + + reiserfs_write_unlock(inode->i_sb); + reiserfs_check_path(&path) ; + return retval; +} + +static int +reiserfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); +} + +/* Compute real number of used bytes by file + * Following three functions can go away when we'll have enough space in stat item + */ +static int real_space_diff(struct inode *inode, int sd_size) +{ + int bytes; + loff_t blocksize = inode->i_sb->s_blocksize ; + + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) + return sd_size ; + + /* End of file is also in full block with indirect reference, so round + ** up to the next block. + ** + ** there is just no way to know if the tail is actually packed + ** on the file, so we have to assume it isn't. When we pack the + ** tail, we add 4 bytes to pretend there really is an unformatted + ** node pointer + */ + bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size; + return bytes ; +} + +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, + int sd_size) +{ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ; + } + return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9); +} + +/* Compute number of blocks used by file in ReiserFS counting */ +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) +{ + loff_t bytes = inode_get_bytes(inode) ; + loff_t real_space = real_space_diff(inode, sd_size) ; + + /* keeps fsck and non-quota versions of reiserfs happy */ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + bytes += (loff_t)511 ; + } + + /* files from before the quota patch might i_blocks such that + ** bytes < real_space. Deal with that here to prevent it from + ** going negative. + */ + if (bytes < real_space) + return 0 ; + return (bytes - real_space) >> 9; +} + +// +// BAD: new directories have stat data of new type and all other items +// of old type. Version stored in the inode says about body items, so +// in update_stat_data we can not rely on inode, but have to check +// item version directly +// + +// called by read_locked_inode +static void init_inode (struct inode * inode, struct path * path) +{ + struct buffer_head * bh; + struct item_head * ih; + __u32 rdev; + //int version = ITEM_VERSION_1; + + bh = PATH_PLAST_BUFFER (path); + ih = PATH_PITEM_HEAD (path); + + + copy_key (INODE_PKEY (inode), &(ih->ih_key)); + inode->i_blksize = reiserfs_default_io_size; + + INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list )); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + REISERFS_I(inode)->i_acl_access = NULL; + REISERFS_I(inode)->i_acl_default = NULL; + init_rwsem (&REISERFS_I(inode)->xattr_sem); + + if (stat_data_v1 (ih)) { + struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih); + unsigned long blocks; + + set_inode_item_key_version (inode, KEY_FORMAT_3_5); + set_inode_sd_version (inode, STAT_DATA_V1); + inode->i_mode = sd_v1_mode(sd); + inode->i_nlink = sd_v1_nlink(sd); + inode->i_uid = sd_v1_uid(sd); + inode->i_gid = sd_v1_gid(sd); + inode->i_size = sd_v1_size(sd); + inode->i_atime.tv_sec = sd_v1_atime(sd); + inode->i_mtime.tv_sec = sd_v1_mtime(sd); + inode->i_ctime.tv_sec = sd_v1_ctime(sd); + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + + inode->i_blocks = sd_v1_blocks(sd); + inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id); + blocks = (inode->i_size + 511) >> 9; + blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9); + if (inode->i_blocks > blocks) { + // there was a bug in <=3.5.23 when i_blocks could take negative + // values. Starting from 3.5.17 this value could even be stored in + // stat data. For such files we set i_blocks based on file + // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be + // only updated if file's inode will ever change + inode->i_blocks = blocks; + } + + rdev = sd_v1_rdev(sd); + REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd); + /* an early bug in the quota code can give us an odd number for the + ** block count. This is incorrect, fix it here. + */ + if (inode->i_blocks & 1) { + inode->i_blocks++ ; + } + inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, + SD_V1_SIZE)); + /* nopack is initially zero for v1 objects. For v2 objects, + nopack is initialised from sd_attrs */ + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } else { + // new stat data found, but object may have old items + // (directories and symlinks) + struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih); + + inode->i_mode = sd_v2_mode(sd); + inode->i_nlink = sd_v2_nlink(sd); + inode->i_uid = sd_v2_uid(sd); + inode->i_size = sd_v2_size(sd); + inode->i_gid = sd_v2_gid(sd); + inode->i_mtime.tv_sec = sd_v2_mtime(sd); + inode->i_atime.tv_sec = sd_v2_atime(sd); + inode->i_ctime.tv_sec = sd_v2_ctime(sd); + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_blocks = sd_v2_blocks(sd); + rdev = sd_v2_rdev(sd); + if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) ) + inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id); + else + inode->i_generation = sd_v2_generation(sd); + + if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode)) + set_inode_item_key_version (inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version (inode, KEY_FORMAT_3_6); + REISERFS_I(inode)->i_first_direct_byte = 0; + set_inode_sd_version (inode, STAT_DATA_V2); + inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, + SD_V2_SIZE)); + /* read persistent inode attributes from sd and initalise + generic inode flags from them */ + REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd ); + sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode ); + } + + pathrelse (path); + if (S_ISREG (inode->i_mode)) { + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations ; + } else if (S_ISDIR (inode->i_mode)) { + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + } else if (S_ISLNK (inode->i_mode)) { + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else { + inode->i_blocks = 0; + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + } +} + + +// update new stat data with inode fields +static void inode2sd (void * sd, struct inode * inode, loff_t size) +{ + struct stat_data * sd_v2 = (struct stat_data *)sd; + __u16 flags; + + set_sd_v2_mode(sd_v2, inode->i_mode ); + set_sd_v2_nlink(sd_v2, inode->i_nlink ); + set_sd_v2_uid(sd_v2, inode->i_uid ); + set_sd_v2_size(sd_v2, size ); + set_sd_v2_gid(sd_v2, inode->i_gid ); + set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec ); + set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec ); + set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec ); + set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); + else + set_sd_v2_generation(sd_v2, inode->i_generation); + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs( inode, &flags ); + set_sd_v2_attrs( sd_v2, flags ); +} + + +// used to copy inode's fields to old stat data +static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size) +{ + struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd; + + set_sd_v1_mode(sd_v1, inode->i_mode ); + set_sd_v1_uid(sd_v1, inode->i_uid ); + set_sd_v1_gid(sd_v1, inode->i_gid ); + set_sd_v1_nlink(sd_v1, inode->i_nlink ); + set_sd_v1_size(sd_v1, size ); + set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec ); + set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec ); + set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec ); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); + else + set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); + + // Sigh. i_first_direct_byte is back + set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte); +} + + +/* NOTE, you must prepare the buffer head before sending it here, +** and then log it after the call +*/ +static void update_stat_data (struct path * path, struct inode * inode, + loff_t size) +{ + struct buffer_head * bh; + struct item_head * ih; + + bh = PATH_PLAST_BUFFER (path); + ih = PATH_PITEM_HEAD (path); + + if (!is_statdata_le_ih (ih)) + reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h", + INODE_PKEY (inode), ih); + + if (stat_data_v1 (ih)) { + // path points to old stat data + inode2sd_v1 (B_I_PITEM (bh, ih), inode, size); + } else { + inode2sd (B_I_PITEM (bh, ih), inode, size); + } + + return; +} + + +void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th, + struct inode * inode, loff_t size) +{ + struct cpu_key key; + INITIALIZE_PATH(path); + struct buffer_head *bh ; + int fs_gen ; + struct item_head *ih, tmp_ih ; + int retval; + + BUG_ON (!th->t_trans_id); + + make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant + + for(;;) { + int pos; + /* look for the object's stat data */ + retval = search_item (inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: " + "i/o failure occurred trying to update %K stat data", + &key); + return; + } + if (retval == ITEM_NOT_FOUND) { + pos = PATH_LAST_POSITION (&path); + pathrelse(&path) ; + if (inode->i_nlink == 0) { + /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/ + return; + } + reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: " + "stat data of object %k (nlink == %d) not found (pos %d)", + INODE_PKEY (inode), inode->i_nlink, pos); + reiserfs_check_path(&path) ; + return; + } + + /* sigh, prepare_for_journal might schedule. When it schedules the + ** FS might change. We have to detect that, and loop back to the + ** search if the stat data item has moved + */ + bh = get_last_bh(&path) ; + ih = get_ih(&path) ; + copy_item_head (&tmp_ih, ih); + fs_gen = get_generation (inode->i_sb); + reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; + if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; + continue ; /* Stat_data item has been moved after scheduling. */ + } + break; + } + update_stat_data (&path, inode, size); + journal_mark_dirty(th, th->t_super, bh) ; + pathrelse (&path); + return; +} + +/* reiserfs_read_locked_inode is called to read the inode off disk, and it +** does a make_bad_inode when things go wrong. But, we need to make sure +** and clear the key in the private portion of the inode, otherwise a +** corresponding iput might try to delete whatever object the inode last +** represented. +*/ +static void reiserfs_make_bad_inode(struct inode *inode) { + memset(INODE_PKEY(inode), 0, KEY_SIZE); + make_bad_inode(inode); +} + +// +// initially this function was derived from minix or ext2's analog and +// evolved as the prototype did +// + +int reiserfs_init_locked_inode (struct inode * inode, void *p) +{ + struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ; + inode->i_ino = args->objectid; + INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); + return 0; +} + +/* looks for stat data in the tree, and fills up the fields of in-core + inode stat data fields */ +void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args) +{ + INITIALIZE_PATH (path_to_sd); + struct cpu_key key; + unsigned long dirino; + int retval; + + dirino = args->dirid ; + + /* set version 1, version 2 could be used too, because stat data + key is the same in both versions */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = dirino; + key.on_disk_key.k_objectid = inode->i_ino; + key.on_disk_key.u.k_offset_v1.k_offset = SD_OFFSET; + key.on_disk_key.u.k_offset_v1.k_uniqueness = SD_UNIQUENESS; + + /* look for the object's stat data */ + retval = search_item (inode->i_sb, &key, &path_to_sd); + if (retval == IO_ERROR) { + reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: " + "i/o failure occurred trying to find stat data of %K", + &key); + reiserfs_make_bad_inode(inode) ; + return; + } + if (retval != ITEM_FOUND) { + /* a stale NFS handle can trigger this without it being an error */ + pathrelse (&path_to_sd); + reiserfs_make_bad_inode(inode) ; + inode->i_nlink = 0; + return; + } + + init_inode (inode, &path_to_sd); + + /* It is possible that knfsd is trying to access inode of a file + that is being removed from the disk by some other thread. As we + update sd on unlink all that is required is to check for nlink + here. This bug was first found by Sizif when debugging + SquidNG/Butterfly, forgotten, and found again after Philippe + Gramoulle <philippe.gramoulle@mmania.com> reproduced it. + + More logical fix would require changes in fs/inode.c:iput() to + remove inode from hash-table _after_ fs cleaned disk stuff up and + in iget() to return NULL if I_FREEING inode is found in + hash-table. */ + /* Currently there is one place where it's ok to meet inode with + nlink==0: processing of open-unlinked and half-truncated files + during mount (fs/reiserfs/super.c:finish_unfinished()). */ + if( ( inode -> i_nlink == 0 ) && + ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) { + reiserfs_warning (inode->i_sb, + "vs-13075: reiserfs_read_locked_inode: " + "dead inode read from disk %K. " + "This is likely to be race with knfsd. Ignore", + &key ); + reiserfs_make_bad_inode( inode ); + } + + reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */ + +} + +/** + * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). + * + * @inode: inode from hash table to check + * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args. + * + * This function is called by iget5_locked() to distinguish reiserfs inodes + * having the same inode numbers. Such inodes can only exist due to some + * error condition. One of them should be bad. Inodes with identical + * inode numbers (objectids) are distinguished by parent directory ids. + * + */ +int reiserfs_find_actor( struct inode *inode, void *opaque ) +{ + struct reiserfs_iget_args *args; + + args = opaque; + /* args is already in CPU order */ + return (inode->i_ino == args->objectid) && + (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); +} + +struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key) +{ + struct inode * inode; + struct reiserfs_iget_args args ; + + args.objectid = key->on_disk_key.k_objectid ; + args.dirid = key->on_disk_key.k_dir_id ; + inode = iget5_locked (s, key->on_disk_key.k_objectid, + reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); + if (!inode) + return ERR_PTR(-ENOMEM) ; + + if (inode->i_state & I_NEW) { + reiserfs_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + + if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) { + /* either due to i/o error or a stale NFS handle */ + iput (inode); + inode = NULL; + } + return inode; +} + +struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp) +{ + __u32 *data = vobjp; + struct cpu_key key ; + struct dentry *result; + struct inode *inode; + + key.on_disk_key.k_objectid = data[0] ; + key.on_disk_key.k_dir_id = data[1] ; + reiserfs_write_lock(sb); + inode = reiserfs_iget(sb, &key) ; + if (inode && !IS_ERR(inode) && data[2] != 0 && + data[2] != inode->i_generation) { + iput(inode) ; + inode = NULL ; + } + reiserfs_write_unlock(sb); + if (!inode) + inode = ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); + result = d_alloc_anon(inode); + if (!result) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + return result; +} + +struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data, + int len, int fhtype, + int (*acceptable)(void *contect, struct dentry *de), + void *context) { + __u32 obj[3], parent[3]; + + /* fhtype happens to reflect the number of u32s encoded. + * due to a bug in earlier code, fhtype might indicate there + * are more u32s then actually fitted. + * so if fhtype seems to be more than len, reduce fhtype. + * Valid types are: + * 2 - objectid + dir_id - legacy support + * 3 - objectid + dir_id + generation + * 4 - objectid + dir_id + objectid and dirid of parent - legacy + * 5 - objectid + dir_id + generation + objectid and dirid of parent + * 6 - as above plus generation of directory + * 6 does not fit in NFSv2 handles + */ + if (fhtype > len) { + if (fhtype != 6 || len != 5) + reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd", + fhtype, len); + fhtype = 5; + } + + obj[0] = data[0]; + obj[1] = data[1]; + if (fhtype == 3 || fhtype >= 5) + obj[2] = data[2]; + else obj[2] = 0; /* generation number */ + + if (fhtype >= 4) { + parent[0] = data[fhtype>=5?3:2] ; + parent[1] = data[fhtype>=5?4:3] ; + if (fhtype == 6) + parent[2] = data[5]; + else parent[2] = 0; + } + return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent, + acceptable, context); +} + +int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) { + struct inode *inode = dentry->d_inode ; + int maxlen = *lenp; + + if (maxlen < 3) + return 255 ; + + data[0] = inode->i_ino ; + data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ; + data[2] = inode->i_generation ; + *lenp = 3 ; + /* no room for directory info? return what we've stored so far */ + if (maxlen < 5 || ! need_parent) + return 3 ; + + spin_lock(&dentry->d_lock); + inode = dentry->d_parent->d_inode ; + data[3] = inode->i_ino ; + data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ; + *lenp = 5 ; + if (maxlen >= 6) { + data[5] = inode->i_generation ; + *lenp = 6 ; + } + spin_unlock(&dentry->d_lock); + return *lenp ; +} + + +/* looks for stat data, then copies fields to it, marks the buffer + containing stat data as dirty */ +/* reiserfs inodes are never really dirty, since the dirty inode call +** always logs them. This call allows the VFS inode marking routines +** to properly mark inodes for datasync and such, but only actually +** does something when called for a synchronous update. +*/ +int reiserfs_write_inode (struct inode * inode, int do_sync) { + struct reiserfs_transaction_handle th ; + int jbegin_count = 1 ; + + if (inode->i_sb->s_flags & MS_RDONLY) + return -EROFS; + /* memory pressure can sometimes initiate write_inode calls with sync == 1, + ** these cases are just when the system needs ram, not when the + ** inode needs to reach disk for safety, and they can safely be + ** ignored because the altered inode has already been logged. + */ + if (do_sync && !(current->flags & PF_MEMALLOC)) { + reiserfs_write_lock(inode->i_sb); + if (!journal_begin(&th, inode->i_sb, jbegin_count)) { + reiserfs_update_sd (&th, inode); + journal_end_sync(&th, inode->i_sb, jbegin_count) ; + } + reiserfs_write_unlock(inode->i_sb); + } + return 0; +} + +/* stat data of new object is inserted already, this inserts the item + containing "." and ".." entries */ +static int reiserfs_new_directory (struct reiserfs_transaction_handle *th, + struct inode *inode, + struct item_head * ih, struct path * path, + struct inode * dir) +{ + struct super_block * sb = th->t_super; + char empty_dir [EMPTY_DIR_SIZE]; + char * body = empty_dir; + struct cpu_key key; + int retval; + + BUG_ON (!th->t_trans_id); + + _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id), + le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/); + + /* compose item head for new item. Directories consist of items of + old type (ITEM_VERSION_1). Do not set key (second arg is 0), it + is done by reiserfs_new_inode */ + if (old_format_only (sb)) { + make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); + + make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid, + INODE_PKEY (dir)->k_dir_id, + INODE_PKEY (dir)->k_objectid ); + } else { + make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); + + make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid, + INODE_PKEY (dir)->k_dir_id, + INODE_PKEY (dir)->k_objectid ); + } + + /* look for place in the tree for new item */ + retval = search_item (sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: " + "i/o failure occurred creating new directory"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse (path); + reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: " + "object with this key exists (%k)", &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is empty directory item */ + return reiserfs_insert_item (th, path, &key, ih, inode, body); +} + + +/* stat data of object has been inserted, this inserts the item + containing the body of symlink */ +static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, + struct inode *inode, /* Inode of symlink */ + struct item_head * ih, + struct path * path, const char * symname, int item_len) +{ + struct super_block * sb = th->t_super; + struct cpu_key key; + int retval; + + BUG_ON (!th->t_trans_id); + + _make_cpu_key (&key, KEY_FORMAT_3_5, + le32_to_cpu (ih->ih_key.k_dir_id), + le32_to_cpu (ih->ih_key.k_objectid), + 1, TYPE_DIRECT, 3/*key length*/); + + make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/); + + /* look for place in the tree for new item */ + retval = search_item (sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: " + "i/o failure occurred creating new symlink"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse (path); + reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: " + "object with this key exists (%k)", &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is body of symlink */ + return reiserfs_insert_item (th, path, &key, ih, inode, symname); +} + + +/* inserts the stat data into the tree, and then calls + reiserfs_new_directory (to insert ".", ".." item if new object is + directory) or reiserfs_new_symlink (to insert symlink body if new + object is symlink) or nothing (if new object is regular file) + + NOTE! uid and gid must already be set in the inode. If we return + non-zero due to an error, we have to drop the quota previously allocated + for the fresh inode. This can only be done outside a transaction, so + if we return non-zero, we also end the transaction. */ +int reiserfs_new_inode (struct reiserfs_transaction_handle *th, + struct inode * dir, int mode, + const char * symname, + /* 0 for regular, EMTRY_DIR_SIZE for dirs, + strlen (symname) for symlinks)*/ + loff_t i_size, struct dentry *dentry, + struct inode *inode) +{ + struct super_block * sb; + INITIALIZE_PATH (path_to_key); + struct cpu_key key; + struct item_head ih; + struct stat_data sd; + int retval; + int err; + + BUG_ON (!th->t_trans_id); + + if (DQUOT_ALLOC_INODE(inode)) { + err = -EDQUOT; + goto out_end_trans; + } + if (!dir || !dir->i_nlink) { + err = -EPERM; + goto out_bad_inode; + } + + sb = dir->i_sb; + + /* item head of new item */ + ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); + ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th)); + if (!ih.ih_key.k_objectid) { + err = -ENOMEM; + goto out_bad_inode ; + } + if (old_format_only (sb)) + /* not a perfect generation count, as object ids can be reused, but + ** this is as good as reiserfs can do right now. + ** note that the private part of inode isn't filled in yet, we have + ** to use the directory. + */ + inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid); + else +#if defined( USE_INODE_GENERATION_COUNTER ) + inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); +#else + inode->i_generation = ++event; +#endif + + /* fill stat data */ + inode->i_nlink = (S_ISDIR (mode) ? 2 : 1); + + /* uid and gid must already be set by the caller for quota init */ + + /* symlink cannot be immutable or append only, right? */ + if( S_ISLNK( inode -> i_mode ) ) + inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND ); + + inode->i_mtime = inode->i_atime = inode->i_ctime = + CURRENT_TIME_SEC; + inode->i_size = i_size; + inode->i_blocks = 0; + inode->i_bytes = 0; + REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : + U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/; + + INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list )); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + REISERFS_I(inode)->i_attrs = + REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; + sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode ); + REISERFS_I(inode)->i_acl_access = NULL; + REISERFS_I(inode)->i_acl_default = NULL; + init_rwsem (&REISERFS_I(inode)->xattr_sem); + + if (old_format_only (sb)) + make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); + else + make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); + + /* key to search for correct place for new stat data */ + _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id), + le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); + + /* find proper place for inserting of stat data */ + retval = search_item (sb, &key, &path_to_key); + if (retval == IO_ERROR) { + err = -EIO; + goto out_bad_inode; + } + if (retval == ITEM_FOUND) { + pathrelse (&path_to_key); + err = -EEXIST; + goto out_bad_inode; + } + if (old_format_only (sb)) { + if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { + pathrelse (&path_to_key); + /* i_uid or i_gid is too big to be stored in stat data v3.5 */ + err = -EINVAL; + goto out_bad_inode; + } + inode2sd_v1 (&sd, inode, inode->i_size); + } else { + inode2sd (&sd, inode, inode->i_size); + } + // these do not go to on-disk stat data + inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid); + inode->i_blksize = reiserfs_default_io_size; + + // store in in-core inode the key of stat data and version all + // object items will have (directory items will have old offset + // format, other new objects will consist of new items) + memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE); + if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode)) + set_inode_item_key_version (inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version (inode, KEY_FORMAT_3_6); + if (old_format_only (sb)) + set_inode_sd_version (inode, STAT_DATA_V1); + else + set_inode_sd_version (inode, STAT_DATA_V2); + + /* insert the stat data into the tree */ +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (REISERFS_I(dir)->new_packing_locality) + th->displace_new_blocks = 1; +#endif + retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd)); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key) ; + goto out_bad_inode; + } + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (!th->displace_new_blocks) + REISERFS_I(dir)->new_packing_locality = 0; +#endif + if (S_ISDIR(mode)) { + /* insert item with "." and ".." */ + retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir); + } + + if (S_ISLNK(mode)) { + /* insert body of symlink */ + if (!old_format_only (sb)) + i_size = ROUND_UP(i_size); + retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size); + } + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key) ; + journal_end(th, th->t_super, th->t_blocks_allocated); + goto out_inserted_sd; + } + + /* XXX CHECK THIS */ + if (reiserfs_posixacl (inode->i_sb)) { + retval = reiserfs_inherit_default_acl (dir, dentry, inode); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key) ; + journal_end(th, th->t_super, th->t_blocks_allocated); + goto out_inserted_sd; + } + } else if (inode->i_sb->s_flags & MS_POSIXACL) { + reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, " + "but vfs thinks they are!"); + } else if (is_reiserfs_priv_object (dir)) { + reiserfs_mark_inode_private (inode); + } + + insert_inode_hash (inode); + reiserfs_update_sd(th, inode); + reiserfs_check_path(&path_to_key) ; + + return 0; + +/* it looks like you can easily compress these two goto targets into + * one. Keeping it like this doesn't actually hurt anything, and they + * are place holders for what the quota code actually needs. + */ +out_bad_inode: + /* Invalidate the object, nothing was inserted yet */ + INODE_PKEY(inode)->k_objectid = 0; + + /* Quota change must be inside a transaction for journaling */ + DQUOT_FREE_INODE(inode); + +out_end_trans: + journal_end(th, th->t_super, th->t_blocks_allocated) ; + /* Drop can be outside and it needs more credits so it's better to have it outside */ + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; + make_bad_inode(inode); + +out_inserted_sd: + inode->i_nlink = 0; + th->t_trans_id = 0; /* so the caller can't use this handle later */ + iput(inode); + return err; +} + +/* +** finds the tail page in the page cache, +** reads the last block in. +** +** On success, page_result is set to a locked, pinned page, and bh_result +** is set to an up to date buffer for the last block in the file. returns 0. +** +** tail conversion is not done, so bh_result might not be valid for writing +** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before +** trying to write the block. +** +** on failure, nonzero is returned, page_result and bh_result are untouched. +*/ +static int grab_tail_page(struct inode *p_s_inode, + struct page **page_result, + struct buffer_head **bh_result) { + + /* we want the page with the last byte in the file, + ** not the page that will hold the next byte for appending + */ + unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ; + unsigned long pos = 0 ; + unsigned long start = 0 ; + unsigned long blocksize = p_s_inode->i_sb->s_blocksize ; + unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ; + struct buffer_head *bh ; + struct buffer_head *head ; + struct page * page ; + int error ; + + /* we know that we are only called with inode->i_size > 0. + ** we also know that a file tail can never be as big as a block + ** If i_size % blocksize == 0, our file is currently block aligned + ** and it won't need converting or zeroing after a truncate. + */ + if ((offset & (blocksize - 1)) == 0) { + return -ENOENT ; + } + page = grab_cache_page(p_s_inode->i_mapping, index) ; + error = -ENOMEM ; + if (!page) { + goto out ; + } + /* start within the page of the last block in the file */ + start = (offset / blocksize) * blocksize ; + + error = block_prepare_write(page, start, offset, + reiserfs_get_block_create_0) ; + if (error) + goto unlock ; + + head = page_buffers(page) ; + bh = head; + do { + if (pos >= start) { + break ; + } + bh = bh->b_this_page ; + pos += blocksize ; + } while(bh != head) ; + + if (!buffer_uptodate(bh)) { + /* note, this should never happen, prepare_write should + ** be taking care of this for us. If the buffer isn't up to date, + ** I've screwed up the code to find the buffer, or the code to + ** call prepare_write + */ + reiserfs_warning (p_s_inode->i_sb, + "clm-6000: error reading block %lu on dev %s", + bh->b_blocknr, + reiserfs_bdevname (p_s_inode->i_sb)) ; + error = -EIO ; + goto unlock ; + } + *bh_result = bh ; + *page_result = page ; + +out: + return error ; + +unlock: + unlock_page(page) ; + page_cache_release(page) ; + return error ; +} + +/* +** vfs version of truncate file. Must NOT be called with +** a transaction already started. +** +** some code taken from block_truncate_page +*/ +int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) { + struct reiserfs_transaction_handle th ; + /* we want the offset for the first byte after the end of the file */ + unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ; + unsigned blocksize = p_s_inode->i_sb->s_blocksize ; + unsigned length ; + struct page *page = NULL ; + int error ; + struct buffer_head *bh = NULL ; + + reiserfs_write_lock(p_s_inode->i_sb); + + if (p_s_inode->i_size > 0) { + if ((error = grab_tail_page(p_s_inode, &page, &bh))) { + // -ENOENT means we truncated past the end of the file, + // and get_block_create_0 could not find a block to read in, + // which is ok. + if (error != -ENOENT) + reiserfs_warning (p_s_inode->i_sb, + "clm-6001: grab_tail_page failed %d", + error); + page = NULL ; + bh = NULL ; + } + } + + /* so, if page != NULL, we have a buffer head for the offset at + ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, + ** then we have an unformatted node. Otherwise, we have a direct item, + ** and no zeroing is required on disk. We zero after the truncate, + ** because the truncate might pack the item anyway + ** (it will unmap bh if it packs). + */ + /* it is enough to reserve space in transaction for 2 balancings: + one for "save" link adding and another for the first + cut_from_item. 1 is for update_sd */ + error = journal_begin (&th, p_s_inode->i_sb, + JOURNAL_PER_BALANCE_CNT * 2 + 1); + if (error) + goto out; + reiserfs_update_inode_transaction(p_s_inode) ; + if (update_timestamps) + /* we are doing real truncate: if the system crashes before the last + transaction of truncating gets committed - on reboot the file + either appears truncated properly or not truncated at all */ + add_save_link (&th, p_s_inode, 1); + error = reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ; + if (error) + goto out; + error = journal_end (&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); + if (error) + goto out; + + if (update_timestamps) { + error = remove_save_link (p_s_inode, 1/* truncate */); + if (error) + goto out; + } + + if (page) { + length = offset & (blocksize - 1) ; + /* if we are not on a block boundary */ + if (length) { + char *kaddr; + + length = blocksize - length ; + kaddr = kmap_atomic(page, KM_USER0) ; + memset(kaddr + offset, 0, length) ; + flush_dcache_page(page) ; + kunmap_atomic(kaddr, KM_USER0) ; + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + mark_buffer_dirty(bh) ; + } + } + unlock_page(page) ; + page_cache_release(page) ; + } + + reiserfs_write_unlock(p_s_inode->i_sb); + return 0; +out: + if (page) { + unlock_page (page); + page_cache_release (page); + } + reiserfs_write_unlock(p_s_inode->i_sb); + return error; +} + +static int map_block_for_writepage(struct inode *inode, + struct buffer_head *bh_result, + unsigned long block) { + struct reiserfs_transaction_handle th ; + int fs_gen ; + struct item_head tmp_ih ; + struct item_head *ih ; + struct buffer_head *bh ; + __u32 *item ; + struct cpu_key key ; + INITIALIZE_PATH(path) ; + int pos_in_item ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT ; + loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ; + int retval ; + int use_get_block = 0 ; + int bytes_copied = 0 ; + int copy_size ; + int trans_running = 0; + + /* catch places below that try to log something without starting a trans */ + th.t_trans_id = 0; + + if (!buffer_uptodate(bh_result)) { + return -EIO; + } + + kmap(bh_result->b_page) ; +start_over: + reiserfs_write_lock(inode->i_sb); + make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ; + +research: + retval = search_for_position_by_key(inode->i_sb, &key, &path) ; + if (retval != POSITION_FOUND) { + use_get_block = 1; + goto out ; + } + + bh = get_last_bh(&path) ; + ih = get_ih(&path) ; + item = get_item(&path) ; + pos_in_item = path.pos_in_item ; + + /* we've found an unformatted node */ + if (indirect_item_found(retval, ih)) { + if (bytes_copied > 0) { + reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d", + bytes_copied) ; + } + if (!get_block_num(item, pos_in_item)) { + /* crap, we are writing to a hole */ + use_get_block = 1; + goto out ; + } + set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode); + } else if (is_direct_le_ih(ih)) { + char *p ; + p = page_address(bh_result->b_page) ; + p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ; + copy_size = ih_item_len(ih) - pos_in_item; + + fs_gen = get_generation(inode->i_sb) ; + copy_item_head(&tmp_ih, ih) ; + + if (!trans_running) { + /* vs-3050 is gone, no need to drop the path */ + retval = journal_begin(&th, inode->i_sb, jbegin_count) ; + if (retval) + goto out; + reiserfs_update_inode_transaction(inode) ; + trans_running = 1; + if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; + goto research; + } + } + + reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; + + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; + goto research; + } + + memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ; + + journal_mark_dirty(&th, inode->i_sb, bh) ; + bytes_copied += copy_size ; + set_block_dev_mapped(bh_result, 0, inode); + + /* are there still bytes left? */ + if (bytes_copied < bh_result->b_size && + (byte_offset + bytes_copied) < inode->i_size) { + set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ; + goto research ; + } + } else { + reiserfs_warning (inode->i_sb, + "clm-6003: bad item inode %lu, device %s", + inode->i_ino, reiserfs_bdevname (inode->i_sb)) ; + retval = -EIO ; + goto out ; + } + retval = 0 ; + +out: + pathrelse(&path) ; + if (trans_running) { + int err = journal_end(&th, inode->i_sb, jbegin_count) ; + if (err) + retval = err; + trans_running = 0; + } + reiserfs_write_unlock(inode->i_sb); + + /* this is where we fill in holes in the file. */ + if (use_get_block) { + retval = reiserfs_get_block(inode, block, bh_result, + GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM | + GET_BLOCK_NO_DANGLE); + if (!retval) { + if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) { + /* get_block failed to find a mapped unformatted node. */ + use_get_block = 0 ; + goto start_over ; + } + } + } + kunmap(bh_result->b_page) ; + + if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* we've copied data from the page into the direct item, so the + * buffer in the page is now clean, mark it to reflect that. + */ + lock_buffer(bh_result); + clear_buffer_dirty(bh_result); + unlock_buffer(bh_result); + } + return retval ; +} + +/* + * mason@suse.com: updated in 2.5.54 to follow the same general io + * start/recovery path as __block_write_full_page, along with special + * code to handle reiserfs tails. + */ +static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { + struct inode *inode = page->mapping->host ; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; + int error = 0; + unsigned long block ; + struct buffer_head *head, *bh; + int partial = 0 ; + int nr = 0; + int checked = PageChecked(page); + struct reiserfs_transaction_handle th; + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + th.t_trans_id = 0; + + /* The page dirty bit is cleared before writepage is called, which + * means we have to tell create_empty_buffers to make dirty buffers + * The page really should be up to date at this point, so tossing + * in the BH_Uptodate is just a sanity check. + */ + if (!page_has_buffers(page)) { + create_empty_buffers(page, s->s_blocksize, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + } + head = page_buffers(page) ; + + /* last page in the file, zero out any contents past the + ** last byte in the file + */ + if (page->index >= end_index) { + char *kaddr; + unsigned last_offset; + + last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ; + /* no file contents in this page */ + if (page->index >= end_index + 1 || !last_offset) { + unlock_page(page); + return 0; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ; + flush_dcache_page(page) ; + kunmap_atomic(kaddr, KM_USER0) ; + } + bh = head ; + block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ; + /* first map all the buffers, logging any direct items we find */ + do { + if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) || + (buffer_mapped(bh) && bh->b_blocknr == 0))) { + /* not mapped yet, or it points to a direct item, search + * the btree for the mapping info, and log any direct + * items found + */ + if ((error = map_block_for_writepage(inode, bh, block))) { + goto fail ; + } + } + bh = bh->b_this_page; + block++; + } while(bh != head) ; + + /* + * we start the transaction after map_block_for_writepage, + * because it can create holes in the file (an unbounded operation). + * starting it here, we can make a reliable estimate for how many + * blocks we're going to log + */ + if (checked) { + ClearPageChecked(page); + reiserfs_write_lock(s); + error = journal_begin(&th, s, bh_per_page + 1); + if (error) { + reiserfs_write_unlock(s); + goto fail; + } + reiserfs_update_inode_transaction(inode); + } + /* now go through and lock any dirty buffers on the page */ + do { + get_bh(bh); + if (!buffer_mapped(bh)) + continue; + if (buffer_mapped(bh) && bh->b_blocknr == 0) + continue; + + if (checked) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, s, bh); + continue; + } + /* from this point on, we know the buffer is mapped to a + * real block and not a direct item + */ + if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + lock_buffer(bh); + } else { + if (test_set_buffer_locked(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while((bh = bh->b_this_page) != head); + + if (checked) { + error = journal_end(&th, s, bh_per_page + 1); + reiserfs_write_unlock(s); + if (error) + goto fail; + } + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + + /* + * since any buffer might be the only dirty buffer on the page, + * the first submit_bh can bring the page out of writeback. + * be careful with the buffers. + */ + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while(bh != head); + + error = 0; +done: + if (nr == 0) { + /* + * if this page only had a direct item, it is very possible for + * no io to be required without there being an error. Or, + * someone else could have locked them and sent them down the + * pipe without locking the page + */ + bh = head ; + do { + if (!buffer_uptodate(bh)) { + partial = 1; + break; + } + bh = bh->b_this_page; + } while(bh != head); + if (!partial) + SetPageUptodate(page); + end_page_writeback(page); + } + return error; + +fail: + /* catches various errors, we need to make sure any valid dirty blocks + * get to the media. The page is currently locked and not marked for + * writeback + */ + ClearPageUptodate(page); + bh = head; + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * clear any dirty bits that might have come from getting + * attached to a dirty page + */ + clear_buffer_dirty(bh); + } + bh = bh->b_this_page; + } while(bh != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while(bh != head); + goto done; +} + + +static int reiserfs_readpage (struct file *f, struct page * page) +{ + return block_read_full_page (page, reiserfs_get_block); +} + + +static int reiserfs_writepage (struct page * page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host ; + reiserfs_wait_on_write_block(inode->i_sb) ; + return reiserfs_write_full_page(page, wbc) ; +} + +static int reiserfs_prepare_write(struct file *f, struct page *page, + unsigned from, unsigned to) { + struct inode *inode = page->mapping->host ; + int ret; + int old_ref = 0; + + reiserfs_wait_on_write_block(inode->i_sb) ; + fix_tail_page_for_writing(page) ; + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current->journal_info; + BUG_ON (!th->t_refcount); + BUG_ON (!th->t_trans_id); + old_ref = th->t_refcount; + th->t_refcount++; + } + + ret = block_prepare_write(page, from, to, reiserfs_get_block) ; + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close it, + * and we've got to free handle if it was a persistent transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else { + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (err) + ret = err; + } + } + } + return ret; + +} + + +static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) { + return generic_block_bmap(as, block, reiserfs_bmap) ; +} + +static int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to) { + struct inode *inode = page->mapping->host ; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th = NULL; + + reiserfs_wait_on_write_block(inode->i_sb) ; + if (reiserfs_transaction_running(inode->i_sb)) { + th = current->journal_info; + } + reiserfs_commit_page(inode, page, from, to); + + /* generic_commit_write does this for us, but does not update the + ** transaction tracking stuff when the size changes. So, we have + ** to do the i_size updates here. + */ + if (pos > inode->i_size) { + struct reiserfs_transaction_handle myth ; + reiserfs_write_lock(inode->i_sb); + /* If the file have grown beyond the border where it + can have a tail, unmark it as needing a tail + packing */ + if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) ) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; + + ret = journal_begin(&myth, inode->i_sb, 1) ; + if (ret) { + reiserfs_write_unlock(inode->i_sb); + goto journal_error; + } + reiserfs_update_inode_transaction(inode) ; + inode->i_size = pos ; + reiserfs_update_sd(&myth, inode) ; + update_sd = 1; + ret = journal_end(&myth, inode->i_sb, 1) ; + reiserfs_write_unlock(inode->i_sb); + if (ret) + goto journal_error; + } + if (th) { + reiserfs_write_lock(inode->i_sb); + if (!update_sd) + reiserfs_update_sd(th, inode) ; + ret = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (ret) + goto out; + } + + /* we test for O_SYNC here so we can commit the transaction + ** for any packed tails the file might have had + */ + if (f && (f->f_flags & O_SYNC)) { + reiserfs_write_lock(inode->i_sb); + ret = reiserfs_commit_for_inode(inode) ; + reiserfs_write_unlock(inode->i_sb); + } +out: + return ret ; + +journal_error: + if (th) { + reiserfs_write_lock(inode->i_sb); + if (!update_sd) + reiserfs_update_sd(th, inode) ; + ret = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + } + + return ret; +} + +void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode ) +{ + if( reiserfs_attrs( inode -> i_sb ) ) { + if( sd_attrs & REISERFS_SYNC_FL ) + inode -> i_flags |= S_SYNC; + else + inode -> i_flags &= ~S_SYNC; + if( sd_attrs & REISERFS_IMMUTABLE_FL ) + inode -> i_flags |= S_IMMUTABLE; + else + inode -> i_flags &= ~S_IMMUTABLE; + if( sd_attrs & REISERFS_APPEND_FL ) + inode -> i_flags |= S_APPEND; + else + inode -> i_flags &= ~S_APPEND; + if( sd_attrs & REISERFS_NOATIME_FL ) + inode -> i_flags |= S_NOATIME; + else + inode -> i_flags &= ~S_NOATIME; + if( sd_attrs & REISERFS_NOTAIL_FL ) + REISERFS_I(inode)->i_flags |= i_nopack_mask; + else + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } +} + +void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs ) +{ + if( reiserfs_attrs( inode -> i_sb ) ) { + if( inode -> i_flags & S_IMMUTABLE ) + *sd_attrs |= REISERFS_IMMUTABLE_FL; + else + *sd_attrs &= ~REISERFS_IMMUTABLE_FL; + if( inode -> i_flags & S_SYNC ) + *sd_attrs |= REISERFS_SYNC_FL; + else + *sd_attrs &= ~REISERFS_SYNC_FL; + if( inode -> i_flags & S_NOATIME ) + *sd_attrs |= REISERFS_NOATIME_FL; + else + *sd_attrs &= ~REISERFS_NOATIME_FL; + if( REISERFS_I(inode)->i_flags & i_nopack_mask ) + *sd_attrs |= REISERFS_NOTAIL_FL; + else + *sd_attrs &= ~REISERFS_NOTAIL_FL; + } +} + +/* decide if this buffer needs to stay around for data logging or ordered +** write purposes +*/ +static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) +{ + int ret = 1 ; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; + + spin_lock(&j->j_dirty_buffers_lock) ; + if (!buffer_mapped(bh)) { + goto free_jh; + } + /* the page is locked, and the only places that log a data buffer + * also lock the page. + */ + if (reiserfs_file_data_log(inode)) { + /* + * very conservative, leave the buffer pinned if + * anyone might need it. + */ + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + ret = 0 ; + } + } else + if (buffer_dirty(bh) || buffer_locked(bh)) { + struct reiserfs_journal_list *jl; + struct reiserfs_jh *jh = bh->b_private; + + /* why is this safe? + * reiserfs_setattr updates i_size in the on disk + * stat data before allowing vmtruncate to be called. + * + * If buffer was put onto the ordered list for this + * transaction, we know for sure either this transaction + * or an older one already has updated i_size on disk, + * and this ordered data won't be referenced in the file + * if we crash. + * + * if the buffer was put onto the ordered list for an older + * transaction, we need to leave it around + */ + if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) + ret = 0; + } +free_jh: + if (ret && bh->b_private) { + reiserfs_free_jh(bh); + } + spin_unlock(&j->j_dirty_buffers_lock) ; + return ret ; +} + +/* clm -- taken from fs/buffer.c:block_invalidate_page */ +static int reiserfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + struct inode *inode = page->mapping->host; + unsigned int curr_off = 0; + int ret = 1; + + BUG_ON(!PageLocked(page)); + + if (offset == 0) + ClearPageChecked(page); + + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) { + if (invalidatepage_can_drop(inode, bh)) + reiserfs_unmap_buffer(bh); + else + ret = 0; + } + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (!offset && ret) + ret = try_to_release_page(page, 0); +out: + return ret; +} + +static int reiserfs_set_page_dirty(struct page *page) { + struct inode *inode = page->mapping->host; + if (reiserfs_file_data_log(inode)) { + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); + } + return __set_page_dirty_buffers(page); +} + +/* + * Returns 1 if the page's buffers were dropped. The page is locked. + * + * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads + * in the buffers at page_buffers(page). + * + * even in -o notail mode, we can't be sure an old mount without -o notail + * didn't create files with tails. + */ +static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) +{ + struct inode *inode = page->mapping->host ; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; + struct buffer_head *head ; + struct buffer_head *bh ; + int ret = 1 ; + + WARN_ON(PageChecked(page)); + spin_lock(&j->j_dirty_buffers_lock) ; + head = page_buffers(page) ; + bh = head ; + do { + if (bh->b_private) { + if (!buffer_dirty(bh) && !buffer_locked(bh)) { + reiserfs_free_jh(bh); + } else { + ret = 0 ; + break ; + } + } + bh = bh->b_this_page ; + } while (bh != head) ; + if (ret) + ret = try_to_free_buffers(page) ; + spin_unlock(&j->j_dirty_buffers_lock) ; + return ret ; +} + +/* We thank Mingming Cao for helping us understand in great detail what + to do in this section of the code. */ +static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + + return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, reiserfs_get_blocks_direct_io, NULL); +} + +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { + struct inode *inode = dentry->d_inode ; + int error ; + unsigned int ia_valid = attr->ia_valid; + reiserfs_write_lock(inode->i_sb); + if (attr->ia_valid & ATTR_SIZE) { + /* version 2 items will be caught by the s_maxbytes check + ** done for us in vmtruncate + */ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && + attr->ia_size > MAX_NON_LFS) { + error = -EFBIG ; + goto out; + } + /* fill in hole pointers in the expanding truncate case. */ + if (attr->ia_size > inode->i_size) { + error = generic_cont_expand(inode, attr->ia_size) ; + if (REISERFS_I(inode)->i_prealloc_count > 0) { + int err; + struct reiserfs_transaction_handle th ; + /* we're changing at most 2 bitmaps, inode + super */ + err = journal_begin(&th, inode->i_sb, 4) ; + if (!err) { + reiserfs_discard_prealloc (&th, inode); + err = journal_end(&th, inode->i_sb, 4) ; + } + if (err) + error = err; + } + if (error) + goto out; + } + } + + if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || + ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && + (get_inode_sd_version (inode) == STAT_DATA_V1)) { + /* stat data of format v3.5 has 16 bit uid and gid */ + error = -EINVAL; + goto out; + } + + error = inode_change_ok(inode, attr) ; + if (!error) { + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + error = reiserfs_chown_xattrs (inode, attr); + + if (!error) { + struct reiserfs_transaction_handle th; + + /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ + journal_begin(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); + error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + if (error) { + journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); + goto out; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + mark_inode_dirty(inode); + journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); + } + } + if (!error) + error = inode_setattr(inode, attr) ; + } + + + if (!error && reiserfs_posixacl (inode->i_sb)) { + if (attr->ia_valid & ATTR_MODE) + error = reiserfs_acl_chmod (inode); + } + +out: + reiserfs_write_unlock(inode->i_sb); + return error ; +} + + + +struct address_space_operations reiserfs_address_space_operations = { + .writepage = reiserfs_writepage, + .readpage = reiserfs_readpage, + .readpages = reiserfs_readpages, + .releasepage = reiserfs_releasepage, + .invalidatepage = reiserfs_invalidatepage, + .sync_page = block_sync_page, + .prepare_write = reiserfs_prepare_write, + .commit_write = reiserfs_commit_write, + .bmap = reiserfs_aop_bmap, + .direct_IO = reiserfs_direct_IO, + .set_page_dirty = reiserfs_set_page_dirty, +} ; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c new file mode 100644 index 000000000000..94dc42475a04 --- /dev/null +++ b/fs/reiserfs/ioctl.c @@ -0,0 +1,151 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include <linux/fs.h> +#include <linux/reiserfs_fs.h> +#include <linux/time.h> +#include <asm/uaccess.h> +#include <linux/pagemap.h> +#include <linux/smp_lock.h> + +static int reiserfs_unpack (struct inode * inode, struct file * filp); + +/* +** reiserfs_ioctl - handler for ioctl for inode +** supported commands: +** 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect +** and prevent packing file (argument arg has to be non-zero) +** 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION +** 3) That's all for a while ... +*/ +int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +{ + unsigned int flags; + + switch (cmd) { + case REISERFS_IOC_UNPACK: + if( S_ISREG( inode -> i_mode ) ) { + if (arg) + return reiserfs_unpack (inode, filp); + else + return 0; + } else + return -ENOTTY; + /* following two cases are taken from fs/ext2/ioctl.c by Remy + Card (card@masi.ibp.fr) */ + case REISERFS_IOC_GETFLAGS: + flags = REISERFS_I(inode) -> i_attrs; + i_attrs_to_sd_attrs( inode, ( __u16 * ) &flags ); + return put_user(flags, (int __user *) arg); + case REISERFS_IOC_SETFLAGS: { + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + if ( ( ( flags ^ REISERFS_I(inode) -> i_attrs) & ( REISERFS_IMMUTABLE_FL | REISERFS_APPEND_FL)) && + !capable( CAP_LINUX_IMMUTABLE ) ) + return -EPERM; + + if( ( flags & REISERFS_NOTAIL_FL ) && + S_ISREG( inode -> i_mode ) ) { + int result; + + result = reiserfs_unpack( inode, filp ); + if( result ) + return result; + } + sd_attrs_to_i_attrs( flags, inode ); + REISERFS_I(inode) -> i_attrs = flags; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return 0; + } + case REISERFS_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *) arg); + case REISERFS_IOC_SETVERSION: + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (get_user(inode->i_generation, (int __user *) arg)) + return -EFAULT; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return 0; + default: + return -ENOTTY; + } +} + +/* +** reiserfs_unpack +** Function try to convert tail from direct item into indirect. +** It set up nopack attribute in the REISERFS_I(inode)->nopack +*/ +static int reiserfs_unpack (struct inode * inode, struct file * filp) +{ + int retval = 0; + int index ; + struct page *page ; + struct address_space *mapping ; + unsigned long write_from ; + unsigned long blocksize = inode->i_sb->s_blocksize ; + + if (inode->i_size == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + return 0 ; + } + /* ioctl already done */ + if (REISERFS_I(inode)->i_flags & i_nopack_mask) { + return 0 ; + } + reiserfs_write_lock(inode->i_sb); + + /* we need to make sure nobody is changing the file size beneath + ** us + */ + down(&inode->i_sem) ; + + write_from = inode->i_size & (blocksize - 1) ; + /* if we are on a block boundary, we are already unpacked. */ + if ( write_from == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + goto out ; + } + + /* we unpack by finding the page with the tail, and calling + ** reiserfs_prepare_write on that page. This will force a + ** reiserfs_get_block to unpack the tail for us. + */ + index = inode->i_size >> PAGE_CACHE_SHIFT ; + mapping = inode->i_mapping ; + page = grab_cache_page(mapping, index) ; + retval = -ENOMEM; + if (!page) { + goto out ; + } + retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ; + if (retval) + goto out_unlock ; + + /* conversion can change page contents, must flush */ + flush_dcache_page(page) ; + retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ; + REISERFS_I(inode)->i_flags |= i_nopack_mask; + +out_unlock: + unlock_page(page) ; + page_cache_release(page) ; + +out: + up(&inode->i_sem) ; + reiserfs_write_unlock(inode->i_sb); + return retval; +} diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c new file mode 100644 index 000000000000..9cf7c13b120d --- /dev/null +++ b/fs/reiserfs/item_ops.c @@ -0,0 +1,788 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include <linux/time.h> +#include <linux/reiserfs_fs.h> + +// this contains item handlers for old item types: sd, direct, +// indirect, directory + +/* and where are the comments? how about saying where we can find an + explanation of each item handler method? -Hans */ + +////////////////////////////////////////////////////////////////////////////// +// stat data functions +// +static int sd_bytes_number (struct item_head * ih, int block_size) +{ + return 0; +} + +static void sd_decrement_key (struct cpu_key * key) +{ + key->on_disk_key.k_objectid --; + set_cpu_key_k_type (key, TYPE_ANY); + set_cpu_key_k_offset(key, (loff_t)(-1)); +} + +static int sd_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +{ + return 0; +} + + + +static char * print_time (time_t t) +{ + static char timebuf[256]; + + sprintf (timebuf, "%ld", t); + return timebuf; +} + + +static void sd_print_item (struct item_head * ih, char * item) +{ + printk ("\tmode | size | nlinks | first direct | mtime\n"); + if (stat_data_v1 (ih)) { + struct stat_data_v1 * sd = (struct stat_data_v1 *)item; + + printk ("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), + sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd), + print_time( sd_v1_mtime(sd) ) ); + } else { + struct stat_data * sd = (struct stat_data *)item; + + printk ("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), + (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), + sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); + } +} + +static void sd_check_item (struct item_head * ih, char * item) +{ + // FIXME: type something here! +} + + +static int sd_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + vi->vi_index = TYPE_STAT_DATA; + //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed? + return 0; +} + + +static int sd_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + if (start_skip || end_skip) + BUG (); + return -1; +} + + +static int sd_check_right (struct virtual_item * vi, int free) +{ + return -1; +} + +static int sd_part_size (struct virtual_item * vi, int first, int count) +{ + if (count) + BUG (); + return 0; +} + +static int sd_unit_num (struct virtual_item * vi) +{ + return vi->vi_item_len - IH_SIZE; +} + + +static void sd_print_vi (struct virtual_item * vi) +{ + reiserfs_warning (NULL, "STATDATA, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations stat_data_ops = { + .bytes_number = sd_bytes_number, + .decrement_key = sd_decrement_key, + .is_left_mergeable = sd_is_left_mergeable, + .print_item = sd_print_item, + .check_item = sd_check_item, + + .create_vi = sd_create_vi, + .check_left = sd_check_left, + .check_right = sd_check_right, + .part_size = sd_part_size, + .unit_num = sd_unit_num, + .print_vi = sd_print_vi +}; + + + +////////////////////////////////////////////////////////////////////////////// +// direct item functions +// +static int direct_bytes_number (struct item_head * ih, int block_size) +{ + return ih_item_len(ih); +} + + +// FIXME: this should probably switch to indirect as well +static void direct_decrement_key (struct cpu_key * key) +{ + cpu_key_k_offset_dec (key); + if (cpu_key_k_offset (key) == 0) + set_cpu_key_k_type (key, TYPE_STAT_DATA); +} + + +static int direct_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +{ + int version = le_key_version (key); + return ((le_key_k_offset (version, key) & (bsize - 1)) != 1); +} + + +static void direct_print_item (struct item_head * ih, char * item) +{ + int j = 0; + +// return; + printk ("\""); + while (j < ih_item_len(ih)) + printk ("%c", item[j++]); + printk ("\"\n"); +} + + +static void direct_check_item (struct item_head * ih, char * item) +{ + // FIXME: type something here! +} + + +static int direct_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + vi->vi_index = TYPE_DIRECT; + //vi->vi_type |= VI_TYPE_DIRECT; + return 0; +} + +static int direct_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % 8; + return bytes ?: -1; +} + + +static int direct_check_right (struct virtual_item * vi, int free) +{ + return direct_check_left (vi, free, 0, 0); +} + +static int direct_part_size (struct virtual_item * vi, int first, int count) +{ + return count; +} + + +static int direct_unit_num (struct virtual_item * vi) +{ + return vi->vi_item_len - IH_SIZE; +} + + +static void direct_print_vi (struct virtual_item * vi) +{ + reiserfs_warning (NULL, "DIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations direct_ops = { + .bytes_number = direct_bytes_number, + .decrement_key = direct_decrement_key, + .is_left_mergeable = direct_is_left_mergeable, + .print_item = direct_print_item, + .check_item = direct_check_item, + + .create_vi = direct_create_vi, + .check_left = direct_check_left, + .check_right = direct_check_right, + .part_size = direct_part_size, + .unit_num = direct_unit_num, + .print_vi = direct_print_vi +}; + + + +////////////////////////////////////////////////////////////////////////////// +// indirect item functions +// + +static int indirect_bytes_number (struct item_head * ih, int block_size) +{ + return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih); +} + + +// decrease offset, if it becomes 0, change type to stat data +static void indirect_decrement_key (struct cpu_key * key) +{ + cpu_key_k_offset_dec (key); + if (cpu_key_k_offset (key) == 0) + set_cpu_key_k_type (key, TYPE_STAT_DATA); +} + + +// if it is not first item of the body, then it is mergeable +static int indirect_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +{ + int version = le_key_version (key); + return (le_key_k_offset (version, key) != 1); +} + + +// printing of indirect item +static void start_new_sequence (__u32 * start, int * len, __u32 new) +{ + *start = new; + *len = 1; +} + + +static int sequence_finished (__u32 start, int * len, __u32 new) +{ + if (start == INT_MAX) + return 1; + + if (start == 0 && new == 0) { + (*len) ++; + return 0; + } + if (start != 0 && (start + *len) == new) { + (*len) ++; + return 0; + } + return 1; +} + +static void print_sequence (__u32 start, int len) +{ + if (start == INT_MAX) + return; + + if (len == 1) + printk (" %d", start); + else + printk (" %d(%d)", start, len); +} + + +static void indirect_print_item (struct item_head * ih, char * item) +{ + int j; + __u32 * unp, prev = INT_MAX; + int num; + + unp = (__u32 *)item; + + if (ih_item_len(ih) % UNFM_P_SIZE) + reiserfs_warning (NULL, "indirect_print_item: invalid item len"); + + printk ("%d pointers\n[ ", (int)I_UNFM_NUM (ih)); + for (j = 0; j < I_UNFM_NUM (ih); j ++) { + if (sequence_finished (prev, &num, get_block_num(unp, j))) { + print_sequence (prev, num); + start_new_sequence (&prev, &num, get_block_num(unp, j)); + } + } + print_sequence (prev, num); + printk ("]\n"); +} + +static void indirect_check_item (struct item_head * ih, char * item) +{ + // FIXME: type something here! +} + + +static int indirect_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + vi->vi_index = TYPE_INDIRECT; + //vi->vi_type |= VI_TYPE_INDIRECT; + return 0; +} + +static int indirect_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % UNFM_P_SIZE; + return bytes ?: -1; +} + + +static int indirect_check_right (struct virtual_item * vi, int free) +{ + return indirect_check_left (vi, free, 0, 0); +} + + + +// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right) +static int indirect_part_size (struct virtual_item * vi, int first, int units) +{ + // unit of indirect item is byte (yet) + return units; +} + +static int indirect_unit_num (struct virtual_item * vi) +{ + // unit of indirect item is byte (yet) + return vi->vi_item_len - IH_SIZE; +} + +static void indirect_print_vi (struct virtual_item * vi) +{ + reiserfs_warning (NULL, "INDIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations indirect_ops = { + .bytes_number = indirect_bytes_number, + .decrement_key = indirect_decrement_key, + .is_left_mergeable = indirect_is_left_mergeable, + .print_item = indirect_print_item, + .check_item = indirect_check_item, + + .create_vi = indirect_create_vi, + .check_left = indirect_check_left, + .check_right = indirect_check_right, + .part_size = indirect_part_size, + .unit_num = indirect_unit_num, + .print_vi = indirect_print_vi +}; + + +////////////////////////////////////////////////////////////////////////////// +// direntry functions +// + + +static int direntry_bytes_number (struct item_head * ih, int block_size) +{ + reiserfs_warning (NULL, "vs-16090: direntry_bytes_number: " + "bytes number is asked for direntry"); + return 0; +} + +static void direntry_decrement_key (struct cpu_key * key) +{ + cpu_key_k_offset_dec (key); + if (cpu_key_k_offset (key) == 0) + set_cpu_key_k_type (key, TYPE_STAT_DATA); +} + + +static int direntry_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +{ + if (le32_to_cpu (key->u.k_offset_v1.k_offset) == DOT_OFFSET) + return 0; + return 1; + +} + + +static void direntry_print_item (struct item_head * ih, char * item) +{ + int i; + int namelen; + struct reiserfs_de_head * deh; + char * name; + static char namebuf [80]; + + + printk ("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", "Key of pointed object", "Hash", "Gen number", "Status"); + + deh = (struct reiserfs_de_head *)item; + + for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) { + namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh); + name = item + deh_location(deh); + if (name[namelen-1] == 0) + namelen = strlen (name); + namebuf[0] = '"'; + if (namelen > sizeof (namebuf) - 3) { + strncpy (namebuf + 1, name, sizeof (namebuf) - 3); + namebuf[sizeof (namebuf) - 2] = '"'; + namebuf[sizeof (namebuf) - 1] = 0; + } else { + memcpy (namebuf + 1, name, namelen); + namebuf[namelen + 1] = '"'; + namebuf[namelen + 2] = 0; + } + + printk ("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", + i, namebuf, + deh_dir_id(deh), deh_objectid(deh), + GET_HASH_VALUE (deh_offset (deh)), GET_GENERATION_NUMBER ((deh_offset (deh))), + (de_hidden (deh)) ? "HIDDEN" : "VISIBLE"); + } +} + + +static void direntry_check_item (struct item_head * ih, char * item) +{ + int i; + struct reiserfs_de_head * deh; + + // FIXME: type something here! + deh = (struct reiserfs_de_head *)item; + for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) { + ; + } +} + + + +#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 + +/* + * function returns old entry number in directory item in real node + * using new entry number in virtual item in virtual node */ +static inline int old_entry_num (int is_affected, int virtual_entry_num, int pos_in_item, int mode) +{ + if ( mode == M_INSERT || mode == M_DELETE) + return virtual_entry_num; + + if (!is_affected) + /* cut or paste is applied to another item */ + return virtual_entry_num; + + if (virtual_entry_num < pos_in_item) + return virtual_entry_num; + + if (mode == M_CUT) + return virtual_entry_num + 1; + + RFALSE( mode != M_PASTE || virtual_entry_num == 0, + "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", mode); + + return virtual_entry_num - 1; +} + + + + +/* Create an array of sizes of directory entries for virtual + item. Return space used by an item. FIXME: no control over + consuming of space used by this item handler */ +static int direntry_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + struct direntry_uarea * dir_u = vi->vi_uarea; + int i, j; + int size = sizeof (struct direntry_uarea); + struct reiserfs_de_head * deh; + + vi->vi_index = TYPE_DIRENTRY; + + if (!(vi->vi_ih) || !vi->vi_item) + BUG (); + + + dir_u->flags = 0; + if (le_ih_k_offset (vi->vi_ih) == DOT_OFFSET) + dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; + + deh = (struct reiserfs_de_head *)(vi->vi_item); + + + /* virtual directory item have this amount of entry after */ + dir_u->entry_count = ih_entry_count (vi->vi_ih) + + ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : + (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); + + for (i = 0; i < dir_u->entry_count; i ++) { + j = old_entry_num (is_affected, i, vn->vn_pos_in_item, vn->vn_mode); + dir_u->entry_sizes[i] = (j ? deh_location( &(deh[j - 1]) ) : + ih_item_len (vi->vi_ih)) - + deh_location( &(deh[j])) + DEH_SIZE; + } + + size += (dir_u->entry_count * sizeof (short)); + + /* set size of pasted entry */ + if (is_affected && vn->vn_mode == M_PASTE) + dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; + + +#ifdef CONFIG_REISERFS_CHECK + /* compare total size of entries with item length */ + { + int k, l; + + l = 0; + for (k = 0; k < dir_u->entry_count; k ++) + l += dir_u->entry_sizes[k]; + + if (l + IH_SIZE != vi->vi_item_len + + ((is_affected && (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT)) ? insert_size : 0) ) { + reiserfs_panic (NULL, "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item", + vn->vn_mode, insert_size); + } + } +#endif + + return size; + + +} + + +// +// return number of entries which may fit into specified amount of +// free space, or -1 if free space is not enough even for 1 entry +// +static int direntry_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + int i; + int entries = 0; + struct direntry_uarea * dir_u = vi->vi_uarea; + + for (i = start_skip; i < dir_u->entry_count - end_skip; i ++) { + if (dir_u->entry_sizes[i] > free) + /* i-th entry doesn't fit into the remaining free space */ + break; + + free -= dir_u->entry_sizes[i]; + entries ++; + } + + if (entries == dir_u->entry_count) { + reiserfs_panic (NULL, "free space %d, entry_count %d\n", free, dir_u->entry_count); + } + + /* "." and ".." can not be separated from each other */ + if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries < 2) + entries = 0; + + return entries ?: -1; +} + + +static int direntry_check_right (struct virtual_item * vi, int free) +{ + int i; + int entries = 0; + struct direntry_uarea * dir_u = vi->vi_uarea; + + for (i = dir_u->entry_count - 1; i >= 0; i --) { + if (dir_u->entry_sizes[i] > free) + /* i-th entry doesn't fit into the remaining free space */ + break; + + free -= dir_u->entry_sizes[i]; + entries ++; + } + if (entries == dir_u->entry_count) + BUG (); + + /* "." and ".." can not be separated from each other */ + if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries > dir_u->entry_count - 2) + entries = dir_u->entry_count - 2; + + return entries ?: -1; +} + + +/* sum of entry sizes between from-th and to-th entries including both edges */ +static int direntry_part_size (struct virtual_item * vi, int first, int count) +{ + int i, retval; + int from, to; + struct direntry_uarea * dir_u = vi->vi_uarea; + + retval = 0; + if (first == 0) + from = 0; + else + from = dir_u->entry_count - count; + to = from + count - 1; + + for (i = from; i <= to; i ++) + retval += dir_u->entry_sizes[i]; + + return retval; +} + +static int direntry_unit_num (struct virtual_item * vi) +{ + struct direntry_uarea * dir_u = vi->vi_uarea; + + return dir_u->entry_count; +} + + + +static void direntry_print_vi (struct virtual_item * vi) +{ + int i; + struct direntry_uarea * dir_u = vi->vi_uarea; + + reiserfs_warning (NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", + vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); + printk ("%d entries: ", dir_u->entry_count); + for (i = 0; i < dir_u->entry_count; i ++) + printk ("%d ", dir_u->entry_sizes[i]); + printk ("\n"); +} + +static struct item_operations direntry_ops = { + .bytes_number = direntry_bytes_number, + .decrement_key = direntry_decrement_key, + .is_left_mergeable = direntry_is_left_mergeable, + .print_item = direntry_print_item, + .check_item = direntry_check_item, + + .create_vi = direntry_create_vi, + .check_left = direntry_check_left, + .check_right = direntry_check_right, + .part_size = direntry_part_size, + .unit_num = direntry_unit_num, + .print_vi = direntry_print_vi +}; + + +////////////////////////////////////////////////////////////////////////////// +// Error catching functions to catch errors caused by incorrect item types. +// +static int errcatch_bytes_number (struct item_head * ih, int block_size) +{ + reiserfs_warning (NULL, "green-16001: Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_decrement_key (struct cpu_key * key) +{ + reiserfs_warning (NULL, "green-16002: Invalid item type observed, run fsck ASAP"); +} + + +static int errcatch_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +{ + reiserfs_warning (NULL, "green-16003: Invalid item type observed, run fsck ASAP"); + return 0; +} + + +static void errcatch_print_item (struct item_head * ih, char * item) +{ + reiserfs_warning (NULL, "green-16004: Invalid item type observed, run fsck ASAP"); +} + + +static void errcatch_check_item (struct item_head * ih, char * item) +{ + reiserfs_warning (NULL, "green-16005: Invalid item type observed, run fsck ASAP"); +} + +static int errcatch_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + reiserfs_warning (NULL, "green-16006: Invalid item type observed, run fsck ASAP"); + return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where + // this operation is called from is of return type void. +} + +static int errcatch_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + reiserfs_warning (NULL, "green-16007: Invalid item type observed, run fsck ASAP"); + return -1; +} + + +static int errcatch_check_right (struct virtual_item * vi, int free) +{ + reiserfs_warning (NULL, "green-16008: Invalid item type observed, run fsck ASAP"); + return -1; +} + +static int errcatch_part_size (struct virtual_item * vi, int first, int count) +{ + reiserfs_warning (NULL, "green-16009: Invalid item type observed, run fsck ASAP"); + return 0; +} + +static int errcatch_unit_num (struct virtual_item * vi) +{ + reiserfs_warning (NULL, "green-16010: Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_print_vi (struct virtual_item * vi) +{ + reiserfs_warning (NULL, "green-16011: Invalid item type observed, run fsck ASAP"); +} + +static struct item_operations errcatch_ops = { + errcatch_bytes_number, + errcatch_decrement_key, + errcatch_is_left_mergeable, + errcatch_print_item, + errcatch_check_item, + + errcatch_create_vi, + errcatch_check_left, + errcatch_check_right, + errcatch_part_size, + errcatch_unit_num, + errcatch_print_vi +}; + + + +////////////////////////////////////////////////////////////////////////////// +// +// +#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) + do not compile +#endif + +struct item_operations * item_ops [TYPE_ANY + 1] = { + &stat_data_ops, + &indirect_ops, + &direct_ops, + &direntry_ops, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ +}; + + + + diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c new file mode 100644 index 000000000000..c9ad3a7849f4 --- /dev/null +++ b/fs/reiserfs/journal.c @@ -0,0 +1,3876 @@ +/* +** Write ahead logging implementation copyright Chris Mason 2000 +** +** The background commits make this code very interelated, and +** overly complex. I need to rethink things a bit....The major players: +** +** journal_begin -- call with the number of blocks you expect to log. +** If the current transaction is too +** old, it will block until the current transaction is +** finished, and then start a new one. +** Usually, your transaction will get joined in with +** previous ones for speed. +** +** journal_join -- same as journal_begin, but won't block on the current +** transaction regardless of age. Don't ever call +** this. Ever. There are only two places it should be +** called from, and they are both inside this file. +** +** journal_mark_dirty -- adds blocks into this transaction. clears any flags +** that might make them get sent to disk +** and then marks them BH_JDirty. Puts the buffer head +** into the current transaction hash. +** +** journal_end -- if the current transaction is batchable, it does nothing +** otherwise, it could do an async/synchronous commit, or +** a full flush of all log and real blocks in the +** transaction. +** +** flush_old_commits -- if the current transaction is too old, it is ended and +** commit blocks are sent to disk. Forces commit blocks +** to disk for all backgrounded commits that have been +** around too long. +** -- Note, if you call this as an immediate flush from +** from within kupdate, it will ignore the immediate flag +*/ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <linux/time.h> +#include <asm/semaphore.h> + +#include <linux/vmalloc.h> +#include <linux/reiserfs_fs.h> + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/buffer_head.h> +#include <linux/workqueue.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> + + +/* gets a struct reiserfs_journal_list * from a list head */ +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_list)) +#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_working_list)) + +/* the number of mounted filesystems. This is used to decide when to +** start and kill the commit workqueue +*/ +static int reiserfs_mounted_fs_count; + +static struct workqueue_struct *commit_wq; + +#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit + structs at 4k */ +#define BUFNR 64 /*read ahead */ + +/* cnode stat bits. Move these into reiserfs_fs.h */ + +#define BLOCK_FREED 2 /* this block was freed, and can't be written. */ +#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ + +#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ +#define BLOCK_DIRTIED 5 + + +/* journal list state bits */ +#define LIST_TOUCHED 1 +#define LIST_DIRTY 2 +#define LIST_COMMIT_PENDING 4 /* someone will commit this list */ + +/* flags for do_journal_end */ +#define FLUSH_ALL 1 /* flush commit and real blocks */ +#define COMMIT_NOW 2 /* end and commit this transaction */ +#define WAIT 4 /* wait for the log blocks to hit the disk*/ + +static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ; +static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; +static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; +static int can_dirty(struct reiserfs_journal_cnode *cn) ; +static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks); +static int release_journal_dev( struct super_block *super, + struct reiserfs_journal *journal ); +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl); +static void flush_async_commits(void *p); +static void queue_log_writer(struct super_block *s); + +/* values for join in do_journal_begin_r */ +enum { + JBEGIN_REG = 0, /* regular journal begin */ + JBEGIN_JOIN = 1, /* join the running transaction if at all possible */ + JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */ +}; + +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, + struct super_block * p_s_sb, + unsigned long nblocks,int join); + +static void init_journal_hash(struct super_block *p_s_sb) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + memset(journal->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; +} + +/* +** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to +** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for +** more details. +*/ +static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { + if (bh) { + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + } + return 0 ; +} + +static void disable_barrier(struct super_block *s) +{ + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH); + printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s)); +} + +static struct reiserfs_bitmap_node * +allocate_bitmap_node(struct super_block *p_s_sb) { + struct reiserfs_bitmap_node *bn ; + static int id; + + bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, p_s_sb) ; + if (!bn) { + return NULL ; + } + bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb) ; + if (!bn->data) { + reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; + return NULL ; + } + bn->id = id++ ; + memset(bn->data, 0, p_s_sb->s_blocksize) ; + INIT_LIST_HEAD(&bn->list) ; + return bn ; +} + +static struct reiserfs_bitmap_node * +get_bitmap_node(struct super_block *p_s_sb) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_bitmap_node *bn = NULL; + struct list_head *entry = journal->j_bitmap_nodes.next ; + + journal->j_used_bitmap_nodes++ ; +repeat: + + if(entry != &journal->j_bitmap_nodes) { + bn = list_entry(entry, struct reiserfs_bitmap_node, list) ; + list_del(entry) ; + memset(bn->data, 0, p_s_sb->s_blocksize) ; + journal->j_free_bitmap_nodes-- ; + return bn ; + } + bn = allocate_bitmap_node(p_s_sb) ; + if (!bn) { + yield(); + goto repeat ; + } + return bn ; +} +static inline void free_bitmap_node(struct super_block *p_s_sb, + struct reiserfs_bitmap_node *bn) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + journal->j_used_bitmap_nodes-- ; + if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { + reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ; + reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; + } else { + list_add(&bn->list, &journal->j_bitmap_nodes) ; + journal->j_free_bitmap_nodes++ ; + } +} + +static void allocate_bitmap_nodes(struct super_block *p_s_sb) { + int i ; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_bitmap_node *bn = NULL ; + for (i = 0 ; i < REISERFS_MIN_BITMAP_NODES ; i++) { + bn = allocate_bitmap_node(p_s_sb) ; + if (bn) { + list_add(&bn->list, &journal->j_bitmap_nodes) ; + journal->j_free_bitmap_nodes++ ; + } else { + break ; // this is ok, we'll try again when more are needed + } + } +} + +static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block, + struct reiserfs_list_bitmap *jb) { + int bmap_nr = block / (p_s_sb->s_blocksize << 3) ; + int bit_nr = block % (p_s_sb->s_blocksize << 3) ; + + if (!jb->bitmaps[bmap_nr]) { + jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb) ; + } + set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data) ; + return 0 ; +} + +static void cleanup_bitmap_list(struct super_block *p_s_sb, + struct reiserfs_list_bitmap *jb) { + int i; + if (jb->bitmaps == NULL) + return; + + for (i = 0 ; i < SB_BMAP_NR(p_s_sb) ; i++) { + if (jb->bitmaps[i]) { + free_bitmap_node(p_s_sb, jb->bitmaps[i]) ; + jb->bitmaps[i] = NULL ; + } + } +} + +/* +** only call this on FS unmount. +*/ +static int free_list_bitmaps(struct super_block *p_s_sb, + struct reiserfs_list_bitmap *jb_array) { + int i ; + struct reiserfs_list_bitmap *jb ; + for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { + jb = jb_array + i ; + jb->journal_list = NULL ; + cleanup_bitmap_list(p_s_sb, jb) ; + vfree(jb->bitmaps) ; + jb->bitmaps = NULL ; + } + return 0; +} + +static int free_bitmap_nodes(struct super_block *p_s_sb) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct list_head *next = journal->j_bitmap_nodes.next ; + struct reiserfs_bitmap_node *bn ; + + while(next != &journal->j_bitmap_nodes) { + bn = list_entry(next, struct reiserfs_bitmap_node, list) ; + list_del(next) ; + reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ; + reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; + next = journal->j_bitmap_nodes.next ; + journal->j_free_bitmap_nodes-- ; + } + + return 0 ; +} + +/* +** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. +** jb_array is the array to be filled in. +*/ +int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb, + struct reiserfs_list_bitmap *jb_array, + int bmap_nr) { + int i ; + int failed = 0 ; + struct reiserfs_list_bitmap *jb ; + int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *) ; + + for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { + jb = jb_array + i ; + jb->journal_list = NULL ; + jb->bitmaps = vmalloc( mem ) ; + if (!jb->bitmaps) { + reiserfs_warning(p_s_sb, "clm-2000, unable to allocate bitmaps for journal lists") ; + failed = 1; + break ; + } + memset(jb->bitmaps, 0, mem) ; + } + if (failed) { + free_list_bitmaps(p_s_sb, jb_array) ; + return -1 ; + } + return 0 ; +} + +/* +** find an available list bitmap. If you can't find one, flush a commit list +** and try again +*/ +static struct reiserfs_list_bitmap * +get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { + int i,j ; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_list_bitmap *jb = NULL ; + + for (j = 0 ; j < (JOURNAL_NUM_BITMAPS * 3) ; j++) { + i = journal->j_list_bitmap_index ; + journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS ; + jb = journal->j_list_bitmap + i ; + if (journal->j_list_bitmap[i].journal_list) { + flush_commit_list(p_s_sb, journal->j_list_bitmap[i].journal_list, 1) ; + if (!journal->j_list_bitmap[i].journal_list) { + break ; + } + } else { + break ; + } + } + if (jb->journal_list) { /* double check to make sure if flushed correctly */ + return NULL ; + } + jb->journal_list = jl ; + return jb ; +} + +/* +** allocates a new chunk of X nodes, and links them all together as a list. +** Uses the cnode->next and cnode->prev pointers +** returns NULL on failure +*/ +static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) { + struct reiserfs_journal_cnode *head ; + int i ; + if (num_cnodes <= 0) { + return NULL ; + } + head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)) ; + if (!head) { + return NULL ; + } + memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)) ; + head[0].prev = NULL ; + head[0].next = head + 1 ; + for (i = 1 ; i < num_cnodes; i++) { + head[i].prev = head + (i - 1) ; + head[i].next = head + (i + 1) ; /* if last one, overwrite it after the if */ + } + head[num_cnodes -1].next = NULL ; + return head ; +} + +/* +** pulls a cnode off the free list, or returns NULL on failure +*/ +static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) { + struct reiserfs_journal_cnode *cn ; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + + reiserfs_check_lock_depth(p_s_sb, "get_cnode") ; + + if (journal->j_cnode_free <= 0) { + return NULL ; + } + journal->j_cnode_used++ ; + journal->j_cnode_free-- ; + cn = journal->j_cnode_free_list ; + if (!cn) { + return cn ; + } + if (cn->next) { + cn->next->prev = NULL ; + } + journal->j_cnode_free_list = cn->next ; + memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; + return cn ; +} + +/* +** returns a cnode to the free list +*/ +static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode *cn) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + + reiserfs_check_lock_depth(p_s_sb, "free_cnode") ; + + journal->j_cnode_used-- ; + journal->j_cnode_free++ ; + /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ + cn->next = journal->j_cnode_free_list ; + if (journal->j_cnode_free_list) { + journal->j_cnode_free_list->prev = cn ; + } + cn->prev = NULL ; /* not needed with the memset, but I might kill the memset, and forget to do this */ + journal->j_cnode_free_list = cn ; +} + +static void clear_prepared_bits(struct buffer_head *bh) { + clear_buffer_journal_prepared (bh); + clear_buffer_journal_restore_dirty (bh); +} + +/* utility function to force a BUG if it is called without the big +** kernel lock held. caller is the string printed just before calling BUG() +*/ +void reiserfs_check_lock_depth(struct super_block *sb, char *caller) { +#ifdef CONFIG_SMP + if (current->lock_depth < 0) { + reiserfs_panic (sb, "%s called without kernel lock held", caller) ; + } +#else + ; +#endif +} + +/* return a cnode with same dev, block number and size in table, or null if not found */ +static inline struct reiserfs_journal_cnode * +get_journal_hash_dev(struct super_block *sb, + struct reiserfs_journal_cnode **table, + long bl) +{ + struct reiserfs_journal_cnode *cn ; + cn = journal_hash(table, sb, bl) ; + while(cn) { + if (cn->blocknr == bl && cn->sb == sb) + return cn ; + cn = cn->hnext ; + } + return (struct reiserfs_journal_cnode *)0 ; +} + +/* +** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated +** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever +** being overwritten by a replay after crashing. +** +** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting +** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make +** sure you never write the block without logging it. +** +** next_zero_bit is a suggestion about the next block to try for find_forward. +** when bl is rejected because it is set in a journal list bitmap, we search +** for the next zero bit in the bitmap that rejected bl. Then, we return that +** through next_zero_bit for find_forward to try. +** +** Just because we return something in next_zero_bit does not mean we won't +** reject it on the next call to reiserfs_in_journal +** +*/ +int reiserfs_in_journal(struct super_block *p_s_sb, + int bmap_nr, int bit_nr, int search_all, + b_blocknr_t *next_zero_bit) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_journal_cnode *cn ; + struct reiserfs_list_bitmap *jb ; + int i ; + unsigned long bl; + + *next_zero_bit = 0 ; /* always start this at zero. */ + + PROC_INFO_INC( p_s_sb, journal.in_journal ); + /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. + ** if we crash before the transaction that freed it commits, this transaction won't + ** have committed either, and the block will never be written + */ + if (search_all) { + for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { + PROC_INFO_INC( p_s_sb, journal.in_journal_bitmap ); + jb = journal->j_list_bitmap + i ; + if (jb->journal_list && jb->bitmaps[bmap_nr] && + test_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data)) { + *next_zero_bit = find_next_zero_bit((unsigned long *) + (jb->bitmaps[bmap_nr]->data), + p_s_sb->s_blocksize << 3, bit_nr+1) ; + return 1 ; + } + } + } + + bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr; + /* is it in any old transactions? */ + if (search_all && (cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) { + return 1; + } + + /* is it in the current transaction. This should never happen */ + if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) { + BUG(); + return 1; + } + + PROC_INFO_INC( p_s_sb, journal.in_journal_reusable ); + /* safe for reuse */ + return 0 ; +} + +/* insert cn into table +*/ +static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) { + struct reiserfs_journal_cnode *cn_orig ; + + cn_orig = journal_hash(table, cn->sb, cn->blocknr) ; + cn->hnext = cn_orig ; + cn->hprev = NULL ; + if (cn_orig) { + cn_orig->hprev = cn ; + } + journal_hash(table, cn->sb, cn->blocknr) = cn ; +} + +/* lock the current transaction */ +inline static void lock_journal(struct super_block *p_s_sb) { + PROC_INFO_INC( p_s_sb, journal.lock_journal ); + down(&SB_JOURNAL(p_s_sb)->j_lock); +} + +/* unlock the current transaction */ +inline static void unlock_journal(struct super_block *p_s_sb) { + up(&SB_JOURNAL(p_s_sb)->j_lock); +} + +static inline void get_journal_list(struct reiserfs_journal_list *jl) +{ + jl->j_refcount++; +} + +static inline void put_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + if (jl->j_refcount < 1) { + reiserfs_panic (s, "trans id %lu, refcount at %d", jl->j_trans_id, + jl->j_refcount); + } + if (--jl->j_refcount == 0) + reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); +} + +/* +** this used to be much more involved, and I'm keeping it just in case things get ugly again. +** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a +** transaction. +*/ +static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { + + struct reiserfs_list_bitmap *jb = jl->j_list_bitmap ; + if (jb) { + cleanup_bitmap_list(p_s_sb, jb) ; + } + jl->j_list_bitmap->journal_list = NULL ; + jl->j_list_bitmap = NULL ; +} + +static int journal_list_still_alive(struct super_block *s, + unsigned long trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL (s); + struct list_head *entry = &journal->j_journal_list; + struct reiserfs_journal_list *jl; + + if (!list_empty(entry)) { + jl = JOURNAL_LIST_ENTRY(entry->next); + if (jl->j_trans_id <= trans_id) { + return 1; + } + } + return 0; +} + +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { + char b[BDEVNAME_SIZE]; + + if (buffer_journaled(bh)) { + reiserfs_warning(NULL, "clm-2084: pinned buffer %lu:%s sent to disk", + bh->b_blocknr, bdevname(bh->b_bdev, b)) ; + } + if (uptodate) + set_buffer_uptodate(bh) ; + else + clear_buffer_uptodate(bh) ; + unlock_buffer(bh) ; + put_bh(bh) ; +} + +static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) { + if (uptodate) + set_buffer_uptodate(bh) ; + else + clear_buffer_uptodate(bh) ; + unlock_buffer(bh) ; + put_bh(bh) ; +} + +static void submit_logged_buffer(struct buffer_head *bh) { + get_bh(bh) ; + bh->b_end_io = reiserfs_end_buffer_io_sync ; + clear_buffer_journal_new (bh); + clear_buffer_dirty(bh) ; + if (!test_clear_buffer_journal_test (bh)) + BUG(); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh) ; +} + +static void submit_ordered_buffer(struct buffer_head *bh) { + get_bh(bh) ; + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh) ; + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh) ; +} + +static int submit_barrier_buffer(struct buffer_head *bh) { + get_bh(bh) ; + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh) ; + if (!buffer_uptodate(bh)) + BUG(); + return submit_bh(WRITE_BARRIER, bh) ; +} + +static void check_barrier_completion(struct super_block *s, + struct buffer_head *bh) { + if (buffer_eopnotsupp(bh)) { + clear_buffer_eopnotsupp(bh); + disable_barrier(s); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } +} + +#define CHUNK_SIZE 32 +struct buffer_chunk { + struct buffer_head *bh[CHUNK_SIZE]; + int nr; +}; + +static void write_chunk(struct buffer_chunk *chunk) { + int i; + for (i = 0; i < chunk->nr ; i++) { + submit_logged_buffer(chunk->bh[i]) ; + } + chunk->nr = 0; +} + +static void write_ordered_chunk(struct buffer_chunk *chunk) { + int i; + for (i = 0; i < chunk->nr ; i++) { + submit_ordered_buffer(chunk->bh[i]) ; + } + chunk->nr = 0; +} + +static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, + spinlock_t *lock, + void (fn)(struct buffer_chunk *)) +{ + int ret = 0; + if (chunk->nr >= CHUNK_SIZE) + BUG(); + chunk->bh[chunk->nr++] = bh; + if (chunk->nr >= CHUNK_SIZE) { + ret = 1; + if (lock) + spin_unlock(lock); + fn(chunk); + if (lock) + spin_lock(lock); + } + return ret; +} + + +static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); +static struct reiserfs_jh *alloc_jh(void) { + struct reiserfs_jh *jh; + while(1) { + jh = kmalloc(sizeof(*jh), GFP_NOFS); + if (jh) { + atomic_inc(&nr_reiserfs_jh); + return jh; + } + yield(); + } +} + +/* + * we want to free the jh when the buffer has been written + * and waited on + */ +void reiserfs_free_jh(struct buffer_head *bh) { + struct reiserfs_jh *jh; + + jh = bh->b_private; + if (jh) { + bh->b_private = NULL; + jh->bh = NULL; + list_del_init(&jh->list); + kfree(jh); + if (atomic_read(&nr_reiserfs_jh) <= 0) + BUG(); + atomic_dec(&nr_reiserfs_jh); + put_bh(bh); + } +} + +static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, + int tail) +{ + struct reiserfs_jh *jh; + + if (bh->b_private) { + spin_lock(&j->j_dirty_buffers_lock); + if (!bh->b_private) { + spin_unlock(&j->j_dirty_buffers_lock); + goto no_jh; + } + jh = bh->b_private; + list_del_init(&jh->list); + } else { +no_jh: + get_bh(bh); + jh = alloc_jh(); + spin_lock(&j->j_dirty_buffers_lock); + /* buffer must be locked for __add_jh, should be able to have + * two adds at the same time + */ + if (bh->b_private) + BUG(); + jh->bh = bh; + bh->b_private = jh; + } + jh->jl = j->j_current_jl; + if (tail) + list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); + else { + list_add_tail(&jh->list, &jh->jl->j_bh_list); + } + spin_unlock(&j->j_dirty_buffers_lock); + return 0; +} + +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) { + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); +} +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) { + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); +} + +#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) +static int write_ordered_buffers(spinlock_t *lock, + struct reiserfs_journal *j, + struct reiserfs_journal_list *jl, + struct list_head *list) +{ + struct buffer_head *bh; + struct reiserfs_jh *jh; + int ret = j->j_errno; + struct buffer_chunk chunk; + struct list_head tmp; + INIT_LIST_HEAD(&tmp); + + chunk.nr = 0; + spin_lock(lock); + while(!list_empty(list)) { + jh = JH_ENTRY(list->next); + bh = jh->bh; + get_bh(bh); + if (test_set_buffer_locked(bh)) { + if (!buffer_dirty(bh)) { + list_del_init(&jh->list); + list_add(&jh->list, &tmp); + goto loop_next; + } + spin_unlock(lock); + if (chunk.nr) + write_ordered_chunk(&chunk); + wait_on_buffer(bh); + cond_resched(); + spin_lock(lock); + goto loop_next; + } + if (buffer_dirty(bh)) { + list_del_init(&jh->list); + list_add(&jh->list, &tmp); + add_to_chunk(&chunk, bh, lock, write_ordered_chunk); + } else { + reiserfs_free_jh(bh); + unlock_buffer(bh); + } +loop_next: + put_bh(bh); + cond_resched_lock(lock); + } + if (chunk.nr) { + spin_unlock(lock); + write_ordered_chunk(&chunk); + spin_lock(lock); + } + while(!list_empty(&tmp)) { + jh = JH_ENTRY(tmp.prev); + bh = jh->bh; + get_bh(bh); + reiserfs_free_jh(bh); + + if (buffer_locked(bh)) { + spin_unlock(lock); + wait_on_buffer(bh); + spin_lock(lock); + } + if (!buffer_uptodate(bh)) { + ret = -EIO; + } + put_bh(bh); + cond_resched_lock(lock); + } + spin_unlock(lock); + return ret; +} + +static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) { + struct reiserfs_journal *journal = SB_JOURNAL (s); + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal_list *first_jl; + struct list_head *entry; + unsigned long trans_id = jl->j_trans_id; + unsigned long other_trans_id; + unsigned long first_trans_id; + +find_first: + /* + * first we walk backwards to find the oldest uncommitted transation + */ + first_jl = jl; + entry = jl->j_list.prev; + while(1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + if (entry == &journal->j_journal_list || + atomic_read(&other_jl->j_older_commits_done)) + break; + + first_jl = other_jl; + entry = other_jl->j_list.prev; + } + + /* if we didn't find any older uncommitted transactions, return now */ + if (first_jl == jl) { + return 0; + } + + first_trans_id = first_jl->j_trans_id; + + entry = &first_jl->j_list; + while(1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + other_trans_id = other_jl->j_trans_id; + + if (other_trans_id < trans_id) { + if (atomic_read(&other_jl->j_commit_left) != 0) { + flush_commit_list(s, other_jl, 0); + + /* list we were called with is gone, return */ + if (!journal_list_still_alive(s, trans_id)) + return 1; + + /* the one we just flushed is gone, this means all + * older lists are also gone, so first_jl is no longer + * valid either. Go back to the beginning. + */ + if (!journal_list_still_alive(s, other_trans_id)) { + goto find_first; + } + } + entry = entry->next; + if (entry == &journal->j_journal_list) + return 0; + } else { + return 0; + } + } + return 0; +} +int reiserfs_async_progress_wait(struct super_block *s) { + DEFINE_WAIT(wait); + struct reiserfs_journal *j = SB_JOURNAL(s); + if (atomic_read(&j->j_async_throttle)) + blk_congestion_wait(WRITE, HZ/10); + return 0; +} + +/* +** if this journal list still has commit blocks unflushed, send them to disk. +** +** log areas must be flushed in order (transaction 2 can't commit before transaction 1) +** Before the commit block can by written, every other log block must be safely on disk +** +*/ +static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { + int i; + int bn ; + struct buffer_head *tbh = NULL ; + unsigned long trans_id = jl->j_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL (s); + int barrier = 0; + int retval = 0; + + reiserfs_check_lock_depth(s, "flush_commit_list") ; + + if (atomic_read(&jl->j_older_commits_done)) { + return 0 ; + } + + /* before we can put our commit blocks on disk, we have to make sure everyone older than + ** us is on disk too + */ + BUG_ON (jl->j_len <= 0); + BUG_ON (trans_id == journal->j_trans_id); + + get_journal_list(jl); + if (flushall) { + if (flush_older_commits(s, jl) == 1) { + /* list disappeared during flush_older_commits. return */ + goto put_jl; + } + } + + /* make sure nobody is trying to flush this one at the same time */ + down(&jl->j_commit_lock); + if (!journal_list_still_alive(s, trans_id)) { + up(&jl->j_commit_lock); + goto put_jl; + } + BUG_ON (jl->j_trans_id == 0); + + /* this commit is done, exit */ + if (atomic_read(&(jl->j_commit_left)) <= 0) { + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1) ; + } + up(&jl->j_commit_lock); + goto put_jl; + } + + if (!list_empty(&jl->j_bh_list)) { + unlock_kernel(); + write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_bh_list); + lock_kernel(); + } + BUG_ON (!list_empty(&jl->j_bh_list)); + /* + * for the description block and all the log blocks, submit any buffers + * that haven't already reached the disk + */ + atomic_inc(&journal->j_async_throttle); + for (i = 0 ; i < (jl->j_len + 1) ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % + SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn) ; + if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */ + ll_rw_block(WRITE, 1, &tbh) ; + put_bh(tbh) ; + } + atomic_dec(&journal->j_async_throttle); + + /* wait on everything written so far before writing the commit + * if we are in barrier mode, send the commit down now + */ + barrier = reiserfs_barrier_flush(s); + if (barrier) { + int ret; + lock_buffer(jl->j_commit_bh); + ret = submit_barrier_buffer(jl->j_commit_bh); + if (ret == -EOPNOTSUPP) { + set_buffer_uptodate(jl->j_commit_bh); + disable_barrier(s); + barrier = 0; + } + } + for (i = 0 ; i < (jl->j_len + 1) ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; + tbh = journal_find_get_block(s, bn) ; + wait_on_buffer(tbh) ; + // since we're using ll_rw_blk above, it might have skipped over + // a locked buffer. Double check here + // + if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ + sync_dirty_buffer(tbh); + if (unlikely (!buffer_uptodate(tbh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-601, buffer write failed") ; +#endif + retval = -EIO; + } + put_bh(tbh) ; /* once for journal_find_get_block */ + put_bh(tbh) ; /* once due to original getblk in do_journal_end */ + atomic_dec(&(jl->j_commit_left)) ; + } + + BUG_ON (atomic_read(&(jl->j_commit_left)) != 1); + + if (!barrier) { + if (buffer_dirty(jl->j_commit_bh)) + BUG(); + mark_buffer_dirty(jl->j_commit_bh) ; + sync_dirty_buffer(jl->j_commit_bh) ; + } else + wait_on_buffer(jl->j_commit_bh); + + check_barrier_completion(s, jl->j_commit_bh); + + /* If there was a write error in the journal - we can't commit this + * transaction - it will be invalid and, if successful, will just end + * up propogating the write error out to the filesystem. */ + if (unlikely (!buffer_uptodate(jl->j_commit_bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-615: buffer write failed") ; +#endif + retval = -EIO; + } + bforget(jl->j_commit_bh) ; + if (journal->j_last_commit_id != 0 && + (jl->j_trans_id - journal->j_last_commit_id) != 1) { + reiserfs_warning(s, "clm-2200: last commit %lu, current %lu", + journal->j_last_commit_id, + jl->j_trans_id); + } + journal->j_last_commit_id = jl->j_trans_id; + + /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ + cleanup_freed_for_journal_list(s, jl) ; + + retval = retval ? retval : journal->j_errno; + + /* mark the metadata dirty */ + if (!retval) + dirty_one_transaction(s, jl); + atomic_dec(&(jl->j_commit_left)) ; + + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1) ; + } + up(&jl->j_commit_lock); +put_jl: + put_journal_list(s, jl); + + if (retval) + reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__); + return retval; +} + +/* +** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or +** returns NULL if it can't find anything +*/ +static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) { + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr ; + + cn = cn->hprev ; + while(cn) { + if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { + return cn->jlist ; + } + cn = cn->hprev ; + } + return NULL ; +} + +static void remove_journal_hash(struct super_block *, struct reiserfs_journal_cnode **, +struct reiserfs_journal_list *, unsigned long, int); + +/* +** once all the real blocks have been flushed, it is safe to remove them from the +** journal list for this transaction. Aside from freeing the cnode, this also allows the +** block to be reallocated for data blocks if it had been deleted. +*/ +static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, int debug) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_journal_cnode *cn, *last ; + cn = jl->j_realblock ; + + /* which is better, to lock once around the whole loop, or + ** to lock for each call to remove_journal_hash? + */ + while(cn) { + if (cn->blocknr != 0) { + if (debug) { + reiserfs_warning (p_s_sb, "block %u, bh is %d, state %ld", cn->blocknr, + cn->bh ? 1: 0, cn->state) ; + } + cn->state = 0 ; + remove_journal_hash(p_s_sb, journal->j_list_hash_table, jl, cn->blocknr, 1) ; + } + last = cn ; + cn = cn->next ; + free_cnode(p_s_sb, last) ; + } + jl->j_realblock = NULL ; +} + +/* +** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block. +** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start +** releasing blocks in this transaction for reuse as data blocks. +** called by flush_journal_list, before it calls remove_all_from_journal_list +** +*/ +static int _update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) { + struct reiserfs_journal_header *jh ; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + + if (reiserfs_is_journal_aborted (journal)) + return -EIO; + + if (trans_id >= journal->j_last_flush_trans_id) { + if (buffer_locked((journal->j_header_bh))) { + wait_on_buffer((journal->j_header_bh)) ; + if (unlikely (!buffer_uptodate(journal->j_header_bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning (p_s_sb, "journal-699: buffer write failed") ; +#endif + return -EIO; + } + } + journal->j_last_flush_trans_id = trans_id ; + journal->j_first_unflushed_offset = offset ; + jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ; + jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ; + jh->j_first_unflushed_offset = cpu_to_le32(offset) ; + jh->j_mount_id = cpu_to_le32(journal->j_mount_id) ; + + if (reiserfs_barrier_flush(p_s_sb)) { + int ret; + lock_buffer(journal->j_header_bh); + ret = submit_barrier_buffer(journal->j_header_bh); + if (ret == -EOPNOTSUPP) { + set_buffer_uptodate(journal->j_header_bh); + disable_barrier(p_s_sb); + goto sync; + } + wait_on_buffer(journal->j_header_bh); + check_barrier_completion(p_s_sb, journal->j_header_bh); + } else { +sync: + set_buffer_dirty(journal->j_header_bh) ; + sync_dirty_buffer(journal->j_header_bh) ; + } + if (!buffer_uptodate(journal->j_header_bh)) { + reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay"); + return -EIO ; + } + } + return 0 ; +} + +static int update_journal_header_block(struct super_block *p_s_sb, + unsigned long offset, + unsigned long trans_id) { + return _update_journal_header_block(p_s_sb, offset, trans_id); +} +/* +** flush any and all journal lists older than you are +** can only be called from flush_journal_list +*/ +static int flush_older_journal_lists(struct super_block *p_s_sb, + struct reiserfs_journal_list *jl) +{ + struct list_head *entry; + struct reiserfs_journal_list *other_jl ; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + unsigned long trans_id = jl->j_trans_id; + + /* we know we are the only ones flushing things, no extra race + * protection is required. + */ +restart: + entry = journal->j_journal_list.next; + /* Did we wrap? */ + if (entry == &journal->j_journal_list) + return 0; + other_jl = JOURNAL_LIST_ENTRY(entry); + if (other_jl->j_trans_id < trans_id) { + BUG_ON (other_jl->j_refcount <= 0); + /* do not flush all */ + flush_journal_list(p_s_sb, other_jl, 0) ; + + /* other_jl is now deleted from the list */ + goto restart; + } + return 0 ; +} + +static void del_from_work_list(struct super_block *s, + struct reiserfs_journal_list *jl) { + struct reiserfs_journal *journal = SB_JOURNAL (s); + if (!list_empty(&jl->j_working_list)) { + list_del_init(&jl->j_working_list); + journal->j_num_work_lists--; + } +} + +/* flush a journal list, both commit and real blocks +** +** always set flushall to 1, unless you are calling from inside +** flush_journal_list +** +** IMPORTANT. This can only be called while there are no journal writers, +** and the journal is locked. That means it can only be called from +** do_journal_end, or by journal_release +*/ +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) { + struct reiserfs_journal_list *pjl ; + struct reiserfs_journal_cnode *cn, *last ; + int count ; + int was_jwait = 0 ; + int was_dirty = 0 ; + struct buffer_head *saved_bh ; + unsigned long j_len_saved = jl->j_len ; + struct reiserfs_journal *journal = SB_JOURNAL (s); + int err = 0; + + BUG_ON (j_len_saved <= 0); + + if (atomic_read(&journal->j_wcount) != 0) { + reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d", + atomic_read(&journal->j_wcount)) ; + } + BUG_ON (jl->j_trans_id == 0); + + /* if flushall == 0, the lock is already held */ + if (flushall) { + down(&journal->j_flush_sem); + } else if (!down_trylock(&journal->j_flush_sem)) { + BUG(); + } + + count = 0 ; + if (j_len_saved > journal->j_trans_max) { + reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id); + return 0 ; + } + + /* if all the work is already done, get out of here */ + if (atomic_read(&(jl->j_nonzerolen)) <= 0 && + atomic_read(&(jl->j_commit_left)) <= 0) { + goto flush_older_and_return ; + } + + /* start by putting the commit list on disk. This will also flush + ** the commit lists of any olders transactions + */ + flush_commit_list(s, jl, 1) ; + + if (!(jl->j_state & LIST_DIRTY) && !reiserfs_is_journal_aborted (journal)) + BUG(); + + /* are we done now? */ + if (atomic_read(&(jl->j_nonzerolen)) <= 0 && + atomic_read(&(jl->j_commit_left)) <= 0) { + goto flush_older_and_return ; + } + + /* loop through each cnode, see if we need to write it, + ** or wait on a more recent transaction, or just ignore it + */ + if (atomic_read(&(journal->j_wcount)) != 0) { + reiserfs_panic(s, "journal-844: panic journal list is flushing, wcount is not 0\n") ; + } + cn = jl->j_realblock ; + while(cn) { + was_jwait = 0 ; + was_dirty = 0 ; + saved_bh = NULL ; + /* blocknr of 0 is no longer in the hash, ignore it */ + if (cn->blocknr == 0) { + goto free_cnode ; + } + + /* This transaction failed commit. Don't write out to the disk */ + if (!(jl->j_state & LIST_DIRTY)) + goto free_cnode; + + pjl = find_newer_jl_for_cn(cn) ; + /* the order is important here. We check pjl to make sure we + ** don't clear BH_JDirty_wait if we aren't the one writing this + ** block to disk + */ + if (!pjl && cn->bh) { + saved_bh = cn->bh ; + + /* we do this to make sure nobody releases the buffer while + ** we are working with it + */ + get_bh(saved_bh) ; + + if (buffer_journal_dirty(saved_bh)) { + BUG_ON (!can_dirty (cn)); + was_jwait = 1 ; + was_dirty = 1 ; + } else if (can_dirty(cn)) { + /* everything with !pjl && jwait should be writable */ + BUG(); + } + } + + /* if someone has this block in a newer transaction, just make + ** sure they are commited, and don't try writing it to disk + */ + if (pjl) { + if (atomic_read(&pjl->j_commit_left)) + flush_commit_list(s, pjl, 1) ; + goto free_cnode ; + } + + /* bh == NULL when the block got to disk on its own, OR, + ** the block got freed in a future transaction + */ + if (saved_bh == NULL) { + goto free_cnode ; + } + + /* this should never happen. kupdate_one_transaction has this list + ** locked while it works, so we should never see a buffer here that + ** is not marked JDirty_wait + */ + if ((!was_jwait) && !buffer_locked(saved_bh)) { + reiserfs_warning (s, "journal-813: BAD! buffer %llu %cdirty %cjwait, " + "not in a newer tranasction", + (unsigned long long)saved_bh->b_blocknr, + was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ; + } + if (was_dirty) { + /* we inc again because saved_bh gets decremented at free_cnode */ + get_bh(saved_bh) ; + set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; + lock_buffer(saved_bh); + BUG_ON (cn->blocknr != saved_bh->b_blocknr); + if (buffer_dirty(saved_bh)) + submit_logged_buffer(saved_bh) ; + else + unlock_buffer(saved_bh); + count++ ; + } else { + reiserfs_warning (s, "clm-2082: Unable to flush buffer %llu in %s", + (unsigned long long)saved_bh->b_blocknr, __FUNCTION__); + } +free_cnode: + last = cn ; + cn = cn->next ; + if (saved_bh) { + /* we incremented this to keep others from taking the buffer head away */ + put_bh(saved_bh) ; + if (atomic_read(&(saved_bh->b_count)) < 0) { + reiserfs_warning (s, "journal-945: saved_bh->b_count < 0"); + } + } + } + if (count > 0) { + cn = jl->j_realblock ; + while(cn) { + if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { + if (!cn->bh) { + reiserfs_panic(s, "journal-1011: cn->bh is NULL\n") ; + } + wait_on_buffer(cn->bh) ; + if (!cn->bh) { + reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ; + } + if (unlikely (!buffer_uptodate(cn->bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-949: buffer write failed\n") ; +#endif + err = -EIO; + } + /* note, we must clear the JDirty_wait bit after the up to date + ** check, otherwise we race against our flushpage routine + */ + BUG_ON (!test_clear_buffer_journal_dirty (cn->bh)); + + /* undo the inc from journal_mark_dirty */ + put_bh(cn->bh) ; + brelse(cn->bh) ; + } + cn = cn->next ; + } + } + + if (err) + reiserfs_abort (s, -EIO, "Write error while pushing transaction to disk in %s", __FUNCTION__); +flush_older_and_return: + + + /* before we can update the journal header block, we _must_ flush all + ** real blocks from all older transactions to disk. This is because + ** once the header block is updated, this transaction will not be + ** replayed after a crash + */ + if (flushall) { + flush_older_journal_lists(s, jl); + } + + err = journal->j_errno; + /* before we can remove everything from the hash tables for this + ** transaction, we must make sure it can never be replayed + ** + ** since we are only called from do_journal_end, we know for sure there + ** are no allocations going on while we are flushing journal lists. So, + ** we only need to update the journal header block for the last list + ** being flushed + */ + if (!err && flushall) { + err = update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ; + if (err) + reiserfs_abort (s, -EIO, "Write error while updating journal header in %s", __FUNCTION__); + } + remove_all_from_journal_list(s, jl, 0) ; + list_del_init(&jl->j_list); + journal->j_num_lists--; + del_from_work_list(s, jl); + + if (journal->j_last_flush_id != 0 && + (jl->j_trans_id - journal->j_last_flush_id) != 1) { + reiserfs_warning(s, "clm-2201: last flush %lu, current %lu", + journal->j_last_flush_id, + jl->j_trans_id); + } + journal->j_last_flush_id = jl->j_trans_id; + + /* not strictly required since we are freeing the list, but it should + * help find code using dead lists later on + */ + jl->j_len = 0 ; + atomic_set(&(jl->j_nonzerolen), 0) ; + jl->j_start = 0 ; + jl->j_realblock = NULL ; + jl->j_commit_bh = NULL ; + jl->j_trans_id = 0 ; + jl->j_state = 0; + put_journal_list(s, jl); + if (flushall) + up(&journal->j_flush_sem); + return err ; +} + +static int write_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl, + struct buffer_chunk *chunk) +{ + struct reiserfs_journal_cnode *cn; + int ret = 0 ; + + jl->j_state |= LIST_TOUCHED; + del_from_work_list(s, jl); + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { + return 0; + } + + cn = jl->j_realblock ; + while(cn) { + /* if the blocknr == 0, this has been cleared from the hash, + ** skip it + */ + if (cn->blocknr == 0) { + goto next ; + } + if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { + struct buffer_head *tmp_bh; + /* we can race against journal_mark_freed when we try + * to lock_buffer(cn->bh), so we have to inc the buffer + * count, and recheck things after locking + */ + tmp_bh = cn->bh; + get_bh(tmp_bh); + lock_buffer(tmp_bh); + if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { + if (!buffer_journal_dirty(tmp_bh) || + buffer_journal_prepared(tmp_bh)) + BUG(); + add_to_chunk(chunk, tmp_bh, NULL, write_chunk); + ret++; + } else { + /* note, cn->bh might be null now */ + unlock_buffer(tmp_bh); + } + put_bh(tmp_bh); + } +next: + cn = cn->next ; + cond_resched(); + } + return ret ; +} + +/* used by flush_commit_list */ +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal_list *pjl; + int ret = 0 ; + + jl->j_state |= LIST_DIRTY; + cn = jl->j_realblock ; + while(cn) { + /* look for a more recent transaction that logged this + ** buffer. Only the most recent transaction with a buffer in + ** it is allowed to send that buffer to disk + */ + pjl = find_newer_jl_for_cn(cn) ; + if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh)) + { + BUG_ON (!can_dirty(cn)); + /* if the buffer is prepared, it will either be logged + * or restored. If restored, we need to make sure + * it actually gets marked dirty + */ + clear_buffer_journal_new (cn->bh); + if (buffer_journal_prepared (cn->bh)) { + set_buffer_journal_restore_dirty (cn->bh); + } else { + set_buffer_journal_test (cn->bh); + mark_buffer_dirty(cn->bh); + } + } + cn = cn->next ; + } + return ret ; +} + +static int kupdate_transactions(struct super_block *s, + struct reiserfs_journal_list *jl, + struct reiserfs_journal_list **next_jl, + unsigned long *next_trans_id, + int num_blocks, + int num_trans) { + int ret = 0; + int written = 0 ; + int transactions_flushed = 0; + unsigned long orig_trans_id = jl->j_trans_id; + struct buffer_chunk chunk; + struct list_head *entry; + struct reiserfs_journal *journal = SB_JOURNAL (s); + chunk.nr = 0; + + down(&journal->j_flush_sem); + if (!journal_list_still_alive(s, orig_trans_id)) { + goto done; + } + + /* we've got j_flush_sem held, nobody is going to delete any + * of these lists out from underneath us + */ + while((num_trans && transactions_flushed < num_trans) || + (!num_trans && written < num_blocks)) { + + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || + atomic_read(&jl->j_commit_left) || !(jl->j_state & LIST_DIRTY)) + { + del_from_work_list(s, jl); + break; + } + ret = write_one_transaction(s, jl, &chunk); + + if (ret < 0) + goto done; + transactions_flushed++; + written += ret; + entry = jl->j_list.next; + + /* did we wrap? */ + if (entry == &journal->j_journal_list) { + break; + } + jl = JOURNAL_LIST_ENTRY(entry); + + /* don't bother with older transactions */ + if (jl->j_trans_id <= orig_trans_id) + break; + } + if (chunk.nr) { + write_chunk(&chunk); + } + +done: + up(&journal->j_flush_sem); + return ret; +} + +/* for o_sync and fsync heavy applications, they tend to use +** all the journa list slots with tiny transactions. These +** trigger lots and lots of calls to update the header block, which +** adds seeks and slows things down. +** +** This function tries to clear out a large chunk of the journal lists +** at once, which makes everything faster since only the newest journal +** list updates the header block +*/ +static int flush_used_journal_lists(struct super_block *s, + struct reiserfs_journal_list *jl) { + unsigned long len = 0; + unsigned long cur_len; + int ret; + int i; + int limit = 256; + struct reiserfs_journal_list *tjl; + struct reiserfs_journal_list *flush_jl; + unsigned long trans_id; + struct reiserfs_journal *journal = SB_JOURNAL (s); + + flush_jl = tjl = jl; + + /* in data logging mode, try harder to flush a lot of blocks */ + if (reiserfs_data_log(s)) + limit = 1024; + /* flush for 256 transactions or limit blocks, whichever comes first */ + for(i = 0 ; i < 256 && len < limit ; i++) { + if (atomic_read(&tjl->j_commit_left) || + tjl->j_trans_id < jl->j_trans_id) { + break; + } + cur_len = atomic_read(&tjl->j_nonzerolen); + if (cur_len > 0) { + tjl->j_state &= ~LIST_TOUCHED; + } + len += cur_len; + flush_jl = tjl; + if (tjl->j_list.next == &journal->j_journal_list) + break; + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); + } + /* try to find a group of blocks we can flush across all the + ** transactions, but only bother if we've actually spanned + ** across multiple lists + */ + if (flush_jl != jl) { + ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + } + flush_journal_list(s, flush_jl, 1); + return 0; +} + +/* +** removes any nodes in table with name block and dev as bh. +** only touchs the hnext and hprev pointers. +*/ +void remove_journal_hash(struct super_block *sb, + struct reiserfs_journal_cnode **table, + struct reiserfs_journal_list *jl, + unsigned long block, int remove_freed) +{ + struct reiserfs_journal_cnode *cur ; + struct reiserfs_journal_cnode **head ; + + head= &(journal_hash(table, sb, block)) ; + if (!head) { + return ; + } + cur = *head ; + while(cur) { + if (cur->blocknr == block && cur->sb == sb && (jl == NULL || jl == cur->jlist) && + (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { + if (cur->hnext) { + cur->hnext->hprev = cur->hprev ; + } + if (cur->hprev) { + cur->hprev->hnext = cur->hnext ; + } else { + *head = cur->hnext ; + } + cur->blocknr = 0 ; + cur->sb = NULL ; + cur->state = 0 ; + if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */ + atomic_dec(&(cur->jlist->j_nonzerolen)) ; + cur->bh = NULL ; + cur->jlist = NULL ; + } + cur = cur->hnext ; + } +} + +static void free_journal_ram(struct super_block *p_s_sb) { + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + reiserfs_kfree(journal->j_current_jl, + sizeof(struct reiserfs_journal_list), p_s_sb); + journal->j_num_lists--; + + vfree(journal->j_cnode_free_orig) ; + free_list_bitmaps(p_s_sb, journal->j_list_bitmap) ; + free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */ + if (journal->j_header_bh) { + brelse(journal->j_header_bh) ; + } + /* j_header_bh is on the journal dev, make sure not to release the journal + * dev until we brelse j_header_bh + */ + release_journal_dev(p_s_sb, journal); + vfree(journal) ; +} + +/* +** call on unmount. Only set error to 1 if you haven't made your way out +** of read_super() yet. Any other caller must keep error at 0. +*/ +static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) { + struct reiserfs_transaction_handle myth ; + int flushed = 0; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + + /* we only want to flush out transactions if we were called with error == 0 + */ + if (!error && !(p_s_sb->s_flags & MS_RDONLY)) { + /* end the current trans */ + BUG_ON (!th->t_trans_id); + do_journal_end(th, p_s_sb,10, FLUSH_ALL) ; + + /* make sure something gets logged to force our way into the flush code */ + if (!journal_join(&myth, p_s_sb, 1)) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + do_journal_end(&myth, p_s_sb,1, FLUSH_ALL) ; + flushed = 1; + } + } + + /* this also catches errors during the do_journal_end above */ + if (!error && reiserfs_is_journal_aborted(journal)) { + memset(&myth, 0, sizeof(myth)); + if (!journal_join_abort(&myth, p_s_sb, 1)) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL) ; + } + } + + reiserfs_mounted_fs_count-- ; + /* wait for all commits to finish */ + cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work); + flush_workqueue(commit_wq); + if (!reiserfs_mounted_fs_count) { + destroy_workqueue(commit_wq); + commit_wq = NULL; + } + + free_journal_ram(p_s_sb) ; + + return 0 ; +} + +/* +** call on unmount. flush all journal trans, release all alloc'd ram +*/ +int journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) { + return do_journal_release(th, p_s_sb, 0) ; +} +/* +** only call from an error condition inside reiserfs_read_super! +*/ +int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) { + return do_journal_release(th, p_s_sb, 1) ; +} + +/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */ +static int journal_compare_desc_commit(struct super_block *p_s_sb, struct reiserfs_journal_desc *desc, + struct reiserfs_journal_commit *commit) { + if (get_commit_trans_id (commit) != get_desc_trans_id (desc) || + get_commit_trans_len (commit) != get_desc_trans_len (desc) || + get_commit_trans_len (commit) > SB_JOURNAL(p_s_sb)->j_trans_max || + get_commit_trans_len (commit) <= 0 + ) { + return 1 ; + } + return 0 ; +} +/* returns 0 if it did not find a description block +** returns -1 if it found a corrupt commit block +** returns 1 if both desc and commit were valid +*/ +static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffer_head *d_bh, unsigned long *oldest_invalid_trans_id, unsigned long *newest_mount_id) { + struct reiserfs_journal_desc *desc ; + struct reiserfs_journal_commit *commit ; + struct buffer_head *c_bh ; + unsigned long offset ; + + if (!d_bh) + return 0 ; + + desc = (struct reiserfs_journal_desc *)d_bh->b_data ; + if (get_desc_trans_len(desc) > 0 && !memcmp(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8)) { + if (oldest_invalid_trans_id && *oldest_invalid_trans_id && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-986: transaction " + "is valid returning because trans_id %d is greater than " + "oldest_invalid %lu", get_desc_trans_id(desc), + *oldest_invalid_trans_id); + return 0 ; + } + if (newest_mount_id && *newest_mount_id > get_desc_mount_id (desc)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1087: transaction " + "is valid returning because mount_id %d is less than " + "newest_mount_id %lu", get_desc_mount_id (desc), + *newest_mount_id) ; + return -1 ; + } + if ( get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max ) { + reiserfs_warning(p_s_sb, "journal-2018: Bad transaction length %d encountered, ignoring transaction", get_desc_trans_len(desc)); + return -1 ; + } + offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; + + /* ok, we have a journal description block, lets see if the transaction was valid */ + c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((offset + get_desc_trans_len(desc) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; + if (!c_bh) + return 0 ; + commit = (struct reiserfs_journal_commit *)c_bh->b_data ; + if (journal_compare_desc_commit(p_s_sb, desc, commit)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal_transaction_is_valid, commit offset %ld had bad " + "time %d or length %d", + c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + get_commit_trans_id (commit), + get_commit_trans_len(commit)); + brelse(c_bh) ; + if (oldest_invalid_trans_id) { + *oldest_invalid_trans_id = get_desc_trans_id(desc) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1004: " + "transaction_is_valid setting oldest invalid trans_id " + "to %d", get_desc_trans_id(desc)) ; + } + return -1; + } + brelse(c_bh) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid " + "transaction start offset %llu, len %d id %d", + d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + get_desc_trans_len(desc), get_desc_trans_id(desc)) ; + return 1 ; + } else { + return 0 ; + } +} + +static void brelse_array(struct buffer_head **heads, int num) { + int i ; + for (i = 0 ; i < num ; i++) { + brelse(heads[i]) ; + } +} + +/* +** given the start, and values for the oldest acceptable transactions, +** this either reads in a replays a transaction, or returns because the transaction +** is invalid, or too old. +*/ +static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cur_dblock, unsigned long oldest_start, + unsigned long oldest_trans_id, unsigned long newest_mount_id) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_journal_desc *desc ; + struct reiserfs_journal_commit *commit ; + unsigned long trans_id = 0 ; + struct buffer_head *c_bh ; + struct buffer_head *d_bh ; + struct buffer_head **log_blocks = NULL ; + struct buffer_head **real_blocks = NULL ; + unsigned long trans_offset ; + int i; + int trans_half; + + d_bh = journal_bread(p_s_sb, cur_dblock) ; + if (!d_bh) + return 1 ; + desc = (struct reiserfs_journal_desc *)d_bh->b_data ; + trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: " + "journal_read_transaction, offset %llu, len %d mount_id %d", + d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + get_desc_trans_len(desc), get_desc_mount_id(desc)) ; + if (get_desc_trans_id(desc) < oldest_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: " + "journal_read_trans skipping because %lu is too old", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ; + brelse(d_bh) ; + return 1 ; + } + if (get_desc_mount_id(desc) != newest_mount_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: " + "journal_read_trans skipping because %d is != " + "newest_mount_id %lu", get_desc_mount_id(desc), + newest_mount_id) ; + brelse(d_bh) ; + return 1 ; + } + c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((trans_offset + get_desc_trans_len(desc) + 1) % + SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; + if (!c_bh) { + brelse(d_bh) ; + return 1 ; + } + commit = (struct reiserfs_journal_commit *)c_bh->b_data ; + if (journal_compare_desc_commit(p_s_sb, desc, commit)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, " + "commit offset %llu had bad time %d or length %d", + c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + get_commit_trans_id(commit), get_commit_trans_len(commit)); + brelse(c_bh) ; + brelse(d_bh) ; + return 1; + } + trans_id = get_desc_trans_id(desc) ; + /* now we know we've got a good transaction, and it was inside the valid time ranges */ + log_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ; + real_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ; + if (!log_blocks || !real_blocks) { + brelse(c_bh) ; + brelse(d_bh) ; + reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; + reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; + reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS") ; + return -1 ; + } + /* get all the buffer heads */ + trans_half = journal_trans_half (p_s_sb->s_blocksize) ; + for(i = 0 ; i < get_desc_trans_len(desc) ; i++) { + log_blocks[i] = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + (trans_offset + 1 + i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)); + if (i < trans_half) { + real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(desc->j_realblock[i])) ; + } else { + real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(commit->j_realblock[i - trans_half])) ; + } + if ( real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb) ) { + reiserfs_warning(p_s_sb, "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem"); + goto abort_replay; + } + /* make sure we don't try to replay onto log or reserved area */ + if (is_block_in_log_or_reserved_area(p_s_sb, real_blocks[i]->b_blocknr)) { + reiserfs_warning(p_s_sb, "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block") ; +abort_replay: + brelse_array(log_blocks, i) ; + brelse_array(real_blocks, i) ; + brelse(c_bh) ; + brelse(d_bh) ; + reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; + reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; + return -1 ; + } + } + /* read in the log blocks, memcpy to the corresponding real block */ + ll_rw_block(READ, get_desc_trans_len(desc), log_blocks) ; + for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { + wait_on_buffer(log_blocks[i]) ; + if (!buffer_uptodate(log_blocks[i])) { + reiserfs_warning(p_s_sb, "journal-1212: REPLAY FAILURE fsck required! buffer write failed") ; + brelse_array(log_blocks + i, get_desc_trans_len(desc) - i) ; + brelse_array(real_blocks, get_desc_trans_len(desc)) ; + brelse(c_bh) ; + brelse(d_bh) ; + reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; + reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; + return -1 ; + } + memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, real_blocks[i]->b_size) ; + set_buffer_uptodate(real_blocks[i]) ; + brelse(log_blocks[i]) ; + } + /* flush out the real blocks */ + for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { + set_buffer_dirty(real_blocks[i]) ; + ll_rw_block(WRITE, 1, real_blocks + i) ; + } + for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { + wait_on_buffer(real_blocks[i]) ; + if (!buffer_uptodate(real_blocks[i])) { + reiserfs_warning(p_s_sb, "journal-1226: REPLAY FAILURE, fsck required! buffer write failed") ; + brelse_array(real_blocks + i, get_desc_trans_len(desc) - i) ; + brelse(c_bh) ; + brelse(d_bh) ; + reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; + reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; + return -1 ; + } + brelse(real_blocks[i]) ; + } + cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((trans_offset + get_desc_trans_len(desc) + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1095: setting journal " + "start to offset %ld", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ; + + /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ + journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; + journal->j_last_flush_trans_id = trans_id ; + journal->j_trans_id = trans_id + 1; + brelse(c_bh) ; + brelse(d_bh) ; + reiserfs_kfree(log_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ; + reiserfs_kfree(real_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ; + return 0 ; +} + +/* This function reads blocks starting from block and to max_block of bufsize + size (but no more than BUFNR blocks at a time). This proved to improve + mounting speed on self-rebuilding raid5 arrays at least. + Right now it is only used from journal code. But later we might use it + from other places. + Note: Do not use journal_getblk/sb_getblk functions here! */ +static struct buffer_head * reiserfs_breada (struct block_device *dev, int block, int bufsize, + unsigned int max_block) +{ + struct buffer_head * bhlist[BUFNR]; + unsigned int blocks = BUFNR; + struct buffer_head * bh; + int i, j; + + bh = __getblk (dev, block, bufsize ); + if (buffer_uptodate (bh)) + return (bh); + + if (block + BUFNR > max_block) { + blocks = max_block - block; + } + bhlist[0] = bh; + j = 1; + for (i = 1; i < blocks; i++) { + bh = __getblk (dev, block + i, bufsize); + if (buffer_uptodate (bh)) { + brelse (bh); + break; + } + else bhlist[j++] = bh; + } + ll_rw_block (READ, j, bhlist); + for(i = 1; i < j; i++) + brelse (bhlist[i]); + bh = bhlist[0]; + wait_on_buffer (bh); + if (buffer_uptodate (bh)) + return bh; + brelse (bh); + return NULL; +} + +/* +** read and replay the log +** on a clean unmount, the journal header's next unflushed pointer will be to an invalid +** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast. +** +** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid. +** +** On exit, it sets things up so the first transaction will work correctly. +*/ +static int journal_read(struct super_block *p_s_sb) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_journal_desc *desc ; + unsigned long oldest_trans_id = 0; + unsigned long oldest_invalid_trans_id = 0 ; + time_t start ; + unsigned long oldest_start = 0; + unsigned long cur_dblock = 0 ; + unsigned long newest_mount_id = 9 ; + struct buffer_head *d_bh ; + struct reiserfs_journal_header *jh ; + int valid_journal_header = 0 ; + int replay_count = 0 ; + int continue_replay = 1 ; + int ret ; + char b[BDEVNAME_SIZE]; + + cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; + reiserfs_info (p_s_sb, "checking transaction log (%s)\n", + bdevname(journal->j_dev_bd, b)); + start = get_seconds(); + + /* step 1, read in the journal header block. Check the transaction it says + ** is the first unflushed, and if that transaction is not valid, + ** replay is done + */ + journal->j_header_bh = journal_bread(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); + if (!journal->j_header_bh) { + return 1 ; + } + jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ; + if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && + le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(p_s_sb) && + le32_to_cpu(jh->j_last_flush_trans_id) > 0) { + oldest_start = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + le32_to_cpu(jh->j_first_unflushed_offset) ; + oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; + newest_mount_id = le32_to_cpu(jh->j_mount_id); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1153: found in " + "header: first_unflushed_offset %d, last_flushed_trans_id " + "%lu", le32_to_cpu(jh->j_first_unflushed_offset), + le32_to_cpu(jh->j_last_flush_trans_id)) ; + valid_journal_header = 1 ; + + /* now, we try to read the first unflushed offset. If it is not valid, + ** there is nothing more we can do, and it makes no sense to read + ** through the whole log. + */ + d_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + le32_to_cpu(jh->j_first_unflushed_offset)) ; + ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL) ; + if (!ret) { + continue_replay = 0 ; + } + brelse(d_bh) ; + goto start_log_replay; + } + + if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) { + reiserfs_warning (p_s_sb, + "clm-2076: device is readonly, unable to replay log") ; + return -1 ; + } + + /* ok, there are transactions that need to be replayed. start with the first log block, find + ** all the valid transactions, and pick out the oldest. + */ + while(continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb))) { + /* Note that it is required for blocksize of primary fs device and journal + device to be the same */ + d_bh = reiserfs_breada(journal->j_dev_bd, cur_dblock, p_s_sb->s_blocksize, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ; + ret = journal_transaction_is_valid(p_s_sb, d_bh, &oldest_invalid_trans_id, &newest_mount_id) ; + if (ret == 1) { + desc = (struct reiserfs_journal_desc *)d_bh->b_data ; + if (oldest_start == 0) { /* init all oldest_ values */ + oldest_trans_id = get_desc_trans_id(desc) ; + oldest_start = d_bh->b_blocknr ; + newest_mount_id = get_desc_mount_id(desc) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting " + "oldest_start to offset %llu, trans_id %lu", + oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + oldest_trans_id) ; + } else if (oldest_trans_id > get_desc_trans_id(desc)) { + /* one we just read was older */ + oldest_trans_id = get_desc_trans_id(desc) ; + oldest_start = d_bh->b_blocknr ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1180: Resetting " + "oldest_start to offset %lu, trans_id %lu", + oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + oldest_trans_id) ; + } + if (newest_mount_id < get_desc_mount_id(desc)) { + newest_mount_id = get_desc_mount_id(desc) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " + "newest_mount_id to %d", get_desc_mount_id(desc)); + } + cur_dblock += get_desc_trans_len(desc) + 2 ; + } else { + cur_dblock++ ; + } + brelse(d_bh) ; + } + +start_log_replay: + cur_dblock = oldest_start ; + if (oldest_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay " + "from offset %llu, trans_id %lu", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + oldest_trans_id) ; + + } + replay_count = 0 ; + while(continue_replay && oldest_trans_id > 0) { + ret = journal_read_transaction(p_s_sb, cur_dblock, oldest_start, oldest_trans_id, newest_mount_id) ; + if (ret < 0) { + return ret ; + } else if (ret != 0) { + break ; + } + cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start ; + replay_count++ ; + if (cur_dblock == oldest_start) + break; + } + + if (oldest_trans_id == 0) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1225: No valid " + "transactions found") ; + } + /* j_start does not get set correctly if we don't replay any transactions. + ** if we had a valid journal_header, set j_start to the first unflushed transaction value, + ** copy the trans_id from the header + */ + if (valid_journal_header && replay_count == 0) { + journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset) ; + journal->j_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; + journal->j_last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) ; + journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; + } else { + journal->j_mount_id = newest_mount_id + 1 ; + } + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " + "newest_mount_id to %lu", journal->j_mount_id) ; + journal->j_first_unflushed_offset = journal->j_start ; + if (replay_count > 0) { + reiserfs_info (p_s_sb, "replayed %d transactions in %lu seconds\n", + replay_count, get_seconds() - start) ; + } + if (!bdev_read_only(p_s_sb->s_bdev) && + _update_journal_header_block(p_s_sb, journal->j_start, + journal->j_last_flush_trans_id)) + { + /* replay failed, caller must call free_journal_ram and abort + ** the mount + */ + return -1 ; + } + return 0 ; +} + +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) +{ + struct reiserfs_journal_list *jl; +retry: + jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s); + if (!jl) { + yield(); + goto retry; + } + memset(jl, 0, sizeof(*jl)); + INIT_LIST_HEAD(&jl->j_list); + INIT_LIST_HEAD(&jl->j_working_list); + INIT_LIST_HEAD(&jl->j_tail_bh_list); + INIT_LIST_HEAD(&jl->j_bh_list); + sema_init(&jl->j_commit_lock, 1); + SB_JOURNAL(s)->j_num_lists++; + get_journal_list(jl); + return jl; +} + +static void journal_list_init(struct super_block *p_s_sb) { + SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); +} + +static int release_journal_dev( struct super_block *super, + struct reiserfs_journal *journal ) +{ + int result; + + result = 0; + + if( journal -> j_dev_file != NULL ) { + result = filp_close( journal -> j_dev_file, NULL ); + journal -> j_dev_file = NULL; + journal -> j_dev_bd = NULL; + } else if( journal -> j_dev_bd != NULL ) { + result = blkdev_put( journal -> j_dev_bd ); + journal -> j_dev_bd = NULL; + } + + if( result != 0 ) { + reiserfs_warning(super, "sh-457: release_journal_dev: Cannot release journal device: %i", result ); + } + return result; +} + +static int journal_init_dev( struct super_block *super, + struct reiserfs_journal *journal, + const char *jdev_name ) +{ + int result; + dev_t jdev; + int blkdev_mode = FMODE_READ | FMODE_WRITE; + char b[BDEVNAME_SIZE]; + + result = 0; + + journal -> j_dev_bd = NULL; + journal -> j_dev_file = NULL; + jdev = SB_ONDISK_JOURNAL_DEVICE( super ) ? + new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; + + if (bdev_read_only(super->s_bdev)) + blkdev_mode = FMODE_READ; + + /* there is no "jdev" option and journal is on separate device */ + if( ( !jdev_name || !jdev_name[ 0 ] ) ) { + journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); + if (IS_ERR(journal->j_dev_bd)) { + result = PTR_ERR(journal->j_dev_bd); + journal->j_dev_bd = NULL; + reiserfs_warning (super, "sh-458: journal_init_dev: " + "cannot init journal device '%s': %i", + __bdevname(jdev, b), result ); + return result; + } else if (jdev != super->s_dev) + set_blocksize(journal->j_dev_bd, super->s_blocksize); + return 0; + } + + journal -> j_dev_file = filp_open( jdev_name, 0, 0 ); + if( !IS_ERR( journal -> j_dev_file ) ) { + struct inode *jdev_inode = journal->j_dev_file->f_mapping->host; + if( !S_ISBLK( jdev_inode -> i_mode ) ) { + reiserfs_warning (super, "journal_init_dev: '%s' is " + "not a block device", jdev_name ); + result = -ENOTBLK; + } else { + /* ok */ + journal->j_dev_bd = I_BDEV(jdev_inode); + set_blocksize(journal->j_dev_bd, super->s_blocksize); + } + } else { + result = PTR_ERR( journal -> j_dev_file ); + journal -> j_dev_file = NULL; + reiserfs_warning (super, + "journal_init_dev: Cannot open '%s': %i", + jdev_name, result ); + } + if( result != 0 ) { + release_journal_dev( super, journal ); + } + reiserfs_info(super, "journal_init_dev: journal device: %s\n", + bdevname(journal->j_dev_bd, b)); + return result; +} + +/* +** must be called once on fs mount. calls journal_read for you +*/ +int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_format, unsigned int commit_max_age) { + int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2 ; + struct buffer_head *bhjh; + struct reiserfs_super_block * rs; + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal; + struct reiserfs_journal_list *jl; + char b[BDEVNAME_SIZE]; + + journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ; + if (!journal) { + reiserfs_warning (p_s_sb, "journal-1256: unable to get memory for journal structure") ; + return 1 ; + } + memset(journal, 0, sizeof(struct reiserfs_journal)) ; + INIT_LIST_HEAD(&journal->j_bitmap_nodes) ; + INIT_LIST_HEAD (&journal->j_prealloc_list); + INIT_LIST_HEAD(&journal->j_working_list); + INIT_LIST_HEAD(&journal->j_journal_list); + journal->j_persistent_trans = 0; + if (reiserfs_allocate_list_bitmaps(p_s_sb, + journal->j_list_bitmap, + SB_BMAP_NR(p_s_sb))) + goto free_and_return ; + allocate_bitmap_nodes(p_s_sb) ; + + /* reserved for journal area support */ + SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ? + REISERFS_OLD_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + + SB_BMAP_NR(p_s_sb) + 1 : + REISERFS_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + 2); + + /* Sanity check to see is the standard journal fitting withing first bitmap + (actual for small blocksizes) */ + if ( !SB_ONDISK_JOURNAL_DEVICE( p_s_sb ) && + (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8) ) { + reiserfs_warning (p_s_sb, "journal-1393: journal does not fit for area " + "addressed by first of bitmap blocks. It starts at " + "%u and its size is %u. Block size %ld", + SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb), + SB_ONDISK_JOURNAL_SIZE(p_s_sb), p_s_sb->s_blocksize); + goto free_and_return; + } + + if( journal_init_dev( p_s_sb, journal, j_dev_name ) != 0 ) { + reiserfs_warning (p_s_sb, "sh-462: unable to initialize jornal device"); + goto free_and_return; + } + + rs = SB_DISK_SUPER_BLOCK(p_s_sb); + + /* read journal header */ + bhjh = journal_bread(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); + if (!bhjh) { + reiserfs_warning (p_s_sb, "sh-459: unable to read journal header"); + goto free_and_return; + } + jh = (struct reiserfs_journal_header *)(bhjh->b_data); + + /* make sure that journal matches to the super block */ + if (is_reiserfs_jr(rs) && (jh->jh_journal.jp_journal_magic != sb_jp_journal_magic(rs))) { + reiserfs_warning (p_s_sb, "sh-460: journal header magic %x " + "(device %s) does not match to magic found in super " + "block %x", + jh->jh_journal.jp_journal_magic, + bdevname( journal->j_dev_bd, b), + sb_jp_journal_magic(rs)); + brelse (bhjh); + goto free_and_return; + } + + journal->j_trans_max = le32_to_cpu (jh->jh_journal.jp_journal_trans_max); + journal->j_max_batch = le32_to_cpu (jh->jh_journal.jp_journal_max_batch); + journal->j_max_commit_age = le32_to_cpu (jh->jh_journal.jp_journal_max_commit_age); + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + + if (journal->j_trans_max) { + /* make sure these parameters are available, assign it if they are not */ + __u32 initial = journal->j_trans_max; + __u32 ratio = 1; + + if (p_s_sb->s_blocksize < 4096) + ratio = 4096 / p_s_sb->s_blocksize; + + if (SB_ONDISK_JOURNAL_SIZE(p_s_sb)/journal->j_trans_max < JOURNAL_MIN_RATIO) + journal->j_trans_max = SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO; + if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio) + journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT / ratio; + if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio) + journal->j_trans_max = JOURNAL_TRANS_MIN_DEFAULT / ratio; + + if (journal->j_trans_max != initial) + reiserfs_warning (p_s_sb, "sh-461: journal_init: wrong transaction max size (%u). Changed to %u", + initial, journal->j_trans_max); + + journal->j_max_batch = journal->j_trans_max* + JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT; + } + + if (!journal->j_trans_max) { + /*we have the file system was created by old version of mkreiserfs + so this field contains zero value */ + journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT ; + journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT ; + journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE ; + + /* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096 + trans max size is decreased proportionally */ + if (p_s_sb->s_blocksize < 4096) { + journal->j_trans_max /= (4096 / p_s_sb->s_blocksize) ; + journal->j_max_batch = (journal->j_trans_max) * 9 / 10 ; + } + } + + journal->j_default_max_commit_age = journal->j_max_commit_age; + + if (commit_max_age != 0) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } + + reiserfs_info (p_s_sb, "journal params: device %s, size %u, " + "journal first block %u, max trans len %u, max batch %u, " + "max commit age %u, max trans age %u\n", + bdevname( journal->j_dev_bd, b), + SB_ONDISK_JOURNAL_SIZE(p_s_sb), + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + journal->j_trans_max, + journal->j_max_batch, + journal->j_max_commit_age, + journal->j_max_trans_age); + + brelse (bhjh); + + journal->j_list_bitmap_index = 0 ; + journal_list_init(p_s_sb) ; + + memset(journal->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; + + INIT_LIST_HEAD(&journal->j_dirty_buffers) ; + spin_lock_init(&journal->j_dirty_buffers_lock) ; + + journal->j_start = 0 ; + journal->j_len = 0 ; + journal->j_len_alloc = 0 ; + atomic_set(&(journal->j_wcount), 0) ; + atomic_set(&(journal->j_async_throttle), 0) ; + journal->j_bcount = 0 ; + journal->j_trans_start_time = 0 ; + journal->j_last = NULL ; + journal->j_first = NULL ; + init_waitqueue_head(&(journal->j_join_wait)) ; + sema_init(&journal->j_lock, 1); + sema_init(&journal->j_flush_sem, 1); + + journal->j_trans_id = 10 ; + journal->j_mount_id = 10 ; + journal->j_state = 0 ; + atomic_set(&(journal->j_jlock), 0) ; + journal->j_cnode_free_list = allocate_cnodes(num_cnodes) ; + journal->j_cnode_free_orig = journal->j_cnode_free_list ; + journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0 ; + journal->j_cnode_used = 0 ; + journal->j_must_wait = 0 ; + + init_journal_hash(p_s_sb) ; + jl = journal->j_current_jl; + jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); + if (!jl->j_list_bitmap) { + reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0") ; + goto free_and_return; + } + if (journal_read(p_s_sb) < 0) { + reiserfs_warning(p_s_sb, "Replay Failure, unable to mount") ; + goto free_and_return; + } + + reiserfs_mounted_fs_count++ ; + if (reiserfs_mounted_fs_count <= 1) + commit_wq = create_workqueue("reiserfs"); + + INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb); + return 0 ; +free_and_return: + free_journal_ram(p_s_sb); + return 1; +} + +/* +** test for a polite end of the current transaction. Used by file_write, and should +** be used by delete to make sure they don't write more than can fit inside a single +** transaction +*/ +int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { + struct reiserfs_journal *journal = SB_JOURNAL (th->t_super); + time_t now = get_seconds() ; + /* cannot restart while nested */ + BUG_ON (!th->t_trans_id); + if (th->t_refcount > 1) + return 0 ; + if ( journal->j_must_wait > 0 || + (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || + atomic_read(&(journal->j_jlock)) || + (now - journal->j_trans_start_time) > journal->j_max_trans_age || + journal->j_cnode_free < (journal->j_trans_max * 3)) { + return 1 ; + } + return 0 ; +} + +/* this must be called inside a transaction, and requires the +** kernel_lock to be held +*/ +void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { + struct reiserfs_journal *journal = SB_JOURNAL (th->t_super); + BUG_ON (!th->t_trans_id); + journal->j_must_wait = 1 ; + set_bit(J_WRITERS_BLOCKED, &journal->j_state) ; + return ; +} + +/* this must be called without a transaction started, and does not +** require BKL +*/ +void reiserfs_allow_writes(struct super_block *s) { + struct reiserfs_journal *journal = SB_JOURNAL (s); + clear_bit(J_WRITERS_BLOCKED, &journal->j_state) ; + wake_up(&journal->j_join_wait) ; +} + +/* this must be called without a transaction started, and does not +** require BKL +*/ +void reiserfs_wait_on_write_block(struct super_block *s) { + struct reiserfs_journal *journal = SB_JOURNAL (s); + wait_event(journal->j_join_wait, + !test_bit(J_WRITERS_BLOCKED, &journal->j_state)) ; +} + +static void queue_log_writer(struct super_block *s) { + wait_queue_t wait; + struct reiserfs_journal *journal = SB_JOURNAL (s); + set_bit(J_WRITERS_QUEUED, &journal->j_state); + + /* + * we don't want to use wait_event here because + * we only want to wait once. + */ + init_waitqueue_entry(&wait, current); + add_wait_queue(&journal->j_join_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) + schedule(); + current->state = TASK_RUNNING; + remove_wait_queue(&journal->j_join_wait, &wait); +} + +static void wake_queued_writers(struct super_block *s) { + struct reiserfs_journal *journal = SB_JOURNAL (s); + if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) + wake_up(&journal->j_join_wait); +} + +static void let_transaction_grow(struct super_block *sb, + unsigned long trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL (sb); + unsigned long bcount = journal->j_bcount; + while(1) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(1); + journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; + while ((atomic_read(&journal->j_wcount) > 0 || + atomic_read(&journal->j_jlock)) && + journal->j_trans_id == trans_id) { + queue_log_writer(sb); + } + if (journal->j_trans_id != trans_id) + break; + if (bcount == journal->j_bcount) + break; + bcount = journal->j_bcount; + } +} + +/* join == true if you must join an existing transaction. +** join == false if you can deal with waiting for others to finish +** +** this will block until the transaction is joinable. send the number of blocks you +** expect to use in nblocks. +*/ +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) { + time_t now = get_seconds() ; + int old_trans_id ; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_transaction_handle myth; + int sched_count = 0; + int retval; + + reiserfs_check_lock_depth(p_s_sb, "journal_begin") ; + + PROC_INFO_INC( p_s_sb, journal.journal_being ); + /* set here for journal_join */ + th->t_refcount = 1; + th->t_super = p_s_sb ; + +relock: + lock_journal(p_s_sb) ; + if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted (journal)) { + unlock_journal (p_s_sb); + retval = journal->j_errno; + goto out_fail; + } + journal->j_bcount++; + + if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { + unlock_journal(p_s_sb) ; + reiserfs_wait_on_write_block(p_s_sb) ; + PROC_INFO_INC( p_s_sb, journal.journal_relock_writers ); + goto relock ; + } + now = get_seconds(); + + /* if there is no room in the journal OR + ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning + ** we don't sleep if there aren't other writers + */ + + if ( (!join && journal->j_must_wait > 0) || + ( !join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) || + (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 && + (now - journal->j_trans_start_time) > journal->j_max_trans_age) || + (!join && atomic_read(&journal->j_jlock)) || + (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { + + old_trans_id = journal->j_trans_id; + unlock_journal(p_s_sb) ; /* allow others to finish this transaction */ + + if (!join && (journal->j_len_alloc + nblocks + 2) >= + journal->j_max_batch && + ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75)) + { + if (atomic_read(&journal->j_wcount) > 10) { + sched_count++; + queue_log_writer(p_s_sb); + goto relock; + } + } + /* don't mess with joining the transaction if all we have to do is + * wait for someone else to do a commit + */ + if (atomic_read(&journal->j_jlock)) { + while (journal->j_trans_id == old_trans_id && + atomic_read(&journal->j_jlock)) { + queue_log_writer(p_s_sb); + } + goto relock; + } + retval = journal_join(&myth, p_s_sb, 1) ; + if (retval) + goto out_fail; + + /* someone might have ended the transaction while we joined */ + if (old_trans_id != journal->j_trans_id) { + retval = do_journal_end(&myth, p_s_sb, 1, 0) ; + } else { + retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ; + } + + if (retval) + goto out_fail; + + PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount ); + goto relock ; + } + /* we are the first writer, set trans_id */ + if (journal->j_trans_start_time == 0) { + journal->j_trans_start_time = get_seconds(); + } + atomic_inc(&(journal->j_wcount)) ; + journal->j_len_alloc += nblocks ; + th->t_blocks_logged = 0 ; + th->t_blocks_allocated = nblocks ; + th->t_trans_id = journal->j_trans_id ; + unlock_journal(p_s_sb) ; + INIT_LIST_HEAD (&th->t_list); + return 0 ; + +out_fail: + memset (th, 0, sizeof (*th)); + /* Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later */ + th->t_super = p_s_sb; + return retval; +} + +struct reiserfs_transaction_handle * +reiserfs_persistent_transaction(struct super_block *s, int nblocks) { + int ret ; + struct reiserfs_transaction_handle *th ; + + /* if we're nesting into an existing transaction. It will be + ** persistent on its own + */ + if (reiserfs_transaction_running(s)) { + th = current->journal_info ; + th->t_refcount++ ; + if (th->t_refcount < 2) { + BUG() ; + } + return th ; + } + th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ; + if (!th) + return NULL; + ret = journal_begin(th, s, nblocks) ; + if (ret) { + reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ; + return NULL; + } + + SB_JOURNAL(s)->j_persistent_trans++; + return th ; +} + +int +reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) { + struct super_block *s = th->t_super; + int ret = 0; + if (th->t_trans_id) + ret = journal_end(th, th->t_super, th->t_blocks_allocated); + else + ret = -EIO; + if (th->t_refcount == 0) { + SB_JOURNAL(s)->j_persistent_trans--; + reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ; + } + return ret; +} + +static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* this keeps do_journal_end from NULLing out the current->journal_info + ** pointer + */ + th->t_handle_save = cur_th ; + if (cur_th && cur_th->t_refcount > 1) { + BUG() ; + } + return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN) ; +} + +int journal_join_abort(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* this keeps do_journal_end from NULLing out the current->journal_info + ** pointer + */ + th->t_handle_save = cur_th ; + if (cur_th && cur_th->t_refcount > 1) { + BUG() ; + } + return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT) ; +} + +int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { + struct reiserfs_transaction_handle *cur_th = current->journal_info ; + int ret ; + + th->t_handle_save = NULL ; + if (cur_th) { + /* we are nesting into the current transaction */ + if (cur_th->t_super == p_s_sb) { + BUG_ON (!cur_th->t_refcount); + cur_th->t_refcount++ ; + memcpy(th, cur_th, sizeof(*th)); + if (th->t_refcount <= 1) + reiserfs_warning (p_s_sb, "BAD: refcount <= 1, but journal_info != 0"); + return 0; + } else { + /* we've ended up with a handle from a different filesystem. + ** save it and restore on journal_end. This should never + ** really happen... + */ + reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS") ; + th->t_handle_save = current->journal_info ; + current->journal_info = th; + } + } else { + current->journal_info = th; + } + ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG) ; + if (current->journal_info != th) + BUG() ; + + /* I guess this boils down to being the reciprocal of clm-2100 above. + * If do_journal_begin_r fails, we need to put it back, since journal_end + * won't be called to do it. */ + if (ret) + current->journal_info = th->t_handle_save; + else + BUG_ON (!th->t_refcount); + + return ret ; +} + +/* +** puts bh into the current transaction. If it was already there, reorders removes the +** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order). +** +** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the +** transaction is committed. +** +** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. +*/ +int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_journal_cnode *cn = NULL; + int count_already_incd = 0 ; + int prepared = 0 ; + BUG_ON (!th->t_trans_id); + + PROC_INFO_INC( p_s_sb, journal.mark_dirty ); + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", + th->t_trans_id, journal->j_trans_id); + } + + p_s_sb->s_dirt = 1; + + prepared = test_clear_buffer_journal_prepared (bh); + clear_buffer_journal_restore_dirty (bh); + /* already in this transaction, we are done */ + if (buffer_journaled(bh)) { + PROC_INFO_INC( p_s_sb, journal.mark_dirty_already ); + return 0 ; + } + + /* this must be turned into a panic instead of a warning. We can't allow + ** a dirty or journal_dirty or locked buffer to be logged, as some changes + ** could get to disk too early. NOT GOOD. + */ + if (!prepared || buffer_dirty(bh)) { + reiserfs_warning (p_s_sb, "journal-1777: buffer %llu bad state " + "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", + (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', + buffer_locked(bh) ? ' ' : '!', + buffer_dirty(bh) ? ' ' : '!', + buffer_journal_dirty(bh) ? ' ' : '!') ; + } + + if (atomic_read(&(journal->j_wcount)) <= 0) { + reiserfs_warning (p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d", atomic_read(&(journal->j_wcount))) ; + return 1 ; + } + /* this error means I've screwed up, and we've overflowed the transaction. + ** Nothing can be done here, except make the FS readonly or panic. + */ + if (journal->j_len >= journal->j_trans_max) { + reiserfs_panic(th->t_super, "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", journal->j_len) ; + } + + if (buffer_journal_dirty(bh)) { + count_already_incd = 1 ; + PROC_INFO_INC( p_s_sb, journal.mark_dirty_notjournal ); + clear_buffer_journal_dirty (bh); + } + + if (journal->j_len > journal->j_len_alloc) { + journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT ; + } + + set_buffer_journaled (bh); + + /* now put this guy on the end */ + if (!cn) { + cn = get_cnode(p_s_sb) ; + if (!cn) { + reiserfs_panic(p_s_sb, "get_cnode failed!\n"); + } + + if (th->t_blocks_logged == th->t_blocks_allocated) { + th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT ; + journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT ; + } + th->t_blocks_logged++ ; + journal->j_len++ ; + + cn->bh = bh ; + cn->blocknr = bh->b_blocknr ; + cn->sb = p_s_sb; + cn->jlist = NULL ; + insert_journal_hash(journal->j_hash_table, cn) ; + if (!count_already_incd) { + get_bh(bh) ; + } + } + cn->next = NULL ; + cn->prev = journal->j_last ; + cn->bh = bh ; + if (journal->j_last) { + journal->j_last->next = cn ; + journal->j_last = cn ; + } else { + journal->j_first = cn ; + journal->j_last = cn ; + } + return 0 ; +} + +int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + if (!current->journal_info && th->t_refcount > 1) + reiserfs_warning (p_s_sb, "REISER-NESTING: th NULL, refcount %d", + th->t_refcount); + + if (!th->t_trans_id) { + WARN_ON (1); + return -EIO; + } + + th->t_refcount--; + if (th->t_refcount > 0) { + struct reiserfs_transaction_handle *cur_th = current->journal_info ; + + /* we aren't allowed to close a nested transaction on a different + ** filesystem from the one in the task struct + */ + if (cur_th->t_super != th->t_super) + BUG() ; + + if (th != cur_th) { + memcpy(current->journal_info, th, sizeof(*th)); + th->t_trans_id = 0; + } + return 0; + } else { + return do_journal_end(th, p_s_sb, nblocks, 0) ; + } +} + +/* removes from the current transaction, relsing and descrementing any counters. +** also files the removed buffer directly onto the clean list +** +** called by journal_mark_freed when a block has been deleted +** +** returns 1 if it cleaned and relsed the buffer. 0 otherwise +*/ +static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t blocknr, int already_cleaned) { + struct buffer_head *bh ; + struct reiserfs_journal_cnode *cn ; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + int ret = 0; + + cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr) ; + if (!cn || !cn->bh) { + return ret ; + } + bh = cn->bh ; + if (cn->prev) { + cn->prev->next = cn->next ; + } + if (cn->next) { + cn->next->prev = cn->prev ; + } + if (cn == journal->j_first) { + journal->j_first = cn->next ; + } + if (cn == journal->j_last) { + journal->j_last = cn->prev ; + } + if (bh) + remove_journal_hash(p_s_sb, journal->j_hash_table, NULL, bh->b_blocknr, 0) ; + clear_buffer_journaled (bh); /* don't log this one */ + + if (!already_cleaned) { + clear_buffer_journal_dirty (bh); + clear_buffer_dirty(bh); + clear_buffer_journal_test (bh); + put_bh(bh) ; + if (atomic_read(&(bh->b_count)) < 0) { + reiserfs_warning (p_s_sb, "journal-1752: remove from trans, b_count < 0"); + } + ret = 1 ; + } + journal->j_len-- ; + journal->j_len_alloc-- ; + free_cnode(p_s_sb, cn) ; + return ret ; +} + +/* +** for any cnode in a journal list, it can only be dirtied of all the +** transactions that include it are commited to disk. +** this checks through each transaction, and returns 1 if you are allowed to dirty, +** and 0 if you aren't +** +** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log +** blocks for a given transaction on disk +** +*/ +static int can_dirty(struct reiserfs_journal_cnode *cn) { + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr ; + struct reiserfs_journal_cnode *cur = cn->hprev ; + int can_dirty = 1 ; + + /* first test hprev. These are all newer than cn, so any node here + ** with the same block number and dev means this node can't be sent + ** to disk right now. + */ + while(cur && can_dirty) { + if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && + cur->blocknr == blocknr) { + can_dirty = 0 ; + } + cur = cur->hprev ; + } + /* then test hnext. These are all older than cn. As long as they + ** are committed to the log, it is safe to write cn to disk + */ + cur = cn->hnext ; + while(cur && can_dirty) { + if (cur->jlist && cur->jlist->j_len > 0 && + atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && + cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { + can_dirty = 0 ; + } + cur = cur->hnext ; + } + return can_dirty ; +} + +/* syncs the commit blocks, but does not force the real buffers to disk +** will wait until the current transaction is done/commited before returning +*/ +int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + + BUG_ON (!th->t_trans_id); + /* you can sync while nested, very, very bad */ + if (th->t_refcount > 1) { + BUG() ; + } + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + } + return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ; +} + +/* +** writeback the pending async commits to disk +*/ +static void flush_async_commits(void *p) { + struct super_block *p_s_sb = p; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_journal_list *jl; + struct list_head *entry; + + lock_kernel(); + if (!list_empty(&journal->j_journal_list)) { + /* last entry is the youngest, commit it and you get everything */ + entry = journal->j_journal_list.prev; + jl = JOURNAL_LIST_ENTRY(entry); + flush_commit_list(p_s_sb, jl, 1); + } + unlock_kernel(); + /* + * this is a little racey, but there's no harm in missing + * the filemap_fdata_write + */ + if (!atomic_read(&journal->j_async_throttle) && !reiserfs_is_journal_aborted (journal)) { + atomic_inc(&journal->j_async_throttle); + filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping); + atomic_dec(&journal->j_async_throttle); + } +} + +/* +** flushes any old transactions to disk +** ends the current transaction if it is too old +*/ +int reiserfs_flush_old_commits(struct super_block *p_s_sb) { + time_t now ; + struct reiserfs_transaction_handle th ; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + + now = get_seconds(); + /* safety check so we don't flush while we are replaying the log during + * mount + */ + if (list_empty(&journal->j_journal_list)) { + return 0 ; + } + + /* check the current transaction. If there are no writers, and it is + * too old, finish it, and force the commit blocks to disk + */ + if (atomic_read(&journal->j_wcount) <= 0 && + journal->j_trans_start_time > 0 && + journal->j_len > 0 && + (now - journal->j_trans_start_time) > journal->j_max_trans_age) + { + if (!journal_join(&th, p_s_sb, 1)) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + + /* we're only being called from kreiserfsd, it makes no sense to do + ** an async commit so that kreiserfsd can do it later + */ + do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ; + } + } + return p_s_sb->s_dirt; +} + +/* +** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit +** +** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all +** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just +** flushes the commit list and returns 0. +** +** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait. +** +** Note, we can't allow the journal_end to proceed while there are still writers in the log. +*/ +static int check_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, + unsigned long nblocks, int flags) { + + time_t now ; + int flush = flags & FLUSH_ALL ; + int commit_now = flags & COMMIT_NOW ; + int wait_on_commit = flags & WAIT ; + struct reiserfs_journal_list *jl; + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + + BUG_ON (!th->t_trans_id); + + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", + th->t_trans_id, journal->j_trans_id); + } + + journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged) ; + if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */ + atomic_dec(&(journal->j_wcount)) ; + } + + /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released + ** will be dealt with by next transaction that actually writes something, but should be taken + ** care of in this trans + */ + if (journal->j_len == 0) { + BUG(); + } + /* if wcount > 0, and we are called to with flush or commit_now, + ** we wait on j_join_wait. We will wake up when the last writer has + ** finished the transaction, and started it on its way to the disk. + ** Then, we flush the commit or journal list, and just return 0 + ** because the rest of journal end was already done for this transaction. + */ + if (atomic_read(&(journal->j_wcount)) > 0) { + if (flush || commit_now) { + unsigned trans_id ; + + jl = journal->j_current_jl; + trans_id = jl->j_trans_id; + if (wait_on_commit) + jl->j_state |= LIST_COMMIT_PENDING; + atomic_set(&(journal->j_jlock), 1) ; + if (flush) { + journal->j_next_full_flush = 1 ; + } + unlock_journal(p_s_sb) ; + + /* sleep while the current transaction is still j_jlocked */ + while(journal->j_trans_id == trans_id) { + if (atomic_read(&journal->j_jlock)) { + queue_log_writer(p_s_sb); + } else { + lock_journal(p_s_sb); + if (journal->j_trans_id == trans_id) { + atomic_set(&(journal->j_jlock), 1) ; + } + unlock_journal(p_s_sb); + } + } + if (journal->j_trans_id == trans_id) { + BUG(); + } + if (commit_now && journal_list_still_alive(p_s_sb, trans_id) && + wait_on_commit) + { + flush_commit_list(p_s_sb, jl, 1) ; + } + return 0 ; + } + unlock_journal(p_s_sb) ; + return 0 ; + } + + /* deal with old transactions where we are the last writers */ + now = get_seconds(); + if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { + commit_now = 1 ; + journal->j_next_async_flush = 1 ; + } + /* don't batch when someone is waiting on j_join_wait */ + /* don't batch when syncing the commit or flushing the whole trans */ + if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) && !flush && !commit_now && + (journal->j_len < journal->j_max_batch) && + journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) { + journal->j_bcount++ ; + unlock_journal(p_s_sb) ; + return 0 ; + } + + if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { + reiserfs_panic(p_s_sb, "journal-003: journal_end: j_start (%ld) is too high\n", journal->j_start) ; + } + return 1 ; +} + +/* +** Does all the work that makes deleting blocks safe. +** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on. +** +** otherwise: +** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes +** before this transaction has finished. +** +** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with +** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash, +** the block can't be reallocated yet. +** +** Then remove it from the current transaction, decrementing any counters and filing it on the clean list. +*/ +int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, b_blocknr_t blocknr) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_journal_cnode *cn = NULL ; + struct buffer_head *bh = NULL ; + struct reiserfs_list_bitmap *jb = NULL ; + int cleaned = 0 ; + BUG_ON (!th->t_trans_id); + + cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr); + if (cn && cn->bh) { + bh = cn->bh ; + get_bh(bh) ; + } + /* if it is journal new, we just remove it from this transaction */ + if (bh && buffer_journal_new(bh)) { + clear_buffer_journal_new (bh); + clear_prepared_bits(bh) ; + reiserfs_clean_and_file_buffer(bh) ; + cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; + } else { + /* set the bit for this block in the journal bitmap for this transaction */ + jb = journal->j_current_jl->j_list_bitmap; + if (!jb) { + reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ; + } + set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ; + + /* Note, the entire while loop is not allowed to schedule. */ + + if (bh) { + clear_prepared_bits(bh) ; + reiserfs_clean_and_file_buffer(bh) ; + } + cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; + + /* find all older transactions with this block, make sure they don't try to write it out */ + cn = get_journal_hash_dev(p_s_sb,journal->j_list_hash_table, blocknr) ; + while (cn) { + if (p_s_sb == cn->sb && blocknr == cn->blocknr) { + set_bit(BLOCK_FREED, &cn->state) ; + if (cn->bh) { + if (!cleaned) { + /* remove_from_transaction will brelse the buffer if it was + ** in the current trans + */ + clear_buffer_journal_dirty (cn->bh); + clear_buffer_dirty(cn->bh); + clear_buffer_journal_test(cn->bh); + cleaned = 1 ; + put_bh(cn->bh) ; + if (atomic_read(&(cn->bh->b_count)) < 0) { + reiserfs_warning (p_s_sb, "journal-2138: cn->bh->b_count < 0"); + } + } + if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ + atomic_dec(&(cn->jlist->j_nonzerolen)) ; + } + cn->bh = NULL ; + } + } + cn = cn->hnext ; + } + } + + if (bh) { + put_bh(bh) ; /* get_hash grabs the buffer */ + if (atomic_read(&(bh->b_count)) < 0) { + reiserfs_warning (p_s_sb, "journal-2165: bh->b_count < 0"); + } + } + return 0 ; +} + +void reiserfs_update_inode_transaction(struct inode *inode) { + struct reiserfs_journal *journal = SB_JOURNAL (inode->i_sb); + REISERFS_I(inode)->i_jl = journal->j_current_jl; + REISERFS_I(inode)->i_trans_id = journal->j_trans_id ; +} + +/* + * returns -1 on error, 0 if no commits/barriers were done and 1 + * if a transaction was actually committed and the barrier was done + */ +static int __commit_trans_jl(struct inode *inode, unsigned long id, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_transaction_handle th ; + struct super_block *sb = inode->i_sb ; + struct reiserfs_journal *journal = SB_JOURNAL (sb); + int ret = 0; + + /* is it from the current transaction, or from an unknown transaction? */ + if (id == journal->j_trans_id) { + jl = journal->j_current_jl; + /* try to let other writers come in and grow this transaction */ + let_transaction_grow(sb, id); + if (journal->j_trans_id != id) { + goto flush_commit_only; + } + + ret = journal_begin(&th, sb, 1) ; + if (ret) + return ret; + + /* someone might have ended this transaction while we joined */ + if (journal->j_trans_id != id) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ; + journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ; + ret = journal_end(&th, sb, 1) ; + goto flush_commit_only; + } + + ret = journal_end_sync(&th, sb, 1) ; + if (!ret) + ret = 1; + + } else { + /* this gets tricky, we have to make sure the journal list in + * the inode still exists. We know the list is still around + * if we've got a larger transaction id than the oldest list + */ +flush_commit_only: + if (journal_list_still_alive(inode->i_sb, id)) { + /* + * we only set ret to 1 when we know for sure + * the barrier hasn't been started yet on the commit + * block. + */ + if (atomic_read(&jl->j_commit_left) > 1) + ret = 1; + flush_commit_list(sb, jl, 1) ; + if (journal->j_errno) + ret = journal->j_errno; + } + } + /* otherwise the list is gone, and long since committed */ + return ret; +} + +int reiserfs_commit_for_inode(struct inode *inode) { + unsigned long id = REISERFS_I(inode)->i_trans_id; + struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; + + /* for the whole inode, assume unset id means it was + * changed in the current transaction. More conservative + */ + if (!id || !jl) { + reiserfs_update_inode_transaction(inode) ; + id = REISERFS_I(inode)->i_trans_id; + /* jl will be updated in __commit_trans_jl */ + } + + return __commit_trans_jl(inode, id, jl); +} + +void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, + struct buffer_head *bh) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + PROC_INFO_INC( p_s_sb, journal.restore_prepared ); + if (!bh) { + return ; + } + if (test_clear_buffer_journal_restore_dirty (bh) && + buffer_journal_dirty(bh)) { + struct reiserfs_journal_cnode *cn; + cn = get_journal_hash_dev(p_s_sb, + journal->j_list_hash_table, + bh->b_blocknr); + if (cn && can_dirty(cn)) { + set_buffer_journal_test (bh); + mark_buffer_dirty(bh); + } + } + clear_buffer_journal_prepared (bh); +} + +extern struct tree_balance *cur_tb ; +/* +** before we can change a metadata block, we have to make sure it won't +** be written to disk while we are altering it. So, we must: +** clean it +** wait on it. +** +*/ +int reiserfs_prepare_for_journal(struct super_block *p_s_sb, + struct buffer_head *bh, int wait) { + PROC_INFO_INC( p_s_sb, journal.prepare ); + + if (test_set_buffer_locked(bh)) { + if (!wait) + return 0; + lock_buffer(bh); + } + set_buffer_journal_prepared (bh); + if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { + clear_buffer_journal_test (bh); + set_buffer_journal_restore_dirty (bh); + } + unlock_buffer(bh); + return 1; +} + +static void flush_old_journal_lists(struct super_block *s) { + struct reiserfs_journal *journal = SB_JOURNAL (s); + struct reiserfs_journal_list *jl; + struct list_head *entry; + time_t now = get_seconds(); + + while(!list_empty(&journal->j_journal_list)) { + entry = journal->j_journal_list.next; + jl = JOURNAL_LIST_ENTRY(entry); + /* this check should always be run, to send old lists to disk */ + if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) { + flush_used_journal_lists(s, jl); + } else { + break; + } + } +} + +/* +** long and ugly. If flush, will not return until all commit +** blocks and all real buffers in the trans are on disk. +** If no_async, won't return until all commit blocks are on disk. +** +** keep reading, there are comments as you go along +** +** If the journal is aborted, we just clean up. Things like flushing +** journal lists, etc just won't happen. +*/ +static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks, + int flags) { + struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); + struct reiserfs_journal_cnode *cn, *next, *jl_cn; + struct reiserfs_journal_cnode *last_cn = NULL; + struct reiserfs_journal_desc *desc ; + struct reiserfs_journal_commit *commit ; + struct buffer_head *c_bh ; /* commit bh */ + struct buffer_head *d_bh ; /* desc bh */ + int cur_write_start = 0 ; /* start index of current log write */ + int old_start ; + int i ; + int flush = flags & FLUSH_ALL ; + int wait_on_commit = flags & WAIT ; + struct reiserfs_journal_list *jl, *temp_jl; + struct list_head *entry, *safe; + unsigned long jindex; + unsigned long commit_trans_id; + int trans_half; + + BUG_ON (th->t_refcount > 1); + BUG_ON (!th->t_trans_id); + + current->journal_info = th->t_handle_save; + reiserfs_check_lock_depth(p_s_sb, "journal end"); + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + } + + lock_journal(p_s_sb) ; + if (journal->j_next_full_flush) { + flags |= FLUSH_ALL ; + flush = 1 ; + } + if (journal->j_next_async_flush) { + flags |= COMMIT_NOW | WAIT; + wait_on_commit = 1; + } + + /* check_journal_end locks the journal, and unlocks if it does not return 1 + ** it tells us if we should continue with the journal_end, or just return + */ + if (!check_journal_end(th, p_s_sb, nblocks, flags)) { + p_s_sb->s_dirt = 1; + wake_queued_writers(p_s_sb); + reiserfs_async_progress_wait(p_s_sb); + goto out ; + } + + /* check_journal_end might set these, check again */ + if (journal->j_next_full_flush) { + flush = 1 ; + } + + /* + ** j must wait means we have to flush the log blocks, and the real blocks for + ** this transaction + */ + if (journal->j_must_wait > 0) { + flush = 1 ; + } + +#ifdef REISERFS_PREALLOCATE + /* quota ops might need to nest, setup the journal_info pointer for them */ + current->journal_info = th ; + reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into + * the transaction */ + current->journal_info = th->t_handle_save ; +#endif + + /* setup description block */ + d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start) ; + set_buffer_uptodate(d_bh); + desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ; + memset(d_bh->b_data, 0, d_bh->b_size) ; + memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ; + set_desc_trans_id(desc, journal->j_trans_id) ; + + /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ + c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; + commit = (struct reiserfs_journal_commit *)c_bh->b_data ; + memset(c_bh->b_data, 0, c_bh->b_size) ; + set_commit_trans_id(commit, journal->j_trans_id) ; + set_buffer_uptodate(c_bh) ; + + /* init this journal list */ + jl = journal->j_current_jl; + + /* we lock the commit before doing anything because + * we want to make sure nobody tries to run flush_commit_list until + * the new transaction is fully setup, and we've already flushed the + * ordered bh list + */ + down(&jl->j_commit_lock); + + /* save the transaction id in case we need to commit it later */ + commit_trans_id = jl->j_trans_id; + + atomic_set(&jl->j_older_commits_done, 0) ; + jl->j_trans_id = journal->j_trans_id ; + jl->j_timestamp = journal->j_trans_start_time ; + jl->j_commit_bh = c_bh ; + jl->j_start = journal->j_start ; + jl->j_len = journal->j_len ; + atomic_set(&jl->j_nonzerolen, journal->j_len) ; + atomic_set(&jl->j_commit_left, journal->j_len + 2); + jl->j_realblock = NULL ; + + /* The ENTIRE FOR LOOP MUST not cause schedule to occur. + ** for each real block, add it to the journal list hash, + ** copy into real block index array in the commit or desc block + */ + trans_half = journal_trans_half(p_s_sb->s_blocksize); + for (i = 0, cn = journal->j_first ; cn ; cn = cn->next, i++) { + if (buffer_journaled (cn->bh)) { + jl_cn = get_cnode(p_s_sb) ; + if (!jl_cn) { + reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ; + } + if (i == 0) { + jl->j_realblock = jl_cn ; + } + jl_cn->prev = last_cn ; + jl_cn->next = NULL ; + if (last_cn) { + last_cn->next = jl_cn ; + } + last_cn = jl_cn ; + /* make sure the block we are trying to log is not a block + of journal or reserved area */ + + if (is_block_in_log_or_reserved_area(p_s_sb, cn->bh->b_blocknr)) { + reiserfs_panic(p_s_sb, "journal-2332: Trying to log block %lu, which is a log block\n", cn->bh->b_blocknr) ; + } + jl_cn->blocknr = cn->bh->b_blocknr ; + jl_cn->state = 0 ; + jl_cn->sb = p_s_sb; + jl_cn->bh = cn->bh ; + jl_cn->jlist = jl; + insert_journal_hash(journal->j_list_hash_table, jl_cn) ; + if (i < trans_half) { + desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ; + } else { + commit->j_realblock[i - trans_half] = cpu_to_le32(cn->bh->b_blocknr) ; + } + } else { + i-- ; + } + } + set_desc_trans_len(desc, journal->j_len) ; + set_desc_mount_id(desc, journal->j_mount_id) ; + set_desc_trans_id(desc, journal->j_trans_id) ; + set_commit_trans_len(commit, journal->j_len); + + /* special check in case all buffers in the journal were marked for not logging */ + if (journal->j_len == 0) { + BUG(); + } + + /* we're about to dirty all the log blocks, mark the description block + * dirty now too. Don't mark the commit block dirty until all the + * others are on disk + */ + mark_buffer_dirty(d_bh); + + /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ + cur_write_start = journal->j_start ; + cn = journal->j_first ; + jindex = 1 ; /* start at one so we don't get the desc again */ + while(cn) { + clear_buffer_journal_new (cn->bh); + /* copy all the real blocks into log area. dirty log blocks */ + if (buffer_journaled (cn->bh)) { + struct buffer_head *tmp_bh ; + char *addr; + struct page *page; + tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; + set_buffer_uptodate(tmp_bh); + page = cn->bh->b_page; + addr = kmap(page); + memcpy(tmp_bh->b_data, addr + offset_in_page(cn->bh->b_data), + cn->bh->b_size); + kunmap(page); + mark_buffer_dirty(tmp_bh); + jindex++ ; + set_buffer_journal_dirty (cn->bh); + clear_buffer_journaled (cn->bh); + } else { + /* JDirty cleared sometime during transaction. don't log this one */ + reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!") ; + brelse(cn->bh) ; + } + next = cn->next ; + free_cnode(p_s_sb, cn) ; + cn = next ; + cond_resched(); + } + + /* we are done with both the c_bh and d_bh, but + ** c_bh must be written after all other commit blocks, + ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + */ + + journal->j_current_jl = alloc_journal_list(p_s_sb); + + /* now it is safe to insert this transaction on the main list */ + list_add_tail(&jl->j_list, &journal->j_journal_list); + list_add_tail(&jl->j_working_list, &journal->j_working_list); + journal->j_num_work_lists++; + + /* reset journal values for the next transaction */ + old_start = journal->j_start ; + journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb); + atomic_set(&(journal->j_wcount), 0) ; + journal->j_bcount = 0 ; + journal->j_last = NULL ; + journal->j_first = NULL ; + journal->j_len = 0 ; + journal->j_trans_start_time = 0 ; + journal->j_trans_id++ ; + journal->j_current_jl->j_trans_id = journal->j_trans_id; + journal->j_must_wait = 0 ; + journal->j_len_alloc = 0 ; + journal->j_next_full_flush = 0 ; + journal->j_next_async_flush = 0 ; + init_journal_hash(p_s_sb) ; + + // make sure reiserfs_add_jh sees the new current_jl before we + // write out the tails + smp_mb(); + + /* tail conversion targets have to hit the disk before we end the + * transaction. Otherwise a later transaction might repack the tail + * before this transaction commits, leaving the data block unflushed and + * clean, if we crash before the later transaction commits, the data block + * is lost. + */ + if (!list_empty(&jl->j_tail_bh_list)) { + unlock_kernel(); + write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_tail_bh_list); + lock_kernel(); + } + if (!list_empty(&jl->j_tail_bh_list)) + BUG(); + up(&jl->j_commit_lock); + + /* honor the flush wishes from the caller, simple commits can + ** be done outside the journal lock, they are done below + ** + ** if we don't flush the commit list right now, we put it into + ** the work queue so the people waiting on the async progress work + ** queue don't wait for this proc to flush journal lists and such. + */ + if (flush) { + flush_commit_list(p_s_sb, jl, 1) ; + flush_journal_list(p_s_sb, jl, 1) ; + } else if (!(jl->j_state & LIST_COMMIT_PENDING)) + queue_delayed_work(commit_wq, &journal->j_work, HZ/10); + + + /* if the next transaction has any chance of wrapping, flush + ** transactions that might get overwritten. If any journal lists are very + ** old flush them as well. + */ +first_jl: + list_for_each_safe(entry, safe, &journal->j_journal_list) { + temp_jl = JOURNAL_LIST_ENTRY(entry); + if (journal->j_start <= temp_jl->j_start) { + if ((journal->j_start + journal->j_trans_max + 1) >= + temp_jl->j_start) + { + flush_used_journal_lists(p_s_sb, temp_jl); + goto first_jl; + } else if ((journal->j_start + + journal->j_trans_max + 1) < + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) + { + /* if we don't cross into the next transaction and we don't + * wrap, there is no way we can overlap any later transactions + * break now + */ + break; + } + } else if ((journal->j_start + + journal->j_trans_max + 1) > + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) + { + if (((journal->j_start + journal->j_trans_max + 1) % + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start) + { + flush_used_journal_lists(p_s_sb, temp_jl); + goto first_jl; + } else { + /* we don't overlap anything from out start to the end of the + * log, and our wrapped portion doesn't overlap anything at + * the start of the log. We can break + */ + break; + } + } + } + flush_old_journal_lists(p_s_sb); + + journal->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, journal->j_current_jl) ; + + if (!(journal->j_current_jl->j_list_bitmap)) { + reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ; + } + + atomic_set(&(journal->j_jlock), 0) ; + unlock_journal(p_s_sb) ; + /* wake up any body waiting to join. */ + clear_bit(J_WRITERS_QUEUED, &journal->j_state); + wake_up(&(journal->j_join_wait)) ; + + if (!flush && wait_on_commit && + journal_list_still_alive(p_s_sb, commit_trans_id)) { + flush_commit_list(p_s_sb, jl, 1) ; + } +out: + reiserfs_check_lock_depth(p_s_sb, "journal end2"); + + memset (th, 0, sizeof (*th)); + /* Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later */ + th->t_super = p_s_sb; + + return journal->j_errno; +} + +static void +__reiserfs_journal_abort_hard (struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL (sb); + if (test_bit (J_ABORTED, &journal->j_state)) + return; + + printk (KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n", + reiserfs_bdevname (sb)); + + sb->s_flags |= MS_RDONLY; + set_bit (J_ABORTED, &journal->j_state); + +#ifdef CONFIG_REISERFS_CHECK + dump_stack(); +#endif +} + +static void +__reiserfs_journal_abort_soft (struct super_block *sb, int errno) +{ + struct reiserfs_journal *journal = SB_JOURNAL (sb); + if (test_bit (J_ABORTED, &journal->j_state)) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + __reiserfs_journal_abort_hard (sb); +} + +void +reiserfs_journal_abort (struct super_block *sb, int errno) +{ + return __reiserfs_journal_abort_soft (sb, errno); +} diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c new file mode 100644 index 000000000000..2406608fc5cd --- /dev/null +++ b/fs/reiserfs/lbalance.c @@ -0,0 +1,1222 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <linux/string.h> +#include <linux/time.h> +#include <linux/reiserfs_fs.h> +#include <linux/buffer_head.h> + +/* these are used in do_balance.c */ + +/* leaf_move_items + leaf_shift_left + leaf_shift_right + leaf_delete_items + leaf_insert_into_buf + leaf_paste_in_buffer + leaf_cut_from_buffer + leaf_paste_entries + */ + + +/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */ +static void leaf_copy_dir_entries (struct buffer_info * dest_bi, struct buffer_head * source, + int last_first, int item_num, int from, int copy_count) +{ + struct buffer_head * dest = dest_bi->bi_bh; + int item_num_in_dest; /* either the number of target item, + or if we must create a new item, + the number of the item we will + create it next to */ + struct item_head * ih; + struct reiserfs_de_head * deh; + int copy_records_len; /* length of all records in item to be copied */ + char * records; + + ih = B_N_PITEM_HEAD (source, item_num); + + RFALSE( !is_direntry_le_ih (ih), "vs-10000: item must be directory item"); + + /* length of all record to be copied and first byte of the last of them */ + deh = B_I_DEH (source, ih); + if (copy_count) { + copy_records_len = (from ? deh_location( &(deh[from - 1]) ) : + ih_item_len(ih)) - deh_location( &(deh[from + copy_count - 1])); + records = source->b_data + ih_location(ih) + + deh_location( &(deh[from + copy_count - 1])); + } else { + copy_records_len = 0; + records = NULL; + } + + /* when copy last to first, dest buffer can contain 0 items */ + item_num_in_dest = (last_first == LAST_TO_FIRST) ? (( B_NR_ITEMS(dest) ) ? 0 : -1) : (B_NR_ITEMS(dest) - 1); + + /* if there are no items in dest or the first/last item in dest is not item of the same directory */ + if ( (item_num_in_dest == - 1) || + (last_first == FIRST_TO_LAST && le_ih_k_offset (ih) == DOT_OFFSET) || + (last_first == LAST_TO_FIRST && comp_short_le_keys/*COMP_SHORT_KEYS*/ (&ih->ih_key, B_N_PKEY (dest, item_num_in_dest)))) { + /* create new item in dest */ + struct item_head new_ih; + + /* form item header */ + memcpy (&new_ih.ih_key, &ih->ih_key, KEY_SIZE); + put_ih_version( &new_ih, KEY_FORMAT_3_5 ); + /* calculate item len */ + put_ih_item_len( &new_ih, DEH_SIZE * copy_count + copy_records_len ); + put_ih_entry_count( &new_ih, 0 ); + + if (last_first == LAST_TO_FIRST) { + /* form key by the following way */ + if (from < I_ENTRY_COUNT(ih)) { + set_le_ih_k_offset( &new_ih, deh_offset( &(deh[from]) ) ); + /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE);*/ + } else { + /* no entries will be copied to this item in this function */ + set_le_ih_k_offset (&new_ih, U32_MAX); + /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */ + } + set_le_key_k_type (KEY_FORMAT_3_5, &(new_ih.ih_key), TYPE_DIRENTRY); + } + + /* insert item into dest buffer */ + leaf_insert_into_buf (dest_bi, (last_first == LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), &new_ih, NULL, 0); + } else { + /* prepare space for entries */ + leaf_paste_in_buffer (dest_bi, (last_first==FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0, MAX_US_INT, + DEH_SIZE * copy_count + copy_records_len, records, 0 + ); + } + + item_num_in_dest = (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest)-1) : 0; + + leaf_paste_entries (dest_bi->bi_bh, item_num_in_dest, + (last_first == FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD (dest, item_num_in_dest)) : 0, + copy_count, deh + from, records, + DEH_SIZE * copy_count + copy_records_len + ); +} + + +/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or + part of it or nothing (see the return 0 below) from SOURCE to the end + (if last_first) or beginning (!last_first) of the DEST */ +/* returns 1 if anything was copied, else 0 */ +static int leaf_copy_boundary_item (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, + int bytes_or_entries) +{ + struct buffer_head * dest = dest_bi->bi_bh; + int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */ + struct item_head * ih; + struct item_head * dih; + + dest_nr_item = B_NR_ITEMS(dest); + + if ( last_first == FIRST_TO_LAST ) { + /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects + or of different types ) then there is no need to treat this item differently from the other items + that we copy, so we return */ + ih = B_N_PITEM_HEAD (src, 0); + dih = B_N_PITEM_HEAD (dest, dest_nr_item - 1); + if (!dest_nr_item || (!op_is_left_mergeable (&(ih->ih_key), src->b_size))) + /* there is nothing to merge */ + return 0; + + RFALSE( ! ih_item_len(ih), "vs-10010: item can not have empty length"); + + if ( is_direntry_le_ih (ih) ) { + if ( bytes_or_entries == -1 ) + /* copy all entries to dest */ + bytes_or_entries = ih_entry_count(ih); + leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, 0, 0, bytes_or_entries); + return 1; + } + + /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST + part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header + */ + if ( bytes_or_entries == -1 ) + bytes_or_entries = ih_item_len(ih); + +#ifdef CONFIG_REISERFS_CHECK + else { + if (bytes_or_entries == ih_item_len(ih) && is_indirect_le_ih(ih)) + if (get_ih_free_space (ih)) + reiserfs_panic (NULL, "vs-10020: leaf_copy_boundary_item: " + "last unformatted node must be filled entirely (%h)", + ih); + } +#endif + + /* merge first item (or its part) of src buffer with the last + item of dest buffer. Both are of the same file */ + leaf_paste_in_buffer (dest_bi, + dest_nr_item - 1, ih_item_len(dih), bytes_or_entries, B_I_PITEM(src,ih), 0 + ); + + if (is_indirect_le_ih (dih)) { + RFALSE( get_ih_free_space (dih), + "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", + ih); + if (bytes_or_entries == ih_item_len(ih)) + set_ih_free_space (dih, get_ih_free_space (ih)); + } + + return 1; + } + + + /* copy boundary item to right (last_first == LAST_TO_FIRST) */ + + /* ( DEST is empty or last item of SOURCE and first item of DEST + are the items of different object or of different types ) + */ + src_nr_item = B_NR_ITEMS (src); + ih = B_N_PITEM_HEAD (src, src_nr_item - 1); + dih = B_N_PITEM_HEAD (dest, 0); + + if (!dest_nr_item || !op_is_left_mergeable (&(dih->ih_key), src->b_size)) + return 0; + + if ( is_direntry_le_ih (ih)) { + if ( bytes_or_entries == -1 ) + /* bytes_or_entries = entries number in last item body of SOURCE */ + bytes_or_entries = ih_entry_count(ih); + + leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, src_nr_item - 1, ih_entry_count(ih) - bytes_or_entries, bytes_or_entries); + return 1; + } + + /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST; + part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST; + don't create new item header + */ + + RFALSE( is_indirect_le_ih(ih) && get_ih_free_space (ih), + "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", + ih); + + if ( bytes_or_entries == -1 ) { + /* bytes_or_entries = length of last item body of SOURCE */ + bytes_or_entries = ih_item_len(ih); + + RFALSE( le_ih_k_offset (dih) != + le_ih_k_offset (ih) + op_bytes_number (ih, src->b_size), + "vs-10050: items %h and %h do not match", ih, dih); + + /* change first item key of the DEST */ + set_le_ih_k_offset (dih, le_ih_k_offset (ih)); + + /* item becomes non-mergeable */ + /* or mergeable if left item was */ + set_le_ih_k_type (dih, le_ih_k_type (ih)); + } else { + /* merge to right only part of item */ + RFALSE( ih_item_len(ih) <= bytes_or_entries, + "vs-10060: no so much bytes %lu (needed %lu)", + ( unsigned long )ih_item_len(ih), ( unsigned long )bytes_or_entries); + + /* change first item key of the DEST */ + if ( is_direct_le_ih (dih) ) { + RFALSE( le_ih_k_offset (dih) <= (unsigned long)bytes_or_entries, + "vs-10070: dih %h, bytes_or_entries(%d)", dih, bytes_or_entries); + set_le_ih_k_offset (dih, le_ih_k_offset (dih) - bytes_or_entries); + } else { + RFALSE( le_ih_k_offset (dih) <= + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, + "vs-10080: dih %h, bytes_or_entries(%d)", + dih, (bytes_or_entries/UNFM_P_SIZE)*dest->b_size); + set_le_ih_k_offset (dih, le_ih_k_offset (dih) - ((bytes_or_entries / UNFM_P_SIZE) * dest->b_size)); + } + } + + leaf_paste_in_buffer (dest_bi, 0, 0, bytes_or_entries, B_I_PITEM(src,ih) + ih_item_len(ih) - bytes_or_entries, 0); + return 1; +} + + +/* copy cpy_mun items from buffer src to buffer dest + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest + */ +static void leaf_copy_items_entirely (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, + int first, int cpy_num) +{ + struct buffer_head * dest; + int nr, free_space; + int dest_before; + int last_loc, last_inserted_loc, location; + int i, j; + struct block_head * blkh; + struct item_head * ih; + + RFALSE( last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, + "vs-10090: bad last_first parameter %d", last_first); + RFALSE( B_NR_ITEMS (src) - first < cpy_num, + "vs-10100: too few items in source %d, required %d from %d", + B_NR_ITEMS(src), cpy_num, first); + RFALSE( cpy_num < 0, "vs-10110: can not copy negative amount of items"); + RFALSE( ! dest_bi, "vs-10120: can not copy negative amount of items"); + + dest = dest_bi->bi_bh; + + RFALSE( ! dest, "vs-10130: can not copy negative amount of items"); + + if (cpy_num == 0) + return; + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item( blkh ); + free_space = blkh_free_space(blkh); + + /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */ + dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; + + /* location of head of first new item */ + ih = B_N_PITEM_HEAD (dest, dest_before); + + RFALSE( blkh_free_space(blkh) < cpy_num * IH_SIZE, + "vs-10140: not enough free space for headers %d (needed %d)", + B_FREE_SPACE (dest), cpy_num * IH_SIZE); + + /* prepare space for headers */ + memmove (ih + cpy_num, ih, (nr-dest_before) * IH_SIZE); + + /* copy item headers */ + memcpy (ih, B_N_PITEM_HEAD (src, first), cpy_num * IH_SIZE); + + free_space -= (IH_SIZE * cpy_num); + set_blkh_free_space( blkh, free_space ); + + /* location of unmovable item */ + j = location = (dest_before == 0) ? dest->b_size : ih_location(ih-1); + for (i = dest_before; i < nr + cpy_num; i ++) { + location -= ih_item_len( ih + i - dest_before ); + put_ih_location( ih + i - dest_before, location ); + } + + /* prepare space for items */ + last_loc = ih_location( &(ih[nr+cpy_num-1-dest_before]) ); + last_inserted_loc = ih_location( &(ih[cpy_num-1]) ); + + /* check free space */ + RFALSE( free_space < j - last_inserted_loc, + "vs-10150: not enough free space for items %d (needed %d)", + free_space, j - last_inserted_loc); + + memmove (dest->b_data + last_loc, + dest->b_data + last_loc + j - last_inserted_loc, + last_inserted_loc - last_loc); + + /* copy items */ + memcpy (dest->b_data + last_inserted_loc, B_N_PITEM(src,(first + cpy_num - 1)), + j - last_inserted_loc); + + /* sizes, item number */ + set_blkh_nr_item( blkh, nr + cpy_num ); + set_blkh_free_space( blkh, free_space - (j - last_inserted_loc) ); + + do_balance_mark_leaf_dirty (dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position); + RFALSE( dc_block_number(t_dc) != dest->b_blocknr, + "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", + ( long unsigned ) dest->b_blocknr, + ( long unsigned ) dc_block_number(t_dc)); + put_dc_size( t_dc, dc_size(t_dc) + (j - last_inserted_loc + IH_SIZE * cpy_num ) ); + + do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent, 0); + } +} + + +/* This function splits the (liquid) item into two items (useful when + shifting part of an item into another node.) */ +static void leaf_item_bottle (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, + int item_num, int cpy_bytes) +{ + struct buffer_head * dest = dest_bi->bi_bh; + struct item_head * ih; + + RFALSE( cpy_bytes == -1, "vs-10170: bytes == - 1 means: do not split item"); + + if ( last_first == FIRST_TO_LAST ) { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ + if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(src,item_num))) + leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes); + else { + struct item_head n_ih; + + /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST + part defined by 'cpy_bytes'; create new item header; change old item_header (????); + n_ih = new item_header; + */ + memcpy (&n_ih, ih, IH_SIZE); + put_ih_item_len( &n_ih, cpy_bytes ); + if (is_indirect_le_ih (ih)) { + RFALSE( cpy_bytes == ih_item_len(ih) && get_ih_free_space(ih), + "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", + ( long unsigned ) get_ih_free_space (ih)); + set_ih_free_space (&n_ih, 0); + } + + RFALSE( op_is_left_mergeable (&(ih->ih_key), src->b_size), + "vs-10190: bad mergeability of item %h", ih); + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + leaf_insert_into_buf (dest_bi, B_NR_ITEMS(dest), &n_ih, B_N_PITEM (src, item_num), 0); + } + } else { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ + if (is_direntry_le_ih(ih = B_N_PITEM_HEAD (src, item_num))) + leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, item_num, I_ENTRY_COUNT(ih) - cpy_bytes, cpy_bytes); + else { + struct item_head n_ih; + + /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST + part defined by 'cpy_bytes'; create new item header; + n_ih = new item_header; + */ + memcpy (&n_ih, ih, SHORT_KEY_SIZE); + + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + + if (is_direct_le_ih (ih)) { + set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + ih_item_len(ih) - cpy_bytes); + set_le_ih_k_type (&n_ih, TYPE_DIRECT); + set_ih_free_space (&n_ih, MAX_US_INT); + } else { + /* indirect item */ + RFALSE( !cpy_bytes && get_ih_free_space (ih), + "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); + set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + (ih_item_len(ih) - cpy_bytes) / UNFM_P_SIZE * dest->b_size); + set_le_ih_k_type (&n_ih, TYPE_INDIRECT); + set_ih_free_space (&n_ih, get_ih_free_space (ih)); + } + + /* set item length */ + put_ih_item_len( &n_ih, cpy_bytes ); + + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + + leaf_insert_into_buf (dest_bi, 0, &n_ih, B_N_PITEM(src,item_num) + ih_item_len(ih) - cpy_bytes, 0); + } + } +} + + +/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST. + If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST. + From last item copy cpy_num bytes for regular item and cpy_num directory entries for + directory item. */ +static int leaf_copy_items (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, int cpy_num, + int cpy_bytes) +{ + struct buffer_head * dest; + int pos, i, src_nr_item, bytes; + + dest = dest_bi->bi_bh; + RFALSE( !dest || !src, "vs-10210: !dest || !src"); + RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); + RFALSE( B_NR_ITEMS(src) < cpy_num, + "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), cpy_num); + RFALSE( cpy_num < 0,"vs-10240: cpy_num < 0 (%d)", cpy_num); + + if ( cpy_num == 0 ) + return 0; + + if ( last_first == FIRST_TO_LAST ) { + /* copy items to left */ + pos = 0; + if ( cpy_num == 1 ) + bytes = cpy_bytes; + else + bytes = -1; + + /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ + i = leaf_copy_boundary_item (dest_bi, src, FIRST_TO_LAST, bytes); + cpy_num -= i; + if ( cpy_num == 0 ) + return i; + pos += i; + if ( cpy_bytes == -1 ) + /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */ + leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num); + else { + /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */ + leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num-1); + + /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */ + leaf_item_bottle (dest_bi, src, FIRST_TO_LAST, cpy_num+pos-1, cpy_bytes); + } + } else { + /* copy items to right */ + src_nr_item = B_NR_ITEMS (src); + if ( cpy_num == 1 ) + bytes = cpy_bytes; + else + bytes = -1; + + /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ + i = leaf_copy_boundary_item (dest_bi, src, LAST_TO_FIRST, bytes); + + cpy_num -= i; + if ( cpy_num == 0 ) + return i; + + pos = src_nr_item - cpy_num - i; + if ( cpy_bytes == -1 ) { + /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */ + leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos, cpy_num); + } else { + /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */ + leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos+1, cpy_num-1); + + /* copy part of the item which number is pos to the begin of the DEST */ + leaf_item_bottle (dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes); + } + } + return i; +} + + +/* there are types of coping: from S[0] to L[0], from S[0] to R[0], + from R[0] to L[0]. for each of these we have to define parent and + positions of destination and source buffers */ +static void leaf_define_dest_src_infos (int shift_mode, struct tree_balance * tb, struct buffer_info * dest_bi, + struct buffer_info * src_bi, int * first_last, + struct buffer_head * Snew) +{ + memset (dest_bi, 0, sizeof (struct buffer_info)); + memset (src_bi, 0, sizeof (struct buffer_info)); + + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); /* src->b_item_order */ + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position (tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position (tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[0]; + src_bi->bi_parent = tb->FR[0]; + src_bi->bi_position = get_right_neighbor_position (tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position (tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->L[0]; + src_bi->bi_parent = tb->FL[0]; + src_bi->bi_position = get_left_neighbor_position (tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position (tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_S_TO_SNEW: + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = Snew; + dest_bi->bi_parent = NULL; + dest_bi->bi_position = 0; + *first_last = LAST_TO_FIRST; + break; + + default: + reiserfs_panic (NULL, "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", shift_mode); + } + RFALSE( src_bi->bi_bh == 0 || dest_bi->bi_bh == 0, + "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", + shift_mode, src_bi->bi_bh, dest_bi->bi_bh); +} + + + + +/* copy mov_num items and mov_bytes of the (mov_num-1)th item to + neighbor. Delete them from source */ +int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew) +{ + int ret_value; + struct buffer_info dest_bi, src_bi; + int first_last; + + leaf_define_dest_src_infos (shift_mode, tb, &dest_bi, &src_bi, &first_last, Snew); + + ret_value = leaf_copy_items (&dest_bi, src_bi.bi_bh, first_last, mov_num, mov_bytes); + + leaf_delete_items (&src_bi, first_last, (first_last == FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - mov_num), mov_num, mov_bytes); + + + return ret_value; +} + + +/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1) + from S[0] to L[0] and replace the delimiting key */ +int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes) +{ + struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); + int i; + + /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */ + i = leaf_move_items (LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); + + if ( shift_num ) { + if (B_NR_ITEMS (S0) == 0) { /* number of items in S[0] == 0 */ + + RFALSE( shift_bytes != -1, + "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", + shift_bytes); +#ifdef CONFIG_REISERFS_CHECK + if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { + print_cur_tb ("vs-10275"); + reiserfs_panic (tb->tb_sb, "vs-10275: leaf_shift_left: balance condition corrupted (%c)", tb->tb_mode); + } +#endif + + if (PATH_H_POSITION (tb->tb_path, 1) == 0) + replace_key (tb, tb->CFL[0], tb->lkey[0], PATH_H_PPARENT (tb->tb_path, 0), 0); + + } else { + /* replace lkey in CFL[0] by 0-th key from S[0]; */ + replace_key (tb, tb->CFL[0], tb->lkey[0], S0, 0); + + RFALSE( (shift_bytes != -1 && + !(is_direntry_le_ih (B_N_PITEM_HEAD (S0, 0)) + && !I_ENTRY_COUNT (B_N_PITEM_HEAD (S0, 0)))) && + (!op_is_left_mergeable (B_N_PKEY (S0, 0), S0->b_size)), + "vs-10280: item must be mergeable"); + } + } + + return i; +} + + + + + +/* CLEANING STOPPED HERE */ + + + + +/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */ +int leaf_shift_right( + struct tree_balance * tb, + int shift_num, + int shift_bytes + ) +{ + // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); + int ret_value; + + /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */ + ret_value = leaf_move_items (LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); + + /* replace rkey in CFR[0] by the 0-th key from R[0] */ + if (shift_num) { + replace_key (tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + } + + return ret_value; +} + + + +static void leaf_delete_items_entirely (struct buffer_info * bi, + int first, int del_num); +/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR. + If not. + If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of + the first item. Part defined by del_bytes. Don't delete first item header + If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of + the last item . Part defined by del_bytes. Don't delete last item header. +*/ +void leaf_delete_items (struct buffer_info * cur_bi, int last_first, + int first, int del_num, int del_bytes) +{ + struct buffer_head * bh; + int item_amount = B_NR_ITEMS (bh = cur_bi->bi_bh); + + RFALSE( !bh, "10155: bh is not defined"); + RFALSE( del_num < 0, "10160: del_num can not be < 0. del_num==%d", del_num); + RFALSE( first < 0 || first + del_num > item_amount, + "10165: invalid number of first item to be deleted (%d) or " + "no so much items (%d) to delete (only %d)", + first, first + del_num, item_amount); + + if ( del_num == 0 ) + return; + + if ( first == 0 && del_num == item_amount && del_bytes == -1 ) { + make_empty_node (cur_bi); + do_balance_mark_leaf_dirty (cur_bi->tb, bh, 0); + return; + } + + if ( del_bytes == -1 ) + /* delete del_num items beginning from item in position first */ + leaf_delete_items_entirely (cur_bi, first, del_num); + else { + if ( last_first == FIRST_TO_LAST ) { + /* delete del_num-1 items beginning from item in position first */ + leaf_delete_items_entirely (cur_bi, first, del_num-1); + + /* delete the part of the first item of the bh + do not delete item header + */ + leaf_cut_from_buffer (cur_bi, 0, 0, del_bytes); + } else { + struct item_head * ih; + int len; + + /* delete del_num-1 items beginning from item in position first+1 */ + leaf_delete_items_entirely (cur_bi, first+1, del_num-1); + + if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh)-1))) /* the last item is directory */ + /* len = numbers of directory entries in this item */ + len = ih_entry_count(ih); + else + /* len = body len of item */ + len = ih_item_len(ih); + + /* delete the part of the last item of the bh + do not delete item header + */ + leaf_cut_from_buffer (cur_bi, B_NR_ITEMS(bh)-1, len - del_bytes, del_bytes); + } + } +} + + +/* insert item into the leaf node in position before */ +void leaf_insert_into_buf (struct buffer_info * bi, int before, + struct item_head * inserted_item_ih, + const char * inserted_item_body, + int zeros_number) +{ + struct buffer_head * bh = bi->bi_bh; + int nr, free_space; + struct block_head * blkh; + struct item_head * ih; + int i; + int last_loc, unmoved_loc; + char * to; + + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space( blkh ); + + /* check free space */ + RFALSE( free_space < ih_item_len(inserted_item_ih) + IH_SIZE, + "vs-10170: not enough free space in block %z, new item %h", + bh, inserted_item_ih); + RFALSE( zeros_number > ih_item_len(inserted_item_ih), + "vs-10172: zero number == %d, item length == %d", + zeros_number, ih_item_len(inserted_item_ih)); + + + /* get item new item must be inserted before */ + ih = B_N_PITEM_HEAD (bh, before); + + /* prepare space for the body of new item */ + last_loc = nr ? ih_location( &(ih[nr - before - 1]) ) : bh->b_size; + unmoved_loc = before ? ih_location( ih-1 ) : bh->b_size; + + + memmove (bh->b_data + last_loc - ih_item_len(inserted_item_ih), + bh->b_data + last_loc, unmoved_loc - last_loc); + + to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); + memset (to, 0, zeros_number); + to += zeros_number; + + /* copy body to prepared space */ + if (inserted_item_body) + memmove (to, inserted_item_body, ih_item_len(inserted_item_ih) - zeros_number); + else + memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); + + /* insert item header */ + memmove (ih + 1, ih, IH_SIZE * (nr - before)); + memmove (ih, inserted_item_ih, IH_SIZE); + + /* change locations */ + for (i = before; i < nr + 1; i ++) + { + unmoved_loc -= ih_item_len( &(ih[i-before])); + put_ih_location( &(ih[i-before]), unmoved_loc ); + } + + /* sizes, free space, item number */ + set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 ); + set_blkh_free_space( blkh, + free_space - (IH_SIZE + ih_item_len(inserted_item_ih ) ) ); + do_balance_mark_leaf_dirty (bi->tb, bh, 1); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); + put_dc_size( t_dc, dc_size(t_dc) + (IH_SIZE + ih_item_len(inserted_item_ih))); + do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); + } +} + + +/* paste paste_size bytes to affected_item_num-th item. + When item is a directory, this only prepare space for new entries */ +void leaf_paste_in_buffer (struct buffer_info * bi, int affected_item_num, + int pos_in_item, int paste_size, + const char * body, + int zeros_number) +{ + struct buffer_head * bh = bi->bi_bh; + int nr, free_space; + struct block_head * blkh; + struct item_head * ih; + int i; + int last_loc, unmoved_loc; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + + /* check free space */ + RFALSE( free_space < paste_size, + "vs-10175: not enough free space: needed %d, available %d", + paste_size, free_space); + +#ifdef CONFIG_REISERFS_CHECK + if (zeros_number > paste_size) { + print_cur_tb ("10177"); + reiserfs_panic ( NULL, "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d", + zeros_number, paste_size); + } +#endif /* CONFIG_REISERFS_CHECK */ + + + /* item to be appended */ + ih = B_N_PITEM_HEAD(bh, affected_item_num); + + last_loc = ih_location( &(ih[nr - affected_item_num - 1]) ); + unmoved_loc = affected_item_num ? ih_location( ih-1 ) : bh->b_size; + + /* prepare space */ + memmove (bh->b_data + last_loc - paste_size, bh->b_data + last_loc, + unmoved_loc - last_loc); + + + /* change locations */ + for (i = affected_item_num; i < nr; i ++) + put_ih_location( &(ih[i-affected_item_num]), + ih_location( &(ih[i-affected_item_num])) - paste_size ); + + if ( body ) { + if (!is_direntry_le_ih (ih)) { + if (!pos_in_item) { + /* shift data to right */ + memmove (bh->b_data + ih_location(ih) + paste_size, + bh->b_data + ih_location(ih), ih_item_len(ih)); + /* paste data in the head of item */ + memset (bh->b_data + ih_location(ih), 0, zeros_number); + memcpy (bh->b_data + ih_location(ih) + zeros_number, body, paste_size - zeros_number); + } else { + memset (bh->b_data + unmoved_loc - paste_size, 0, zeros_number); + memcpy (bh->b_data + unmoved_loc - paste_size + zeros_number, body, paste_size - zeros_number); + } + } + } + else + memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); + + put_ih_item_len( ih, ih_item_len(ih) + paste_size ); + + /* change free space */ + set_blkh_free_space( blkh, free_space - paste_size ); + + do_balance_mark_leaf_dirty (bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); + put_dc_size( t_dc, dc_size(t_dc) + paste_size ); + do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); + } +} + + +/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item + does not have free space, so it moves DEHs and remaining records as + necessary. Return value is size of removed part of directory item + in bytes. */ +static int leaf_cut_entries ( + struct buffer_head * bh, + struct item_head * ih, + int from, + int del_count + ) +{ + char * item; + struct reiserfs_de_head * deh; + int prev_record_offset; /* offset of record, that is (from-1)th */ + char * prev_record; /* */ + int cut_records_len; /* length of all removed records */ + int i; + + + /* make sure, that item is directory and there are enough entries to + remove */ + RFALSE( !is_direntry_le_ih (ih), "10180: item is not directory item"); + RFALSE( I_ENTRY_COUNT(ih) < from + del_count, + "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d", + I_ENTRY_COUNT(ih), from, del_count); + + if (del_count == 0) + return 0; + + /* first byte of item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH (bh, ih); + + /* first byte of remaining entries, those are BEFORE cut entries + (prev_record) and length of all removed records (cut_records_len) */ + prev_record_offset = (from ? deh_location( &(deh[from - 1])) : ih_item_len(ih)); + cut_records_len = prev_record_offset/*from_record*/ - + deh_location( &(deh[from + del_count - 1])); + prev_record = item + prev_record_offset; + + + /* adjust locations of remaining entries */ + for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i --) + put_deh_location( &(deh[i]), + deh_location( &deh[i] ) - (DEH_SIZE * del_count ) ); + + for (i = 0; i < from; i ++) + put_deh_location( &(deh[i]), + deh_location( &deh[i] ) - (DEH_SIZE * del_count + cut_records_len) ); + + put_ih_entry_count( ih, ih_entry_count(ih) - del_count ); + + /* shift entry head array and entries those are AFTER removed entries */ + memmove ((char *)(deh + from), + deh + from + del_count, + prev_record - cut_records_len - (char *)(deh + from + del_count)); + + /* shift records, those are BEFORE removed entries */ + memmove (prev_record - cut_records_len - DEH_SIZE * del_count, + prev_record, item + ih_item_len(ih) - prev_record); + + return DEH_SIZE * del_count + cut_records_len; +} + + +/* when cut item is part of regular file + pos_in_item - first byte that must be cut + cut_size - number of bytes to be cut beginning from pos_in_item + + when cut item is part of directory + pos_in_item - number of first deleted entry + cut_size - count of deleted entries + */ +void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num, + int pos_in_item, int cut_size) +{ + int nr; + struct buffer_head * bh = bi->bi_bh; + struct block_head * blkh; + struct item_head * ih; + int last_loc, unmoved_loc; + int i; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + /* item head of truncated item */ + ih = B_N_PITEM_HEAD (bh, cut_item_num); + + if (is_direntry_le_ih (ih)) { + /* first cut entry ()*/ + cut_size = leaf_cut_entries (bh, ih, pos_in_item, cut_size); + if (pos_in_item == 0) { + /* change key */ + RFALSE( cut_item_num, + "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", cut_item_num); + /* change item key by key of first entry in the item */ + set_le_ih_k_offset (ih, deh_offset(B_I_DEH (bh, ih))); + /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE);*/ + } + } else { + /* item is direct or indirect */ + RFALSE( is_statdata_le_ih (ih), "10195: item is stat data"); + RFALSE( pos_in_item && pos_in_item + cut_size != ih_item_len(ih), + "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", + ( long unsigned ) pos_in_item, ( long unsigned ) cut_size, + ( long unsigned ) ih_item_len (ih)); + + /* shift item body to left if cut is from the head of item */ + if (pos_in_item == 0) { + memmove( bh->b_data + ih_location(ih), + bh->b_data + ih_location(ih) + cut_size, + ih_item_len(ih) - cut_size); + + /* change key of item */ + if (is_direct_le_ih (ih)) + set_le_ih_k_offset (ih, le_ih_k_offset (ih) + cut_size); + else { + set_le_ih_k_offset (ih, le_ih_k_offset (ih) + (cut_size / UNFM_P_SIZE) * bh->b_size); + RFALSE( ih_item_len(ih) == cut_size && get_ih_free_space (ih), + "10205: invalid ih_free_space (%h)", ih); + } + } + } + + + /* location of the last item */ + last_loc = ih_location( &(ih[nr - cut_item_num - 1]) ); + + /* location of the item, which is remaining at the same place */ + unmoved_loc = cut_item_num ? ih_location(ih-1) : bh->b_size; + + + /* shift */ + memmove (bh->b_data + last_loc + cut_size, bh->b_data + last_loc, + unmoved_loc - last_loc - cut_size); + + /* change item length */ + put_ih_item_len( ih, ih_item_len(ih) - cut_size ); + + if (is_indirect_le_ih (ih)) { + if (pos_in_item) + set_ih_free_space (ih, 0); + } + + /* change locations */ + for (i = cut_item_num; i < nr; i ++) + put_ih_location( &(ih[i-cut_item_num]), ih_location( &ih[i-cut_item_num]) + cut_size ); + + /* size, free space */ + set_blkh_free_space( blkh, blkh_free_space(blkh) + cut_size ); + + do_balance_mark_leaf_dirty (bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); + put_dc_size( t_dc, dc_size(t_dc) - cut_size ); + do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); + } +} + + +/* delete del_num items from buffer starting from the first'th item */ +static void leaf_delete_items_entirely (struct buffer_info * bi, + int first, int del_num) +{ + struct buffer_head * bh = bi->bi_bh; + int nr; + int i, j; + int last_loc, last_removed_loc; + struct block_head * blkh; + struct item_head * ih; + + RFALSE( bh == NULL, "10210: buffer is 0"); + RFALSE( del_num < 0, "10215: del_num less than 0 (%d)", del_num); + + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + RFALSE( first < 0 || first + del_num > nr, + "10220: first=%d, number=%d, there is %d items", first, del_num, nr); + + if (first == 0 && del_num == nr) { + /* this does not work */ + make_empty_node (bi); + + do_balance_mark_leaf_dirty (bi->tb, bh, 0); + return; + } + + ih = B_N_PITEM_HEAD (bh, first); + + /* location of unmovable item */ + j = (first == 0) ? bh->b_size : ih_location(ih-1); + + /* delete items */ + last_loc = ih_location( &(ih[nr-1-first]) ); + last_removed_loc = ih_location( &(ih[del_num-1]) ); + + memmove (bh->b_data + last_loc + j - last_removed_loc, + bh->b_data + last_loc, last_removed_loc - last_loc); + + /* delete item headers */ + memmove (ih, ih + del_num, (nr - first - del_num) * IH_SIZE); + + /* change item location */ + for (i = first; i < nr - del_num; i ++) + put_ih_location( &(ih[i-first]), ih_location( &(ih[i-first]) ) + (j - last_removed_loc) ); + + /* sizes, item number */ + set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num ); + set_blkh_free_space( blkh, blkh_free_space(blkh) + (j - last_removed_loc + IH_SIZE * del_num) ); + + do_balance_mark_leaf_dirty (bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); + put_dc_size( t_dc, dc_size(t_dc) - + (j - last_removed_loc + IH_SIZE * del_num)); + do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); + } +} + + + + + +/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */ +void leaf_paste_entries ( + struct buffer_head * bh, + int item_num, + int before, + int new_entry_count, + struct reiserfs_de_head * new_dehs, + const char * records, + int paste_size + ) +{ + struct item_head * ih; + char * item; + struct reiserfs_de_head * deh; + char * insert_point; + int i, old_entry_num; + + if (new_entry_count == 0) + return; + + ih = B_N_PITEM_HEAD(bh, item_num); + + /* make sure, that item is directory, and there are enough records in it */ + RFALSE( !is_direntry_le_ih (ih), "10225: item is not directory item"); + RFALSE( I_ENTRY_COUNT (ih) < before, + "10230: there are no entry we paste entries before. entry_count = %d, before = %d", + I_ENTRY_COUNT (ih), before); + + + /* first byte of dest item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH (bh, ih); + + /* new records will be pasted at this point */ + insert_point = item + (before ? deh_location( &(deh[before - 1])) : (ih_item_len(ih) - paste_size)); + + /* adjust locations of records that will be AFTER new records */ + for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i --) + put_deh_location( &(deh[i]), + deh_location(&(deh[i])) + (DEH_SIZE * new_entry_count )); + + /* adjust locations of records that will be BEFORE new records */ + for (i = 0; i < before; i ++) + put_deh_location( &(deh[i]), deh_location(&(deh[i])) + paste_size ); + + old_entry_num = I_ENTRY_COUNT(ih); + put_ih_entry_count( ih, ih_entry_count(ih) + new_entry_count ); + + /* prepare space for pasted records */ + memmove (insert_point + paste_size, insert_point, item + (ih_item_len(ih) - paste_size) - insert_point); + + /* copy new records */ + memcpy (insert_point + DEH_SIZE * new_entry_count, records, + paste_size - DEH_SIZE * new_entry_count); + + /* prepare space for new entry heads */ + deh += before; + memmove ((char *)(deh + new_entry_count), deh, insert_point - (char *)deh); + + /* copy new entry heads */ + deh = (struct reiserfs_de_head *)((char *)deh); + memcpy (deh, new_dehs, DEH_SIZE * new_entry_count); + + /* set locations of new records */ + for (i = 0; i < new_entry_count; i ++) + { + put_deh_location( &(deh[i]), + deh_location( &(deh[i] )) + + (- deh_location( &(new_dehs[new_entry_count - 1])) + + insert_point + DEH_SIZE * new_entry_count - item)); + } + + + /* change item key if necessary (when we paste before 0-th entry */ + if (!before) + { + set_le_ih_k_offset (ih, deh_offset(new_dehs)); +/* memcpy (&ih->ih_key.k_offset, + &new_dehs->deh_offset, SHORT_KEY_SIZE);*/ + } + +#ifdef CONFIG_REISERFS_CHECK + { + int prev, next; + /* check record locations */ + deh = B_I_DEH (bh, ih); + for (i = 0; i < I_ENTRY_COUNT(ih); i ++) { + next = (i < I_ENTRY_COUNT(ih) - 1) ? deh_location( &(deh[i + 1])) : 0; + prev = (i != 0) ? deh_location( &(deh[i - 1]) ) : 0; + + if (prev && prev <= deh_location( &(deh[i]))) + reiserfs_warning (NULL, "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)", + ih, deh + i - 1, i, deh + i); + if (next && next >= deh_location( &(deh[i]))) + reiserfs_warning (NULL, "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)", + ih, i, deh + i, deh + i + 1); + } + } +#endif + +} diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c new file mode 100644 index 000000000000..80e92d9b81cb --- /dev/null +++ b/fs/reiserfs/namei.c @@ -0,0 +1,1491 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility + * + * Trivial Changes: + * Rights granted to Hans Reiser to redistribute under other terms providing + * he accepts all liability including but not limited to patent, fitness + * for purpose, and direct or indirect claims arising from failure to perform. + * + * NO WARRANTY + */ + +#include <linux/config.h> +#include <linux/time.h> +#include <linux/bitops.h> +#include <linux/reiserfs_fs.h> +#include <linux/reiserfs_acl.h> +#include <linux/reiserfs_xattr.h> +#include <linux/smp_lock.h> +#include <linux/quotaops.h> + +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } +#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--; + +// directory item contains array of entry headers. This performs +// binary search through that array +static int bin_search_in_dir_item (struct reiserfs_dir_entry * de, loff_t off) +{ + struct item_head * ih = de->de_ih; + struct reiserfs_de_head * deh = de->de_deh; + int rbound, lbound, j; + + lbound = 0; + rbound = I_ENTRY_COUNT (ih) - 1; + + for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) { + if (off < deh_offset (deh + j)) { + rbound = j - 1; + continue; + } + if (off > deh_offset (deh + j)) { + lbound = j + 1; + continue; + } + // this is not name found, but matched third key component + de->de_entry_num = j; + return NAME_FOUND; + } + + de->de_entry_num = lbound; + return NAME_NOT_FOUND; +} + + +// comment? maybe something like set de to point to what the path points to? +static inline void set_de_item_location (struct reiserfs_dir_entry * de, struct path * path) +{ + de->de_bh = get_last_bh (path); + de->de_ih = get_ih (path); + de->de_deh = B_I_DEH (de->de_bh, de->de_ih); + de->de_item_num = PATH_LAST_POSITION (path); +} + + +// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set +inline void set_de_name_and_namelen (struct reiserfs_dir_entry * de) +{ + struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num; + + if (de->de_entry_num >= ih_entry_count (de->de_ih)) + BUG (); + + de->de_entrylen = entry_length (de->de_bh, de->de_ih, de->de_entry_num); + de->de_namelen = de->de_entrylen - (de_with_sd (deh) ? SD_SIZE : 0); + de->de_name = B_I_PITEM (de->de_bh, de->de_ih) + deh_location(deh); + if (de->de_name[de->de_namelen - 1] == 0) + de->de_namelen = strlen (de->de_name); +} + + +// what entry points to +static inline void set_de_object_key (struct reiserfs_dir_entry * de) +{ + if (de->de_entry_num >= ih_entry_count (de->de_ih)) + BUG (); + de->de_dir_id = deh_dir_id( &(de->de_deh[de->de_entry_num])); + de->de_objectid = deh_objectid( &(de->de_deh[de->de_entry_num])); +} + + +static inline void store_de_entry_key (struct reiserfs_dir_entry * de) +{ + struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num; + + if (de->de_entry_num >= ih_entry_count (de->de_ih)) + BUG (); + + /* store key of the found entry */ + de->de_entry_key.version = KEY_FORMAT_3_5; + de->de_entry_key.on_disk_key.k_dir_id = le32_to_cpu (de->de_ih->ih_key.k_dir_id); + de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu (de->de_ih->ih_key.k_objectid); + set_cpu_key_k_offset (&(de->de_entry_key), deh_offset (deh)); + set_cpu_key_k_type (&(de->de_entry_key), TYPE_DIRENTRY); +} + + +/* We assign a key to each directory item, and place multiple entries +in a single directory item. A directory item has a key equal to the +key of the first directory entry in it. + +This function first calls search_by_key, then, if item whose first +entry matches is not found it looks for the entry inside directory +item found by search_by_key. Fills the path to the entry, and to the +entry position in the item + +*/ + +/* The function is NOT SCHEDULE-SAFE! */ +int search_by_entry_key (struct super_block * sb, const struct cpu_key * key, + struct path * path, struct reiserfs_dir_entry * de) +{ + int retval; + + retval = search_item (sb, key, path); + switch (retval) { + case ITEM_NOT_FOUND: + if (!PATH_LAST_POSITION (path)) { + reiserfs_warning (sb, "vs-7000: search_by_entry_key: search_by_key returned item position == 0"); + pathrelse(path) ; + return IO_ERROR ; + } + PATH_LAST_POSITION (path) --; + + case ITEM_FOUND: + break; + + case IO_ERROR: + return retval; + + default: + pathrelse (path); + reiserfs_warning (sb, "vs-7002: search_by_entry_key: no path to here"); + return IO_ERROR; + } + + set_de_item_location (de, path); + +#ifdef CONFIG_REISERFS_CHECK + if (!is_direntry_le_ih (de->de_ih) || + COMP_SHORT_KEYS (&(de->de_ih->ih_key), key)) { + print_block (de->de_bh, 0, -1, -1); + reiserfs_panic (sb, "vs-7005: search_by_entry_key: found item %h is not directory item or " + "does not belong to the same directory as key %K", de->de_ih, key); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* binary search in directory item by third componen t of the + key. sets de->de_entry_num of de */ + retval = bin_search_in_dir_item (de, cpu_key_k_offset (key)); + path->pos_in_item = de->de_entry_num; + if (retval != NAME_NOT_FOUND) { + // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set + set_de_name_and_namelen (de); + set_de_object_key (de); + } + return retval; +} + + + +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ + +/* The third component is hashed, and you can choose from more than + one hash function. Per directory hashes are not yet implemented + but are thought about. This function should be moved to hashes.c + Jedi, please do so. -Hans */ + +static __u32 get_third_component (struct super_block * s, + const char * name, int len) +{ + __u32 res; + + if (!len || (len == 1 && name[0] == '.')) + return DOT_OFFSET; + if (len == 2 && name[0] == '.' && name[1] == '.') + return DOT_DOT_OFFSET; + + res = REISERFS_SB(s)->s_hash_function (name, len); + + // take bits from 7-th to 30-th including both bounds + res = GET_HASH_VALUE(res); + if (res == 0) + // needed to have no names before "." and ".." those have hash + // value == 0 and generation conters 1 and 2 accordingly + res = 128; + return res + MAX_GENERATION_NUMBER; +} + + +static int reiserfs_match (struct reiserfs_dir_entry * de, + const char * name, int namelen) +{ + int retval = NAME_NOT_FOUND; + + if ((namelen == de->de_namelen) && + !memcmp(de->de_name, name, de->de_namelen)) + retval = (de_visible (de->de_deh + de->de_entry_num) ? NAME_FOUND : NAME_FOUND_INVISIBLE); + + return retval; +} + + +/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ + + /* used when hash collisions exist */ + + +static int linear_search_in_dir_item (struct cpu_key * key, struct reiserfs_dir_entry * de, + const char * name, int namelen) +{ + struct reiserfs_de_head * deh = de->de_deh; + int retval; + int i; + + i = de->de_entry_num; + + if (i == I_ENTRY_COUNT (de->de_ih) || + GET_HASH_VALUE (deh_offset (deh + i)) != GET_HASH_VALUE (cpu_key_k_offset (key))) { + i --; + } + + RFALSE( de->de_deh != B_I_DEH (de->de_bh, de->de_ih), + "vs-7010: array of entry headers not found"); + + deh += i; + + for (; i >= 0; i --, deh --) { + if (GET_HASH_VALUE (deh_offset (deh)) != + GET_HASH_VALUE (cpu_key_k_offset (key))) { + // hash value does not match, no need to check whole name + return NAME_NOT_FOUND; + } + + /* mark, that this generation number is used */ + if (de->de_gen_number_bit_string) + set_bit (GET_GENERATION_NUMBER (deh_offset (deh)), (unsigned long *)de->de_gen_number_bit_string); + + // calculate pointer to name and namelen + de->de_entry_num = i; + set_de_name_and_namelen (de); + + if ((retval = reiserfs_match (de, name, namelen)) != NAME_NOT_FOUND) { + // de's de_name, de_namelen, de_recordlen are set. Fill the rest: + + // key of pointed object + set_de_object_key (de); + + store_de_entry_key (de); + + // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE + return retval; + } + } + + if (GET_GENERATION_NUMBER (le_ih_k_offset (de->de_ih)) == 0) + /* we have reached left most entry in the node. In common we + have to go to the left neighbor, but if generation counter + is 0 already, we know for sure, that there is no name with + the same hash value */ + // FIXME: this work correctly only because hash value can not + // be 0. Btw, in case of Yura's hash it is probably possible, + // so, this is a bug + return NAME_NOT_FOUND; + + RFALSE( de->de_item_num, + "vs-7015: two diritems of the same directory in one node?"); + + return GOTO_PREVIOUS_ITEM; +} + + +// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND +// FIXME: should add something like IOERROR +static int reiserfs_find_entry (struct inode * dir, const char * name, int namelen, + struct path * path_to_entry, struct reiserfs_dir_entry * de) +{ + struct cpu_key key_to_search; + int retval; + + + if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize)) + return NAME_NOT_FOUND; + + /* we will search for this key in the tree */ + make_cpu_key (&key_to_search, dir, + get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3); + + while (1) { + retval = search_by_entry_key (dir->i_sb, &key_to_search, path_to_entry, de); + if (retval == IO_ERROR) { + reiserfs_warning (dir->i_sb, "zam-7001: io error in %s", + __FUNCTION__); + return IO_ERROR; + } + + /* compare names for all entries having given hash value */ + retval = linear_search_in_dir_item (&key_to_search, de, name, namelen); + if (retval != GOTO_PREVIOUS_ITEM) { + /* there is no need to scan directory anymore. Given entry found or does not exist */ + path_to_entry->pos_in_item = de->de_entry_num; + return retval; + } + + /* there is left neighboring item of this directory and given entry can be there */ + set_cpu_key_k_offset (&key_to_search, le_ih_k_offset (de->de_ih) - 1); + pathrelse (path_to_entry); + + } /* while (1) */ +} + + +static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry, struct nameidata *nd) +{ + int retval; + struct inode * inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH (path_to_entry); + + if (REISERFS_MAX_NAME (dir->i_sb->s_blocksize) < dentry->d_name.len) + return ERR_PTR(-ENAMETOOLONG); + + reiserfs_write_lock(dir->i_sb); + de.de_gen_number_bit_string = NULL; + retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de); + pathrelse (&path_to_entry); + if (retval == NAME_FOUND) { + /* Hide the .reiserfs_priv directory */ + if (reiserfs_xattrs (dir->i_sb) && + !old_format_only(dir->i_sb) && + REISERFS_SB(dir->i_sb)->priv_root && + REISERFS_SB(dir->i_sb)->priv_root->d_inode && + de.de_objectid == le32_to_cpu (INODE_PKEY(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->k_objectid)) { + reiserfs_write_unlock (dir->i_sb); + return ERR_PTR (-EACCES); + } + + inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); + if (!inode || IS_ERR(inode)) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-EACCES); + } + + /* Propogate the priv_object flag so we know we're in the priv tree */ + if (is_reiserfs_priv_object (dir)) + reiserfs_mark_inode_private (inode); + } + reiserfs_write_unlock(dir->i_sb); + if ( retval == IO_ERROR ) { + return ERR_PTR(-EIO); + } + + if (inode) + return d_splice_alias(inode, dentry); + + d_add(dentry, inode); + return NULL; +} + + +/* +** looks up the dentry of the parent directory for child. +** taken from ext2_get_parent +*/ +struct dentry *reiserfs_get_parent(struct dentry *child) +{ + int retval; + struct inode * inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH (path_to_entry); + struct dentry *parent; + struct inode *dir = child->d_inode ; + + + if (dir->i_nlink == 0) { + return ERR_PTR(-ENOENT); + } + de.de_gen_number_bit_string = NULL; + + reiserfs_write_lock(dir->i_sb); + retval = reiserfs_find_entry (dir, "..", 2, &path_to_entry, &de); + pathrelse (&path_to_entry); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-ENOENT); + } + inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); + reiserfs_write_unlock(dir->i_sb); + + if (!inode || IS_ERR(inode)) { + return ERR_PTR(-EACCES); + } + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + parent = ERR_PTR(-ENOMEM); + } + return parent; +} + + +/* add entry to the directory (entry can be hidden). + +insert definition of when hidden directories are used here -Hans + + Does not mark dir inode dirty, do it after successesfull call to it */ + +static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct inode * dir, + const char * name, int namelen, struct inode * inode, + int visible) +{ + struct cpu_key entry_key; + struct reiserfs_de_head * deh; + INITIALIZE_PATH (path); + struct reiserfs_dir_entry de; + int bit_string [MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; + int gen_number; + char small_buf[32+DEH_SIZE] ; /* 48 bytes now and we avoid kmalloc + if we create file with short name */ + char * buffer; + int buflen, paste_size; + int retval; + + BUG_ON (!th->t_trans_id); + + /* cannot allow items to be added into a busy deleted directory */ + if (!namelen) + return -EINVAL; + + if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize)) + return -ENAMETOOLONG; + + /* each entry has unique key. compose it */ + make_cpu_key (&entry_key, dir, + get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3); + + /* get memory for composing the entry */ + buflen = DEH_SIZE + ROUND_UP (namelen); + if (buflen > sizeof (small_buf)) { + buffer = reiserfs_kmalloc (buflen, GFP_NOFS, dir->i_sb); + if (buffer == 0) + return -ENOMEM; + } else + buffer = small_buf; + + paste_size = (get_inode_sd_version (dir) == STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; + + /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */ + deh = (struct reiserfs_de_head *)buffer; + deh->deh_location = 0; /* JDM Endian safe if 0 */ + put_deh_offset( deh, cpu_key_k_offset( &entry_key ) ); + deh->deh_state = 0; /* JDM Endian safe if 0 */ + /* put key (ino analog) to de */ + deh->deh_dir_id = INODE_PKEY (inode)->k_dir_id; /* safe: k_dir_id is le */ + deh->deh_objectid = INODE_PKEY (inode)->k_objectid; /* safe: k_objectid is le */ + + /* copy name */ + memcpy ((char *)(deh + 1), name, namelen); + /* padd by 0s to the 4 byte boundary */ + padd_item ((char *)(deh + 1), ROUND_UP (namelen), namelen); + + /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */ + mark_de_without_sd (deh); + visible ? mark_de_visible (deh) : mark_de_hidden (deh); + + /* find the proper place for the new entry */ + memset (bit_string, 0, sizeof (bit_string)); + de.de_gen_number_bit_string = (char *)bit_string; + retval = reiserfs_find_entry (dir, name, namelen, &path, &de); + if( retval != NAME_NOT_FOUND ) { + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + pathrelse (&path); + + if ( retval == IO_ERROR ) { + return -EIO; + } + + if (retval != NAME_FOUND) { + reiserfs_warning (dir->i_sb, "zam-7002:%s: \"reiserfs_find_entry\" " + "has returned unexpected value (%d)", + __FUNCTION__, retval); + } + + return -EEXIST; + } + + gen_number = find_first_zero_bit ((unsigned long *)bit_string, MAX_GENERATION_NUMBER + 1); + if (gen_number > MAX_GENERATION_NUMBER) { + /* there is no free generation number */ + reiserfs_warning (dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + pathrelse (&path); + return -EBUSY; + } + /* adjust offset of directory enrty */ + put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number)); + set_cpu_key_k_offset (&entry_key, deh_offset(deh)); + + /* update max-hash-collisions counter in reiserfs_sb_info */ + PROC_INFO_MAX( th -> t_super, max_hash_collisions, gen_number ); + + if (gen_number != 0) { /* we need to re-search for the insertion point */ + if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) { + reiserfs_warning (dir->i_sb, "vs-7032: reiserfs_add_entry: " + "entry with this key (%K) already exists", + &entry_key); + + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + pathrelse (&path); + return -EBUSY; + } + } + + /* perform the insertion of the entry that we have prepared */ + retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size); + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + if (retval) { + reiserfs_check_path(&path) ; + return retval; + } + + dir->i_size += paste_size; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (!S_ISDIR (inode->i_mode) && visible) + // reiserfs_mkdir or reiserfs_rename will do that by itself + reiserfs_update_sd (th, dir); + + reiserfs_check_path(&path) ; + return 0; +} + +/* quota utility function, call if you've had to abort after calling +** new_inode_init, and have not called reiserfs_new_inode yet. +** This should only be called on inodes that do not have stat data +** inserted into the tree yet. +*/ +static int drop_new_inode(struct inode *inode) { + DQUOT_DROP(inode); + make_bad_inode(inode) ; + inode->i_flags |= S_NOQUOTA; + iput(inode) ; + return 0 ; +} + +/* utility function that does setup for reiserfs_new_inode. +** DQUOT_INIT needs lots of credits so it's better to have it +** outside of a transaction, so we had to pull some bits of +** reiserfs_new_inode out into this func. +*/ +static int new_inode_init(struct inode *inode, struct inode *dir, int mode) { + + /* the quota init calls have to know who to charge the quota to, so + ** we have to set uid and gid here + */ + inode->i_uid = current->fsuid; + inode->i_mode = mode; + + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } else { + inode->i_gid = current->fsgid; + } + DQUOT_INIT(inode); + return 0 ; +} + +static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int retval; + struct inode * inode; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); + struct reiserfs_transaction_handle th ; + int locked; + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM ; + } + new_inode_init(inode, dir, mode); + + locked = reiserfs_cache_default_acl (dir); + + reiserfs_write_lock(dir->i_sb); + + if (locked) + reiserfs_write_lock_xattrs (dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode (inode); + goto out_failed; + } + + retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode); + if (retval) + goto out_failed; + + if (locked) { + reiserfs_write_unlock_xattrs (dir->i_sb); + locked = 0; + } + + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations ; + + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd (&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count) ; + if (err) + retval = err; + iput (inode); + goto out_failed; + } + reiserfs_update_inode_transaction(inode) ; + reiserfs_update_inode_transaction(dir) ; + + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count) ; + +out_failed: + if (locked) + reiserfs_write_unlock_xattrs (dir->i_sb); + reiserfs_write_unlock(dir->i_sb); + return retval; +} + + +static int reiserfs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +{ + int retval; + struct inode * inode; + struct reiserfs_transaction_handle th ; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); + int locked; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM ; + } + new_inode_init(inode, dir, mode); + + locked = reiserfs_cache_default_acl (dir); + + reiserfs_write_lock(dir->i_sb); + + if (locked) + reiserfs_write_lock_xattrs (dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count) ; + if (retval) { + drop_new_inode (inode); + goto out_failed; + } + + retval = reiserfs_new_inode (&th, dir, mode, NULL, 0/*i_size*/, dentry, inode); + if (retval) { + goto out_failed; + } + + if (locked) { + reiserfs_write_unlock_xattrs (dir->i_sb); + locked = 0; + } + + + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev) ; + + //FIXME: needed for block and char devices only + reiserfs_update_sd (&th, inode); + + reiserfs_update_inode_transaction(inode) ; + reiserfs_update_inode_transaction(dir) ; + + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd (&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count) ; + if (err) + retval = err; + iput (inode); + goto out_failed; + } + + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count) ; + +out_failed: + if (locked) + reiserfs_write_unlock_xattrs (dir->i_sb); + reiserfs_write_unlock(dir->i_sb); + return retval; +} + + +static int reiserfs_mkdir (struct inode * dir, struct dentry *dentry, int mode) +{ + int retval; + struct inode * inode; + struct reiserfs_transaction_handle th ; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); + int locked; + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ + REISERFS_I(dir)->new_packing_locality = 1; +#endif + mode = S_IFDIR | mode; + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM ; + } + new_inode_init(inode, dir, mode); + + locked = reiserfs_cache_default_acl (dir); + + reiserfs_write_lock(dir->i_sb); + if (locked) + reiserfs_write_lock_xattrs (dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count) ; + if (retval) { + drop_new_inode (inode); + goto out_failed; + } + + + /* inc the link count now, so another writer doesn't overflow it while + ** we sleep later on. + */ + INC_DIR_INODE_NLINK(dir) + + retval = reiserfs_new_inode (&th, dir, mode, NULL/*symlink*/, + old_format_only (dir->i_sb) ? + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode); + if (retval) { + dir->i_nlink-- ; + goto out_failed; + } + + if (locked) { + reiserfs_write_unlock_xattrs (dir->i_sb); + locked = 0; + } + + reiserfs_update_inode_transaction(inode) ; + reiserfs_update_inode_transaction(dir) ; + + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + + // note, _this_ add_entry will not update dir's stat data + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + if (retval) { + int err; + inode->i_nlink = 0; + DEC_DIR_INODE_NLINK(dir); + reiserfs_update_sd (&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count) ; + if (err) + retval = err; + iput (inode); + goto out_failed; + } + + // the above add_entry did not update dir's stat data + reiserfs_update_sd (&th, dir); + + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count) ; +out_failed: + if (locked) + reiserfs_write_unlock_xattrs (dir->i_sb); + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static inline int reiserfs_empty_dir(struct inode *inode) { + /* we can cheat because an old format dir cannot have + ** EMPTY_DIR_SIZE, and a new format dir cannot have + ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, + ** regardless of disk format version, the directory is empty. + */ + if (inode->i_size != EMPTY_DIR_SIZE && + inode->i_size != EMPTY_DIR_SIZE_V1) { + return 0 ; + } + return 1 ; +} + +static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry) +{ + int retval, err; + struct inode * inode; + struct reiserfs_transaction_handle th ; + int jbegin_count; + INITIALIZE_PATH (path); + struct reiserfs_dir_entry de; + + + /* we will be doing 2 balancings and update 2 stat data, we change quotas + * of the owner of the directory and of the owner of the parent directory */ + jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count) ; + if (retval) + goto out_rmdir; + + de.de_gen_number_bit_string = NULL; + if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_rmdir; + } else if ( retval == IO_ERROR) { + retval = -EIO; + goto end_rmdir; + } + + inode = dentry->d_inode; + + reiserfs_update_inode_transaction(inode) ; + reiserfs_update_inode_transaction(dir) ; + + if (de.de_objectid != inode->i_ino) { + // FIXME: compare key of an object and a key found in the + // entry + retval = -EIO; + goto end_rmdir; + } + if (!reiserfs_empty_dir(inode)) { + retval = -ENOTEMPTY; + goto end_rmdir; + } + + /* cut entry from dir directory */ + retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, + NULL, /* page */ + 0/*new file size - not used here*/); + if (retval < 0) + goto end_rmdir; + + if ( inode->i_nlink != 2 && inode->i_nlink != 1 ) + reiserfs_warning (inode->i_sb, "%s: empty directory has nlink " + "!= 2 (%d)", __FUNCTION__, inode->i_nlink); + + inode->i_nlink = 0; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd (&th, inode); + + DEC_DIR_INODE_NLINK(dir) + dir->i_size -= (DEH_SIZE + de.de_entrylen); + reiserfs_update_sd (&th, dir); + + /* prevent empty directory from getting lost */ + add_save_link (&th, inode, 0/* not truncate */); + + retval = journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_check_path(&path) ; +out_rmdir: + reiserfs_write_unlock(dir->i_sb); + return retval; + + end_rmdir: + /* we must release path, because we did not call + reiserfs_cut_from_item, or reiserfs_cut_from_item does not + release path if operation was not complete */ + pathrelse (&path); + err = journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; +} + +static int reiserfs_unlink (struct inode * dir, struct dentry *dentry) +{ + int retval, err; + struct inode * inode; + struct reiserfs_dir_entry de; + INITIALIZE_PATH (path); + struct reiserfs_transaction_handle th ; + int jbegin_count; + unsigned long savelink; + + inode = dentry->d_inode; + + /* in this transaction we can be doing at max two balancings and update + two stat datas, we change quotas of the owner of the directory and of + the owner of the parent directory */ + jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count) ; + if (retval) + goto out_unlink; + + de.de_gen_number_bit_string = NULL; + if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_unlink; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_unlink; + } + + reiserfs_update_inode_transaction(inode) ; + reiserfs_update_inode_transaction(dir) ; + + if (de.de_objectid != inode->i_ino) { + // FIXME: compare key of an object and a key found in the + // entry + retval = -EIO; + goto end_unlink; + } + + if (!inode->i_nlink) { + reiserfs_warning (inode->i_sb, "%s: deleting nonexistent file " + "(%s:%lu), %d", __FUNCTION__, + reiserfs_bdevname (inode->i_sb), inode->i_ino, + inode->i_nlink); + inode->i_nlink = 1; + } + + inode->i_nlink--; + + /* + * we schedule before doing the add_save_link call, save the link + * count so we don't race + */ + savelink = inode->i_nlink; + + + retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, NULL, 0); + if (retval < 0) { + inode->i_nlink++; + goto end_unlink; + } + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd (&th, inode); + + dir->i_size -= (de.de_entrylen + DEH_SIZE); + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd (&th, dir); + + if (!savelink) + /* prevent file from getting lost */ + add_save_link (&th, inode, 0/* not truncate */); + + retval = journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_check_path(&path) ; + reiserfs_write_unlock(dir->i_sb); + return retval; + + end_unlink: + pathrelse (&path); + err = journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_check_path(&path) ; + if (err) + retval = err; +out_unlink: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_symlink (struct inode * parent_dir, + struct dentry * dentry, const char * symname) +{ + int retval; + struct inode * inode; + char * name; + int item_len; + struct reiserfs_transaction_handle th ; + int mode = S_IFLNK | S_IRWXUGO; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); + + if (!(inode = new_inode(parent_dir->i_sb))) { + return -ENOMEM ; + } + new_inode_init(inode, parent_dir, mode); + + reiserfs_write_lock(parent_dir->i_sb); + item_len = ROUND_UP (strlen (symname)); + if (item_len > MAX_DIRECT_ITEM_LEN (parent_dir->i_sb->s_blocksize)) { + retval = -ENAMETOOLONG; + drop_new_inode(inode); + goto out_failed; + } + + name = reiserfs_kmalloc (item_len, GFP_NOFS, parent_dir->i_sb); + if (!name) { + drop_new_inode(inode); + retval = -ENOMEM; + goto out_failed; + } + memcpy (name, symname, strlen (symname)); + padd_item (name, item_len, strlen (symname)); + + /* We would inherit the default ACL here, but symlinks don't get ACLs */ + + retval = journal_begin(&th, parent_dir->i_sb, jbegin_count) ; + if (retval) { + drop_new_inode (inode); + reiserfs_kfree (name, item_len, parent_dir->i_sb); + goto out_failed; + } + + retval = reiserfs_new_inode (&th, parent_dir, mode, name, strlen (symname), + dentry, inode); + reiserfs_kfree (name, item_len, parent_dir->i_sb); + if (retval) { /* reiserfs_new_inode iputs for us */ + goto out_failed; + } + + reiserfs_update_inode_transaction(inode) ; + reiserfs_update_inode_transaction(parent_dir) ; + + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + // must be sure this inode is written with this transaction + // + //reiserfs_update_sd (&th, inode, READ_BLOCKS); + + retval = reiserfs_add_entry (&th, parent_dir, dentry->d_name.name, + dentry->d_name.len, inode, 1/*visible*/); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd (&th, inode); + err = journal_end(&th, parent_dir->i_sb, jbegin_count) ; + if (err) + retval = err; + iput (inode); + goto out_failed; + } + + d_instantiate(dentry, inode); + retval = journal_end(&th, parent_dir->i_sb, jbegin_count) ; +out_failed: + reiserfs_write_unlock(parent_dir->i_sb); + return retval; +} + +static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct dentry * dentry) +{ + int retval; + struct inode *inode = old_dentry->d_inode; + struct reiserfs_transaction_handle th ; + /* We need blocks for transaction + update of quotas for the owners of the directory */ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS; + + reiserfs_write_lock(dir->i_sb); + if (inode->i_nlink >= REISERFS_LINK_MAX) { + //FIXME: sd_nlink is 32 bit for new files + reiserfs_write_unlock(dir->i_sb); + return -EMLINK; + } + if (inode->i_nlink == 0) { + reiserfs_write_unlock(dir->i_sb); + return -ENOENT; + } + + /* inc before scheduling so reiserfs_unlink knows we are here */ + inode->i_nlink++; + + retval = journal_begin(&th, dir->i_sb, jbegin_count) ; + if (retval) { + inode->i_nlink--; + reiserfs_write_unlock (dir->i_sb); + return retval; + } + + /* create new entry */ + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + + reiserfs_update_inode_transaction(inode) ; + reiserfs_update_inode_transaction(dir) ; + + if (retval) { + int err; + inode->i_nlink--; + err = journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; + } + + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd (&th, inode); + + atomic_inc(&inode->i_count) ; + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_write_unlock(dir->i_sb); + return retval; +} + + +// de contains information pointing to an entry which +static int de_still_valid (const char * name, int len, struct reiserfs_dir_entry * de) +{ + struct reiserfs_dir_entry tmp = *de; + + // recalculate pointer to name and name length + set_de_name_and_namelen (&tmp); + // FIXME: could check more + if (tmp.de_namelen != len || memcmp (name, de->de_name, len)) + return 0; + return 1; +} + + +static int entry_points_to_object (const char * name, int len, struct reiserfs_dir_entry * de, struct inode * inode) +{ + if (!de_still_valid (name, len, de)) + return 0; + + if (inode) { + if (!de_visible (de->de_deh + de->de_entry_num)) + reiserfs_panic (NULL, "vs-7042: entry_points_to_object: entry must be visible"); + return (de->de_objectid == inode->i_ino) ? 1 : 0; + } + + /* this must be added hidden entry */ + if (de_visible (de->de_deh + de->de_entry_num)) + reiserfs_panic (NULL, "vs-7043: entry_points_to_object: entry must be visible"); + + return 1; +} + + +/* sets key of objectid the entry has to point to */ +static void set_ino_in_dir_entry (struct reiserfs_dir_entry * de, struct reiserfs_key * key) +{ + /* JDM These operations are endian safe - both are le */ + de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; + de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; +} + + +/* + * process, that is going to call fix_nodes/do_balance must hold only + * one path. If it holds 2 or more, it can get into endless waiting in + * get_empty_nodes or its clones + */ +static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir, struct dentry *new_dentry) +{ + int retval; + INITIALIZE_PATH (old_entry_path); + INITIALIZE_PATH (new_entry_path); + INITIALIZE_PATH (dot_dot_entry_path); + struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ; + struct reiserfs_dir_entry old_de, new_de, dot_dot_de; + struct inode * old_inode, * new_dentry_inode; + struct reiserfs_transaction_handle th ; + int jbegin_count ; + umode_t old_inode_mode; + unsigned long savelink = 1; + struct timespec ctime; + + /* three balancings: (1) old name removal, (2) new name insertion + and (3) maybe "save" link insertion + stat data updates: (1) old directory, + (2) new directory and (3) maybe old object stat data (when it is + directory) and (4) maybe stat data of object to which new entry + pointed initially and (5) maybe block containing ".." of + renamed directory + quota updates: two parent directories */ + jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS; + + old_inode = old_dentry->d_inode; + new_dentry_inode = new_dentry->d_inode; + + // make sure, that oldname still exists and points to an object we + // are going to rename + old_de.de_gen_number_bit_string = NULL; + reiserfs_write_lock(old_dir->i_sb); + retval = reiserfs_find_entry (old_dir, old_dentry->d_name.name, old_dentry->d_name.len, + &old_entry_path, &old_de); + pathrelse (&old_entry_path); + if (retval == IO_ERROR) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOENT; + } + + old_inode_mode = old_inode->i_mode; + if (S_ISDIR(old_inode_mode)) { + // make sure, that directory being renamed has correct ".." + // and that its new parent directory has not too many links + // already + + if (new_dentry_inode) { + if (!reiserfs_empty_dir(new_dentry_inode)) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOTEMPTY; + } + } + + /* directory is renamed, its parent directory will be changed, + ** so find ".." entry + */ + dot_dot_de.de_gen_number_bit_string = NULL; + retval = reiserfs_find_entry (old_inode, "..", 2, &dot_dot_entry_path, &dot_dot_de); + pathrelse (&dot_dot_entry_path); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + /* inode number of .. must equal old_dir->i_ino */ + if (dot_dot_de.de_objectid != old_dir->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + } + + retval = journal_begin(&th, old_dir->i_sb, jbegin_count) ; + if (retval) { + reiserfs_write_unlock (old_dir->i_sb); + return retval; + } + + /* add new entry (or find the existing one) */ + retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, + old_inode, 0); + if (retval == -EEXIST) { + if (!new_dentry_inode) { + reiserfs_panic (old_dir->i_sb, + "vs-7050: new entry is found, new inode == 0\n"); + } + } else if (retval) { + int err = journal_end(&th, old_dir->i_sb, jbegin_count) ; + reiserfs_write_unlock(old_dir->i_sb); + return err ? err : retval; + } + + reiserfs_update_inode_transaction(old_dir) ; + reiserfs_update_inode_transaction(new_dir) ; + + /* this makes it so an fsync on an open fd for the old name will + ** commit the rename operation + */ + reiserfs_update_inode_transaction(old_inode) ; + + if (new_dentry_inode) + reiserfs_update_inode_transaction(new_dentry_inode) ; + + while (1) { + // look for old name using corresponding entry key (found by reiserfs_find_entry) + if ((retval = search_by_entry_key (new_dir->i_sb, &old_de.de_entry_key, + &old_entry_path, &old_de)) != NAME_FOUND) { + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&old_entry_ih, get_ih(&old_entry_path)) ; + + reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1) ; + + // look for new name by reiserfs_find_entry + new_de.de_gen_number_bit_string = NULL; + retval = reiserfs_find_entry (new_dir, new_dentry->d_name.name, new_dentry->d_name.len, + &new_entry_path, &new_de); + // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from + // reiserfs_add_entry above, and we'll catch any i/o errors before we get here. + if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&new_entry_ih, get_ih(&new_entry_path)) ; + + reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1) ; + + if (S_ISDIR(old_inode->i_mode)) { + if ((retval = search_by_entry_key (new_dir->i_sb, &dot_dot_de.de_entry_key, + &dot_dot_entry_path, &dot_dot_de)) != NAME_FOUND) { + pathrelse(&dot_dot_entry_path); + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + copy_item_head(&dot_dot_ih, get_ih(&dot_dot_entry_path)) ; + // node containing ".." gets into transaction + reiserfs_prepare_for_journal(old_inode->i_sb, dot_dot_de.de_bh, 1) ; + } + /* we should check seals here, not do + this stuff, yes? Then, having + gathered everything into RAM we + should lock the buffers, yes? -Hans */ + /* probably. our rename needs to hold more + ** than one path at once. The seals would + ** have to be written to deal with multi-path + ** issues -chris + */ + /* sanity checking before doing the rename - avoid races many + ** of the above checks could have scheduled. We have to be + ** sure our items haven't been shifted by another process. + */ + if (item_moved(&new_entry_ih, &new_entry_path) || + !entry_points_to_object(new_dentry->d_name.name, + new_dentry->d_name.len, + &new_de, new_dentry_inode) || + item_moved(&old_entry_ih, &old_entry_path) || + !entry_points_to_object (old_dentry->d_name.name, + old_dentry->d_name.len, + &old_de, old_inode)) { + reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh); + reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh); + if (S_ISDIR(old_inode_mode)) + reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh); + continue; + } + if (S_ISDIR(old_inode_mode)) { + if ( item_moved(&dot_dot_ih, &dot_dot_entry_path) || + !entry_points_to_object ( "..", 2, &dot_dot_de, old_dir) ) { + reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh); + reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh); + reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh); + continue; + } + } + + RFALSE( S_ISDIR(old_inode_mode) && + !buffer_journal_prepared(dot_dot_de.de_bh), "" ); + + break; + } + + /* ok, all the changes can be done in one fell swoop when we + have claimed all the buffers needed.*/ + + mark_de_visible (new_de.de_deh + new_de.de_entry_num); + set_ino_in_dir_entry (&new_de, INODE_PKEY (old_inode)); + journal_mark_dirty (&th, old_dir->i_sb, new_de.de_bh); + + mark_de_hidden (old_de.de_deh + old_de.de_entry_num); + journal_mark_dirty (&th, old_dir->i_sb, old_de.de_bh); + ctime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of + renamed object */ + old_inode->i_ctime = ctime; + + if (new_dentry_inode) { + // adjust link number of the victim + if (S_ISDIR(new_dentry_inode->i_mode)) { + new_dentry_inode->i_nlink = 0; + } else { + new_dentry_inode->i_nlink--; + } + new_dentry_inode->i_ctime = ctime; + savelink = new_dentry_inode->i_nlink; + } + + if (S_ISDIR(old_inode_mode)) { + // adjust ".." of renamed directory + set_ino_in_dir_entry (&dot_dot_de, INODE_PKEY (new_dir)); + journal_mark_dirty (&th, new_dir->i_sb, dot_dot_de.de_bh); + + if (!new_dentry_inode) + /* there (in new_dir) was no directory, so it got new link + (".." of renamed directory) */ + INC_DIR_INODE_NLINK(new_dir); + + /* old directory lost one link - ".. " of renamed directory */ + DEC_DIR_INODE_NLINK(old_dir); + } + + // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse + pathrelse (&new_entry_path); + pathrelse (&dot_dot_entry_path); + + // FIXME: this reiserfs_cut_from_item's return value may screw up + // anybody, but it will panic if will not be able to find the + // entry. This needs one more clean up + if (reiserfs_cut_from_item (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, 0) < 0) + reiserfs_warning (old_dir->i_sb, "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?"); + + old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; + + reiserfs_update_sd (&th, old_dir); + reiserfs_update_sd (&th, new_dir); + reiserfs_update_sd (&th, old_inode); + + if (new_dentry_inode) { + if (savelink == 0) + add_save_link (&th, new_dentry_inode, 0/* not truncate */); + reiserfs_update_sd (&th, new_dentry_inode); + } + + retval = journal_end(&th, old_dir->i_sb, jbegin_count) ; + reiserfs_write_unlock(old_dir->i_sb); + return retval; +} + +/* + * directories can handle most operations... + */ +struct inode_operations reiserfs_dir_inode_operations = { + //&reiserfs_dir_operations, /* default_file_ops */ + .create = reiserfs_create, + .lookup = reiserfs_lookup, + .link = reiserfs_link, + .unlink = reiserfs_unlink, + .symlink = reiserfs_symlink, + .mkdir = reiserfs_mkdir, + .rmdir = reiserfs_rmdir, + .mknod = reiserfs_mknod, + .rename = reiserfs_rename, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, +}; + +/* + * symlink operations.. same as page_symlink_inode_operations, with xattr + * stuff added + */ +struct inode_operations reiserfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + +}; + + +/* + * special file operations.. just xattr/acl stuff + */ +struct inode_operations reiserfs_special_inode_operations = { + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + +}; diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c new file mode 100644 index 000000000000..0785c43a7486 --- /dev/null +++ b/fs/reiserfs/objectid.c @@ -0,0 +1,206 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include <linux/config.h> +#include <linux/string.h> +#include <linux/random.h> +#include <linux/time.h> +#include <linux/reiserfs_fs.h> +#include <linux/reiserfs_fs_sb.h> + +// find where objectid map starts +#define objectid_map(s,rs) (old_format_only (s) ? \ + (__u32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ + (__u32 *)((rs) + 1)) + + +#ifdef CONFIG_REISERFS_CHECK + +static void check_objectid_map (struct super_block * s, __u32 * map) +{ + if (le32_to_cpu (map[0]) != 1) + reiserfs_panic (s, "vs-15010: check_objectid_map: map corrupted: %lx", + ( long unsigned int ) le32_to_cpu (map[0])); + + // FIXME: add something else here +} + +#else +static void check_objectid_map (struct super_block * s, __u32 * map) +{;} +#endif + + +/* When we allocate objectids we allocate the first unused objectid. + Each sequence of objectids in use (the odd sequences) is followed + by a sequence of objectids not in use (the even sequences). We + only need to record the last objectid in each of these sequences + (both the odd and even sequences) in order to fully define the + boundaries of the sequences. A consequence of allocating the first + objectid not in use is that under most conditions this scheme is + extremely compact. The exception is immediately after a sequence + of operations which deletes a large number of objects of + non-sequential objectids, and even then it will become compact + again as soon as more objects are created. Note that many + interesting optimizations of layout could result from complicating + objectid assignment, but we have deferred making them for now. */ + + +/* get unique object identifier */ +__u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th) +{ + struct super_block * s = th->t_super; + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + __u32 * map = objectid_map (s, rs); + __u32 unused_objectid; + + BUG_ON (!th->t_trans_id); + + check_objectid_map (s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + /* comment needed -Hans */ + unused_objectid = le32_to_cpu (map[1]); + if (unused_objectid == U32_MAX) { + reiserfs_warning (s, "%s: no more object ids", __FUNCTION__); + reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)) ; + return 0; + } + + /* This incrementation allocates the first unused objectid. That + is to say, the first entry on the objectid map is the first + unused objectid, and by incrementing it we use it. See below + where we check to see if we eliminated a sequence of unused + objectids.... */ + map[1] = cpu_to_le32 (unused_objectid + 1); + + /* Now we check to see if we eliminated the last remaining member of + the first even sequence (and can eliminate the sequence by + eliminating its last objectid from oids), and can collapse the + first two odd sequences into one sequence. If so, then the net + result is to eliminate a pair of objectids from oids. We do this + by shifting the entire map to the left. */ + if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { + memmove (map + 1, map + 3, (sb_oid_cursize(rs) - 3) * sizeof(__u32)); + set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 ); + } + + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); + return unused_objectid; +} + + +/* makes object identifier unused */ +void reiserfs_release_objectid (struct reiserfs_transaction_handle *th, + __u32 objectid_to_release) +{ + struct super_block * s = th->t_super; + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + __u32 * map = objectid_map (s, rs); + int i = 0; + + BUG_ON (!th->t_trans_id); + //return; + check_objectid_map (s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); + + /* start at the beginning of the objectid map (i = 0) and go to + the end of it (i = disk_sb->s_oid_cursize). Linear search is + what we use, though it is possible that binary search would be + more efficient after performing lots of deletions (which is + when oids is large.) We only check even i's. */ + while (i < sb_oid_cursize(rs)) { + if (objectid_to_release == le32_to_cpu (map[i])) { + /* This incrementation unallocates the objectid. */ + //map[i]++; + map[i] = cpu_to_le32 (le32_to_cpu (map[i]) + 1); + + /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ + if (map[i] == map[i+1]) { + /* shrink objectid map */ + memmove (map + i, map + i + 2, + (sb_oid_cursize(rs) - i - 2) * sizeof (__u32)); + //disk_sb->s_oid_cursize -= 2; + set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 ); + + RFALSE( sb_oid_cursize(rs) < 2 || + sb_oid_cursize(rs) > sb_oid_maxsize(rs), + "vs-15005: objectid map corrupted cur_size == %d (max == %d)", + sb_oid_cursize(rs), sb_oid_maxsize(rs)); + } + return; + } + + if (objectid_to_release > le32_to_cpu (map[i]) && + objectid_to_release < le32_to_cpu (map[i + 1])) { + /* size of objectid map is not changed */ + if (objectid_to_release + 1 == le32_to_cpu (map[i + 1])) { + //objectid_map[i+1]--; + map[i + 1] = cpu_to_le32 (le32_to_cpu (map[i + 1]) - 1); + return; + } + + /* JDM comparing two little-endian values for equality -- safe */ + if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { + /* objectid map must be expanded, but there is no space */ + PROC_INFO_INC( s, leaked_oid ); + return; + } + + /* expand the objectid map*/ + memmove (map + i + 3, map + i + 1, + (sb_oid_cursize(rs) - i - 1) * sizeof(__u32)); + map[i + 1] = cpu_to_le32 (objectid_to_release); + map[i + 2] = cpu_to_le32 (objectid_to_release + 1); + set_sb_oid_cursize( rs, sb_oid_cursize(rs) + 2 ); + return; + } + i += 2; + } + + reiserfs_warning (s, "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)", + ( long unsigned ) objectid_to_release); +} + + +int reiserfs_convert_objectid_map_v1(struct super_block *s) { + struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK (s); + int cur_size = sb_oid_cursize(disk_sb); + int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2 ; + int old_max = sb_oid_maxsize(disk_sb); + struct reiserfs_super_block_v1 *disk_sb_v1 ; + __u32 *objectid_map, *new_objectid_map ; + int i ; + + disk_sb_v1=(struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); + objectid_map = (__u32 *)(disk_sb_v1 + 1) ; + new_objectid_map = (__u32 *)(disk_sb + 1) ; + + if (cur_size > new_size) { + /* mark everyone used that was listed as free at the end of the objectid + ** map + */ + objectid_map[new_size - 1] = objectid_map[cur_size - 1] ; + set_sb_oid_cursize(disk_sb,new_size) ; + } + /* move the smaller objectid map past the end of the new super */ + for (i = new_size - 1 ; i >= 0 ; i--) { + objectid_map[i + (old_max - new_size)] = objectid_map[i] ; + } + + + /* set the max size so we don't overflow later */ + set_sb_oid_maxsize(disk_sb,new_size) ; + + /* Zero out label and generate random UUID */ + memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)) ; + generate_random_uuid(disk_sb->s_uuid); + + /* finally, zero out the unused chunk of the new super */ + memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)) ; + return 0 ; +} + diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c new file mode 100644 index 000000000000..16fdca1d4bd7 --- /dev/null +++ b/fs/reiserfs/prints.c @@ -0,0 +1,727 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include <linux/config.h> +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/reiserfs_fs.h> +#include <linux/string.h> +#include <linux/buffer_head.h> + +#include <stdarg.h> + +static char error_buf[1024]; +static char fmt_buf[1024]; +static char off_buf[80]; + + +static char * reiserfs_cpu_offset (struct cpu_key * key) +{ + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + sprintf (off_buf, "%Lu(%Lu)", + (unsigned long long)GET_HASH_VALUE (cpu_key_k_offset (key)), + (unsigned long long)GET_GENERATION_NUMBER (cpu_key_k_offset (key))); + else + sprintf (off_buf, "0x%Lx", (unsigned long long)cpu_key_k_offset (key)); + return off_buf; +} + + +static char * le_offset (struct reiserfs_key * key) +{ + int version; + + version = le_key_version (key); + if (le_key_k_type (version, key) == TYPE_DIRENTRY) + sprintf (off_buf, "%Lu(%Lu)", + (unsigned long long)GET_HASH_VALUE (le_key_k_offset (version, key)), + (unsigned long long)GET_GENERATION_NUMBER (le_key_k_offset (version, key))); + else + sprintf (off_buf, "0x%Lx", (unsigned long long)le_key_k_offset (version, key)); + return off_buf; +} + + +static char * cpu_type (struct cpu_key * key) +{ + if (cpu_key_k_type (key) == TYPE_STAT_DATA) + return "SD"; + if (cpu_key_k_type (key) == TYPE_DIRENTRY) + return "DIR"; + if (cpu_key_k_type (key) == TYPE_DIRECT) + return "DIRECT"; + if (cpu_key_k_type (key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + + +static char * le_type (struct reiserfs_key * key) +{ + int version; + + version = le_key_version (key); + + if (le_key_k_type (version, key) == TYPE_STAT_DATA) + return "SD"; + if (le_key_k_type (version, key) == TYPE_DIRENTRY) + return "DIR"; + if (le_key_k_type (version, key) == TYPE_DIRECT) + return "DIRECT"; + if (le_key_k_type (version, key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + + +/* %k */ +static void sprintf_le_key (char * buf, struct reiserfs_key * key) +{ + if (key) + sprintf (buf, "[%d %d %s %s]", le32_to_cpu (key->k_dir_id), + le32_to_cpu (key->k_objectid), le_offset (key), le_type (key)); + else + sprintf (buf, "[NULL]"); +} + + +/* %K */ +static void sprintf_cpu_key (char * buf, struct cpu_key * key) +{ + if (key) + sprintf (buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, + key->on_disk_key.k_objectid, reiserfs_cpu_offset (key), + cpu_type (key)); + else + sprintf (buf, "[NULL]"); +} + +static void sprintf_de_head( char *buf, struct reiserfs_de_head *deh ) +{ + if( deh ) + sprintf( buf, "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", deh_offset(deh), deh_dir_id(deh), + deh_objectid(deh), deh_location(deh), deh_state(deh) ); + else + sprintf( buf, "[NULL]" ); + +} + +static void sprintf_item_head (char * buf, struct item_head * ih) +{ + if (ih) { + strcpy (buf, (ih_version (ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); + sprintf_le_key (buf + strlen (buf), &(ih->ih_key)); + sprintf (buf + strlen (buf), ", item_len %d, item_location %d, " + "free_space(entry_count) %d", + ih_item_len(ih), ih_location(ih), ih_free_space (ih)); + } else + sprintf (buf, "[NULL]"); +} + + +static void sprintf_direntry (char * buf, struct reiserfs_dir_entry * de) +{ + char name[20]; + + memcpy (name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); + name [de->de_namelen > 19 ? 19 : de->de_namelen] = 0; + sprintf (buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); +} + + +static void sprintf_block_head (char * buf, struct buffer_head * bh) +{ + sprintf (buf, "level=%d, nr_items=%d, free_space=%d rdkey ", + B_LEVEL (bh), B_NR_ITEMS (bh), B_FREE_SPACE (bh)); +} + + +static void sprintf_buffer_head (char * buf, struct buffer_head * bh) +{ + char b[BDEVNAME_SIZE]; + + sprintf (buf, "dev %s, size %d, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bdevname (bh->b_bdev, b), bh->b_size, + (unsigned long long)bh->b_blocknr, + atomic_read (&(bh->b_count)), + bh->b_state, bh->b_page, + buffer_uptodate (bh) ? "UPTODATE" : "!UPTODATE", + buffer_dirty (bh) ? "DIRTY" : "CLEAN", + buffer_locked (bh) ? "LOCKED" : "UNLOCKED"); +} + + +static void sprintf_disk_child (char * buf, struct disk_child * dc) +{ + sprintf (buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), dc_size(dc)); +} + + +static char * is_there_reiserfs_struct (char * fmt, int * what, int * skip) +{ + char * k = fmt; + + *skip = 0; + + while ((k = strchr (k, '%')) != NULL) + { + if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || + k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a' ) { + *what = k[1]; + break; + } + (*skip) ++; + k ++; + } + return k; +} + + +/* debugging reiserfs we used to print out a lot of different + variables, like keys, item headers, buffer heads etc. Values of + most fields matter. So it took a long time just to write + appropriative printk. With this reiserfs_warning you can use format + specification for complex structures like you used to do with + printfs for integers, doubles and pointers. For instance, to print + out key structure you have to write just: + reiserfs_warning ("bad key %k", key); + instead of + printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, + key->k_offset, key->k_uniqueness); +*/ + + +static void +prepare_error_buf( const char *fmt, va_list args ) +{ + char * fmt1 = fmt_buf; + char * k; + char * p = error_buf; + int i, j, what, skip; + + strcpy (fmt1, fmt); + + while( (k = is_there_reiserfs_struct( fmt1, &what, &skip )) != NULL ) + { + *k = 0; + + p += vsprintf (p, fmt1, args); + + for (i = 0; i < skip; i ++) + j = va_arg (args, int); + + switch (what) { + case 'k': + sprintf_le_key (p, va_arg(args, struct reiserfs_key *)); + break; + case 'K': + sprintf_cpu_key (p, va_arg(args, struct cpu_key *)); + break; + case 'h': + sprintf_item_head (p, va_arg(args, struct item_head *)); + break; + case 't': + sprintf_direntry (p, va_arg(args, struct reiserfs_dir_entry *)); + break; + case 'y': + sprintf_disk_child (p, va_arg(args, struct disk_child *)); + break; + case 'z': + sprintf_block_head (p, va_arg(args, struct buffer_head *)); + break; + case 'b': + sprintf_buffer_head (p, va_arg(args, struct buffer_head *)); + break; + case 'a': + sprintf_de_head (p, va_arg(args, struct reiserfs_de_head *)); + break; + } + + p += strlen (p); + fmt1 = k + 2; + } + vsprintf (p, fmt1, args); + +} + + +/* in addition to usual conversion specifiers this accepts reiserfs + specific conversion specifiers: + %k to print little endian key, + %K to print cpu key, + %h to print item_head, + %t to print directory entry + %z to print block head (arg must be struct buffer_head * + %b to print buffer_head +*/ + +#define do_reiserfs_warning(fmt)\ +{\ + va_list args;\ + va_start( args, fmt );\ + prepare_error_buf( fmt, args );\ + va_end( args );\ +} + +void reiserfs_warning (struct super_block *sb, const char * fmt, ...) +{ + do_reiserfs_warning(fmt); + if (sb) + printk (KERN_WARNING "ReiserFS: %s: warning: %s\n", + reiserfs_bdevname (sb), error_buf); + else + printk (KERN_WARNING "ReiserFS: warning: %s\n", error_buf); +} + +/* No newline.. reiserfs_info calls can be followed by printk's */ +void reiserfs_info (struct super_block *sb, const char * fmt, ...) +{ + do_reiserfs_warning(fmt); + if (sb) + printk (KERN_NOTICE "ReiserFS: %s: %s", + reiserfs_bdevname (sb), error_buf); + else + printk (KERN_NOTICE "ReiserFS: %s", error_buf); +} + +/* No newline.. reiserfs_printk calls can be followed by printk's */ +static void reiserfs_printk (const char * fmt, ...) +{ + do_reiserfs_warning(fmt); + printk (error_buf); +} + +void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...) +{ +#ifdef CONFIG_REISERFS_CHECK + do_reiserfs_warning(fmt); + if (s) + printk (KERN_DEBUG "ReiserFS: %s: %s\n", + reiserfs_bdevname (s), error_buf); + else + printk (KERN_DEBUG "ReiserFS: %s\n", error_buf); +#endif +} + +/* The format: + + maintainer-errorid: [function-name:] message + + where errorid is unique to the maintainer and function-name is + optional, is recommended, so that anyone can easily find the bug + with a simple grep for the short to type string + maintainer-errorid. Don't bother with reusing errorids, there are + lots of numbers out there. + + Example: + + reiserfs_panic( + p_sb, "reiser-29: reiserfs_new_blocknrs: " + "one of search_start or rn(%d) is equal to MAX_B_NUM," + "which means that we are optimizing location based on the bogus location of a temp buffer (%p).", + rn, bh + ); + + Regular panic()s sometimes clear the screen before the message can + be read, thus the need for the while loop. + + Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it + pointless complexity): + + panics in reiserfs_fs.h have numbers from 1000 to 1999 + super.c 2000 to 2999 + preserve.c (unused) 3000 to 3999 + bitmap.c 4000 to 4999 + stree.c 5000 to 5999 + prints.c 6000 to 6999 + namei.c 7000 to 7999 + fix_nodes.c 8000 to 8999 + dir.c 9000 to 9999 + lbalance.c 10000 to 10999 + ibalance.c 11000 to 11999 not ready + do_balan.c 12000 to 12999 + inode.c 13000 to 13999 + file.c 14000 to 14999 + objectid.c 15000 - 15999 + buffer.c 16000 - 16999 + symlink.c 17000 - 17999 + + . */ + + +#ifdef CONFIG_REISERFS_CHECK +extern struct tree_balance * cur_tb; +#endif + +void reiserfs_panic (struct super_block * sb, const char * fmt, ...) +{ + do_reiserfs_warning(fmt); + printk (KERN_EMERG "REISERFS: panic (device %s): %s\n", + reiserfs_bdevname (sb), error_buf); + BUG (); + + /* this is not actually called, but makes reiserfs_panic() "noreturn" */ + panic ("REISERFS: panic (device %s): %s\n", + reiserfs_bdevname (sb), error_buf); +} + +void +reiserfs_abort (struct super_block *sb, int errno, const char *fmt, ...) +{ + do_reiserfs_warning (fmt); + + if (reiserfs_error_panic (sb)) { + panic (KERN_CRIT "REISERFS: panic (device %s): %s\n", + reiserfs_bdevname (sb), error_buf); + } + + if (sb->s_flags & MS_RDONLY) + return; + + printk (KERN_CRIT "REISERFS: abort (device %s): %s\n", + reiserfs_bdevname (sb), error_buf); + + sb->s_flags |= MS_RDONLY; + reiserfs_journal_abort (sb, errno); +} + +/* this prints internal nodes (4 keys/items in line) (dc_number, + dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, + dc_size)...*/ +static int print_internal (struct buffer_head * bh, int first, int last) +{ + struct reiserfs_key * key; + struct disk_child * dc; + int i; + int from, to; + + if (!B_IS_KEYS_LEVEL (bh)) + return 1; + + check_internal (bh); + + if (first == -1) { + from = 0; + to = B_NR_ITEMS (bh); + } else { + from = first; + to = last < B_NR_ITEMS (bh) ? last : B_NR_ITEMS (bh); + } + + reiserfs_printk ("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + dc = B_N_CHILD (bh, from); + reiserfs_printk ("PTR %d: %y ", from, dc); + + for (i = from, key = B_N_PDELIM_KEY (bh, from), dc ++; i < to; i ++, key ++, dc ++) { + reiserfs_printk ("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); + if (i && i % 4 == 0) + printk ("\n"); + } + printk ("\n"); + return 0; +} + + + + + +static int print_leaf (struct buffer_head * bh, int print_mode, int first, int last) +{ + struct block_head * blkh; + struct item_head * ih; + int i, nr; + int from, to; + + if (!B_IS_ITEMS_LEVEL (bh)) + return 1; + + check_leaf (bh); + + blkh = B_BLK_HEAD (bh); + ih = B_N_PITEM_HEAD (bh,0); + nr = blkh_nr_item(blkh); + + printk ("\n===================================================================\n"); + reiserfs_printk ("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + if (!(print_mode & PRINT_LEAF_ITEMS)) { + reiserfs_printk ("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", + &(ih->ih_key), &((ih + nr - 1)->ih_key)); + return 0; + } + + if (first < 0 || first > nr - 1) + from = 0; + else + from = first; + + if (last < 0 || last > nr ) + to = nr; + else + to = last; + + ih += from; + printk ("-------------------------------------------------------------------------------\n"); + printk ("|##| type | key | ilen | free_space | version | loc |\n"); + for (i = from; i < to; i++, ih ++) { + printk ("-------------------------------------------------------------------------------\n"); + reiserfs_printk ("|%2d| %h |\n", i, ih); + if (print_mode & PRINT_LEAF_ITEMS) + op_print_item (ih, B_I_PITEM (bh, ih)); + } + + printk ("===================================================================\n"); + + return 0; +} + +char * reiserfs_hashname(int code) +{ + if ( code == YURA_HASH) + return "rupasov"; + if ( code == TEA_HASH) + return "tea"; + if ( code == R5_HASH) + return "r5"; + + return "unknown"; +} + +/* return 1 if this is not super block */ +static int print_super_block (struct buffer_head * bh) +{ + struct reiserfs_super_block * rs = (struct reiserfs_super_block *)(bh->b_data); + int skipped, data_blocks; + char *version; + char b[BDEVNAME_SIZE]; + + if (is_reiserfs_3_5(rs)) { + version = "3.5"; + } else if (is_reiserfs_3_6(rs)) { + version = "3.6"; + } else if (is_reiserfs_jr(rs)) { + version = ((sb_version(rs) == REISERFS_VERSION_2) ? + "3.6" : "3.5"); + } else { + return 1; + } + + printk ("%s\'s super block is in block %llu\n", bdevname (bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); + printk ("Reiserfs version %s\n", version ); + printk ("Block count %u\n", sb_block_count(rs)); + printk ("Blocksize %d\n", sb_blocksize(rs)); + printk ("Free blocks %u\n", sb_free_blocks(rs)); + // FIXME: this would be confusing if + // someone stores reiserfs super block in some data block ;) +// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); + skipped = bh->b_blocknr; + data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - + (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + 1 : sb_reserved_for_journal(rs)) - + sb_free_blocks(rs); + printk ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" + "1 super block, %d data blocks\n", + skipped, sb_bmap_nr(rs), (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : + sb_reserved_for_journal(rs)) , data_blocks); + printk ("Root block %u\n", sb_root_block(rs)); + printk ("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); + printk ("Journal dev %d\n", sb_jp_journal_dev(rs)); + printk ("Journal orig size %d\n", sb_jp_journal_size(rs)); + printk ("FS state %d\n", sb_fs_state(rs)); + printk ("Hash function \"%s\"\n", + reiserfs_hashname(sb_hash_function_code(rs))); + + printk ("Tree height %d\n", sb_tree_height(rs)); + return 0; +} + +static int print_desc_block (struct buffer_head * bh) +{ + struct reiserfs_journal_desc * desc; + + if (memcmp(get_journal_desc_magic (bh), JOURNAL_DESC_MAGIC, 8)) + return 1; + + desc = (struct reiserfs_journal_desc *)(bh->b_data); + printk ("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", + (unsigned long long)bh->b_blocknr, get_desc_trans_id (desc), get_desc_mount_id (desc), + get_desc_trans_len (desc)); + + return 0; +} + + +void print_block (struct buffer_head * bh, ...)//int print_mode, int first, int last) +{ + va_list args; + int mode, first, last; + + va_start (args, bh); + + if ( ! bh ) { + printk("print_block: buffer is NULL\n"); + return; + } + + mode = va_arg (args, int); + first = va_arg (args, int); + last = va_arg (args, int); + if (print_leaf (bh, mode, first, last)) + if (print_internal (bh, first, last)) + if (print_super_block (bh)) + if (print_desc_block (bh)) + printk ("Block %llu contains unformatted data\n", (unsigned long long)bh->b_blocknr); +} + + + +static char print_tb_buf[2048]; + +/* this stores initial state of tree balance in the print_tb_buf */ +void store_print_tb (struct tree_balance * tb) +{ + int h = 0; + int i; + struct buffer_head * tbSh, * tbFh; + + if (!tb) + return; + + sprintf (print_tb_buf, "\n" + "BALANCING %d\n" + "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" + "=====================================================================\n" + "* h * S * L * R * F * FL * FR * CFL * CFR *\n", + REISERFS_SB(tb->tb_sb)->s_do_balance, + tb->tb_mode, PATH_LAST_POSITION (tb->tb_path), tb->tb_path->pos_in_item); + + for (h = 0; h < sizeof(tb->insert_size) / sizeof (tb->insert_size[0]); h ++) { + if (PATH_H_PATH_OFFSET (tb->tb_path, h) <= tb->tb_path->path_length && + PATH_H_PATH_OFFSET (tb->tb_path, h) > ILLEGAL_PATH_ELEMENT_OFFSET) { + tbSh = PATH_H_PBUFFER (tb->tb_path, h); + tbFh = PATH_H_PPARENT (tb->tb_path, h); + } else { + tbSh = NULL; + tbFh = NULL; + } + sprintf (print_tb_buf + strlen (print_tb_buf), + "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", + h, + (tbSh) ? (long long)(tbSh->b_blocknr):(-1LL), + (tbSh) ? atomic_read (&(tbSh->b_count)) : -1, + (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr):(-1LL), + (tb->L[h]) ? atomic_read (&(tb->L[h]->b_count)) : -1, + (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr):(-1LL), + (tb->R[h]) ? atomic_read (&(tb->R[h]->b_count)) : -1, + (tbFh) ? (long long)(tbFh->b_blocknr):(-1LL), + (tb->FL[h]) ? (long long)(tb->FL[h]->b_blocknr):(-1LL), + (tb->FR[h]) ? (long long)(tb->FR[h]->b_blocknr):(-1LL), + (tb->CFL[h]) ? (long long)(tb->CFL[h]->b_blocknr):(-1LL), + (tb->CFR[h]) ? (long long)(tb->CFR[h]->b_blocknr):(-1LL)); + } + + sprintf (print_tb_buf + strlen (print_tb_buf), + "=====================================================================\n" + "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" + "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", + tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],tb->rbytes, tb->blknum[0], + tb->s0num, tb->s1num,tb->s1bytes, tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], tb->rkey[0]); + + /* this prints balance parameters for non-leaf levels */ + h = 0; + do { + h++; + sprintf (print_tb_buf + strlen (print_tb_buf), + "* %d * %4d * %2d * * %2d * * %2d *\n", + h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], tb->blknum[h]); + } while (tb->insert_size[h]); + + sprintf (print_tb_buf + strlen (print_tb_buf), + "=====================================================================\n" + "FEB list: "); + + /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ + h = 0; + for (i = 0; i < sizeof (tb->FEB) / sizeof (tb->FEB[0]); i ++) + sprintf (print_tb_buf + strlen (print_tb_buf), + "%p (%llu %d)%s", tb->FEB[i], tb->FEB[i] ? (unsigned long long)tb->FEB[i]->b_blocknr : 0ULL, + tb->FEB[i] ? atomic_read (&(tb->FEB[i]->b_count)) : 0, + (i == sizeof (tb->FEB) / sizeof (tb->FEB[0]) - 1) ? "\n" : ", "); + + sprintf (print_tb_buf + strlen (print_tb_buf), + "======================== the end ====================================\n"); +} + +void print_cur_tb (char * mes) +{ + printk ("%s\n%s", mes, print_tb_buf); +} + +static void check_leaf_block_head (struct buffer_head * bh) +{ + struct block_head * blkh; + int nr; + + blkh = B_BLK_HEAD (bh); + nr = blkh_nr_item(blkh); + if ( nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic (NULL, "vs-6010: check_leaf_block_head: invalid item number %z", bh); + if ( blkh_free_space(blkh) > + bh->b_size - BLKH_SIZE - IH_SIZE * nr ) + reiserfs_panic (NULL, "vs-6020: check_leaf_block_head: invalid free space %z", bh); + +} + +static void check_internal_block_head (struct buffer_head * bh) +{ + struct block_head * blkh; + + blkh = B_BLK_HEAD (bh); + if (!(B_LEVEL (bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL (bh) <= MAX_HEIGHT)) + reiserfs_panic (NULL, "vs-6025: check_internal_block_head: invalid level %z", bh); + + if (B_NR_ITEMS (bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic (NULL, "vs-6030: check_internal_block_head: invalid item number %z", bh); + + if (B_FREE_SPACE (bh) != + bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS (bh) - DC_SIZE * (B_NR_ITEMS (bh) + 1)) + reiserfs_panic (NULL, "vs-6040: check_internal_block_head: invalid free space %z", bh); + +} + + +void check_leaf (struct buffer_head * bh) +{ + int i; + struct item_head * ih; + + if (!bh) + return; + check_leaf_block_head (bh); + for (i = 0, ih = B_N_PITEM_HEAD (bh, 0); i < B_NR_ITEMS (bh); i ++, ih ++) + op_check_item (ih, B_I_PITEM (bh, ih)); +} + + +void check_internal (struct buffer_head * bh) +{ + if (!bh) + return; + check_internal_block_head (bh); +} + + +void print_statistics (struct super_block * s) +{ + + /* + printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ +bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", + REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, + REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, + REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); + */ + +} diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c new file mode 100644 index 000000000000..f4ea81ae0e0f --- /dev/null +++ b/fs/reiserfs/procfs.c @@ -0,0 +1,664 @@ +/* -*- linux-c -*- */ + +/* fs/reiserfs/procfs.c */ + +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* proc info support a la one created by Sizif@Botik.RU for PGC */ + +/* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/time.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> +#include <linux/reiserfs_fs.h> +#include <linux/reiserfs_fs_sb.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/proc_fs.h> + +#if defined( REISERFS_PROC_INFO ) + +/* + * LOCKING: + * + * We rely on new Alexander Viro's super-block locking. + * + */ + +static int show_version(struct seq_file *m, struct super_block *sb) +{ + char *format; + + if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6) ) { + format = "3.6"; + } else if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5) ) { + format = "3.5"; + } else { + format = "unknown"; + } + + seq_printf(m, "%s format\twith checks %s\n", + format, +#if defined( CONFIG_REISERFS_CHECK ) + "on" +#else + "off" +#endif + ); + return 0; +} + +int reiserfs_global_version_in_proc( char *buffer, char **start, off_t offset, + int count, int *eof, void *data ) +{ + *start = buffer; + *eof = 1; + return 0; +} + +#define SF( x ) ( r -> x ) +#define SFP( x ) SF( s_proc_info_data.x ) +#define SFPL( x ) SFP( x[ level ] ) +#define SFPF( x ) SFP( scan_bitmap.x ) +#define SFPJ( x ) SFP( journal.x ) + +#define D2C( x ) le16_to_cpu( x ) +#define D4C( x ) le32_to_cpu( x ) +#define DF( x ) D2C( rs -> s_v1.x ) +#define DFL( x ) D4C( rs -> s_v1.x ) + +#define objectid_map( s, rs ) (old_format_only (s) ? \ + (__u32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \ + (__u32 *)(rs + 1)) +#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) + +#define DJF( x ) le32_to_cpu( rs -> x ) +#define DJV( x ) le32_to_cpu( s_v1 -> x ) +#define DJP( x ) le32_to_cpu( jp -> x ) +#define JF( x ) ( r -> s_journal -> x ) + +static int show_super(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *r = REISERFS_SB(sb); + + seq_printf(m, "state: \t%s\n" + "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" + "gen. counter: \t%i\n" + "s_kmallocs: \t%i\n" + "s_disk_reads: \t%i\n" + "s_disk_writes: \t%i\n" + "s_fix_nodes: \t%i\n" + "s_do_balance: \t%i\n" + "s_unneeded_left_neighbor: \t%i\n" + "s_good_search_by_key_reada: \t%i\n" + "s_bmaps: \t%i\n" + "s_bmaps_without_search: \t%i\n" + "s_direct2indirect: \t%i\n" + "s_indirect2direct: \t%i\n" + "\n" + "max_hash_collisions: \t%i\n" + + "breads: \t%lu\n" + "bread_misses: \t%lu\n" + + "search_by_key: \t%lu\n" + "search_by_key_fs_changed: \t%lu\n" + "search_by_key_restarted: \t%lu\n" + + "insert_item_restarted: \t%lu\n" + "paste_into_item_restarted: \t%lu\n" + "cut_from_item_restarted: \t%lu\n" + "delete_solid_item_restarted: \t%lu\n" + "delete_item_restarted: \t%lu\n" + + "leaked_oid: \t%lu\n" + "leaves_removable: \t%lu\n", + + SF( s_mount_state ) == REISERFS_VALID_FS ? + "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", + reiserfs_r5_hash( sb ) ? "FORCE_R5 " : "", + reiserfs_rupasov_hash( sb ) ? "FORCE_RUPASOV " : "", + reiserfs_tea_hash( sb ) ? "FORCE_TEA " : "", + reiserfs_hash_detect( sb ) ? "DETECT_HASH " : "", + reiserfs_no_border( sb ) ? "NO_BORDER " : "BORDER ", + reiserfs_no_unhashed_relocation( sb ) ? "NO_UNHASHED_RELOCATION " : "", + reiserfs_hashed_relocation( sb ) ? "UNHASHED_RELOCATION " : "", + reiserfs_test4( sb ) ? "TEST4 " : "", + have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ", + replay_only( sb ) ? "REPLAY_ONLY " : "", + convert_reiserfs( sb ) ? "CONV " : "", + + atomic_read( &r -> s_generation_counter ), + SF( s_kmallocs ), + SF( s_disk_reads ), + SF( s_disk_writes ), + SF( s_fix_nodes ), + SF( s_do_balance ), + SF( s_unneeded_left_neighbor ), + SF( s_good_search_by_key_reada ), + SF( s_bmaps ), + SF( s_bmaps_without_search ), + SF( s_direct2indirect ), + SF( s_indirect2direct ), + SFP( max_hash_collisions ), + SFP( breads ), + SFP( bread_miss ), + SFP( search_by_key ), + SFP( search_by_key_fs_changed ), + SFP( search_by_key_restarted ), + + SFP( insert_item_restarted ), + SFP( paste_into_item_restarted ), + SFP( cut_from_item_restarted ), + SFP( delete_solid_item_restarted ), + SFP( delete_item_restarted ), + + SFP( leaked_oid ), + SFP( leaves_removable ) ); + + return 0; +} + +static int show_per_level(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *r = REISERFS_SB(sb); + int level; + + seq_printf(m, "level\t" + " balances" + " [sbk: reads" + " fs_changed" + " restarted]" + " free space" + " items" + " can_remove" + " lnum" + " rnum" + " lbytes" + " rbytes" + " get_neig" + " get_neig_res" + " need_l_neig" + " need_r_neig" + "\n" + + ); + + for( level = 0 ; level < MAX_HEIGHT ; ++ level ) { + seq_printf(m, "%i\t" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12li" + " %12li" + " %12li" + " %12li" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + "\n", + level, + SFPL( balance_at ), + SFPL( sbk_read_at ), + SFPL( sbk_fs_changed ), + SFPL( sbk_restarted ), + SFPL( free_at ), + SFPL( items_at ), + SFPL( can_node_be_removed ), + SFPL( lnum ), + SFPL( rnum ), + SFPL( lbytes ), + SFPL( rbytes ), + SFPL( get_neighbors ), + SFPL( get_neighbors_restart ), + SFPL( need_l_neighbor ), + SFPL( need_r_neighbor ) + ); + } + return 0; +} + +static int show_bitmap(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *r = REISERFS_SB(sb); + + seq_printf(m, "free_block: %lu\n" + " scan_bitmap:" + " wait" + " bmap" + " retry" + " stolen" + " journal_hint" + "journal_nohint" + "\n" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + "\n", + SFP( free_block ), + SFPF( call ), + SFPF( wait ), + SFPF( bmap ), + SFPF( retry ), + SFPF( stolen ), + SFPF( in_journal_hint ), + SFPF( in_journal_nohint ) ); + + return 0; +} + +static int show_on_disk_super(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); + struct reiserfs_super_block *rs = sb_info -> s_rs; + int hash_code = DFL( s_hash_function_code ); + __u32 flags = DJF( s_flags ); + + seq_printf(m, "block_count: \t%i\n" + "free_blocks: \t%i\n" + "root_block: \t%i\n" + "blocksize: \t%i\n" + "oid_maxsize: \t%i\n" + "oid_cursize: \t%i\n" + "umount_state: \t%i\n" + "magic: \t%10.10s\n" + "fs_state: \t%i\n" + "hash: \t%s\n" + "tree_height: \t%i\n" + "bmap_nr: \t%i\n" + "version: \t%i\n" + "flags: \t%x[%s]\n" + "reserved_for_journal: \t%i\n", + + DFL( s_block_count ), + DFL( s_free_blocks ), + DFL( s_root_block ), + DF( s_blocksize ), + DF( s_oid_maxsize ), + DF( s_oid_cursize ), + DF( s_umount_state ), + rs -> s_v1.s_magic, + DF( s_fs_state ), + hash_code == TEA_HASH ? "tea" : + ( hash_code == YURA_HASH ) ? "rupasov" : + ( hash_code == R5_HASH ) ? "r5" : + ( hash_code == UNSET_HASH ) ? "unset" : "unknown", + DF( s_tree_height ), + DF( s_bmap_nr ), + DF( s_version ), + flags, + ( flags & reiserfs_attrs_cleared ) + ? "attrs_cleared" : "", + DF (s_reserved_for_journal)); + + return 0; +} + +static int show_oidmap(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); + struct reiserfs_super_block *rs = sb_info -> s_rs; + unsigned int mapsize = le16_to_cpu( rs -> s_v1.s_oid_cursize ); + unsigned long total_used = 0; + int i; + + for( i = 0 ; i < mapsize ; ++i ) { + __u32 right; + + right = ( i == mapsize - 1 ) ? MAX_KEY_OBJECTID : MAP( i + 1 ); + seq_printf(m, "%s: [ %x .. %x )\n", + ( i & 1 ) ? "free" : "used", MAP( i ), right ); + if( ! ( i & 1 ) ) { + total_used += right - MAP( i ); + } + } +#if defined( REISERFS_USE_OIDMAPF ) + if( sb_info -> oidmap.use_file && ( sb_info -> oidmap.mapf != NULL ) ) { + loff_t size = sb_info->oidmap.mapf->f_dentry->d_inode->i_size; + total_used += size / sizeof( reiserfs_oidinterval_d_t ); + } +#endif + seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", + mapsize, + mapsize, le16_to_cpu( rs -> s_v1.s_oid_maxsize ), + total_used); + return 0; +} + +static int show_journal(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *r = REISERFS_SB(sb); + struct reiserfs_super_block *rs = r -> s_rs; + struct journal_params *jp = &rs->s_v1.s_journal; + char b[BDEVNAME_SIZE]; + + + seq_printf(m, /* on-disk fields */ + "jp_journal_1st_block: \t%i\n" + "jp_journal_dev: \t%s[%x]\n" + "jp_journal_size: \t%i\n" + "jp_journal_trans_max: \t%i\n" + "jp_journal_magic: \t%i\n" + "jp_journal_max_batch: \t%i\n" + "jp_journal_max_commit_age: \t%i\n" + "jp_journal_max_trans_age: \t%i\n" + /* incore fields */ + "j_1st_reserved_block: \t%i\n" + "j_state: \t%li\n" + "j_trans_id: \t%lu\n" + "j_mount_id: \t%lu\n" + "j_start: \t%lu\n" + "j_len: \t%lu\n" + "j_len_alloc: \t%lu\n" + "j_wcount: \t%i\n" + "j_bcount: \t%lu\n" + "j_first_unflushed_offset: \t%lu\n" + "j_last_flush_trans_id: \t%lu\n" + "j_trans_start_time: \t%li\n" + "j_list_bitmap_index: \t%i\n" + "j_must_wait: \t%i\n" + "j_next_full_flush: \t%i\n" + "j_next_async_flush: \t%i\n" + "j_cnode_used: \t%i\n" + "j_cnode_free: \t%i\n" + "\n" + /* reiserfs_proc_info_data_t.journal fields */ + "in_journal: \t%12lu\n" + "in_journal_bitmap: \t%12lu\n" + "in_journal_reusable: \t%12lu\n" + "lock_journal: \t%12lu\n" + "lock_journal_wait: \t%12lu\n" + "journal_begin: \t%12lu\n" + "journal_relock_writers: \t%12lu\n" + "journal_relock_wcount: \t%12lu\n" + "mark_dirty: \t%12lu\n" + "mark_dirty_already: \t%12lu\n" + "mark_dirty_notjournal: \t%12lu\n" + "restore_prepared: \t%12lu\n" + "prepare: \t%12lu\n" + "prepare_retry: \t%12lu\n", + + DJP( jp_journal_1st_block ), + bdevname(SB_JOURNAL(sb)->j_dev_bd, b), + DJP( jp_journal_dev ), + DJP( jp_journal_size ), + DJP( jp_journal_trans_max ), + DJP( jp_journal_magic ), + DJP( jp_journal_max_batch ), + SB_JOURNAL(sb)->j_max_commit_age, + DJP( jp_journal_max_trans_age ), + + JF( j_1st_reserved_block ), + JF( j_state ), + JF( j_trans_id ), + JF( j_mount_id ), + JF( j_start ), + JF( j_len ), + JF( j_len_alloc ), + atomic_read( & r -> s_journal -> j_wcount ), + JF( j_bcount ), + JF( j_first_unflushed_offset ), + JF( j_last_flush_trans_id ), + JF( j_trans_start_time ), + JF( j_list_bitmap_index ), + JF( j_must_wait ), + JF( j_next_full_flush ), + JF( j_next_async_flush ), + JF( j_cnode_used ), + JF( j_cnode_free ), + + SFPJ( in_journal ), + SFPJ( in_journal_bitmap ), + SFPJ( in_journal_reusable ), + SFPJ( lock_journal ), + SFPJ( lock_journal_wait ), + SFPJ( journal_being ), + SFPJ( journal_relock_writers ), + SFPJ( journal_relock_wcount ), + SFPJ( mark_dirty ), + SFPJ( mark_dirty_already ), + SFPJ( mark_dirty_notjournal ), + SFPJ( restore_prepared ), + SFPJ( prepare ), + SFPJ( prepare_retry ) + ); + return 0; +} + +/* iterator */ +static int test_sb(struct super_block *sb, void *data) +{ + return data == sb; +} + +static int set_sb(struct super_block *sb, void *data) +{ + return -ENOENT; +} + +static void *r_start(struct seq_file *m, loff_t *pos) +{ + struct proc_dir_entry *de = m->private; + struct super_block *s = de->parent->data; + loff_t l = *pos; + + if (l) + return NULL; + + if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, s))) + return NULL; + + up_write(&s->s_umount); + + if (de->deleted) { + deactivate_super(s); + return NULL; + } + + return s; +} + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + if (v) + deactivate_super(v); + return NULL; +} + +static void r_stop(struct seq_file *m, void *v) +{ + if (v) + deactivate_super(v); +} + +static int r_show(struct seq_file *m, void *v) +{ + struct proc_dir_entry *de = m->private; + int (*show)(struct seq_file *, struct super_block *) = de->data; + return show(m, v); +} + +static struct seq_operations r_ops = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = r_show, +}; + +static int r_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &r_ops); + + if (!ret) { + struct seq_file *m = file->private_data; + m->private = PDE(inode); + } + return ret; +} + +static struct file_operations r_file_operations = { + .open = r_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *proc_info_root = NULL; +static const char proc_info_root_name[] = "fs/reiserfs"; + +static void add_file(struct super_block *sb, char *name, + int (*func)(struct seq_file *, struct super_block *)) +{ + struct proc_dir_entry *de; + de = create_proc_entry(name, 0, REISERFS_SB(sb)->procdir); + if (de) { + de->data = func; + de->proc_fops = &r_file_operations; + } +} + +int reiserfs_proc_info_init( struct super_block *sb ) +{ + spin_lock_init( & __PINFO( sb ).lock ); + REISERFS_SB(sb)->procdir = proc_mkdir(reiserfs_bdevname (sb), proc_info_root); + if( REISERFS_SB(sb)->procdir ) { + REISERFS_SB(sb)->procdir->owner = THIS_MODULE; + REISERFS_SB(sb)->procdir->data = sb; + add_file(sb, "version", show_version); + add_file(sb, "super", show_super); + add_file(sb, "per-level", show_per_level); + add_file(sb, "bitmap", show_bitmap); + add_file(sb, "on-disk-super", show_on_disk_super); + add_file(sb, "oidmap", show_oidmap); + add_file(sb, "journal", show_journal); + return 0; + } + reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s", + proc_info_root_name, reiserfs_bdevname (sb) ); + return 1; +} + +int reiserfs_proc_info_done( struct super_block *sb ) +{ + struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; + if (de) { + remove_proc_entry("journal", de); + remove_proc_entry("oidmap", de); + remove_proc_entry("on-disk-super", de); + remove_proc_entry("bitmap", de); + remove_proc_entry("per-level", de); + remove_proc_entry("super", de); + remove_proc_entry("version", de); + } + spin_lock( & __PINFO( sb ).lock ); + __PINFO( sb ).exiting = 1; + spin_unlock( & __PINFO( sb ).lock ); + if ( proc_info_root ) { + remove_proc_entry( reiserfs_bdevname (sb), proc_info_root ); + REISERFS_SB(sb)->procdir = NULL; + } + return 0; +} + +struct proc_dir_entry *reiserfs_proc_register_global( char *name, + read_proc_t *func ) +{ + return ( proc_info_root ) ? create_proc_read_entry( name, 0, + proc_info_root, + func, NULL ) : NULL; +} + +void reiserfs_proc_unregister_global( const char *name ) +{ + remove_proc_entry( name, proc_info_root ); +} + +int reiserfs_proc_info_global_init( void ) +{ + if( proc_info_root == NULL ) { + proc_info_root = proc_mkdir(proc_info_root_name, NULL); + if( proc_info_root ) { + proc_info_root -> owner = THIS_MODULE; + } else { + reiserfs_warning (NULL, + "reiserfs: cannot create /proc/%s", + proc_info_root_name ); + return 1; + } + } + return 0; +} + +int reiserfs_proc_info_global_done( void ) +{ + if ( proc_info_root != NULL ) { + proc_info_root = NULL; + remove_proc_entry(proc_info_root_name, NULL); + } + return 0; +} + +/* REISERFS_PROC_INFO */ +#else + +int reiserfs_proc_info_init( struct super_block *sb ) { return 0; } +int reiserfs_proc_info_done( struct super_block *sb ) { return 0; } + +struct proc_dir_entry *reiserfs_proc_register_global( char *name, + read_proc_t *func ) +{ return NULL; } + +void reiserfs_proc_unregister_global( const char *name ) {;} + +int reiserfs_proc_info_global_init( void ) { return 0; } +int reiserfs_proc_info_global_done( void ) { return 0; } + +int reiserfs_global_version_in_proc( char *buffer, char **start, + off_t offset, + int count, int *eof, void *data ) +{ return 0; } + +/* REISERFS_PROC_INFO */ +#endif + +/* + * $Log: procfs.c,v $ + * Revision 1.1.8.2 2001/07/15 17:08:42 god + * . use get_super() in procfs.c + * . remove remove_save_link() from reiserfs_do_truncate() + * + * I accept terms and conditions stated in the Legal Agreement + * (available at http://www.namesys.com/legalese.html) + * + * Revision 1.1.8.1 2001/07/11 16:48:50 god + * proc info support + * + * I accept terms and conditions stated in the Legal Agreement + * (available at http://www.namesys.com/legalese.html) + * + */ + +/* + * Make Linus happy. + * Local variables: + * c-indentation-style: "K&R" + * mode-name: "LC" + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c new file mode 100644 index 000000000000..170012078b76 --- /dev/null +++ b/fs/reiserfs/resize.c @@ -0,0 +1,182 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Alexander Zarochentcev. + * + * The kernel part of the (on-line) reiserfs resizer. + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/reiserfs_fs.h> +#include <linux/reiserfs_fs_sb.h> +#include <linux/buffer_head.h> + +int reiserfs_resize (struct super_block * s, unsigned long block_count_new) +{ + int err = 0; + struct reiserfs_super_block * sb; + struct reiserfs_bitmap_info *bitmap; + struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s); + struct buffer_head * bh; + struct reiserfs_transaction_handle th; + unsigned int bmap_nr_new, bmap_nr; + unsigned int block_r_new, block_r; + + struct reiserfs_list_bitmap * jb; + struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS]; + + unsigned long int block_count, free_blocks; + int i; + int copy_size ; + + sb = SB_DISK_SUPER_BLOCK(s); + + if (SB_BLOCK_COUNT(s) >= block_count_new) { + printk("can\'t shrink filesystem on-line\n"); + return -EINVAL; + } + + /* check the device size */ + bh = sb_bread(s, block_count_new - 1); + if (!bh) { + printk("reiserfs_resize: can\'t read last block\n"); + return -EINVAL; + } + bforget(bh); + + /* old disk layout detection; those partitions can be mounted, but + * cannot be resized */ + if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size + != REISERFS_DISK_OFFSET_IN_BYTES ) { + printk("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); + return -ENOTSUPP; + } + + /* count used bits in last bitmap block */ + block_r = SB_BLOCK_COUNT(s) - + (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8; + + /* count bitmap blocks in new fs */ + bmap_nr_new = block_count_new / ( s->s_blocksize * 8 ); + block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8; + if (block_r_new) + bmap_nr_new++; + else + block_r_new = s->s_blocksize * 8; + + /* save old values */ + block_count = SB_BLOCK_COUNT(s); + bmap_nr = SB_BMAP_NR(s); + + /* resizing of reiserfs bitmaps (journal and real), if needed */ + if (bmap_nr_new > bmap_nr) { + /* reallocate journal bitmaps */ + if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { + printk("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); + unlock_super(s) ; + return -ENOMEM ; + } + /* the new journal bitmaps are zero filled, now we copy in the bitmap + ** node pointers from the old journal bitmap structs, and then + ** transfer the new data structures into the journal struct. + ** + ** using the copy_size var below allows this code to work for + ** both shrinking and expanding the FS. + */ + copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr ; + copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *) ; + for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { + struct reiserfs_bitmap_node **node_tmp ; + jb = SB_JOURNAL(s)->j_list_bitmap + i ; + memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size) ; + + /* just in case vfree schedules on us, copy the new + ** pointer into the journal struct before freeing the + ** old one + */ + node_tmp = jb->bitmaps ; + jb->bitmaps = jbitmap[i].bitmaps ; + vfree(node_tmp) ; + } + + /* allocate additional bitmap blocks, reallocate array of bitmap + * block pointers */ + bitmap = vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + if (!bitmap) { + /* Journal bitmaps are still supersized, but the memory isn't + * leaked, so I guess it's ok */ + printk("reiserfs_resize: unable to allocate memory.\n"); + return -ENOMEM; + } + memset (bitmap, 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); + for (i = 0; i < bmap_nr; i++) + bitmap[i] = old_bitmap[i]; + + /* This doesn't go through the journal, but it doesn't have to. + * The changes are still atomic: We're synced up when the journal + * transaction begins, and the new bitmaps don't matter if the + * transaction fails. */ + for (i = bmap_nr; i < bmap_nr_new; i++) { + bitmap[i].bh = sb_getblk(s, i * s->s_blocksize * 8); + memset(bitmap[i].bh->b_data, 0, sb_blocksize(sb)); + reiserfs_test_and_set_le_bit(0, bitmap[i].bh->b_data); + + set_buffer_uptodate(bitmap[i].bh); + mark_buffer_dirty(bitmap[i].bh) ; + sync_dirty_buffer(bitmap[i].bh); + // update bitmap_info stuff + bitmap[i].first_zero_hint=1; + bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; + } + /* free old bitmap blocks array */ + SB_AP_BITMAP(s) = bitmap; + vfree (old_bitmap); + } + + /* begin transaction, if there was an error, it's fine. Yes, we have + * incorrect bitmaps now, but none of it is ever going to touch the + * disk anyway. */ + err = journal_begin(&th, s, 10); + if (err) + return err; + + /* correct last bitmap blocks in old and new disk layout */ + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr - 1].bh, 1); + for (i = block_r; i < s->s_blocksize * 8; i++) + reiserfs_test_and_clear_le_bit(i, + SB_AP_BITMAP(s)[bmap_nr - 1].bh->b_data); + SB_AP_BITMAP(s)[bmap_nr - 1].free_count += s->s_blocksize * 8 - block_r; + if ( !SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint) + SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint = block_r; + + journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr - 1].bh); + + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh, 1); + for (i = block_r_new; i < s->s_blocksize * 8; i++) + reiserfs_test_and_set_le_bit(i, + SB_AP_BITMAP(s)[bmap_nr_new - 1].bh->b_data); + journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh); + + SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count -= s->s_blocksize * 8 - block_r_new; + /* Extreme case where last bitmap is the only valid block in itself. */ + if ( !SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count ) + SB_AP_BITMAP(s)[bmap_nr_new - 1].first_zero_hint = 0; + /* update super */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + free_blocks = SB_FREE_BLOCKS(s); + PUT_SB_FREE_BLOCKS(s, free_blocks + (block_count_new - block_count - (bmap_nr_new - bmap_nr))); + PUT_SB_BLOCK_COUNT(s, block_count_new); + PUT_SB_BMAP_NR(s, bmap_nr_new); + s->s_dirt = 1; + + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + + SB_JOURNAL(s)->j_must_wait = 1; + return journal_end(&th, s, 10); +} diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c new file mode 100644 index 000000000000..73ec5212178b --- /dev/null +++ b/fs/reiserfs/stree.c @@ -0,0 +1,2073 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Anatoly P. Pinchuk pap@namesys.botik.ru + * Programm System Institute + * Pereslavl-Zalessky Russia + */ + +/* + * This file contains functions dealing with S+tree + * + * B_IS_IN_TREE + * copy_item_head + * comp_short_keys + * comp_keys + * comp_short_le_keys + * le_key2cpu_key + * comp_le_keys + * bin_search + * get_lkey + * get_rkey + * key_in_buffer + * decrement_bcount + * decrement_counters_in_path + * reiserfs_check_path + * pathrelse_and_restore + * pathrelse + * search_by_key_reada + * search_by_key + * search_for_position_by_key + * comp_items + * prepare_for_direct_item + * prepare_for_direntry_item + * prepare_for_delete_or_cut + * calc_deleted_bytes_number + * init_tb_struct + * padd_item + * reiserfs_delete_item + * reiserfs_delete_solid_item + * reiserfs_delete_object + * maybe_indirect_to_direct + * indirect_to_direct_roll_back + * reiserfs_cut_from_item + * truncate_directory + * reiserfs_do_truncate + * reiserfs_paste_into_item + * reiserfs_insert_item + */ + +#include <linux/config.h> +#include <linux/time.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/reiserfs_fs.h> +#include <linux/smp_lock.h> +#include <linux/buffer_head.h> +#include <linux/quotaops.h> + +/* Does the buffer contain a disk block which is in the tree. */ +inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh) +{ + + RFALSE( B_LEVEL (p_s_bh) > MAX_HEIGHT, + "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh); + + return ( B_LEVEL (p_s_bh) != FREE_LEVEL ); +} + +// +// to gets item head in le form +// +inline void copy_item_head(struct item_head * p_v_to, + const struct item_head * p_v_from) +{ + memcpy (p_v_to, p_v_from, IH_SIZE); +} + + +/* k1 is pointer to on-disk structure which is stored in little-endian + form. k2 is pointer to cpu variable. For key of items of the same + object this returns 0. + Returns: -1 if key1 < key2 + 0 if key1 == key2 + 1 if key1 > key2 */ +inline int comp_short_keys (const struct reiserfs_key * le_key, + const struct cpu_key * cpu_key) +{ + __u32 * p_s_le_u32, * p_s_cpu_u32; + int n_key_length = REISERFS_SHORT_KEY_LEN; + + p_s_le_u32 = (__u32 *)le_key; + p_s_cpu_u32 = (__u32 *)&cpu_key->on_disk_key; + for( ; n_key_length--; ++p_s_le_u32, ++p_s_cpu_u32 ) { + if ( le32_to_cpu (*p_s_le_u32) < *p_s_cpu_u32 ) + return -1; + if ( le32_to_cpu (*p_s_le_u32) > *p_s_cpu_u32 ) + return 1; + } + + return 0; +} + + +/* k1 is pointer to on-disk structure which is stored in little-endian + form. k2 is pointer to cpu variable. + Compare keys using all 4 key fields. + Returns: -1 if key1 < key2 0 + if key1 = key2 1 if key1 > key2 */ +static inline int comp_keys (const struct reiserfs_key * le_key, const struct cpu_key * cpu_key) +{ + int retval; + + retval = comp_short_keys (le_key, cpu_key); + if (retval) + return retval; + if (le_key_k_offset (le_key_version(le_key), le_key) < cpu_key_k_offset (cpu_key)) + return -1; + if (le_key_k_offset (le_key_version(le_key), le_key) > cpu_key_k_offset (cpu_key)) + return 1; + + if (cpu_key->key_length == 3) + return 0; + + /* this part is needed only when tail conversion is in progress */ + if (le_key_k_type (le_key_version(le_key), le_key) < cpu_key_k_type (cpu_key)) + return -1; + + if (le_key_k_type (le_key_version(le_key), le_key) > cpu_key_k_type (cpu_key)) + return 1; + + return 0; +} + + +inline int comp_short_le_keys (const struct reiserfs_key * key1, const struct reiserfs_key * key2) +{ + __u32 * p_s_1_u32, * p_s_2_u32; + int n_key_length = REISERFS_SHORT_KEY_LEN; + + p_s_1_u32 = (__u32 *)key1; + p_s_2_u32 = (__u32 *)key2; + for( ; n_key_length--; ++p_s_1_u32, ++p_s_2_u32 ) { + if ( le32_to_cpu (*p_s_1_u32) < le32_to_cpu (*p_s_2_u32) ) + return -1; + if ( le32_to_cpu (*p_s_1_u32) > le32_to_cpu (*p_s_2_u32) ) + return 1; + } + return 0; +} + +inline void le_key2cpu_key (struct cpu_key * to, const struct reiserfs_key * from) +{ + to->on_disk_key.k_dir_id = le32_to_cpu (from->k_dir_id); + to->on_disk_key.k_objectid = le32_to_cpu (from->k_objectid); + + // find out version of the key + to->version = le_key_version (from); + if (to->version == KEY_FORMAT_3_5) { + to->on_disk_key.u.k_offset_v1.k_offset = le32_to_cpu (from->u.k_offset_v1.k_offset); + to->on_disk_key.u.k_offset_v1.k_uniqueness = le32_to_cpu (from->u.k_offset_v1.k_uniqueness); + } else { + to->on_disk_key.u.k_offset_v2.k_offset = offset_v2_k_offset(&from->u.k_offset_v2); + to->on_disk_key.u.k_offset_v2.k_type = offset_v2_k_type(&from->u.k_offset_v2); + } +} + + + +// this does not say which one is bigger, it only returns 1 if keys +// are not equal, 0 otherwise +inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_key * k2) +{ + return memcmp (k1, k2, sizeof (struct reiserfs_key)); +} + +/************************************************************************** + * Binary search toolkit function * + * Search for an item in the array by the item key * + * Returns: 1 if found, 0 if not found; * + * *p_n_pos = number of the searched element if found, else the * + * number of the first element that is larger than p_v_key. * + **************************************************************************/ +/* For those not familiar with binary search: n_lbound is the leftmost item that it + could be, n_rbound the rightmost item that it could be. We examine the item + halfway between n_lbound and n_rbound, and that tells us either that we can increase + n_lbound, or decrease n_rbound, or that we have found it, or if n_lbound <= n_rbound that + there are no possible items, and we have not found it. With each examination we + cut the number of possible items it could be by one more than half rounded down, + or we find it. */ +static inline int bin_search ( + const void * p_v_key, /* Key to search for. */ + const void * p_v_base,/* First item in the array. */ + int p_n_num, /* Number of items in the array. */ + int p_n_width, /* Item size in the array. + searched. Lest the reader be + confused, note that this is crafted + as a general function, and when it + is applied specifically to the array + of item headers in a node, p_n_width + is actually the item header size not + the item size. */ + int * p_n_pos /* Number of the searched for element. */ + ) { + int n_rbound, n_lbound, n_j; + + for ( n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0))/2; n_lbound <= n_rbound; n_j = (n_rbound + n_lbound)/2 ) + switch( comp_keys((struct reiserfs_key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) ) { + case -1: n_lbound = n_j + 1; continue; + case 1: n_rbound = n_j - 1; continue; + case 0: *p_n_pos = n_j; return ITEM_FOUND; /* Key found in the array. */ + } + + /* bin_search did not find given key, it returns position of key, + that is minimal and greater than the given one. */ + *p_n_pos = n_lbound; + return ITEM_NOT_FOUND; +} + +#ifdef CONFIG_REISERFS_CHECK +extern struct tree_balance * cur_tb; +#endif + + + +/* Minimal possible key. It is never in the tree. */ +const struct reiserfs_key MIN_KEY = {0, 0, {{0, 0},}}; + +/* Maximal possible key. It is never in the tree. */ +const struct reiserfs_key MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}}; + + +/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom + of the path, and going upwards. We must check the path's validity at each step. If the key is not in + the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this + case we return a special key, either MIN_KEY or MAX_KEY. */ +static inline const struct reiserfs_key * get_lkey ( + const struct path * p_s_chk_path, + const struct super_block * p_s_sb + ) { + int n_position, n_path_offset = p_s_chk_path->path_length; + struct buffer_head * p_s_parent; + + RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5010: invalid offset in the path"); + + /* While not higher in path than first element. */ + while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) { + + RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), + "PAP-5020: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) + return &MAX_KEY; + /* Check whether position in the parent is correct. */ + if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) ) + return &MAX_KEY; + /* Check whether parent at the path really points to the child. */ + if ( B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr ) + return &MAX_KEY; + /* Return delimiting key if position in the parent is not equal to zero. */ + if ( n_position ) + return B_N_PDELIM_KEY(p_s_parent, n_position - 1); + } + /* Return MIN_KEY if we are in the root of the buffer tree. */ + if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK (p_s_sb) ) + return &MIN_KEY; + return &MAX_KEY; +} + + +/* Get delimiting key of the buffer at the path and its right neighbor. */ +inline const struct reiserfs_key * get_rkey ( + const struct path * p_s_chk_path, + const struct super_block * p_s_sb + ) { + int n_position, + n_path_offset = p_s_chk_path->path_length; + struct buffer_head * p_s_parent; + + RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5030: invalid offset in the path"); + + while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) { + + RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), + "PAP-5040: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) + return &MIN_KEY; + /* Check whether position in the parent is correct. */ + if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) ) + return &MIN_KEY; + /* Check whether parent at the path really points to the child. */ + if ( B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr ) + return &MIN_KEY; + /* Return delimiting key if position in the parent is not the last one. */ + if ( n_position != B_NR_ITEMS(p_s_parent) ) + return B_N_PDELIM_KEY(p_s_parent, n_position); + } + /* Return MAX_KEY if we are in the root of the buffer tree. */ + if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK (p_s_sb) ) + return &MAX_KEY; + return &MIN_KEY; +} + + +/* Check whether a key is contained in the tree rooted from a buffer at a path. */ +/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in + the path. These delimiting keys are stored at least one level above that buffer in the tree. If the + buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in + this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ +static inline int key_in_buffer ( + struct path * p_s_chk_path, /* Path which should be checked. */ + const struct cpu_key * p_s_key, /* Key which should be checked. */ + struct super_block * p_s_sb /* Super block pointer. */ + ) { + + RFALSE( ! p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET || + p_s_chk_path->path_length > MAX_HEIGHT, + "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", + p_s_key, p_s_chk_path->path_length); + RFALSE( !PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev, + "PAP-5060: device must not be NODEV"); + + if ( comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1 ) + /* left delimiting key is bigger, that the key we look for */ + return 0; + // if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 ) + if ( comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1 ) + /* p_s_key must be less than right delimitiing key */ + return 0; + return 1; +} + + +inline void decrement_bcount( + struct buffer_head * p_s_bh + ) { + if ( p_s_bh ) { + if ( atomic_read (&(p_s_bh->b_count)) ) { + put_bh(p_s_bh) ; + return; + } + reiserfs_panic(NULL, "PAP-5070: decrement_bcount: trying to free free buffer %b", p_s_bh); + } +} + + +/* Decrement b_count field of the all buffers in the path. */ +void decrement_counters_in_path ( + struct path * p_s_search_path + ) { + int n_path_offset = p_s_search_path->path_length; + + RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET || + n_path_offset > EXTENDED_MAX_HEIGHT - 1, + "PAP-5080: invalid path offset of %d", n_path_offset); + + while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { + struct buffer_head * bh; + + bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--); + decrement_bcount (bh); + } + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + + +int reiserfs_check_path(struct path *p) { + RFALSE( p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, + "path not properly relsed") ; + return 0 ; +} + + +/* Release all buffers in the path. Restore dirty bits clean +** when preparing the buffer for the log +** +** only called from fix_nodes() +*/ +void pathrelse_and_restore ( + struct super_block *s, + struct path * p_s_search_path + ) { + int n_path_offset = p_s_search_path->path_length; + + RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "clm-4000: invalid path offset"); + + while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { + reiserfs_restore_prepared_buffer(s, PATH_OFFSET_PBUFFER(p_s_search_path, + n_path_offset)); + brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); + } + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + +/* Release all buffers in the path. */ +void pathrelse ( + struct path * p_s_search_path + ) { + int n_path_offset = p_s_search_path->path_length; + + RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "PAP-5090: invalid path offset"); + + while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) + brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); + + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + + + +static int is_leaf (char * buf, int blocksize, struct buffer_head * bh) +{ + struct block_head * blkh; + struct item_head * ih; + int used_space; + int prev_location; + int i; + int nr; + + blkh = (struct block_head *)buf; + if ( blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { + reiserfs_warning (NULL, "is_leaf: this should be caught earlier"); + return 0; + } + + nr = blkh_nr_item(blkh); + if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { + /* item number is too big or too small */ + reiserfs_warning (NULL, "is_leaf: nr_item seems wrong: %z", bh); + return 0; + } + ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; + used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location (ih)); + if (used_space != blocksize - blkh_free_space(blkh)) { + /* free space does not match to calculated amount of use space */ + reiserfs_warning (NULL, "is_leaf: free space seems wrong: %z", bh); + return 0; + } + + // FIXME: it is_leaf will hit performance too much - we may have + // return 1 here + + /* check tables of item heads */ + ih = (struct item_head *)(buf + BLKH_SIZE); + prev_location = blocksize; + for (i = 0; i < nr; i ++, ih ++) { + if ( le_ih_k_type(ih) == TYPE_ANY) { + reiserfs_warning (NULL, "is_leaf: wrong item type for item %h",ih); + return 0; + } + if (ih_location (ih) >= blocksize || ih_location (ih) < IH_SIZE * nr) { + reiserfs_warning (NULL, "is_leaf: item location seems wrong: %h", ih); + return 0; + } + if (ih_item_len (ih) < 1 || ih_item_len (ih) > MAX_ITEM_LEN (blocksize)) { + reiserfs_warning (NULL, "is_leaf: item length seems wrong: %h", ih); + return 0; + } + if (prev_location - ih_location (ih) != ih_item_len (ih)) { + reiserfs_warning (NULL, "is_leaf: item location seems wrong (second one): %h", ih); + return 0; + } + prev_location = ih_location (ih); + } + + // one may imagine much more checks + return 1; +} + + +/* returns 1 if buf looks like an internal node, 0 otherwise */ +static int is_internal (char * buf, int blocksize, struct buffer_head * bh) +{ + struct block_head * blkh; + int nr; + int used_space; + + blkh = (struct block_head *)buf; + nr = blkh_level(blkh); + if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { + /* this level is not possible for internal nodes */ + reiserfs_warning (NULL, "is_internal: this should be caught earlier"); + return 0; + } + + nr = blkh_nr_item(blkh); + if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { + /* for internal which is not root we might check min number of keys */ + reiserfs_warning (NULL, "is_internal: number of key seems wrong: %z", bh); + return 0; + } + + used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); + if (used_space != blocksize - blkh_free_space(blkh)) { + reiserfs_warning (NULL, "is_internal: free space seems wrong: %z", bh); + return 0; + } + + // one may imagine much more checks + return 1; +} + + +// make sure that bh contains formatted node of reiserfs tree of +// 'level'-th level +static int is_tree_node (struct buffer_head * bh, int level) +{ + if (B_LEVEL (bh) != level) { + reiserfs_warning (NULL, "is_tree_node: node level %d does not match to the expected one %d", + B_LEVEL (bh), level); + return 0; + } + if (level == DISK_LEAF_NODE_LEVEL) + return is_leaf (bh->b_data, bh->b_size, bh); + + return is_internal (bh->b_data, bh->b_size, bh); +} + + + +#define SEARCH_BY_KEY_READA 16 + +/* The function is NOT SCHEDULE-SAFE! */ +static void search_by_key_reada (struct super_block * s, + struct buffer_head **bh, + unsigned long *b, int num) +{ + int i,j; + + for (i = 0 ; i < num ; i++) { + bh[i] = sb_getblk (s, b[i]); + } + for (j = 0 ; j < i ; j++) { + /* + * note, this needs attention if we are getting rid of the BKL + * you have to make sure the prepared bit isn't set on this buffer + */ + if (!buffer_uptodate(bh[j])) + ll_rw_block(READA, 1, bh + j); + brelse(bh[j]); + } +} + +/************************************************************************** + * Algorithm SearchByKey * + * look for item in the Disk S+Tree by its key * + * Input: p_s_sb - super block * + * p_s_key - pointer to the key to search * + * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR * + * p_s_search_path - path from the root to the needed leaf * + **************************************************************************/ + +/* This function fills up the path from the root to the leaf as it + descends the tree looking for the key. It uses reiserfs_bread to + try to find buffers in the cache given their block number. If it + does not find them in the cache it reads them from disk. For each + node search_by_key finds using reiserfs_bread it then uses + bin_search to look through that node. bin_search will find the + position of the block_number of the next node if it is looking + through an internal node. If it is looking through a leaf node + bin_search will find the position of the item which has key either + equal to given key, or which is the maximal key less than the given + key. search_by_key returns a path that must be checked for the + correctness of the top of the path but need not be checked for the + correctness of the bottom of the path */ +/* The function is NOT SCHEDULE-SAFE! */ +int search_by_key (struct super_block * p_s_sb, + const struct cpu_key * p_s_key, /* Key to search. */ + struct path * p_s_search_path, /* This structure was + allocated and initialized + by the calling + function. It is filled up + by this function. */ + int n_stop_level /* How far down the tree to search. To + stop at leaf level - set to + DISK_LEAF_NODE_LEVEL */ + ) { + int n_block_number; + int expected_level; + struct buffer_head * p_s_bh; + struct path_element * p_s_last_element; + int n_node_level, n_retval; + int right_neighbor_of_leaf_node; + int fs_gen; + struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; + unsigned long reada_blocks[SEARCH_BY_KEY_READA]; + int reada_count = 0; + +#ifdef CONFIG_REISERFS_CHECK + int n_repeat_counter = 0; +#endif + + PROC_INFO_INC( p_s_sb, search_by_key ); + + /* As we add each node to a path we increase its count. This means that + we must be careful to release all nodes in a path before we either + discard the path struct or re-use the path struct, as we do here. */ + + decrement_counters_in_path(p_s_search_path); + + right_neighbor_of_leaf_node = 0; + + /* With each iteration of this loop we search through the items in the + current node, and calculate the next current node(next path element) + for the next iteration of this loop.. */ + n_block_number = SB_ROOT_BLOCK (p_s_sb); + expected_level = -1; + while ( 1 ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( !(++n_repeat_counter % 50000) ) + reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:" + "there were %d iterations of while loop " + "looking for key %K", + current->comm, n_repeat_counter, p_s_key); +#endif + + /* prep path to have another element added to it. */ + p_s_last_element = PATH_OFFSET_PELEMENT(p_s_search_path, ++p_s_search_path->path_length); + fs_gen = get_generation (p_s_sb); + + /* Read the next tree node, and set the last element in the path to + have a pointer to it. */ + if ((p_s_bh = p_s_last_element->pe_buffer = + sb_getblk(p_s_sb, n_block_number)) ) { + if (!buffer_uptodate(p_s_bh) && reada_count > 1) { + search_by_key_reada (p_s_sb, reada_bh, + reada_blocks, reada_count); + } + ll_rw_block(READ, 1, &p_s_bh); + wait_on_buffer(p_s_bh); + if (!buffer_uptodate(p_s_bh)) + goto io_error; + } else { +io_error: + p_s_search_path->path_length --; + pathrelse(p_s_search_path); + return IO_ERROR; + } + reada_count = 0; + if (expected_level == -1) + expected_level = SB_TREE_HEIGHT (p_s_sb); + expected_level --; + + /* It is possible that schedule occurred. We must check whether the key + to search is still in the tree rooted from the current buffer. If + not then repeat search from the root. */ + if ( fs_changed (fs_gen, p_s_sb) && + (!B_IS_IN_TREE (p_s_bh) || + B_LEVEL(p_s_bh) != expected_level || + !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) { + PROC_INFO_INC( p_s_sb, search_by_key_fs_changed ); + PROC_INFO_INC( p_s_sb, search_by_key_restarted ); + PROC_INFO_INC( p_s_sb, sbk_restarted[ expected_level - 1 ] ); + decrement_counters_in_path(p_s_search_path); + + /* Get the root block number so that we can repeat the search + starting from the root. */ + n_block_number = SB_ROOT_BLOCK (p_s_sb); + expected_level = -1; + right_neighbor_of_leaf_node = 0; + + /* repeat search from the root */ + continue; + } + + /* only check that the key is in the buffer if p_s_key is not + equal to the MAX_KEY. Latter case is only possible in + "finish_unfinished()" processing during mount. */ + RFALSE( comp_keys( &MAX_KEY, p_s_key ) && + ! key_in_buffer(p_s_search_path, p_s_key, p_s_sb), + "PAP-5130: key is not in the buffer"); +#ifdef CONFIG_REISERFS_CHECK + if ( cur_tb ) { + print_cur_tb ("5140"); + reiserfs_panic(p_s_sb, "PAP-5140: search_by_key: schedule occurred in do_balance!"); + } +#endif + + // make sure, that the node contents look like a node of + // certain level + if (!is_tree_node (p_s_bh, expected_level)) { + reiserfs_warning (p_s_sb, "vs-5150: search_by_key: " + "invalid format found in block %ld. Fsck?", + p_s_bh->b_blocknr); + pathrelse (p_s_search_path); + return IO_ERROR; + } + + /* ok, we have acquired next formatted node in the tree */ + n_node_level = B_LEVEL (p_s_bh); + + PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level - 1 ); + + RFALSE( n_node_level < n_stop_level, + "vs-5152: tree level (%d) is less than stop level (%d)", + n_node_level, n_stop_level); + + n_retval = bin_search( p_s_key, B_N_PITEM_HEAD(p_s_bh, 0), + B_NR_ITEMS(p_s_bh), + ( n_node_level == DISK_LEAF_NODE_LEVEL ) ? IH_SIZE : KEY_SIZE, + &(p_s_last_element->pe_position)); + if (n_node_level == n_stop_level) { + return n_retval; + } + + /* we are not in the stop level */ + if (n_retval == ITEM_FOUND) + /* item has been found, so we choose the pointer which is to the right of the found one */ + p_s_last_element->pe_position++; + + /* if item was not found we choose the position which is to + the left of the found item. This requires no code, + bin_search did it already.*/ + + /* So we have chosen a position in the current node which is + an internal node. Now we calculate child block number by + position in the node. */ + n_block_number = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position); + + /* if we are going to read leaf nodes, try for read ahead as well */ + if ((p_s_search_path->reada & PATH_READA) && + n_node_level == DISK_LEAF_NODE_LEVEL + 1) + { + int pos = p_s_last_element->pe_position; + int limit = B_NR_ITEMS(p_s_bh); + struct reiserfs_key *le_key; + + if (p_s_search_path->reada & PATH_READA_BACK) + limit = 0; + while(reada_count < SEARCH_BY_KEY_READA) { + if (pos == limit) + break; + reada_blocks[reada_count++] = B_N_CHILD_NUM(p_s_bh, pos); + if (p_s_search_path->reada & PATH_READA_BACK) + pos--; + else + pos++; + + /* + * check to make sure we're in the same object + */ + le_key = B_N_PDELIM_KEY(p_s_bh, pos); + if (le32_to_cpu(le_key->k_objectid) != + p_s_key->on_disk_key.k_objectid) + { + break; + } + } + } + } +} + + +/* Form the path to an item and position in this item which contains + file byte defined by p_s_key. If there is no such item + corresponding to the key, we point the path to the item with + maximal key less than p_s_key, and *p_n_pos_in_item is set to one + past the last entry/byte in the item. If searching for entry in a + directory item, and it is not found, *p_n_pos_in_item is set to one + entry more than the entry with maximal key which is less than the + sought key. + + Note that if there is no entry in this same node which is one more, + then we point to an imaginary entry. for direct items, the + position is in units of bytes, for indirect items the position is + in units of blocknr entries, for directory items the position is in + units of directory entries. */ + +/* The function is NOT SCHEDULE-SAFE! */ +int search_for_position_by_key (struct super_block * p_s_sb, /* Pointer to the super block. */ + const struct cpu_key * p_cpu_key, /* Key to search (cpu variable) */ + struct path * p_s_search_path /* Filled up by this function. */ + ) { + struct item_head * p_le_ih; /* pointer to on-disk structure */ + int n_blk_size; + loff_t item_offset, offset; + struct reiserfs_dir_entry de; + int retval; + + /* If searching for directory entry. */ + if ( is_direntry_cpu_key (p_cpu_key) ) + return search_by_entry_key (p_s_sb, p_cpu_key, p_s_search_path, &de); + + /* If not searching for directory entry. */ + + /* If item is found. */ + retval = search_item (p_s_sb, p_cpu_key, p_s_search_path); + if (retval == IO_ERROR) + return retval; + if ( retval == ITEM_FOUND ) { + + RFALSE( ! ih_item_len( + B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), + PATH_LAST_POSITION(p_s_search_path))), + "PAP-5165: item length equals zero"); + + pos_in_item(p_s_search_path) = 0; + return POSITION_FOUND; + } + + RFALSE( ! PATH_LAST_POSITION(p_s_search_path), + "PAP-5170: position equals zero"); + + /* Item is not found. Set path to the previous item. */ + p_le_ih = B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), --PATH_LAST_POSITION(p_s_search_path)); + n_blk_size = p_s_sb->s_blocksize; + + if (comp_short_keys (&(p_le_ih->ih_key), p_cpu_key)) { + return FILE_NOT_FOUND; + } + + // FIXME: quite ugly this far + + item_offset = le_ih_k_offset (p_le_ih); + offset = cpu_key_k_offset (p_cpu_key); + + /* Needed byte is contained in the item pointed to by the path.*/ + if (item_offset <= offset && + item_offset + op_bytes_number (p_le_ih, n_blk_size) > offset) { + pos_in_item (p_s_search_path) = offset - item_offset; + if ( is_indirect_le_ih(p_le_ih) ) { + pos_in_item (p_s_search_path) /= n_blk_size; + } + return POSITION_FOUND; + } + + /* Needed byte is not contained in the item pointed to by the + path. Set pos_in_item out of the item. */ + if ( is_indirect_le_ih (p_le_ih) ) + pos_in_item (p_s_search_path) = ih_item_len(p_le_ih) / UNFM_P_SIZE; + else + pos_in_item (p_s_search_path) = ih_item_len( p_le_ih ); + + return POSITION_NOT_FOUND; +} + + +/* Compare given item and item pointed to by the path. */ +int comp_items (const struct item_head * stored_ih, const struct path * p_s_path) +{ + struct buffer_head * p_s_bh; + struct item_head * ih; + + /* Last buffer at the path is not in the tree. */ + if ( ! B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)) ) + return 1; + + /* Last path position is invalid. */ + if ( PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh) ) + return 1; + + /* we need only to know, whether it is the same item */ + ih = get_ih (p_s_path); + return memcmp (stored_ih, ih, IH_SIZE); +} + + +/* unformatted nodes are not logged anymore, ever. This is safe +** now +*/ +#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) + +// block can not be forgotten as it is in I/O or held by someone +#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) + + + +// prepare for delete or cut of direct item +static inline int prepare_for_direct_item (struct path * path, + struct item_head * le_ih, + struct inode * inode, + loff_t new_file_length, + int * cut_size) +{ + loff_t round_len; + + + if ( new_file_length == max_reiserfs_offset (inode) ) { + /* item has to be deleted */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + + // new file gets truncated + if (get_inode_item_key_version (inode) == KEY_FORMAT_3_6) { + // + round_len = ROUND_UP (new_file_length); + /* this was n_new_file_length < le_ih ... */ + if ( round_len < le_ih_k_offset (le_ih) ) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + pos_in_item (path) = round_len - (le_ih_k_offset (le_ih) - 1); + *cut_size = -(ih_item_len(le_ih) - pos_in_item(path)); + + return M_CUT; /* Cut from this item. */ + } + + + // old file: items may have any length + + if ( new_file_length < le_ih_k_offset (le_ih) ) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + *cut_size = -(ih_item_len(le_ih) - + (pos_in_item (path) = new_file_length + 1 - le_ih_k_offset (le_ih))); + return M_CUT; /* Cut from this item. */ +} + + +static inline int prepare_for_direntry_item (struct path * path, + struct item_head * le_ih, + struct inode * inode, + loff_t new_file_length, + int * cut_size) +{ + if (le_ih_k_offset (le_ih) == DOT_OFFSET && + new_file_length == max_reiserfs_offset (inode)) { + RFALSE( ih_entry_count (le_ih) != 2, + "PAP-5220: incorrect empty directory item (%h)", le_ih); + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete the directory item containing "." and ".." entry. */ + } + + if ( ih_entry_count (le_ih) == 1 ) { + /* Delete the directory item such as there is one record only + in this item*/ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + + /* Cut one record from the directory item. */ + *cut_size = -(DEH_SIZE + entry_length (get_last_bh (path), le_ih, pos_in_item (path))); + return M_CUT; +} + + +/* If the path points to a directory or direct item, calculate mode and the size cut, for balance. + If the path points to an indirect item, remove some number of its unformatted nodes. + In case of file truncate calculate whether this item must be deleted/truncated or last + unformatted node of this item will be converted to a direct item. + This function returns a determination of what balance mode the calling function should employ. */ +static char prepare_for_delete_or_cut( + struct reiserfs_transaction_handle *th, + struct inode * inode, + struct path * p_s_path, + const struct cpu_key * p_s_item_key, + int * p_n_removed, /* Number of unformatted nodes which were removed + from end of the file. */ + int * p_n_cut_size, + unsigned long long n_new_file_length /* MAX_KEY_OFFSET in case of delete. */ + ) { + struct super_block * p_s_sb = inode->i_sb; + struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_path); + struct buffer_head * p_s_bh = PATH_PLAST_BUFFER(p_s_path); + + BUG_ON (!th->t_trans_id); + + /* Stat_data item. */ + if ( is_statdata_le_ih (p_le_ih) ) { + + RFALSE( n_new_file_length != max_reiserfs_offset (inode), + "PAP-5210: mode must be M_DELETE"); + + *p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); + return M_DELETE; + } + + + /* Directory item. */ + if ( is_direntry_le_ih (p_le_ih) ) + return prepare_for_direntry_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size); + + /* Direct item. */ + if ( is_direct_le_ih (p_le_ih) ) + return prepare_for_direct_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size); + + + /* Case of an indirect item. */ + { + int n_unfm_number, /* Number of the item unformatted nodes. */ + n_counter, + n_blk_size; + __u32 * p_n_unfm_pointer; /* Pointer to the unformatted node number. */ + __u32 tmp; + struct item_head s_ih; /* Item header. */ + char c_mode; /* Returned mode of the balance. */ + int need_research; + + + n_blk_size = p_s_sb->s_blocksize; + + /* Search for the needed object indirect item until there are no unformatted nodes to be removed. */ + do { + need_research = 0; + p_s_bh = PATH_PLAST_BUFFER(p_s_path); + /* Copy indirect item header to a temp variable. */ + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); + /* Calculate number of unformatted nodes in this item. */ + n_unfm_number = I_UNFM_NUM(&s_ih); + + RFALSE( ! is_indirect_le_ih(&s_ih) || ! n_unfm_number || + pos_in_item (p_s_path) + 1 != n_unfm_number, + "PAP-5240: invalid item %h " + "n_unfm_number = %d *p_n_pos_in_item = %d", + &s_ih, n_unfm_number, pos_in_item (p_s_path)); + + /* Calculate balance mode and position in the item to remove unformatted nodes. */ + if ( n_new_file_length == max_reiserfs_offset (inode) ) {/* Case of delete. */ + pos_in_item (p_s_path) = 0; + *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih)); + c_mode = M_DELETE; + } + else { /* Case of truncate. */ + if ( n_new_file_length < le_ih_k_offset (&s_ih) ) { + pos_in_item (p_s_path) = 0; + *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih)); + c_mode = M_DELETE; /* Delete this item. */ + } + else { + /* indirect item must be truncated starting from *p_n_pos_in_item-th position */ + pos_in_item (p_s_path) = (n_new_file_length + n_blk_size - le_ih_k_offset (&s_ih) ) >> p_s_sb->s_blocksize_bits; + + RFALSE( pos_in_item (p_s_path) > n_unfm_number, + "PAP-5250: invalid position in the item"); + + /* Either convert last unformatted node of indirect item to direct item or increase + its free space. */ + if ( pos_in_item (p_s_path) == n_unfm_number ) { + *p_n_cut_size = 0; /* Nothing to cut. */ + return M_CONVERT; /* Maybe convert last unformatted node to the direct item. */ + } + /* Calculate size to cut. */ + *p_n_cut_size = -(ih_item_len(&s_ih) - pos_in_item(p_s_path) * UNFM_P_SIZE); + + c_mode = M_CUT; /* Cut from this indirect item. */ + } + } + + RFALSE( n_unfm_number <= pos_in_item (p_s_path), + "PAP-5260: invalid position in the indirect item"); + + /* pointers to be cut */ + n_unfm_number -= pos_in_item (p_s_path); + /* Set pointer to the last unformatted node pointer that is to be cut. */ + p_n_unfm_pointer = (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1 - *p_n_removed; + + + /* We go through the unformatted nodes pointers of the indirect + item and look for the unformatted nodes in the cache. If we + found some of them we free it, zero corresponding indirect item + entry and log buffer containing that indirect item. For this we + need to prepare last path element for logging. If some + unformatted node has b_count > 1 we must not free this + unformatted node since it is in use. */ + reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1); + // note: path could be changed, first line in for loop takes care + // of it + + for (n_counter = *p_n_removed; + n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { + + cond_resched(); + if (item_moved (&s_ih, p_s_path)) { + need_research = 1 ; + break; + } + RFALSE( p_n_unfm_pointer < (__u32 *)B_I_PITEM(p_s_bh, &s_ih) || + p_n_unfm_pointer > (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1, + "vs-5265: pointer out of range"); + + /* Hole, nothing to remove. */ + if ( ! get_block_num(p_n_unfm_pointer,0) ) { + (*p_n_removed)++; + continue; + } + + (*p_n_removed)++; + + tmp = get_block_num(p_n_unfm_pointer,0); + put_block_num(p_n_unfm_pointer, 0, 0); + journal_mark_dirty (th, p_s_sb, p_s_bh); + reiserfs_free_block(th, inode, tmp, 1); + if ( item_moved (&s_ih, p_s_path) ) { + need_research = 1; + break ; + } + } + + /* a trick. If the buffer has been logged, this + ** will do nothing. If we've broken the loop without + ** logging it, it will restore the buffer + ** + */ + reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh); + + /* This loop can be optimized. */ + } while ( (*p_n_removed < n_unfm_number || need_research) && + search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND ); + + RFALSE( *p_n_removed < n_unfm_number, + "PAP-5310: indirect item is not found"); + RFALSE( item_moved (&s_ih, p_s_path), + "after while, comp failed, retry") ; + + if (c_mode == M_CUT) + pos_in_item (p_s_path) *= UNFM_P_SIZE; + return c_mode; + } +} + +/* Calculate number of bytes which will be deleted or cut during balance */ +static int calc_deleted_bytes_number( + struct tree_balance * p_s_tb, + char c_mode + ) { + int n_del_size; + struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path); + + if ( is_statdata_le_ih (p_le_ih) ) + return 0; + + n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0]; + if ( is_direntry_le_ih (p_le_ih) ) { + // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */ + // we can't use EMPTY_DIR_SIZE, as old format dirs have a different + // empty size. ick. FIXME, is this right? + // + return n_del_size ; + } + + if ( is_indirect_le_ih (p_le_ih) ) + n_del_size = (n_del_size/UNFM_P_SIZE)* + (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);// - get_ih_free_space (p_le_ih); + return n_del_size; +} + +static void init_tb_struct( + struct reiserfs_transaction_handle *th, + struct tree_balance * p_s_tb, + struct super_block * p_s_sb, + struct path * p_s_path, + int n_size + ) { + + BUG_ON (!th->t_trans_id); + + memset (p_s_tb,'\0',sizeof(struct tree_balance)); + p_s_tb->transaction_handle = th ; + p_s_tb->tb_sb = p_s_sb; + p_s_tb->tb_path = p_s_path; + PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; + PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; + p_s_tb->insert_size[0] = n_size; +} + + + +void padd_item (char * item, int total_length, int length) +{ + int i; + + for (i = total_length; i > length; ) + item [--i] = 0; +} + +#ifdef REISERQUOTA_DEBUG +char key2type(struct reiserfs_key *ih) +{ + if (is_direntry_le_key(2, ih)) + return 'd'; + if (is_direct_le_key(2, ih)) + return 'D'; + if (is_indirect_le_key(2, ih)) + return 'i'; + if (is_statdata_le_key(2, ih)) + return 's'; + return 'u'; +} + +char head2type(struct item_head *ih) +{ + if (is_direntry_le_ih(ih)) + return 'd'; + if (is_direct_le_ih(ih)) + return 'D'; + if (is_indirect_le_ih(ih)) + return 'i'; + if (is_statdata_le_ih(ih)) + return 's'; + return 'u'; +} +#endif + +/* Delete object item. */ +int reiserfs_delete_item (struct reiserfs_transaction_handle *th, + struct path * p_s_path, /* Path to the deleted item. */ + const struct cpu_key * p_s_item_key, /* Key to search for the deleted item. */ + struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */ + struct buffer_head * p_s_un_bh) /* NULL or unformatted node pointer. */ +{ + struct super_block * p_s_sb = p_s_inode->i_sb; + struct tree_balance s_del_balance; + struct item_head s_ih; + struct item_head *q_ih; + int quota_cut_bytes; + int n_ret_value, + n_del_size, + n_removed; + +#ifdef CONFIG_REISERFS_CHECK + char c_mode; + int n_iter = 0; +#endif + + BUG_ON (!th->t_trans_id); + + init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, 0/*size is unknown*/); + + while ( 1 ) { + n_removed = 0; + +#ifdef CONFIG_REISERFS_CHECK + n_iter++; + c_mode = +#endif + prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, &n_del_size, max_reiserfs_offset (p_s_inode)); + + RFALSE( c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); + + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); + s_del_balance.insert_size[0] = n_del_size; + + n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); + if ( n_ret_value != REPEAT_SEARCH ) + break; + + PROC_INFO_INC( p_s_sb, delete_item_restarted ); + + // file system changed, repeat search + n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); + if (n_ret_value == IO_ERROR) + break; + if (n_ret_value == FILE_NOT_FOUND) { + reiserfs_warning (p_s_sb, "vs-5340: reiserfs_delete_item: " + "no items of the file %K found", p_s_item_key); + break; + } + } /* while (1) */ + + if ( n_ret_value != CARRY_ON ) { + unfix_nodes(&s_del_balance); + return 0; + } + + // reiserfs_delete_item returns item length when success + n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); + q_ih = get_ih(p_s_path) ; + quota_cut_bytes = ih_item_len(q_ih) ; + + /* hack so the quota code doesn't have to guess if the file + ** has a tail. On tail insert, we allocate quota for 1 unformatted node. + ** We test the offset because the tail might have been + ** split into multiple items, and we only want to decrement for + ** the unfm node once + */ + if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) { + if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) { + quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0 ; + } + } + + if ( p_s_un_bh ) { + int off; + char *data ; + + /* We are in direct2indirect conversion, so move tail contents + to the unformatted node */ + /* note, we do the copy before preparing the buffer because we + ** don't care about the contents of the unformatted node yet. + ** the only thing we really care about is the direct item's data + ** is in the unformatted node. + ** + ** Otherwise, we would have to call reiserfs_prepare_for_journal on + ** the unformatted node, which might schedule, meaning we'd have to + ** loop all the way back up to the start of the while loop. + ** + ** The unformatted node must be dirtied later on. We can't be + ** sure here if the entire tail has been deleted yet. + ** + ** p_s_un_bh is from the page cache (all unformatted nodes are + ** from the page cache) and might be a highmem page. So, we + ** can't use p_s_un_bh->b_data. + ** -clm + */ + + data = kmap_atomic(p_s_un_bh->b_page, KM_USER0); + off = ((le_ih_k_offset (&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); + memcpy(data + off, + B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value); + kunmap_atomic(data, KM_USER0); + } + /* Perform balancing after all resources have been collected at once. */ + do_balance(&s_del_balance, NULL, NULL, M_DELETE); + +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (p_s_sb, REISERFS_DEBUG_CODE, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih)); +#endif + DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); + + /* Return deleted body length */ + return n_ret_value; +} + + +/* Summary Of Mechanisms For Handling Collisions Between Processes: + + deletion of the body of the object is performed by iput(), with the + result that if multiple processes are operating on a file, the + deletion of the body of the file is deferred until the last process + that has an open inode performs its iput(). + + writes and truncates are protected from collisions by use of + semaphores. + + creates, linking, and mknod are protected from collisions with other + processes by making the reiserfs_add_entry() the last step in the + creation, and then rolling back all changes if there was a collision. + - Hans +*/ + + +/* this deletes item which never gets split */ +void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th, + struct inode *inode, + struct reiserfs_key * key) +{ + struct tree_balance tb; + INITIALIZE_PATH (path); + int item_len = 0; + int tb_init = 0 ; + struct cpu_key cpu_key; + int retval; + int quota_cut_bytes = 0; + + BUG_ON (!th->t_trans_id); + + le_key2cpu_key (&cpu_key, key); + + while (1) { + retval = search_item (th->t_super, &cpu_key, &path); + if (retval == IO_ERROR) { + reiserfs_warning (th->t_super, + "vs-5350: reiserfs_delete_solid_item: " + "i/o failure occurred trying to delete %K", + &cpu_key); + break; + } + if (retval != ITEM_FOUND) { + pathrelse (&path); + // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir + if ( !( (unsigned long long) GET_HASH_VALUE (le_key_k_offset (le_key_version (key), key)) == 0 && \ + (unsigned long long) GET_GENERATION_NUMBER (le_key_k_offset (le_key_version (key), key)) == 1 ) ) + reiserfs_warning (th->t_super, "vs-5355: reiserfs_delete_solid_item: %k not found", key); + break; + } + if (!tb_init) { + tb_init = 1 ; + item_len = ih_item_len( PATH_PITEM_HEAD(&path) ); + init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len)); + } + quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ; + + retval = fix_nodes (M_DELETE, &tb, NULL, NULL); + if (retval == REPEAT_SEARCH) { + PROC_INFO_INC( th -> t_super, delete_solid_item_restarted ); + continue; + } + + if (retval == CARRY_ON) { + do_balance (&tb, NULL, NULL, M_DELETE); + if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota delete_solid_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, key2type(key)); +#endif + DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes); + } + break; + } + + // IO_ERROR, NO_DISK_SPACE, etc + reiserfs_warning (th->t_super, "vs-5360: reiserfs_delete_solid_item: " + "could not delete %K due to fix_nodes failure", &cpu_key); + unfix_nodes (&tb); + break; + } + + reiserfs_check_path(&path) ; +} + + +int reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * inode) +{ + int err; + inode->i_size = 0; + BUG_ON (!th->t_trans_id); + + /* for directory this deletes item containing "." and ".." */ + err = reiserfs_do_truncate (th, inode, NULL, 0/*no timestamp updates*/); + if (err) + return err; + +#if defined( USE_INODE_GENERATION_COUNTER ) + if( !old_format_only ( th -> t_super ) ) + { + __u32 *inode_generation; + + inode_generation = + &REISERFS_SB(th -> t_super) -> s_rs -> s_inode_generation; + *inode_generation = cpu_to_le32( le32_to_cpu( *inode_generation ) + 1 ); + } +/* USE_INODE_GENERATION_COUNTER */ +#endif + reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode)); + + return err; +} + +static void +unmap_buffers(struct page *page, loff_t pos) { + struct buffer_head *bh ; + struct buffer_head *head ; + struct buffer_head *next ; + unsigned long tail_index ; + unsigned long cur_index ; + + if (page) { + if (page_has_buffers(page)) { + tail_index = pos & (PAGE_CACHE_SIZE - 1) ; + cur_index = 0 ; + head = page_buffers(page) ; + bh = head ; + do { + next = bh->b_this_page ; + + /* we want to unmap the buffers that contain the tail, and + ** all the buffers after it (since the tail must be at the + ** end of the file). We don't want to unmap file data + ** before the tail, since it might be dirty and waiting to + ** reach disk + */ + cur_index += bh->b_size ; + if (cur_index > tail_index) { + reiserfs_unmap_buffer(bh) ; + } + bh = next ; + } while (bh != head) ; + if ( PAGE_SIZE == bh->b_size ) { + clear_page_dirty(page); + } + } + } +} + +static int maybe_indirect_to_direct (struct reiserfs_transaction_handle *th, + struct inode * p_s_inode, + struct page *page, + struct path * p_s_path, + const struct cpu_key * p_s_item_key, + loff_t n_new_file_size, + char * p_c_mode + ) { + struct super_block * p_s_sb = p_s_inode->i_sb; + int n_block_size = p_s_sb->s_blocksize; + int cut_bytes; + BUG_ON (!th->t_trans_id); + + if (n_new_file_size != p_s_inode->i_size) + BUG (); + + /* the page being sent in could be NULL if there was an i/o error + ** reading in the last block. The user will hit problems trying to + ** read the file, but for now we just skip the indirect2direct + */ + if (atomic_read(&p_s_inode->i_count) > 1 || + !tail_has_to_be_packed (p_s_inode) || + !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) { + // leave tail in an unformatted node + *p_c_mode = M_SKIP_BALANCING; + cut_bytes = n_block_size - (n_new_file_size & (n_block_size - 1)); + pathrelse(p_s_path); + return cut_bytes; + } + /* Permorm the conversion to a direct_item. */ + /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);*/ + return indirect2direct (th, p_s_inode, page, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); +} + + +/* we did indirect_to_direct conversion. And we have inserted direct + item successesfully, but there were no disk space to cut unfm + pointer being converted. Therefore we have to delete inserted + direct item(s) */ +static void indirect_to_direct_roll_back (struct reiserfs_transaction_handle *th, struct inode * inode, struct path * path) +{ + struct cpu_key tail_key; + int tail_len; + int removed; + BUG_ON (!th->t_trans_id); + + make_cpu_key (&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);// !!!! + tail_key.key_length = 4; + + tail_len = (cpu_key_k_offset (&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; + while (tail_len) { + /* look for the last byte of the tail */ + if (search_for_position_by_key (inode->i_sb, &tail_key, path) == POSITION_NOT_FOUND) + reiserfs_panic (inode->i_sb, "vs-5615: indirect_to_direct_roll_back: found invalid item"); + RFALSE( path->pos_in_item != ih_item_len(PATH_PITEM_HEAD (path)) - 1, + "vs-5616: appended bytes found"); + PATH_LAST_POSITION (path) --; + + removed = reiserfs_delete_item (th, path, &tail_key, inode, NULL/*unbh not needed*/); + RFALSE( removed <= 0 || removed > tail_len, + "vs-5617: there was tail %d bytes, removed item length %d bytes", + tail_len, removed); + tail_len -= removed; + set_cpu_key_k_offset (&tail_key, cpu_key_k_offset (&tail_key) - removed); + } + reiserfs_warning (inode->i_sb, "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space"); + //mark_file_without_tail (inode); + mark_inode_dirty (inode); +} + + +/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ +int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th, + struct path * p_s_path, + struct cpu_key * p_s_item_key, + struct inode * p_s_inode, + struct page *page, + loff_t n_new_file_size) +{ + struct super_block * p_s_sb = p_s_inode->i_sb; + /* Every function which is going to call do_balance must first + create a tree_balance structure. Then it must fill up this + structure by using the init_tb_struct and fix_nodes functions. + After that we can make tree balancing. */ + struct tree_balance s_cut_balance; + struct item_head *p_le_ih; + int n_cut_size = 0, /* Amount to be cut. */ + n_ret_value = CARRY_ON, + n_removed = 0, /* Number of the removed unformatted nodes. */ + n_is_inode_locked = 0; + char c_mode; /* Mode of the balance. */ + int retval2 = -1; + int quota_cut_bytes; + loff_t tail_pos = 0; + + BUG_ON (!th->t_trans_id); + + init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size); + + + /* Repeat this loop until we either cut the item without needing + to balance, or we fix_nodes without schedule occurring */ + while ( 1 ) { + /* Determine the balance mode, position of the first byte to + be cut, and size to be cut. In case of the indirect item + free unformatted nodes which are pointed to by the cut + pointers. */ + + c_mode = prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, + &n_cut_size, n_new_file_size); + if ( c_mode == M_CONVERT ) { + /* convert last unformatted node to direct item or leave + tail in the unformatted node */ + RFALSE( n_ret_value != CARRY_ON, "PAP-5570: can not convert twice"); + + n_ret_value = maybe_indirect_to_direct (th, p_s_inode, page, p_s_path, p_s_item_key, + n_new_file_size, &c_mode); + if ( c_mode == M_SKIP_BALANCING ) + /* tail has been left in the unformatted node */ + return n_ret_value; + + n_is_inode_locked = 1; + + /* removing of last unformatted node will change value we + have to return to truncate. Save it */ + retval2 = n_ret_value; + /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1));*/ + + /* So, we have performed the first part of the conversion: + inserting the new direct item. Now we are removing the + last unformatted node pointer. Set key to search for + it. */ + set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT); + p_s_item_key->key_length = 4; + n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1)); + tail_pos = n_new_file_size; + set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1); + if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){ + print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1); + reiserfs_panic(p_s_sb, "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)", p_s_item_key); + } + continue; + } + if (n_cut_size == 0) { + pathrelse (p_s_path); + return 0; + } + + s_cut_balance.insert_size[0] = n_cut_size; + + n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL); + if ( n_ret_value != REPEAT_SEARCH ) + break; + + PROC_INFO_INC( p_s_sb, cut_from_item_restarted ); + + n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); + if (n_ret_value == POSITION_FOUND) + continue; + + reiserfs_warning (p_s_sb, "PAP-5610: reiserfs_cut_from_item: item %K not found", p_s_item_key); + unfix_nodes (&s_cut_balance); + return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT; + } /* while */ + + // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) + if ( n_ret_value != CARRY_ON ) { + if ( n_is_inode_locked ) { + // FIXME: this seems to be not needed: we are always able + // to cut item + indirect_to_direct_roll_back (th, p_s_inode, p_s_path); + } + if (n_ret_value == NO_DISK_SPACE) + reiserfs_warning (p_s_sb, "NO_DISK_SPACE"); + unfix_nodes (&s_cut_balance); + return -EIO; + } + + /* go ahead and perform balancing */ + + RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode"); + + /* Calculate number of bytes that need to be cut from the item. */ + quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0]; + if (retval2 == -1) + n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode); + else + n_ret_value = retval2; + + + /* For direct items, we only change the quota when deleting the last + ** item. + */ + p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); + if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) { + if (c_mode == M_DELETE && + (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) { + // FIXME: this is to keep 3.5 happy + REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX; + quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ; + } else { + quota_cut_bytes = 0 ; + } + } +#ifdef CONFIG_REISERFS_CHECK + if (n_is_inode_locked) { + struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); + /* we are going to complete indirect2direct conversion. Make + sure, that we exactly remove last unformatted node pointer + of the item */ + if (!is_indirect_le_ih (le_ih)) + reiserfs_panic (p_s_sb, "vs-5652: reiserfs_cut_from_item: " + "item must be indirect %h", le_ih); + + if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) + reiserfs_panic (p_s_sb, "vs-5653: reiserfs_cut_from_item: " + "completing indirect2direct conversion indirect item %h " + "being deleted must be of 4 byte long", le_ih); + + if (c_mode == M_CUT && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { + reiserfs_panic (p_s_sb, "vs-5654: reiserfs_cut_from_item: " + "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)", + le_ih, s_cut_balance.insert_size[0]); + } + /* it would be useful to make sure, that right neighboring + item is direct item of this file */ + } +#endif + + do_balance(&s_cut_balance, NULL, NULL, c_mode); + if ( n_is_inode_locked ) { + /* we've done an indirect->direct conversion. when the data block + ** was freed, it was removed from the list of blocks that must + ** be flushed before the transaction commits, make sure to + ** unmap and invalidate it + */ + unmap_buffers(page, tail_pos); + REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask ; + } +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (p_s_inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, '?'); +#endif + DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); + return n_ret_value; +} + +static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode) +{ + BUG_ON (!th->t_trans_id); + if (inode->i_nlink) + reiserfs_warning (inode->i_sb, + "vs-5655: truncate_directory: link count != 0"); + + set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET); + set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY); + reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode)); + reiserfs_update_sd(th, inode) ; + set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET); + set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA); +} + + + + +/* Truncate file to the new size. Note, this must be called with a transaction + already started */ +int reiserfs_do_truncate (struct reiserfs_transaction_handle *th, + struct inode * p_s_inode, /* ->i_size contains new + size */ + struct page *page, /* up to date for last block */ + int update_timestamps /* when it is called by + file_release to convert + the tail - no timestamps + should be updated */ + ) { + INITIALIZE_PATH (s_search_path); /* Path to the current object item. */ + struct item_head * p_le_ih; /* Pointer to an item header. */ + struct cpu_key s_item_key; /* Key to search for a previous file item. */ + loff_t n_file_size, /* Old file size. */ + n_new_file_size;/* New file size. */ + int n_deleted; /* Number of deleted or truncated bytes. */ + int retval; + int err = 0; + + BUG_ON (!th->t_trans_id); + if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) ) + return 0; + + if (S_ISDIR(p_s_inode->i_mode)) { + // deletion of directory - no need to update timestamps + truncate_directory (th, p_s_inode); + return 0; + } + + /* Get new file size. */ + n_new_file_size = p_s_inode->i_size; + + // FIXME: note, that key type is unimportant here + make_cpu_key (&s_item_key, p_s_inode, max_reiserfs_offset (p_s_inode), TYPE_DIRECT, 3); + + retval = search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path); + if (retval == IO_ERROR) { + reiserfs_warning (p_s_inode->i_sb, "vs-5657: reiserfs_do_truncate: " + "i/o failure occurred trying to truncate %K", &s_item_key); + err = -EIO; + goto out; + } + if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { + reiserfs_warning (p_s_inode->i_sb, "PAP-5660: reiserfs_do_truncate: " + "wrong result %d of search for %K", retval, &s_item_key); + + err = -EIO; + goto out; + } + + s_search_path.pos_in_item --; + + /* Get real file size (total length of all file items) */ + p_le_ih = PATH_PITEM_HEAD(&s_search_path); + if ( is_statdata_le_ih (p_le_ih) ) + n_file_size = 0; + else { + loff_t offset = le_ih_k_offset (p_le_ih); + int bytes = op_bytes_number (p_le_ih,p_s_inode->i_sb->s_blocksize); + + /* this may mismatch with real file size: if last direct item + had no padding zeros and last unformatted node had no free + space, this file would have this file size */ + n_file_size = offset + bytes - 1; + } + /* + * are we doing a full truncate or delete, if so + * kick in the reada code + */ + if (n_new_file_size == 0) + s_search_path.reada = PATH_READA | PATH_READA_BACK; + + if ( n_file_size == 0 || n_file_size < n_new_file_size ) { + goto update_and_out ; + } + + /* Update key to search for the last file item. */ + set_cpu_key_k_offset (&s_item_key, n_file_size); + + do { + /* Cut or delete file item. */ + n_deleted = reiserfs_cut_from_item(th, &s_search_path, &s_item_key, p_s_inode, page, n_new_file_size); + if (n_deleted < 0) { + reiserfs_warning (p_s_inode->i_sb, "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed"); + reiserfs_check_path(&s_search_path) ; + return 0; + } + + RFALSE( n_deleted > n_file_size, + "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", + n_deleted, n_file_size, &s_item_key); + + /* Change key to search the last file item. */ + n_file_size -= n_deleted; + + set_cpu_key_k_offset (&s_item_key, n_file_size); + + /* While there are bytes to truncate and previous file item is presented in the tree. */ + + /* + ** This loop could take a really long time, and could log + ** many more blocks than a transaction can hold. So, we do a polite + ** journal end here, and if the transaction needs ending, we make + ** sure the file is consistent before ending the current trans + ** and starting a new one + */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + int orig_len_alloc = th->t_blocks_allocated ; + decrement_counters_in_path(&s_search_path) ; + + if (update_timestamps) { + p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC; + } + reiserfs_update_sd(th, p_s_inode) ; + + err = journal_end(th, p_s_inode->i_sb, orig_len_alloc) ; + if (err) + goto out; + err = journal_begin (th, p_s_inode->i_sb, + JOURNAL_PER_BALANCE_CNT * 6); + if (err) + goto out; + reiserfs_update_inode_transaction(p_s_inode) ; + } + } while ( n_file_size > ROUND_UP (n_new_file_size) && + search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path) == POSITION_FOUND ) ; + + RFALSE( n_file_size > ROUND_UP (n_new_file_size), + "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", + n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid); + +update_and_out: + if (update_timestamps) { + // this is truncate, not file closing + p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC; + } + reiserfs_update_sd (th, p_s_inode); + +out: + pathrelse(&s_search_path) ; + return err; +} + + +#ifdef CONFIG_REISERFS_CHECK +// this makes sure, that we __append__, not overwrite or add holes +static void check_research_for_paste (struct path * path, + const struct cpu_key * p_s_key) +{ + struct item_head * found_ih = get_ih (path); + + if (is_direct_le_ih (found_ih)) { + if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != + cpu_key_k_offset (p_s_key) || + op_bytes_number (found_ih, get_last_bh (path)->b_size) != pos_in_item (path)) + reiserfs_panic (NULL, "PAP-5720: check_research_for_paste: " + "found direct item %h or position (%d) does not match to key %K", + found_ih, pos_in_item (path), p_s_key); + } + if (is_indirect_le_ih (found_ih)) { + if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != cpu_key_k_offset (p_s_key) || + I_UNFM_NUM (found_ih) != pos_in_item (path) || + get_ih_free_space (found_ih) != 0) + reiserfs_panic (NULL, "PAP-5730: check_research_for_paste: " + "found indirect item (%h) or position (%d) does not match to key (%K)", + found_ih, pos_in_item (path), p_s_key); + } +} +#endif /* config reiserfs check */ + + +/* Paste bytes to the existing item. Returns bytes number pasted into the item. */ +int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, + struct path * p_s_search_path, /* Path to the pasted item. */ + const struct cpu_key * p_s_key, /* Key to search for the needed item.*/ + struct inode * inode, /* Inode item belongs to */ + const char * p_c_body, /* Pointer to the bytes to paste. */ + int n_pasted_size) /* Size of pasted bytes. */ +{ + struct tree_balance s_paste_balance; + int retval; + int fs_gen; + + BUG_ON (!th->t_trans_id); + + fs_gen = get_generation(inode->i_sb) ; + +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): allocating %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); +#endif + + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) { + pathrelse(p_s_search_path); + return -EDQUOT; + } + init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size); +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + s_paste_balance.key = p_s_key->on_disk_key; +#endif + + /* DQUOT_* can schedule, must check before the fix_nodes */ + if (fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == +REPEAT_SEARCH ) { +search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC( th -> t_super, paste_into_item_restarted ); + retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path); + if (retval == IO_ERROR) { + retval = -EIO ; + goto error_out ; + } + if (retval == POSITION_FOUND) { + reiserfs_warning (inode->i_sb, "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", p_s_key); + retval = -EEXIST ; + goto error_out ; + } + +#ifdef CONFIG_REISERFS_CHECK + check_research_for_paste (p_s_search_path, p_s_key); +#endif + } + + /* Perform balancing after all resources are collected by fix_nodes, and + accessing them will not risk triggering schedule. */ + if ( retval == CARRY_ON ) { + do_balance(&s_paste_balance, NULL/*ih*/, p_c_body, M_PASTE); + return 0; + } + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; +error_out: + /* this also releases the path */ + unfix_nodes(&s_paste_balance); +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): freeing %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); +#endif + DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size); + return retval ; +} + + +/* Insert new item into the buffer at the path. */ +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, + struct path * p_s_path, /* Path to the inserteded item. */ + const struct cpu_key * key, + struct item_head * p_s_ih, /* Pointer to the item header to insert.*/ + struct inode * inode, + const char * p_c_body) /* Pointer to the bytes to insert. */ +{ + struct tree_balance s_ins_balance; + int retval; + int fs_gen = 0 ; + int quota_bytes = 0 ; + + BUG_ON (!th->t_trans_id); + + if (inode) { /* Do we count quotas for item? */ + fs_gen = get_generation(inode->i_sb); + quota_bytes = ih_item_len(p_s_ih); + + /* hack so the quota code doesn't have to guess if the file has + ** a tail, links are always tails, so there's no guessing needed + */ + if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) { + quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ; + } +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih)); +#endif + /* We can't dirty inode here. It would be immediately written but + * appropriate stat item isn't inserted yet... */ + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) { + pathrelse(p_s_path); + return -EDQUOT; + } + } + init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih)); +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + s_ins_balance.key = key->on_disk_key; +#endif + /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */ + if (inode && fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) { +search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC( th -> t_super, insert_item_restarted ); + retval = search_item (th->t_super, key, p_s_path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out ; + } + if (retval == ITEM_FOUND) { + reiserfs_warning (th->t_super, "PAP-5760: reiserfs_insert_item: " + "key %K already exists in the tree", key); + retval = -EEXIST ; + goto error_out; + } + } + + /* make balancing after all resources will be collected at a time */ + if ( retval == CARRY_ON ) { + do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT); + return 0; + } + + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; +error_out: + /* also releases the path */ + unfix_nodes(&s_ins_balance); +#ifdef REISERQUOTA_DEBUG + reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih)); +#endif + if (inode) + DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ; + return retval; +} + + + + diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c new file mode 100644 index 000000000000..bcdf2438d152 --- /dev/null +++ b/fs/reiserfs/super.c @@ -0,0 +1,2148 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * Trivial changes by Alan Cox to add the LFS fixes + * + * Trivial Changes: + * Rights granted to Hans Reiser to redistribute under other terms providing + * he accepts all liability including but not limited to patent, fitness + * for purpose, and direct or indirect claims arising from failure to perform. + * + * NO WARRANTY + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/time.h> +#include <asm/uaccess.h> +#include <linux/reiserfs_fs.h> +#include <linux/reiserfs_acl.h> +#include <linux/reiserfs_xattr.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/vfs.h> +#include <linux/namespace.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/quotaops.h> + +struct file_system_type reiserfs_fs_type; + +static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING; +static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING; +static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING; + +int is_reiserfs_3_5 (struct reiserfs_super_block * rs) +{ + return !strncmp (rs->s_v1.s_magic, reiserfs_3_5_magic_string, + strlen (reiserfs_3_5_magic_string)); +} + + +int is_reiserfs_3_6 (struct reiserfs_super_block * rs) +{ + return !strncmp (rs->s_v1.s_magic, reiserfs_3_6_magic_string, + strlen (reiserfs_3_6_magic_string)); +} + + +int is_reiserfs_jr (struct reiserfs_super_block * rs) +{ + return !strncmp (rs->s_v1.s_magic, reiserfs_jr_magic_string, + strlen (reiserfs_jr_magic_string)); +} + + +static int is_any_reiserfs_magic_string (struct reiserfs_super_block * rs) +{ + return (is_reiserfs_3_5 (rs) || is_reiserfs_3_6 (rs) || + is_reiserfs_jr (rs)); +} + +static int reiserfs_remount (struct super_block * s, int * flags, char * data); +static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf); + +static int reiserfs_sync_fs (struct super_block * s, int wait) +{ + if (!(s->s_flags & MS_RDONLY)) { + struct reiserfs_transaction_handle th; + reiserfs_write_lock(s); + if (!journal_begin(&th, s, 1)) + if (!journal_end_sync(&th, s, 1)) + reiserfs_flush_old_commits(s); + s->s_dirt = 0; /* Even if it's not true. + * We'll loop forever in sync_supers otherwise */ + reiserfs_write_unlock(s); + } else { + s->s_dirt = 0; + } + return 0; +} + +static void reiserfs_write_super(struct super_block *s) +{ + reiserfs_sync_fs(s, 1); +} + +static void reiserfs_write_super_lockfs (struct super_block * s) +{ + struct reiserfs_transaction_handle th ; + reiserfs_write_lock(s); + if (!(s->s_flags & MS_RDONLY)) { + int err = journal_begin(&th, s, 1) ; + if (err) { + reiserfs_block_writes(&th) ; + } else { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + reiserfs_block_writes(&th) ; + journal_end_sync(&th, s, 1) ; + } + } + s->s_dirt = 0; + reiserfs_write_unlock(s); +} + +static void reiserfs_unlockfs(struct super_block *s) { + reiserfs_allow_writes(s) ; +} + +extern const struct reiserfs_key MAX_KEY; + + +/* this is used to delete "save link" when there are no items of a + file it points to. It can either happen if unlink is completed but + "save unlink" removal, or if file has both unlink and truncate + pending and as unlink completes first (because key of "save link" + protecting unlink is bigger that a key lf "save link" which + protects truncate), so there left no items to make truncate + completion on */ +static int remove_save_link_only (struct super_block * s, struct reiserfs_key * key, int oid_free) +{ + struct reiserfs_transaction_handle th; + int err; + + /* we are going to do one balancing */ + err = journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + reiserfs_delete_solid_item (&th, NULL, key); + if (oid_free) + /* removals are protected by direct items */ + reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid)); + + return journal_end (&th, s, JOURNAL_PER_BALANCE_CNT); +} + +#ifdef CONFIG_QUOTA +static int reiserfs_quota_on_mount(struct super_block *, int); +#endif + +/* look for uncompleted unlinks and truncates and complete them */ +static int finish_unfinished (struct super_block * s) +{ + INITIALIZE_PATH (path); + struct cpu_key max_cpu_key, obj_key; + struct reiserfs_key save_link_key; + int retval = 0; + struct item_head * ih; + struct buffer_head * bh; + int item_pos; + char * item; + int done; + struct inode * inode; + int truncate; +#ifdef CONFIG_QUOTA + int i; + int ms_active_set; +#endif + + + /* compose key to look for "save" links */ + max_cpu_key.version = KEY_FORMAT_3_5; + max_cpu_key.on_disk_key = MAX_KEY; + max_cpu_key.key_length = 3; + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + if (s->s_flags & MS_ACTIVE) { + ms_active_set = 0; + } else { + ms_active_set = 1; + s->s_flags |= MS_ACTIVE; + } + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + if (REISERFS_SB(s)->s_qf_names[i]) { + int ret = reiserfs_quota_on_mount(s, i); + if (ret < 0) + reiserfs_warning(s, "reiserfs: cannot turn on journalled quota: error %d", ret); + } + } +#endif + + done = 0; + REISERFS_SB(s)->s_is_unlinked_ok = 1; + while (!retval) { + retval = search_item (s, &max_cpu_key, &path); + if (retval != ITEM_NOT_FOUND) { + reiserfs_warning (s, "vs-2140: finish_unfinished: search_by_key returned %d", + retval); + break; + } + + bh = get_last_bh (&path); + item_pos = get_item_pos (&path); + if (item_pos != B_NR_ITEMS (bh)) { + reiserfs_warning (s, "vs-2060: finish_unfinished: wrong position found"); + break; + } + item_pos --; + ih = B_N_PITEM_HEAD (bh, item_pos); + + if (le32_to_cpu (ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) + /* there are no "save" links anymore */ + break; + + save_link_key = ih->ih_key; + if (is_indirect_le_ih (ih)) + truncate = 1; + else + truncate = 0; + + /* reiserfs_iget needs k_dirid and k_objectid only */ + item = B_I_PITEM (bh, ih); + obj_key.on_disk_key.k_dir_id = le32_to_cpu (*(__u32 *)item); + obj_key.on_disk_key.k_objectid = le32_to_cpu (ih->ih_key.k_objectid); + obj_key.on_disk_key.u.k_offset_v1.k_offset = 0; + obj_key.on_disk_key.u.k_offset_v1.k_uniqueness = 0; + + pathrelse (&path); + + inode = reiserfs_iget (s, &obj_key); + if (!inode) { + /* the unlink almost completed, it just did not manage to remove + "save" link and release objectid */ + reiserfs_warning (s, "vs-2180: finish_unfinished: iget failed for %K", + &obj_key); + retval = remove_save_link_only (s, &save_link_key, 1); + continue; + } + + if (!truncate && inode->i_nlink) { + /* file is not unlinked */ + reiserfs_warning (s, "vs-2185: finish_unfinished: file %K is not unlinked", + &obj_key); + retval = remove_save_link_only (s, &save_link_key, 0); + continue; + } + DQUOT_INIT(inode); + + if (truncate && S_ISDIR (inode->i_mode) ) { + /* We got a truncate request for a dir which is impossible. + The only imaginable way is to execute unfinished truncate request + then boot into old kernel, remove the file and create dir with + the same key. */ + reiserfs_warning(s, "green-2101: impossible truncate on a directory %k. Please report", INODE_PKEY (inode)); + retval = remove_save_link_only (s, &save_link_key, 0); + truncate = 0; + iput (inode); + continue; + } + + if (truncate) { + REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask; + /* not completed truncate found. New size was committed together + with "save" link */ + reiserfs_info (s, "Truncating %k to %Ld ..", + INODE_PKEY (inode), inode->i_size); + reiserfs_truncate_file (inode, 0/*don't update modification time*/); + retval = remove_save_link (inode, truncate); + } else { + REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask; + /* not completed unlink (rmdir) found */ + reiserfs_info (s, "Removing %k..", INODE_PKEY (inode)); + /* removal gets completed in iput */ + retval = 0; + } + + iput (inode); + printk ("done\n"); + done ++; + } + REISERFS_SB(s)->s_is_unlinked_ok = 0; + +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(s)->files[i]) + vfs_quota_off_mount(s, i); + } + if (ms_active_set) + /* Restore the flag back */ + s->s_flags &= ~MS_ACTIVE; +#endif + pathrelse (&path); + if (done) + reiserfs_info (s, "There were %d uncompleted unlinks/truncates. " + "Completed\n", done); + return retval; +} + +/* to protect file being unlinked from getting lost we "safe" link files + being unlinked. This link will be deleted in the same transaction with last + item of file. mounting the filesytem we scan all these links and remove + files which almost got lost */ +void add_save_link (struct reiserfs_transaction_handle * th, + struct inode * inode, int truncate) +{ + INITIALIZE_PATH (path); + int retval; + struct cpu_key key; + struct item_head ih; + __u32 link; + + BUG_ON (!th->t_trans_id); + + /* file can only get one "save link" of each kind */ + RFALSE( truncate && + ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ), + "saved link already exists for truncated inode %lx", + ( long ) inode -> i_ino ); + RFALSE( !truncate && + ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ), + "saved link already exists for unlinked inode %lx", + ( long ) inode -> i_ino ); + + /* setup key of "save" link */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID; + key.on_disk_key.k_objectid = inode->i_ino; + if (!truncate) { + /* unlink, rmdir, rename */ + set_cpu_key_k_offset (&key, 1 + inode->i_sb->s_blocksize); + set_cpu_key_k_type (&key, TYPE_DIRECT); + + /* item head of "safe" link */ + make_le_item_head (&ih, &key, key.version, 1 + inode->i_sb->s_blocksize, TYPE_DIRECT, + 4/*length*/, 0xffff/*free space*/); + } else { + /* truncate */ + if (S_ISDIR (inode->i_mode)) + reiserfs_warning(inode->i_sb, "green-2102: Adding a truncate savelink for a directory %k! Please report", INODE_PKEY(inode)); + set_cpu_key_k_offset (&key, 1); + set_cpu_key_k_type (&key, TYPE_INDIRECT); + + /* item head of "safe" link */ + make_le_item_head (&ih, &key, key.version, 1, TYPE_INDIRECT, + 4/*length*/, 0/*free space*/); + } + key.key_length = 3; + + /* look for its place in the tree */ + retval = search_item (inode->i_sb, &key, &path); + if (retval != ITEM_NOT_FOUND) { + if ( retval != -ENOSPC ) + reiserfs_warning (inode->i_sb, "vs-2100: add_save_link:" + "search_by_key (%K) returned %d", &key, retval); + pathrelse (&path); + return; + } + + /* body of "save" link */ + link = INODE_PKEY (inode)->k_dir_id; + + /* put "save" link inot tree, don't charge quota to anyone */ + retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link); + if (retval) { + if (retval != -ENOSPC) + reiserfs_warning (inode->i_sb, "vs-2120: add_save_link: insert_item returned %d", + retval); + } else { + if( truncate ) + REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask; + else + REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask; + } +} + + +/* this opens transaction unlike add_save_link */ +int remove_save_link (struct inode * inode, int truncate) +{ + struct reiserfs_transaction_handle th; + struct reiserfs_key key; + int err; + + /* we are going to do one balancing only */ + err = journal_begin (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + /* setup key of "save" link */ + key.k_dir_id = cpu_to_le32 (MAX_KEY_OBJECTID); + key.k_objectid = INODE_PKEY (inode)->k_objectid; + if (!truncate) { + /* unlink, rmdir, rename */ + set_le_key_k_offset (KEY_FORMAT_3_5, &key, + 1 + inode->i_sb->s_blocksize); + set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_DIRECT); + } else { + /* truncate */ + set_le_key_k_offset (KEY_FORMAT_3_5, &key, 1); + set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_INDIRECT); + } + + if( ( truncate && + ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ) ) || + ( !truncate && + ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ) ) ) + /* don't take quota bytes from anywhere */ + reiserfs_delete_solid_item (&th, NULL, &key); + if (!truncate) { + reiserfs_release_objectid (&th, inode->i_ino); + REISERFS_I(inode) -> i_flags &= ~i_link_saved_unlink_mask; + } else + REISERFS_I(inode) -> i_flags &= ~i_link_saved_truncate_mask; + + return journal_end (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); +} + + +static void reiserfs_put_super (struct super_block * s) +{ + int i; + struct reiserfs_transaction_handle th ; + th.t_trans_id = 0; + + if (REISERFS_SB(s)->xattr_root) { + d_invalidate (REISERFS_SB(s)->xattr_root); + dput (REISERFS_SB(s)->xattr_root); + } + + if (REISERFS_SB(s)->priv_root) { + d_invalidate (REISERFS_SB(s)->priv_root); + dput (REISERFS_SB(s)->priv_root); + } + + /* change file system state to current state if it was mounted with read-write permissions */ + if (!(s->s_flags & MS_RDONLY)) { + if (!journal_begin(&th, s, 10)) { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + set_sb_umount_state( SB_DISK_SUPER_BLOCK(s), REISERFS_SB(s)->s_mount_state ); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + } + } + + /* note, journal_release checks for readonly mount, and can decide not + ** to do a journal_end + */ + journal_release(&th, s) ; + + for (i = 0; i < SB_BMAP_NR (s); i ++) + brelse (SB_AP_BITMAP (s)[i].bh); + + vfree (SB_AP_BITMAP (s)); + + brelse (SB_BUFFER_WITH_SB (s)); + + print_statistics (s); + + if (REISERFS_SB(s)->s_kmallocs != 0) { + reiserfs_warning (s, "vs-2004: reiserfs_put_super: allocated memory left %d", + REISERFS_SB(s)->s_kmallocs); + } + + if (REISERFS_SB(s)->reserved_blocks != 0) { + reiserfs_warning (s, "green-2005: reiserfs_put_super: reserved blocks left %d", + REISERFS_SB(s)->reserved_blocks); + } + + reiserfs_proc_info_done( s ); + + kfree(s->s_fs_info); + s->s_fs_info = NULL; + + return; +} + +static kmem_cache_t * reiserfs_inode_cachep; + +static struct inode *reiserfs_alloc_inode(struct super_block *sb) +{ + struct reiserfs_inode_info *ei; + ei = (struct reiserfs_inode_info *)kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void reiserfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + INIT_LIST_HEAD(&ei->i_prealloc_list) ; + inode_init_once(&ei->vfs_inode); + ei->i_acl_access = NULL; + ei->i_acl_default = NULL; + } +} + +static int init_inodecache(void) +{ + reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", + sizeof(struct reiserfs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT, + init_once, NULL); + if (reiserfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + if (kmem_cache_destroy(reiserfs_inode_cachep)) + reiserfs_warning (NULL, "reiserfs_inode_cache: not all structures were freed"); +} + +/* we don't mark inodes dirty, we just log them */ +static void reiserfs_dirty_inode (struct inode * inode) { + struct reiserfs_transaction_handle th ; + + int err = 0; + if (inode->i_sb->s_flags & MS_RDONLY) { + reiserfs_warning(inode->i_sb, "clm-6006: writing inode %lu on readonly FS", + inode->i_ino) ; + return ; + } + reiserfs_write_lock(inode->i_sb); + + /* this is really only used for atime updates, so they don't have + ** to be included in O_SYNC or fsync + */ + err = journal_begin(&th, inode->i_sb, 1) ; + if (err) { + reiserfs_write_unlock (inode->i_sb); + return; + } + reiserfs_update_sd (&th, inode); + journal_end(&th, inode->i_sb, 1) ; + reiserfs_write_unlock(inode->i_sb); +} + +static void reiserfs_clear_inode (struct inode *inode) +{ + struct posix_acl *acl; + + acl = REISERFS_I(inode)->i_acl_access; + if (acl && !IS_ERR (acl)) + posix_acl_release (acl); + REISERFS_I(inode)->i_acl_access = NULL; + + acl = REISERFS_I(inode)->i_acl_default; + if (acl && !IS_ERR (acl)) + posix_acl_release (acl); + REISERFS_I(inode)->i_acl_default = NULL; +} + +#ifdef CONFIG_QUOTA +static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); +static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, loff_t); +#endif + +static struct super_operations reiserfs_sops = +{ + .alloc_inode = reiserfs_alloc_inode, + .destroy_inode = reiserfs_destroy_inode, + .write_inode = reiserfs_write_inode, + .dirty_inode = reiserfs_dirty_inode, + .delete_inode = reiserfs_delete_inode, + .clear_inode = reiserfs_clear_inode, + .put_super = reiserfs_put_super, + .write_super = reiserfs_write_super, + .sync_fs = reiserfs_sync_fs, + .write_super_lockfs = reiserfs_write_super_lockfs, + .unlockfs = reiserfs_unlockfs, + .statfs = reiserfs_statfs, + .remount_fs = reiserfs_remount, +#ifdef CONFIG_QUOTA + .quota_read = reiserfs_quota_read, + .quota_write = reiserfs_quota_write, +#endif +}; + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") + +static int reiserfs_dquot_initialize(struct inode *, int); +static int reiserfs_dquot_drop(struct inode *); +static int reiserfs_write_dquot(struct dquot *); +static int reiserfs_acquire_dquot(struct dquot *); +static int reiserfs_release_dquot(struct dquot *); +static int reiserfs_mark_dquot_dirty(struct dquot *); +static int reiserfs_write_info(struct super_block *, int); +static int reiserfs_quota_on(struct super_block *, int, int, char *); + +static struct dquot_operations reiserfs_quota_operations = +{ + .initialize = reiserfs_dquot_initialize, + .drop = reiserfs_dquot_drop, + .alloc_space = dquot_alloc_space, + .alloc_inode = dquot_alloc_inode, + .free_space = dquot_free_space, + .free_inode = dquot_free_inode, + .transfer = dquot_transfer, + .write_dquot = reiserfs_write_dquot, + .acquire_dquot = reiserfs_acquire_dquot, + .release_dquot = reiserfs_release_dquot, + .mark_dirty = reiserfs_mark_dquot_dirty, + .write_info = reiserfs_write_info, +}; + +static struct quotactl_ops reiserfs_qctl_operations = +{ + .quota_on = reiserfs_quota_on, + .quota_off = vfs_quota_off, + .quota_sync = vfs_quota_sync, + .get_info = vfs_get_dqinfo, + .set_info = vfs_set_dqinfo, + .get_dqblk = vfs_get_dqblk, + .set_dqblk = vfs_set_dqblk, +}; +#endif + +static struct export_operations reiserfs_export_ops = { + .encode_fh = reiserfs_encode_fh, + .decode_fh = reiserfs_decode_fh, + .get_parent = reiserfs_get_parent, + .get_dentry = reiserfs_get_dentry, +} ; + +/* this struct is used in reiserfs_getopt () for containing the value for those + mount options that have values rather than being toggles. */ +typedef struct { + char * value; + int setmask; /* bitmask which is to set on mount_options bitmask when this + value is found, 0 is no bits are to be changed. */ + int clrmask; /* bitmask which is to clear on mount_options bitmask when this + value is found, 0 is no bits are to be changed. This is + applied BEFORE setmask */ +} arg_desc_t; + +/* Set this bit in arg_required to allow empty arguments */ +#define REISERFS_OPT_ALLOWEMPTY 31 + +/* this struct is used in reiserfs_getopt() for describing the set of reiserfs + mount options */ +typedef struct { + char * option_name; + int arg_required; /* 0 if argument is not required, not 0 otherwise */ + const arg_desc_t * values; /* list of values accepted by an option */ + int setmask; /* bitmask which is to set on mount_options bitmask when this + value is found, 0 is no bits are to be changed. */ + int clrmask; /* bitmask which is to clear on mount_options bitmask when this + value is found, 0 is no bits are to be changed. This is + applied BEFORE setmask */ +} opt_desc_t; + +/* possible values for -o data= */ +static const arg_desc_t logging_mode[] = { + {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)}, + {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)}, + {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)}, + {NULL, 0} +}; + +/* possible values for -o barrier= */ +static const arg_desc_t barrier_mode[] = { + {"none", 1<<REISERFS_BARRIER_NONE, 1<<REISERFS_BARRIER_FLUSH}, + {"flush", 1<<REISERFS_BARRIER_FLUSH, 1<<REISERFS_BARRIER_NONE}, + {NULL, 0} +}; + +/* possible values for "-o block-allocator=" and bits which are to be set in + s_mount_opt of reiserfs specific part of in-core super block */ +static const arg_desc_t balloc[] = { + {"noborder", 1<<REISERFS_NO_BORDER, 0}, + {"border", 0, 1<<REISERFS_NO_BORDER}, + {"no_unhashed_relocation", 1<<REISERFS_NO_UNHASHED_RELOCATION, 0}, + {"hashed_relocation", 1<<REISERFS_HASHED_RELOCATION, 0}, + {"test4", 1<<REISERFS_TEST4, 0}, + {"notest4", 0, 1<<REISERFS_TEST4}, + {NULL, 0, 0} +}; + +static const arg_desc_t tails[] = { + {"on", 1<<REISERFS_LARGETAIL, 1<<REISERFS_SMALLTAIL}, + {"off", 0, (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)}, + {"small", 1<<REISERFS_SMALLTAIL, 1<<REISERFS_LARGETAIL}, + {NULL, 0, 0} +}; + +static const arg_desc_t error_actions[] = { + {"panic", 1 << REISERFS_ERROR_PANIC, + (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)}, + {"ro-remount", 1 << REISERFS_ERROR_RO, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)}, +#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG + {"continue", 1 << REISERFS_ERROR_CONTINUE, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)}, +#endif + {NULL, 0, 0}, +}; + +int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 128k. + There might be broken applications that are + confused by this. Use nolargeio mount option + to get usual i/o size = PAGE_SIZE. + */ + +/* proceed only one option from a list *cur - string containing of mount options + opts - array of options which are accepted + opt_arg - if option is found and requires an argument and if it is specifed + in the input - pointer to the argument is stored here + bit_flags - if option requires to set a certain bit - it is set here + return -1 if unknown option is found, opt->arg_required otherwise */ +static int reiserfs_getopt ( struct super_block * s, char ** cur, opt_desc_t * opts, char ** opt_arg, + unsigned long * bit_flags) +{ + char * p; + /* foo=bar, + ^ ^ ^ + | | +-- option_end + | +-- arg_start + +-- option_start + */ + const opt_desc_t * opt; + const arg_desc_t * arg; + + + p = *cur; + + /* assume argument cannot contain commas */ + *cur = strchr (p, ','); + if (*cur) { + *(*cur) = '\0'; + (*cur) ++; + } + + if ( !strncmp (p, "alloc=", 6) ) { + /* Ugly special case, probably we should redo options parser so that + it can understand several arguments for some options, also so that + it can fill several bitfields with option values. */ + if ( reiserfs_parse_alloc_options( s, p + 6) ) { + return -1; + } else { + return 0; + } + } + + + /* for every option in the list */ + for (opt = opts; opt->option_name; opt ++) { + if (!strncmp (p, opt->option_name, strlen (opt->option_name))) { + if (bit_flags) { + if (opt->clrmask == (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning (s, "%s not supported.", p); + else + *bit_flags &= ~opt->clrmask; + if (opt->setmask == (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning (s, "%s not supported.", p); + else + *bit_flags |= opt->setmask; + } + break; + } + } + if (!opt->option_name) { + reiserfs_warning (s, "unknown mount option \"%s\"", p); + return -1; + } + + p += strlen (opt->option_name); + switch (*p) { + case '=': + if (!opt->arg_required) { + reiserfs_warning (s, "the option \"%s\" does not require an argument", + opt->option_name); + return -1; + } + break; + + case 0: + if (opt->arg_required) { + reiserfs_warning (s, "the option \"%s\" requires an argument", opt->option_name); + return -1; + } + break; + default: + reiserfs_warning (s, "head of option \"%s\" is only correct", opt->option_name); + return -1; + } + + /* move to the argument, or to next option if argument is not required */ + p ++; + + if ( opt->arg_required && !(opt->arg_required & (1<<REISERFS_OPT_ALLOWEMPTY)) && !strlen (p) ) { + /* this catches "option=," if not allowed */ + reiserfs_warning (s, "empty argument for \"%s\"", opt->option_name); + return -1; + } + + if (!opt->values) { + /* *=NULLopt_arg contains pointer to argument */ + *opt_arg = p; + return opt->arg_required & ~(1<<REISERFS_OPT_ALLOWEMPTY); + } + + /* values possible for this option are listed in opt->values */ + for (arg = opt->values; arg->value; arg ++) { + if (!strcmp (p, arg->value)) { + if (bit_flags) { + *bit_flags &= ~arg->clrmask; + *bit_flags |= arg->setmask; + } + return opt->arg_required; + } + } + + reiserfs_warning (s, "bad value \"%s\" for option \"%s\"", p, opt->option_name); + return -1; +} + +/* returns 0 if something is wrong in option string, 1 - otherwise */ +static int reiserfs_parse_options (struct super_block * s, char * options, /* string given via mount's -o */ + unsigned long * mount_options, + /* after the parsing phase, contains the + collection of bitflags defining what + mount options were selected. */ + unsigned long * blocks, /* strtol-ed from NNN of resize=NNN */ + char ** jdev_name, + unsigned int * commit_max_age) +{ + int c; + char * arg = NULL; + char * pos; + opt_desc_t opts[] = { + /* Compatibility stuff, so that -o notail for old setups still work */ + {"tails", .arg_required = 't', .values = tails}, + {"notail", .clrmask = (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)}, + {"conv", .setmask = 1<<REISERFS_CONVERT}, + {"attrs", .setmask = 1<<REISERFS_ATTRS}, + {"noattrs", .clrmask = 1<<REISERFS_ATTRS}, +#ifdef CONFIG_REISERFS_FS_XATTR + {"user_xattr", .setmask = 1<<REISERFS_XATTRS_USER}, + {"nouser_xattr",.clrmask = 1<<REISERFS_XATTRS_USER}, +#else + {"user_xattr", .setmask = 1<<REISERFS_UNSUPPORTED_OPT}, + {"nouser_xattr",.clrmask = 1<<REISERFS_UNSUPPORTED_OPT}, +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + {"acl", .setmask = 1<<REISERFS_POSIXACL}, + {"noacl", .clrmask = 1<<REISERFS_POSIXACL}, +#else + {"acl", .setmask = 1<<REISERFS_UNSUPPORTED_OPT}, + {"noacl", .clrmask = 1<<REISERFS_UNSUPPORTED_OPT}, +#endif + {"nolog",}, /* This is unsupported */ + {"replayonly", .setmask = 1<<REPLAYONLY}, + {"block-allocator", .arg_required = 'a', .values = balloc}, + {"data", .arg_required = 'd', .values = logging_mode}, + {"barrier", .arg_required = 'b', .values = barrier_mode}, + {"resize", .arg_required = 'r', .values = NULL}, + {"jdev", .arg_required = 'j', .values = NULL}, + {"nolargeio", .arg_required = 'w', .values = NULL}, + {"commit", .arg_required = 'c', .values = NULL}, + {"usrquota",}, + {"grpquota",}, + {"errors", .arg_required = 'e', .values = error_actions}, + {"usrjquota", .arg_required = 'u'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL}, + {"grpjquota", .arg_required = 'g'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL}, + {"jqfmt", .arg_required = 'f', .values = NULL}, + {NULL,} + }; + + *blocks = 0; + if (!options || !*options) + /* use default configuration: create tails, journaling on, no + conversion to newest format */ + return 1; + + for (pos = options; pos; ) { + c = reiserfs_getopt (s, &pos, opts, &arg, mount_options); + if (c == -1) + /* wrong option is given */ + return 0; + + if (c == 'r') { + char * p; + + p = NULL; + /* "resize=NNN" */ + *blocks = simple_strtoul (arg, &p, 0); + if (*p != '\0') { + /* NNN does not look like a number */ + reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg); + return 0; + } + } + + if ( c == 'c' ) { + char *p = NULL; + unsigned long val = simple_strtoul (arg, &p, 0); + /* commit=NNN (time in seconds) */ + if ( *p != '\0' || val >= (unsigned int)-1) { + reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg); return 0; + } + *commit_max_age = (unsigned int)val; + } + + if ( c == 'w' ) { + char *p=NULL; + int val = simple_strtoul (arg, &p, 0); + + if ( *p != '\0') { + reiserfs_warning (s, "reiserfs_parse_options: non-numeric value %s for nolargeio option", arg); + return 0; + } + if ( val ) + reiserfs_default_io_size = PAGE_SIZE; + else + reiserfs_default_io_size = 128 * 1024; + } + + if (c == 'j') { + if (arg && *arg && jdev_name) { + if ( *jdev_name ) { //Hm, already assigned? + reiserfs_warning (s, "reiserfs_parse_options: journal device was already specified to be %s", *jdev_name); + return 0; + } + *jdev_name = arg; + } + } + +#ifdef CONFIG_QUOTA + if (c == 'u' || c == 'g') { + int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; + + if (sb_any_quota_enabled(s)) { + reiserfs_warning(s, "reiserfs_parse_options: cannot change journalled quota options when quota turned on."); + return 0; + } + if (*arg) { /* Some filename specified? */ + if (REISERFS_SB(s)->s_qf_names[qtype] && strcmp(REISERFS_SB(s)->s_qf_names[qtype], arg)) { + reiserfs_warning(s, "reiserfs_parse_options: %s quota file already specified.", QTYPE2NAME(qtype)); + return 0; + } + if (strchr(arg, '/')) { + reiserfs_warning(s, "reiserfs_parse_options: quotafile must be on filesystem root."); + return 0; + } + REISERFS_SB(s)->s_qf_names[qtype] = kmalloc(strlen(arg)+1, GFP_KERNEL); + if (!REISERFS_SB(s)->s_qf_names[qtype]) { + reiserfs_warning(s, "reiserfs_parse_options: not enough memory for storing quotafile name."); + return 0; + } + strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg); + } + else { + if (REISERFS_SB(s)->s_qf_names[qtype]) { + kfree(REISERFS_SB(s)->s_qf_names[qtype]); + REISERFS_SB(s)->s_qf_names[qtype] = NULL; + } + } + } + if (c == 'f') { + if (!strcmp(arg, "vfsold")) + REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD; + else if (!strcmp(arg, "vfsv0")) + REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0; + else { + reiserfs_warning(s, "reiserfs_parse_options: unknown quota format specified."); + return 0; + } + } +#else + if (c == 'u' || c == 'g' || c == 'f') { + reiserfs_warning(s, "reiserfs_parse_options: journalled quota options not supported."); + return 0; + } +#endif + } + +#ifdef CONFIG_QUOTA + if (!REISERFS_SB(s)->s_jquota_fmt && (REISERFS_SB(s)->s_qf_names[USRQUOTA] || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) { + reiserfs_warning(s, "reiserfs_parse_options: journalled quota format not specified."); + return 0; + } +#endif + return 1; +} + +static void switch_data_mode(struct super_block *s, unsigned long mode) { + REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | + (1 << REISERFS_DATA_ORDERED) | + (1 << REISERFS_DATA_WRITEBACK)); + REISERFS_SB(s)->s_mount_opt |= (1 << mode); +} + +static void handle_data_mode(struct super_block *s, unsigned long mount_options) +{ + if (mount_options & (1 << REISERFS_DATA_LOG)) { + if (!reiserfs_data_log(s)) { + switch_data_mode(s, REISERFS_DATA_LOG); + reiserfs_info (s, "switching to journaled data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { + if (!reiserfs_data_ordered(s)) { + switch_data_mode(s, REISERFS_DATA_ORDERED); + reiserfs_info (s, "switching to ordered data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { + if (!reiserfs_data_writeback(s)) { + switch_data_mode(s, REISERFS_DATA_WRITEBACK); + reiserfs_info (s, "switching to writeback data mode\n"); + } + } +} + +static void handle_barrier_mode(struct super_block *s, unsigned long bits) { + int flush = (1 << REISERFS_BARRIER_FLUSH); + int none = (1 << REISERFS_BARRIER_NONE); + int all_barrier = flush | none; + + if (bits & all_barrier) { + REISERFS_SB(s)->s_mount_opt &= ~all_barrier; + if (bits & flush) { + REISERFS_SB(s)->s_mount_opt |= flush; + printk("reiserfs: enabling write barrier flush mode\n"); + } else if (bits & none) { + REISERFS_SB(s)->s_mount_opt |= none; + printk("reiserfs: write barriers turned off\n"); + } + } +} + +static void handle_attrs( struct super_block *s ) +{ + struct reiserfs_super_block * rs; + + if( reiserfs_attrs( s ) ) { + rs = SB_DISK_SUPER_BLOCK (s); + if( old_format_only(s) ) { + reiserfs_warning(s, "reiserfs: cannot support attributes on 3.5.x disk format" ); + REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS ); + return; + } + if( !( le32_to_cpu( rs -> s_flags ) & reiserfs_attrs_cleared ) ) { + reiserfs_warning(s, "reiserfs: cannot support attributes until flag is set in super-block" ); + REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS ); + } + } +} + +static int reiserfs_remount (struct super_block * s, int * mount_flags, char * arg) +{ + struct reiserfs_super_block * rs; + struct reiserfs_transaction_handle th ; + unsigned long blocks; + unsigned long mount_options = REISERFS_SB(s)->s_mount_opt; + unsigned long safe_mask = 0; + unsigned int commit_max_age = (unsigned int)-1; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int err; +#ifdef CONFIG_QUOTA + int i; +#endif + + rs = SB_DISK_SUPER_BLOCK (s); + + if (!reiserfs_parse_options(s, arg, &mount_options, &blocks, NULL, &commit_max_age)) { +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + if (REISERFS_SB(s)->s_qf_names[i]) { + kfree(REISERFS_SB(s)->s_qf_names[i]); + REISERFS_SB(s)->s_qf_names[i] = NULL; + } +#endif + return -EINVAL; + } + + handle_attrs(s); + + /* Add options that are safe here */ + safe_mask |= 1 << REISERFS_SMALLTAIL; + safe_mask |= 1 << REISERFS_LARGETAIL; + safe_mask |= 1 << REISERFS_NO_BORDER; + safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION; + safe_mask |= 1 << REISERFS_HASHED_RELOCATION; + safe_mask |= 1 << REISERFS_TEST4; + safe_mask |= 1 << REISERFS_ATTRS; + safe_mask |= 1 << REISERFS_XATTRS_USER; + safe_mask |= 1 << REISERFS_POSIXACL; + safe_mask |= 1 << REISERFS_BARRIER_FLUSH; + safe_mask |= 1 << REISERFS_BARRIER_NONE; + safe_mask |= 1 << REISERFS_ERROR_RO; + safe_mask |= 1 << REISERFS_ERROR_CONTINUE; + safe_mask |= 1 << REISERFS_ERROR_PANIC; + + /* Update the bitmask, taking care to keep + * the bits we're not allowed to change here */ + REISERFS_SB(s)->s_mount_opt = (REISERFS_SB(s)->s_mount_opt & ~safe_mask) | (mount_options & safe_mask); + + if(commit_max_age != 0 && commit_max_age != (unsigned int)-1) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } + else if(commit_max_age == 0) + { + /* 0 means restore defaults. */ + journal->j_max_commit_age = journal->j_default_max_commit_age; + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + } + + if(blocks) { + int rc = reiserfs_resize(s, blocks); + if (rc != 0) + return rc; + } + + if (*mount_flags & MS_RDONLY) { + reiserfs_xattr_init (s, *mount_flags); + /* remount read-only */ + if (s->s_flags & MS_RDONLY) + /* it is read-only already */ + return 0; + /* try to remount file system with read-only permissions */ + if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { + return 0; + } + + err = journal_begin(&th, s, 10) ; + if (err) + return err; + + /* Mounting a rw partition read-only. */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state ); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + } else { + /* remount read-write */ + if (!(s->s_flags & MS_RDONLY)) { + reiserfs_xattr_init (s, *mount_flags); + return 0; /* We are read-write already */ + } + + if (reiserfs_is_journal_aborted (journal)) + return journal->j_errno; + + handle_data_mode(s, mount_options); + handle_barrier_mode(s, mount_options); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ; + s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */ + err = journal_begin(&th, s, 10) ; + if (err) + return err; + + /* Mount a partition which is read-only, read-write */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + s->s_flags &= ~MS_RDONLY; + set_sb_umount_state( rs, REISERFS_ERROR_FS ); + /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ; + } + /* this will force a full flush of all journal lists */ + SB_JOURNAL(s)->j_must_wait = 1 ; + err = journal_end(&th, s, 10) ; + if (err) + return err; + s->s_dirt = 0; + + if (!( *mount_flags & MS_RDONLY ) ) { + finish_unfinished( s ); + reiserfs_xattr_init (s, *mount_flags); + } + + return 0; +} + +/* load_bitmap_info_data - Sets up the reiserfs_bitmap_info structure from disk. + * @sb - superblock for this filesystem + * @bi - the bitmap info to be loaded. Requires that bi->bh is valid. + * + * This routine counts how many free bits there are, finding the first zero + * as a side effect. Could also be implemented as a loop of test_bit() calls, or + * a loop of find_first_zero_bit() calls. This implementation is similar to + * find_first_zero_bit(), but doesn't return after it finds the first bit. + * Should only be called on fs mount, but should be fairly efficient anyways. + * + * bi->first_zero_hint is considered unset if it == 0, since the bitmap itself + * will * invariably occupt block 0 represented in the bitmap. The only + * exception to this is when free_count also == 0, since there will be no + * free blocks at all. + */ + +static void load_bitmap_info_data (struct super_block *sb, + struct reiserfs_bitmap_info *bi) +{ + unsigned long *cur = (unsigned long *)bi->bh->b_data; + + while ((char *)cur < (bi->bh->b_data + sb->s_blocksize)) { + + /* No need to scan if all 0's or all 1's. + * Since we're only counting 0's, we can simply ignore all 1's */ + if (*cur == 0) { + if (bi->first_zero_hint == 0) { + bi->first_zero_hint = ((char *)cur - bi->bh->b_data) << 3; + } + bi->free_count += sizeof(unsigned long)*8; + } else if (*cur != ~0L) { + int b; + for (b = 0; b < sizeof(unsigned long)*8; b++) { + if (!reiserfs_test_le_bit (b, cur)) { + bi->free_count ++; + if (bi->first_zero_hint == 0) + bi->first_zero_hint = + (((char *)cur - bi->bh->b_data) << 3) + b; + } + } + } + cur ++; + } + +#ifdef CONFIG_REISERFS_CHECK +// This outputs a lot of unneded info on big FSes +// reiserfs_warning ("bitmap loaded from block %d: %d free blocks", +// bi->bh->b_blocknr, bi->free_count); +#endif +} + +static int read_bitmaps (struct super_block * s) +{ + int i, bmap_nr; + + SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); + if (SB_AP_BITMAP (s) == 0) + return 1; + memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); + for (i = 0, bmap_nr = REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize + 1; + i < SB_BMAP_NR(s); i++, bmap_nr = s->s_blocksize * 8 * i) { + SB_AP_BITMAP (s)[i].bh = sb_getblk(s, bmap_nr); + if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) + ll_rw_block(READ, 1, &SB_AP_BITMAP(s)[i].bh); + } + for (i = 0; i < SB_BMAP_NR(s); i++) { + wait_on_buffer(SB_AP_BITMAP (s)[i].bh); + if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) { + reiserfs_warning(s,"sh-2029: reiserfs read_bitmaps: " + "bitmap block (#%lu) reading failed", + SB_AP_BITMAP(s)[i].bh->b_blocknr); + for (i = 0; i < SB_BMAP_NR(s); i++) + brelse(SB_AP_BITMAP(s)[i].bh); + vfree(SB_AP_BITMAP(s)); + SB_AP_BITMAP(s) = NULL; + return 1; + } + load_bitmap_info_data (s, SB_AP_BITMAP (s) + i); + } + return 0; +} + +static int read_old_bitmaps (struct super_block * s) +{ + int i ; + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK(s); + int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1; /* first of bitmap blocks */ + + /* read true bitmap */ + SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs)); + if (SB_AP_BITMAP (s) == 0) + return 1; + + memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs)); + + for (i = 0; i < sb_bmap_nr(rs); i ++) { + SB_AP_BITMAP (s)[i].bh = sb_bread (s, bmp1 + i); + if (!SB_AP_BITMAP (s)[i].bh) + return 1; + load_bitmap_info_data (s, SB_AP_BITMAP (s) + i); + } + + return 0; +} + +static int read_super_block (struct super_block * s, int offset) +{ + struct buffer_head * bh; + struct reiserfs_super_block * rs; + int fs_blocksize; + + + bh = sb_bread (s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning (s, "sh-2006: read_super_block: " + "bread failed (dev %s, block %lu, size %lu)", + reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (!is_any_reiserfs_magic_string (rs)) { + brelse (bh); + return 1; + } + + // + // ok, reiserfs signature (old or new) found in at the given offset + // + fs_blocksize = sb_blocksize(rs); + brelse (bh); + sb_set_blocksize (s, fs_blocksize); + + bh = sb_bread (s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning (s, "sh-2007: read_super_block: " + "bread failed (dev %s, block %lu, size %lu)\n", + reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (sb_blocksize(rs) != s->s_blocksize) { + reiserfs_warning (s, "sh-2011: read_super_block: " + "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n", + reiserfs_bdevname (s), (unsigned long long)bh->b_blocknr, s->s_blocksize); + brelse (bh); + return 1; + } + + if ( rs->s_v1.s_root_block == -1 ) { + brelse(bh) ; + reiserfs_warning (s, "Unfinished reiserfsck --rebuild-tree run detected. Please run\n" + "reiserfsck --rebuild-tree and wait for a completion. If that fails\n" + "get newer reiserfsprogs package"); + return 1; + } + + SB_BUFFER_WITH_SB (s) = bh; + SB_DISK_SUPER_BLOCK (s) = rs; + + if (is_reiserfs_jr (rs)) { + /* magic is of non-standard journal filesystem, look at s_version to + find which format is in use */ + if (sb_version(rs) == REISERFS_VERSION_2) + reiserfs_warning (s, "read_super_block: found reiserfs format \"3.6\"" + " with non-standard journal"); + else if (sb_version(rs) == REISERFS_VERSION_1) + reiserfs_warning (s, "read_super_block: found reiserfs format \"3.5\"" + " with non-standard journal"); + else { + reiserfs_warning (s, "sh-2012: read_super_block: found unknown " + "format \"%u\" of reiserfs with non-standard magic", + sb_version(rs)); + return 1; + } + } + else + /* s_version of standard format may contain incorrect information, + so we just look at the magic string */ + reiserfs_info (s, "found reiserfs format \"%s\" with standard journal\n", + is_reiserfs_3_5 (rs) ? "3.5" : "3.6"); + + s->s_op = &reiserfs_sops; + s->s_export_op = &reiserfs_export_ops; +#ifdef CONFIG_QUOTA + s->s_qcop = &reiserfs_qctl_operations; + s->dq_op = &reiserfs_quota_operations; +#endif + + /* new format is limited by the 32 bit wide i_blocks field, want to + ** be one full block below that. + */ + s->s_maxbytes = (512LL << 32) - s->s_blocksize ; + return 0; +} + + + +/* after journal replay, reread all bitmap and super blocks */ +static int reread_meta_blocks(struct super_block *s) { + int i ; + ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))) ; + wait_on_buffer(SB_BUFFER_WITH_SB(s)) ; + if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + reiserfs_warning (s, "reread_meta_blocks, error reading the super") ; + return 1 ; + } + + for (i = 0; i < SB_BMAP_NR(s) ; i++) { + ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i].bh)) ; + wait_on_buffer(SB_AP_BITMAP(s)[i].bh) ; + if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) { + reiserfs_warning (s, "reread_meta_blocks, error reading bitmap block number %d at %llu", + i, (unsigned long long)SB_AP_BITMAP(s)[i].bh->b_blocknr) ; + return 1 ; + } + } + return 0 ; + +} + + +///////////////////////////////////////////////////// +// hash detection stuff + + +// if root directory is empty - we set default - Yura's - hash and +// warn about it +// FIXME: we look for only one name in a directory. If tea and yura +// bith have the same value - we ask user to send report to the +// mailing list +static __u32 find_hash_out (struct super_block * s) +{ + int retval; + struct inode * inode; + struct cpu_key key; + INITIALIZE_PATH (path); + struct reiserfs_dir_entry de; + __u32 hash = DEFAULT_HASH; + + inode = s->s_root->d_inode; + + do { // Some serious "goto"-hater was there ;) + u32 teahash, r5hash, yurahash; + + make_cpu_key (&key, inode, ~0, TYPE_DIRENTRY, 3); + retval = search_by_entry_key (s, &key, &path, &de); + if (retval == IO_ERROR) { + pathrelse (&path); + return UNSET_HASH ; + } + if (retval == NAME_NOT_FOUND) + de.de_entry_num --; + set_de_name_and_namelen (&de); + if (deh_offset( &(de.de_deh[de.de_entry_num]) ) == DOT_DOT_OFFSET) { + /* allow override in this case */ + if (reiserfs_rupasov_hash(s)) { + hash = YURA_HASH ; + } + reiserfs_warning(s,"FS seems to be empty, autodetect " + "is using the default hash"); + break; + } + r5hash=GET_HASH_VALUE (r5_hash (de.de_name, de.de_namelen)); + teahash=GET_HASH_VALUE (keyed_hash (de.de_name, de.de_namelen)); + yurahash=GET_HASH_VALUE (yura_hash (de.de_name, de.de_namelen)); + if ( ( (teahash == r5hash) && (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) ) || + ( (teahash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) || + ( (r5hash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) ) { + reiserfs_warning(s,"Unable to automatically detect hash function. " + "Please mount with -o hash={tea,rupasov,r5}", + reiserfs_bdevname (s)); + hash = UNSET_HASH; + break; + } + if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == yurahash) + hash = YURA_HASH; + else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == teahash) + hash = TEA_HASH; + else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == r5hash) + hash = R5_HASH; + else { + reiserfs_warning (s,"Unrecognised hash function"); + hash = UNSET_HASH; + } + } while (0); + + pathrelse (&path); + return hash; +} + +// finds out which hash names are sorted with +static int what_hash (struct super_block * s) +{ + __u32 code; + + code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); + + /* reiserfs_hash_detect() == true if any of the hash mount options + ** were used. We must check them to make sure the user isn't + ** using a bad hash value + */ + if (code == UNSET_HASH || reiserfs_hash_detect(s)) + code = find_hash_out (s); + + if (code != UNSET_HASH && reiserfs_hash_detect(s)) { + /* detection has found the hash, and we must check against the + ** mount options + */ + if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { + reiserfs_warning (s, "Error, %s hash detected, " + "unable to force rupasov hash", reiserfs_hashname(code)) ; + code = UNSET_HASH ; + } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { + reiserfs_warning (s, "Error, %s hash detected, " + "unable to force tea hash", reiserfs_hashname(code)) ; + code = UNSET_HASH ; + } else if (reiserfs_r5_hash(s) && code != R5_HASH) { + reiserfs_warning (s, "Error, %s hash detected, " + "unable to force r5 hash", reiserfs_hashname(code)) ; + code = UNSET_HASH ; + } + } else { + /* find_hash_out was not called or could not determine the hash */ + if (reiserfs_rupasov_hash(s)) { + code = YURA_HASH ; + } else if (reiserfs_tea_hash(s)) { + code = TEA_HASH ; + } else if (reiserfs_r5_hash(s)) { + code = R5_HASH ; + } + } + + /* if we are mounted RW, and we have a new valid hash code, update + ** the super + */ + if (code != UNSET_HASH && + !(s->s_flags & MS_RDONLY) && + code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { + set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); + } + return code; +} + +// return pointer to appropriate function +static hashf_t hash_function (struct super_block * s) +{ + switch (what_hash (s)) { + case TEA_HASH: + reiserfs_info (s, "Using tea hash to sort names\n"); + return keyed_hash; + case YURA_HASH: + reiserfs_info (s, "Using rupasov hash to sort names\n"); + return yura_hash; + case R5_HASH: + reiserfs_info (s, "Using r5 hash to sort names\n"); + return r5_hash; + } + return NULL; +} + +// this is used to set up correct value for old partitions +static int function2code (hashf_t func) +{ + if (func == keyed_hash) + return TEA_HASH; + if (func == yura_hash) + return YURA_HASH; + if (func == r5_hash) + return R5_HASH; + + BUG() ; // should never happen + + return 0; +} + +#define SWARN(silent, s, ...) \ + if (!(silent)) \ + reiserfs_warning (s, __VA_ARGS__) + +static int reiserfs_fill_super (struct super_block * s, void * data, int silent) +{ + struct inode *root_inode; + int j; + struct reiserfs_transaction_handle th ; + int old_format = 0; + unsigned long blocks; + unsigned int commit_max_age = 0; + int jinit_done = 0 ; + struct reiserfs_iget_args args ; + struct reiserfs_super_block * rs; + char *jdev_name; + struct reiserfs_sb_info *sbi; + int errval = -EINVAL; + + sbi = kmalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); + if (!sbi) { + errval = -ENOMEM; + goto error; + } + s->s_fs_info = sbi; + memset (sbi, 0, sizeof (struct reiserfs_sb_info)); + /* Set default values for options: non-aggressive tails, RO on errors */ + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO); + /* no preallocation minimum, be smart in + reiserfs_file_write instead */ + REISERFS_SB(s)->s_alloc_options.preallocmin = 0; + /* Preallocate by 16 blocks (17-1) at once */ + REISERFS_SB(s)->s_alloc_options.preallocsize = 17; + /* Initialize the rwsem for xattr dir */ + init_rwsem(&REISERFS_SB(s)->xattr_dir_sem); + + /* setup default block allocator options */ + reiserfs_init_alloc_options(s); + + jdev_name = NULL; + if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) { + goto error; + } + + if (blocks) { + SWARN (silent, s, "jmacd-7: reiserfs_fill_super: resize option " + "for remount only"); + goto error; + } + + /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ + if (!read_super_block (s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) + old_format = 1; + /* try new format (64-th 1k block), which can contain reiserfs super block */ + else if (read_super_block (s, REISERFS_DISK_OFFSET_IN_BYTES)) { + SWARN(silent, s, "sh-2021: reiserfs_fill_super: can not find reiserfs on %s", reiserfs_bdevname (s)); + goto error; + } + + rs = SB_DISK_SUPER_BLOCK (s); + /* Let's do basic sanity check to verify that underlying device is not + smaller than the filesystem. If the check fails then abort and scream, + because bad stuff will happen otherwise. */ + if ( s->s_bdev && s->s_bdev->bd_inode && i_size_read(s->s_bdev->bd_inode) < sb_block_count(rs)*sb_blocksize(rs)) { + SWARN (silent, s, "Filesystem on %s cannot be mounted because it is bigger than the device", reiserfs_bdevname(s)); + SWARN(silent, s, "You may need to run fsck or increase size of your LVM partition"); + SWARN(silent, s, "Or may be you forgot to reboot after fdisk when it told you to"); + goto error; + } + + sbi->s_mount_state = SB_REISERFS_STATE(s); + sbi->s_mount_state = REISERFS_VALID_FS ; + + if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) { + SWARN(silent, s, "jmacd-8: reiserfs_fill_super: unable to read bitmap"); + goto error; + } +#ifdef CONFIG_REISERFS_CHECK + SWARN (silent, s, "CONFIG_REISERFS_CHECK is set ON"); + SWARN (silent, s, "- it is slow mode for debugging."); +#endif + + /* make data=ordered the default */ + if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && + !reiserfs_data_writeback(s)) + { + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); + } + + if (reiserfs_data_log(s)) { + reiserfs_info (s, "using journaled data mode\n"); + } else if (reiserfs_data_ordered(s)) { + reiserfs_info (s, "using ordered data mode\n"); + } else { + reiserfs_info (s, "using writeback data mode\n"); + } + if (reiserfs_barrier_flush(s)) { + printk("reiserfs: using flush barriers\n"); + } + + // set_device_ro(s->s_dev, 1) ; + if( journal_init(s, jdev_name, old_format, commit_max_age) ) { + SWARN(silent, s, "sh-2022: reiserfs_fill_super: unable to initialize journal space") ; + goto error ; + } else { + jinit_done = 1 ; /* once this is set, journal_release must be called + ** if we error out of the mount + */ + } + if (reread_meta_blocks(s)) { + SWARN(silent, s, "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init") ; + goto error ; + } + + if (replay_only (s)) + goto error; + + if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { + SWARN(silent, s, "clm-7000: Detected readonly device, marking FS readonly") ; + s->s_flags |= MS_RDONLY ; + } + args.objectid = REISERFS_ROOT_OBJECTID ; + args.dirid = REISERFS_ROOT_PARENT_OBJECTID ; + root_inode = iget5_locked (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); + if (!root_inode) { + SWARN(silent, s, "jmacd-10: reiserfs_fill_super: get root inode failed"); + goto error; + } + + if (root_inode->i_state & I_NEW) { + reiserfs_read_locked_inode(root_inode, &args); + unlock_new_inode(root_inode); + } + + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) { + iput(root_inode); + goto error; + } + + // define and initialize hash function + sbi->s_hash_function = hash_function (s); + if (sbi->s_hash_function == NULL) { + dput(s->s_root) ; + s->s_root = NULL ; + goto error ; + } + + if (is_reiserfs_3_5 (rs) || (is_reiserfs_jr (rs) && SB_VERSION (s) == REISERFS_VERSION_1)) + set_bit(REISERFS_3_5, &(sbi->s_properties)); + else + set_bit(REISERFS_3_6, &(sbi->s_properties)); + + if (!(s->s_flags & MS_RDONLY)) { + + errval = journal_begin(&th, s, 1) ; + if (errval) { + dput (s->s_root); + s->s_root = NULL; + goto error; + } + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + + set_sb_umount_state( rs, REISERFS_ERROR_FS ); + set_sb_fs_state (rs, 0); + + if (old_format_only(s)) { + /* filesystem of format 3.5 either with standard or non-standard + journal */ + if (convert_reiserfs (s)) { + /* and -o conv is given */ + if(!silent) + reiserfs_info (s,"converting 3.5 filesystem to the 3.6 format") ; + + if (is_reiserfs_3_5 (rs)) + /* put magic string of 3.6 format. 2.2 will not be able to + mount this filesystem anymore */ + memcpy (rs->s_v1.s_magic, reiserfs_3_6_magic_string, + sizeof (reiserfs_3_6_magic_string)); + + set_sb_version(rs,REISERFS_VERSION_2); + reiserfs_convert_objectid_map_v1(s) ; + set_bit(REISERFS_3_6, &(sbi->s_properties)); + clear_bit(REISERFS_3_5, &(sbi->s_properties)); + } else if (!silent){ + reiserfs_info (s, "using 3.5.x disk format\n") ; + } + } + + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + errval = journal_end(&th, s, 1) ; + if (errval) { + dput (s->s_root); + s->s_root = NULL; + goto error; + } + + if ((errval = reiserfs_xattr_init (s, s->s_flags))) { + dput (s->s_root); + s->s_root = NULL; + goto error; + } + + /* look for files which were to be removed in previous session */ + finish_unfinished (s); + } else { + if ( old_format_only(s) && !silent) { + reiserfs_info (s, "using 3.5.x disk format\n") ; + } + + if ((errval = reiserfs_xattr_init (s, s->s_flags))) { + dput (s->s_root); + s->s_root = NULL; + goto error; + } + } + // mark hash in super block: it could be unset. overwrite should be ok + set_sb_hash_function_code( rs, function2code(sbi->s_hash_function ) ); + + handle_attrs( s ); + + reiserfs_proc_info_init( s ); + + init_waitqueue_head (&(sbi->s_wait)); + spin_lock_init(&sbi->bitmap_lock); + + return (0); + + error: + if (jinit_done) { /* kill the commit thread, free journal ram */ + journal_release_error(NULL, s) ; + } + if (SB_DISK_SUPER_BLOCK (s)) { + for (j = 0; j < SB_BMAP_NR (s); j ++) { + if (SB_AP_BITMAP (s)) + brelse (SB_AP_BITMAP (s)[j].bh); + } + if (SB_AP_BITMAP (s)) + vfree (SB_AP_BITMAP (s)); + } + if (SB_BUFFER_WITH_SB (s)) + brelse(SB_BUFFER_WITH_SB (s)); +#ifdef CONFIG_QUOTA + for (j = 0; j < MAXQUOTAS; j++) { + if (sbi->s_qf_names[j]) + kfree(sbi->s_qf_names[j]); + } +#endif + if (sbi != NULL) { + kfree(sbi); + } + + s->s_fs_info = NULL; + return errval; +} + + +static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf) +{ + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + + buf->f_namelen = (REISERFS_MAX_NAME (s->s_blocksize)); + buf->f_bfree = sb_free_blocks(rs); + buf->f_bavail = buf->f_bfree; + buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; + buf->f_bsize = s->s_blocksize; + /* changed to accommodate gcc folks.*/ + buf->f_type = REISERFS_SUPER_MAGIC; + return 0; +} + +#ifdef CONFIG_QUOTA +static int reiserfs_dquot_initialize(struct inode *inode, int type) +{ + struct reiserfs_transaction_handle th; + int ret; + + /* We may create quota structure so we need to reserve enough blocks */ + reiserfs_write_lock(inode->i_sb); + journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); + ret = dquot_initialize(inode, type); + journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); + reiserfs_write_unlock(inode->i_sb); + return ret; +} + +static int reiserfs_dquot_drop(struct inode *inode) +{ + struct reiserfs_transaction_handle th; + int ret; + + /* We may delete quota structure so we need to reserve enough blocks */ + reiserfs_write_lock(inode->i_sb); + journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); + ret = dquot_drop(inode); + journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); + reiserfs_write_unlock(inode->i_sb); + return ret; +} + +static int reiserfs_write_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret; + + reiserfs_write_lock(dquot->dq_sb); + journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS); + ret = dquot_commit(dquot); + journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS); + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_acquire_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret; + + reiserfs_write_lock(dquot->dq_sb); + journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); + ret = dquot_acquire(dquot); + journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_release_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret; + + reiserfs_write_lock(dquot->dq_sb); + journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); + ret = dquot_release(dquot); + journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journalling quotas? */ + if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return reiserfs_write_dquot(dquot); + } + else + return dquot_mark_dquot_dirty(dquot); +} + +static int reiserfs_write_info(struct super_block *sb, int type) +{ + struct reiserfs_transaction_handle th; + int ret; + + /* Data block + inode block */ + reiserfs_write_lock(sb); + journal_begin(&th, sb, 2); + ret = dquot_commit_info(sb, type); + journal_end(&th, sb, 2); + reiserfs_write_unlock(sb); + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int reiserfs_quota_on_mount(struct super_block *sb, int type) +{ + int err; + struct dentry *dentry; + struct qstr name = { .name = REISERFS_SB(sb)->s_qf_names[type], + .hash = 0, + .len = strlen(REISERFS_SB(sb)->s_qf_names[type])}; + + dentry = lookup_hash(&name, sb->s_root); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + err = vfs_quota_on_mount(type, REISERFS_SB(sb)->s_jquota_fmt, dentry); + /* Now invalidate and put the dentry - quota got its own reference + * to inode and dentry has at least wrong hash so we had better + * throw it away */ + d_invalidate(dentry); + dput(dentry); + return err; +} + +/* + * Standard function to be called on quota_on + */ +static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, char *path) +{ + int err; + struct nameidata nd; + + err = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (err) + return err; + /* Quotafile not on the same filesystem? */ + if (nd.mnt->mnt_sb != sb) { + path_release(&nd); + return -EXDEV; + } + /* We must not pack tails for quota files on reiserfs for quota IO to work */ + if (!REISERFS_I(nd.dentry->d_inode)->i_flags & i_nopack_mask) { + reiserfs_warning(sb, "reiserfs: Quota file must have tail packing disabled."); + path_release(&nd); + return -EINVAL; + } + /* Not journalling quota? No more tests needed... */ + if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] && + !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) { + path_release(&nd); + return vfs_quota_on(sb, type, format_id, path); + } + /* Quotafile not of fs root? */ + if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode) + reiserfs_warning(sb, "reiserfs: Quota file not on filesystem root. " + "Journalled quota will not work."); + path_release(&nd); + return vfs_quota_on(sb, type, format_id, path); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and noone else should touch the files) + * we don't have to be afraid of races */ +static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + size_t toread; + struct buffer_head tmp_bh, *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread; + tmp_bh.b_state = 0; + /* Quota files are without tails so we can safely use this function */ + reiserfs_write_lock(sb); + err = reiserfs_get_block(inode, blk, &tmp_bh, 0); + reiserfs_write_unlock(sb); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t reiserfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL; + size_t towrite = len; + struct buffer_head tmp_bh, *bh; + + down(&inode->i_sem); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + tmp_bh.b_state = 0; + err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + reiserfs_prepare_for_journal(sb, bh, 1); + journal_mark_dirty(current->journal_info, sb, bh); + if (!journal_quota) + reiserfs_add_ordered_list(inode, bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) + return err; + if (inode->i_size < off+len-towrite) + i_size_write(inode, off+len-towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + up(&inode->i_sem); + return len - towrite; +} + +#endif + +static struct super_block* +get_super_block (struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); +} + +static int __init +init_reiserfs_fs ( void ) +{ + int ret; + + if ((ret = init_inodecache ())) { + return ret; + } + + if ((ret = reiserfs_xattr_register_handlers ())) + goto failed_reiserfs_xattr_register_handlers; + + reiserfs_proc_info_global_init (); + reiserfs_proc_register_global ("version", reiserfs_global_version_in_proc); + + ret = register_filesystem (& reiserfs_fs_type); + + if (ret == 0) { + return 0; + } + + reiserfs_xattr_unregister_handlers (); + +failed_reiserfs_xattr_register_handlers: + reiserfs_proc_unregister_global ("version"); + reiserfs_proc_info_global_done (); + destroy_inodecache (); + + return ret; +} + +static void __exit +exit_reiserfs_fs ( void ) +{ + reiserfs_xattr_unregister_handlers (); + reiserfs_proc_unregister_global ("version"); + reiserfs_proc_info_global_done (); + unregister_filesystem (& reiserfs_fs_type); + destroy_inodecache (); +} + +struct file_system_type reiserfs_fs_type = { + .owner = THIS_MODULE, + .name = "reiserfs", + .get_sb = get_super_block, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +MODULE_DESCRIPTION ("ReiserFS journaled filesystem"); +MODULE_AUTHOR ("Hans Reiser <reiser@namesys.com>"); +MODULE_LICENSE ("GPL"); + +module_init (init_reiserfs_fs); +module_exit (exit_reiserfs_fs); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c new file mode 100644 index 000000000000..6191909d5165 --- /dev/null +++ b/fs/reiserfs/tail_conversion.c @@ -0,0 +1,276 @@ +/* + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details + */ + +#include <linux/config.h> +#include <linux/time.h> +#include <linux/pagemap.h> +#include <linux/buffer_head.h> +#include <linux/reiserfs_fs.h> + +/* access to tail : when one is going to read tail it must make sure, that is not running. + direct2indirect and indirect2direct can not run concurrently */ + + +/* Converts direct items to an unformatted node. Panics if file has no + tail. -ENOSPC if no disk space for conversion */ +/* path points to first direct item of the file regarless of how many of + them are there */ +int direct2indirect (struct reiserfs_transaction_handle *th, struct inode * inode, + struct path * path, struct buffer_head * unbh, + loff_t tail_offset) +{ + struct super_block * sb = inode->i_sb; + struct buffer_head *up_to_date_bh ; + struct item_head * p_le_ih = PATH_PITEM_HEAD (path); + unsigned long total_tail = 0 ; + struct cpu_key end_key; /* Key to search for the last byte of the + converted item. */ + struct item_head ind_ih; /* new indirect item to be inserted or + key of unfm pointer to be pasted */ + int n_blk_size, + n_retval; /* returned value for reiserfs_insert_item and clones */ + unp_t unfm_ptr; /* Handle on an unformatted node + that will be inserted in the + tree. */ + + BUG_ON (!th->t_trans_id); + + REISERFS_SB(sb)->s_direct2indirect ++; + + n_blk_size = sb->s_blocksize; + + /* and key to search for append or insert pointer to the new + unformatted node. */ + copy_item_head (&ind_ih, p_le_ih); + set_le_ih_k_offset (&ind_ih, tail_offset); + set_le_ih_k_type (&ind_ih, TYPE_INDIRECT); + + /* Set the key to search for the place for new unfm pointer */ + make_cpu_key (&end_key, inode, tail_offset, TYPE_INDIRECT, 4); + + // FIXME: we could avoid this + if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) { + reiserfs_warning (sb, "PAP-14030: direct2indirect: " + "pasted or inserted byte exists in the tree %K. " + "Use fsck to repair.", &end_key); + pathrelse(path); + return -EIO; + } + + p_le_ih = PATH_PITEM_HEAD (path); + + unfm_ptr = cpu_to_le32 (unbh->b_blocknr); + + if ( is_statdata_le_ih (p_le_ih) ) { + /* Insert new indirect item. */ + set_ih_free_space (&ind_ih, 0); /* delete at nearest future */ + put_ih_item_len( &ind_ih, UNFM_P_SIZE ); + PATH_LAST_POSITION (path)++; + n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode, + (char *)&unfm_ptr); + } else { + /* Paste into last indirect item of an object. */ + n_retval = reiserfs_paste_into_item(th, path, &end_key, inode, + (char *)&unfm_ptr, UNFM_P_SIZE); + } + if ( n_retval ) { + return n_retval; + } + + // note: from here there are two keys which have matching first + // three key components. They only differ by the fourth one. + + + /* Set the key to search for the direct items of the file */ + make_cpu_key (&end_key, inode, max_reiserfs_offset (inode), TYPE_DIRECT, 4); + + /* Move bytes from the direct items to the new unformatted node + and delete them. */ + while (1) { + int tail_size; + + /* end_key.k_offset is set so, that we will always have found + last item of the file */ + if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) + reiserfs_panic (sb, "PAP-14050: direct2indirect: " + "direct item (%K) not found", &end_key); + p_le_ih = PATH_PITEM_HEAD (path); + RFALSE( !is_direct_le_ih (p_le_ih), + "vs-14055: direct item expected(%K), found %h", + &end_key, p_le_ih); + tail_size = (le_ih_k_offset (p_le_ih) & (n_blk_size - 1)) + + ih_item_len(p_le_ih) - 1; + + /* we only send the unbh pointer if the buffer is not up to date. + ** this avoids overwriting good data from writepage() with old data + ** from the disk or buffer cache + ** Special case: unbh->b_page will be NULL if we are coming through + ** DIRECT_IO handler here. + */ + if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { + up_to_date_bh = NULL ; + } else { + up_to_date_bh = unbh ; + } + n_retval = reiserfs_delete_item (th, path, &end_key, inode, + up_to_date_bh) ; + + total_tail += n_retval ; + if (tail_size == n_retval) + // done: file does not have direct items anymore + break; + + } + /* if we've copied bytes from disk into the page, we need to zero + ** out the unused part of the block (it was not up to date before) + */ + if (up_to_date_bh) { + unsigned pgoff = (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); + char *kaddr=kmap_atomic(up_to_date_bh->b_page, KM_USER0); + memset(kaddr + pgoff, 0, n_blk_size - total_tail) ; + kunmap_atomic(kaddr, KM_USER0); + } + + REISERFS_I(inode)->i_first_direct_byte = U32_MAX; + + return 0; +} + + +/* stolen from fs/buffer.c */ +void reiserfs_unmap_buffer(struct buffer_head *bh) { + lock_buffer(bh) ; + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + BUG() ; + } + clear_buffer_dirty(bh) ; + /* Remove the buffer from whatever list it belongs to. We are mostly + interested in removing it from per-sb j_dirty_buffers list, to avoid + BUG() on attempt to write not mapped buffer */ + if ( (!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { + struct inode *inode = bh->b_page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + spin_lock(&j->j_dirty_buffers_lock); + list_del_init(&bh->b_assoc_buffers); + reiserfs_free_jh(bh); + spin_unlock(&j->j_dirty_buffers_lock); + } + clear_buffer_mapped(bh) ; + clear_buffer_req(bh) ; + clear_buffer_new(bh); + bh->b_bdev = NULL; + unlock_buffer(bh) ; +} + +/* this first locks inode (neither reads nor sync are permitted), + reads tail through page cache, insert direct item. When direct item + inserted successfully inode is left locked. Return value is always + what we expect from it (number of cut bytes). But when tail remains + in the unformatted node, we set mode to SKIP_BALANCING and unlock + inode */ +int indirect2direct (struct reiserfs_transaction_handle *th, + struct inode * p_s_inode, + struct page *page, + struct path * p_s_path, /* path to the indirect item. */ + const struct cpu_key * p_s_item_key, /* Key to look for unformatted node pointer to be cut. */ + loff_t n_new_file_size, /* New file size. */ + char * p_c_mode) +{ + struct super_block * p_s_sb = p_s_inode->i_sb; + struct item_head s_ih; + unsigned long n_block_size = p_s_sb->s_blocksize; + char * tail; + int tail_len, round_tail_len; + loff_t pos, pos1; /* position of first byte of the tail */ + struct cpu_key key; + + BUG_ON (!th->t_trans_id); + + REISERFS_SB(p_s_sb)->s_indirect2direct ++; + + *p_c_mode = M_SKIP_BALANCING; + + /* store item head path points to. */ + copy_item_head (&s_ih, PATH_PITEM_HEAD(p_s_path)); + + tail_len = (n_new_file_size & (n_block_size - 1)); + if (get_inode_sd_version (p_s_inode) == STAT_DATA_V2) + round_tail_len = ROUND_UP (tail_len); + else + round_tail_len = tail_len; + + pos = le_ih_k_offset (&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize; + pos1 = pos; + + // we are protected by i_sem. The tail can not disapper, not + // append can be done either + // we are in truncate or packing tail in file_release + + tail = (char *)kmap(page) ; /* this can schedule */ + + if (path_changed (&s_ih, p_s_path)) { + /* re-search indirect item */ + if ( search_for_position_by_key (p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ) + reiserfs_panic(p_s_sb, "PAP-5520: indirect2direct: " + "item to be converted %K does not exist", p_s_item_key); + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); +#ifdef CONFIG_REISERFS_CHECK + pos = le_ih_k_offset (&s_ih) - 1 + + (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize; + if (pos != pos1) + reiserfs_panic (p_s_sb, "vs-5530: indirect2direct: " + "tail position changed while we were reading it"); +#endif + } + + + /* Set direct item header to insert. */ + make_le_item_head (&s_ih, NULL, get_inode_item_key_version (p_s_inode), pos1 + 1, + TYPE_DIRECT, round_tail_len, 0xffff/*ih_free_space*/); + + /* we want a pointer to the first byte of the tail in the page. + ** the page was locked and this part of the page was up to date when + ** indirect2direct was called, so we know the bytes are still valid + */ + tail = tail + (pos & (PAGE_CACHE_SIZE - 1)) ; + + PATH_LAST_POSITION(p_s_path)++; + + key = *p_s_item_key; + set_cpu_key_k_type (&key, TYPE_DIRECT); + key.key_length = 4; + /* Insert tail as new direct item in the tree */ + if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode, + tail ? tail : NULL) < 0 ) { + /* No disk memory. So we can not convert last unformatted node + to the direct item. In this case we used to adjust + indirect items's ih_free_space. Now ih_free_space is not + used, it would be ideal to write zeros to corresponding + unformatted node. For now i_size is considered as guard for + going out of file size */ + kunmap(page) ; + return n_block_size - round_tail_len; + } + kunmap(page) ; + + /* make sure to get the i_blocks changes from reiserfs_insert_item */ + reiserfs_update_sd(th, p_s_inode); + + // note: we have now the same as in above direct2indirect + // conversion: there are two keys which have matching first three + // key components. They only differ by the fouhth one. + + /* We have inserted new direct item and must remove last + unformatted node. */ + *p_c_mode = M_CUT; + + /* we store position of first direct item in the in-core inode */ + //mark_file_with_tail (p_s_inode, pos1 + 1); + REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1; + + return n_block_size - round_tail_len; +} + + + diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c new file mode 100644 index 000000000000..45582fe8b466 --- /dev/null +++ b/fs/reiserfs/xattr.c @@ -0,0 +1,1450 @@ +/* + * linux/fs/reiserfs/xattr.c + * + * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com> + * + */ + +/* + * In order to implement EA/ACLs in a clean, backwards compatible manner, + * they are implemented as files in a "private" directory. + * Each EA is in it's own file, with the directory layout like so (/ is assumed + * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory, + * directories named using the capital-hex form of the objectid and + * generation number are used. Inside each directory are individual files + * named with the name of the extended attribute. + * + * So, for objectid 12648430, we could have: + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default + * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type + * .. or similar. + * + * The file contents are the text of the EA. The size is known based on the + * stat data describing the file. + * + * In the case of system.posix_acl_access and system.posix_acl_default, since + * these are special cases for filesystem ACLs, they are interpreted by the + * kernel, in addition, they are negatively and positively cached and attached + * to the inode so that unnecessary lookups are avoided. + */ + +#include <linux/reiserfs_fs.h> +#include <linux/dcache.h> +#include <linux/namei.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/pagemap.h> +#include <linux/xattr.h> +#include <linux/reiserfs_xattr.h> +#include <linux/reiserfs_acl.h> +#include <linux/mbcache.h> +#include <asm/uaccess.h> +#include <asm/checksum.h> +#include <linux/smp_lock.h> +#include <linux/stat.h> +#include <asm/semaphore.h> + +#define FL_READONLY 128 +#define FL_DIR_SEM_HELD 256 +#define PRIVROOT_NAME ".reiserfs_priv" +#define XAROOT_NAME "xattrs" + +static struct reiserfs_xattr_handler *find_xattr_handler_prefix (const char *prefix); + +static struct dentry * +create_xa_root (struct super_block *sb) +{ + struct dentry *privroot = dget (REISERFS_SB(sb)->priv_root); + struct dentry *xaroot; + + /* This needs to be created at mount-time */ + if (!privroot) + return ERR_PTR(-EOPNOTSUPP); + + xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME)); + if (IS_ERR (xaroot)) { + goto out; + } else if (!xaroot->d_inode) { + int err; + down (&privroot->d_inode->i_sem); + err = privroot->d_inode->i_op->mkdir (privroot->d_inode, xaroot, 0700); + up (&privroot->d_inode->i_sem); + + if (err) { + dput (xaroot); + dput (privroot); + return ERR_PTR (err); + } + REISERFS_SB(sb)->xattr_root = dget (xaroot); + } + +out: + dput (privroot); + return xaroot; +} + +/* This will return a dentry, or error, refering to the xa root directory. + * If the xa root doesn't exist yet, the dentry will be returned without + * an associated inode. This dentry can be used with ->mkdir to create + * the xa directory. */ +static struct dentry * +__get_xa_root (struct super_block *s) +{ + struct dentry *privroot = dget (REISERFS_SB(s)->priv_root); + struct dentry *xaroot = NULL; + + if (IS_ERR (privroot) || !privroot) + return privroot; + + xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME)); + if (IS_ERR (xaroot)) { + goto out; + } else if (!xaroot->d_inode) { + dput (xaroot); + xaroot = NULL; + goto out; + } + + REISERFS_SB(s)->xattr_root = dget (xaroot); + +out: + dput (privroot); + return xaroot; +} + +/* Returns the dentry (or NULL) referring to the root of the extended + * attribute directory tree. If it has already been retreived, it is used. + * Otherwise, we attempt to retreive it from disk. It may also return + * a pointer-encoded error. + */ +static inline struct dentry * +get_xa_root (struct super_block *s) +{ + struct dentry *dentry = dget (REISERFS_SB(s)->xattr_root); + + if (!dentry) + dentry = __get_xa_root (s); + + return dentry; +} + +/* Opens the directory corresponding to the inode's extended attribute store. + * If flags allow, the tree to the directory may be created. If creation is + * prohibited, -ENODATA is returned. */ +static struct dentry * +open_xa_dir (const struct inode *inode, int flags) +{ + struct dentry *xaroot, *xadir; + char namebuf[17]; + + xaroot = get_xa_root (inode->i_sb); + if (IS_ERR (xaroot)) { + return xaroot; + } else if (!xaroot) { + if (flags == 0 || flags & XATTR_CREATE) { + xaroot = create_xa_root (inode->i_sb); + if (IS_ERR (xaroot)) + return xaroot; + } + if (!xaroot) + return ERR_PTR (-ENODATA); + } + + /* ok, we have xaroot open */ + + snprintf (namebuf, sizeof (namebuf), "%X.%X", + le32_to_cpu (INODE_PKEY (inode)->k_objectid), + inode->i_generation); + xadir = lookup_one_len (namebuf, xaroot, strlen (namebuf)); + if (IS_ERR (xadir)) { + dput (xaroot); + return xadir; + } + + if (!xadir->d_inode) { + int err; + if (flags == 0 || flags & XATTR_CREATE) { + /* Although there is nothing else trying to create this directory, + * another directory with the same hash may be created, so we need + * to protect against that */ + err = xaroot->d_inode->i_op->mkdir (xaroot->d_inode, xadir, 0700); + if (err) { + dput (xaroot); + dput (xadir); + return ERR_PTR (err); + } + } + if (!xadir->d_inode) { + dput (xaroot); + dput (xadir); + return ERR_PTR (-ENODATA); + } + } + + dput (xaroot); + return xadir; +} + +/* Returns a dentry corresponding to a specific extended attribute file + * for the inode. If flags allow, the file is created. Otherwise, a + * valid or negative dentry, or an error is returned. */ +static struct dentry * +get_xa_file_dentry (const struct inode *inode, const char *name, int flags) +{ + struct dentry *xadir, *xafile; + int err = 0; + + xadir = open_xa_dir (inode, flags); + if (IS_ERR (xadir)) { + return ERR_PTR (PTR_ERR (xadir)); + } else if (xadir && !xadir->d_inode) { + dput (xadir); + return ERR_PTR (-ENODATA); + } + + xafile = lookup_one_len (name, xadir, strlen (name)); + if (IS_ERR (xafile)) { + dput (xadir); + return ERR_PTR (PTR_ERR (xafile)); + } + + if (xafile->d_inode) { /* file exists */ + if (flags & XATTR_CREATE) { + err = -EEXIST; + dput (xafile); + goto out; + } + } else if (flags & XATTR_REPLACE || flags & FL_READONLY) { + goto out; + } else { + /* inode->i_sem is down, so nothing else can try to create + * the same xattr */ + err = xadir->d_inode->i_op->create (xadir->d_inode, xafile, + 0700|S_IFREG, NULL); + + if (err) { + dput (xafile); + goto out; + } + } + +out: + dput (xadir); + if (err) + xafile = ERR_PTR (err); + return xafile; +} + + +/* Opens a file pointer to the attribute associated with inode */ +static struct file * +open_xa_file (const struct inode *inode, const char *name, int flags) +{ + struct dentry *xafile; + struct file *fp; + + xafile = get_xa_file_dentry (inode, name, flags); + if (IS_ERR (xafile)) + return ERR_PTR (PTR_ERR (xafile)); + else if (!xafile->d_inode) { + dput (xafile); + return ERR_PTR (-ENODATA); + } + + fp = dentry_open (xafile, NULL, O_RDWR); + /* dentry_open dputs the dentry if it fails */ + + return fp; +} + + +/* + * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but + * we need to drop the path before calling the filldir struct. That + * would be a big performance hit to the non-xattr case, so I've copied + * the whole thing for now. --clm + * + * the big difference is that I go backwards through the directory, + * and don't mess with f->f_pos, but the idea is the same. Do some + * action on each and every entry in the directory. + * + * we're called with i_sem held, so there are no worries about the directory + * changing underneath us. + */ +static int __xattr_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + INITIALIZE_PATH (path_to_entry); + struct buffer_head * bh; + int entry_num; + struct item_head * ih, tmp_ih; + int search_res; + char * local_buf; + loff_t next_pos; + char small_buf[32] ; /* avoid kmalloc if we can */ + struct reiserfs_de_head *deh; + int d_reclen; + char * d_name; + off_t d_off; + ino_t d_ino; + struct reiserfs_dir_entry de; + + + /* form key for search the next directory entry using f_pos field of + file structure */ + next_pos = max_reiserfs_offset(inode); + + while (1) { +research: + if (next_pos <= DOT_DOT_OFFSET) + break; + make_cpu_key (&pos_key, inode, next_pos, TYPE_DIRENTRY, 3); + + search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de); + if (search_res == IO_ERROR) { + // FIXME: we could just skip part of directory which could + // not be read + pathrelse(&path_to_entry); + return -EIO; + } + + if (search_res == NAME_NOT_FOUND) + de.de_entry_num--; + + set_de_name_and_namelen(&de); + entry_num = de.de_entry_num; + deh = &(de.de_deh[entry_num]); + + bh = de.de_bh; + ih = de.de_ih; + + if (!is_direntry_le_ih(ih)) { + reiserfs_warning(inode->i_sb, "not direntry %h", ih); + break; + } + copy_item_head(&tmp_ih, ih); + + /* we must have found item, that is item of this directory, */ + RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key), + "vs-9000: found item %h does not match to dir we readdir %K", + ih, &pos_key); + + if (deh_offset(deh) <= DOT_DOT_OFFSET) { + break; + } + + /* look for the previous entry in the directory */ + next_pos = deh_offset (deh) - 1; + + if (!de_visible (deh)) + /* it is hidden entry */ + continue; + + d_reclen = entry_length(bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh); + d_off = deh_offset (deh); + d_ino = deh_objectid (deh); + + if (!d_name[d_reclen - 1]) + d_reclen = strlen (d_name); + + if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){ + /* too big to send back to VFS */ + continue ; + } + + /* Ignore the .reiserfs_priv entry */ + if (reiserfs_xattrs (inode->i_sb) && + !old_format_only(inode->i_sb) && + deh_objectid (deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) + continue; + + if (d_reclen <= 32) { + local_buf = small_buf ; + } else { + local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ; + if (!local_buf) { + pathrelse (&path_to_entry); + return -ENOMEM ; + } + if (item_moved (&tmp_ih, &path_to_entry)) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + + /* sigh, must retry. Do this same offset again */ + next_pos = d_off; + goto research; + } + } + + // Note, that we copy name to user space via temporary + // buffer (local_buf) because filldir will block if + // user space buffer is swapped out. At that time + // entry can move to somewhere else + memcpy (local_buf, d_name, d_reclen); + + /* the filldir function might need to start transactions, + * or do who knows what. Release the path now that we've + * copied all the important stuff out of the deh + */ + pathrelse (&path_to_entry); + + if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, + DT_UNKNOWN) < 0) { + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + } + goto end; + } + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + } + } /* while */ + +end: + pathrelse (&path_to_entry); + return 0; +} + +/* + * this could be done with dedicated readdir ops for the xattr files, + * but I want to get something working asap + * this is stolen from vfs_readdir + * + */ +static +int xattr_readdir(struct file *file, filldir_t filler, void *buf) +{ + struct inode *inode = file->f_dentry->d_inode; + int res = -ENOTDIR; + if (!file->f_op || !file->f_op->readdir) + goto out; + down(&inode->i_sem); +// down(&inode->i_zombie); + res = -ENOENT; + if (!IS_DEADDIR(inode)) { + lock_kernel(); + res = __xattr_readdir(file, buf, filler); + unlock_kernel(); + } +// up(&inode->i_zombie); + up(&inode->i_sem); +out: + return res; +} + + +/* Internal operations on file data */ +static inline void +reiserfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static struct page * +reiserfs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page; + /* We can deadlock if we try to free dentries, + and an unlink/rmdir has just occured - GFP_NOFS avoids this */ + mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS; + page = read_cache_page (mapping, n, + (filler_t*)mapping->a_ops->readpage, NULL); + if (!IS_ERR(page)) { + wait_on_page_locked(page); + kmap(page); + if (!PageUptodate(page)) + goto fail; + + if (PageError(page)) + goto fail; + } + return page; + +fail: + reiserfs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline __u32 +xattr_hash (const char *msg, int len) +{ + return csum_partial (msg, len, 0); +} + +/* Generic extended attribute operations that can be used by xa plugins */ + +/* + * inode->i_sem: down + */ +int +reiserfs_xattr_set (struct inode *inode, const char *name, const void *buffer, + size_t buffer_size, int flags) +{ + int err = 0; + struct file *fp; + struct page *page; + char *data; + struct address_space *mapping; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct inode *xinode; + struct iattr newattrs; + __u32 xahash = 0; + + if (IS_RDONLY (inode)) + return -EROFS; + + if (IS_IMMUTABLE (inode) || IS_APPEND (inode)) + return -EPERM; + + if (get_inode_sd_version (inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + /* Empty xattrs are ok, they're just empty files, no hash */ + if (buffer && buffer_size) + xahash = xattr_hash (buffer, buffer_size); + +open_file: + fp = open_xa_file (inode, name, flags); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + goto out; + } + + xinode = fp->f_dentry->d_inode; + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* we need to copy it off.. */ + if (xinode->i_nlink > 1) { + fput(fp); + err = reiserfs_xattr_del (inode, name); + if (err < 0) + goto out; + /* We just killed the old one, we're not replacing anymore */ + if (flags & XATTR_REPLACE) + flags &= ~XATTR_REPLACE; + goto open_file; + } + + /* Resize it so we're ok to write there */ + newattrs.ia_size = buffer_size; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + down (&xinode->i_sem); + err = notify_change(fp->f_dentry, &newattrs); + if (err) + goto out_filp; + + mapping = xinode->i_mapping; + while (buffer_pos < buffer_size || buffer_pos == 0) { + size_t chunk; + size_t skip = 0; + size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = buffer_size - buffer_pos; + + page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT); + if (IS_ERR (page)) { + err = PTR_ERR (page); + goto out_filp; + } + + lock_page (page); + data = page_address (page); + + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh; + skip = file_pos = sizeof (struct reiserfs_xattr_header); + if (chunk + skip > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE - skip; + rxh = (struct reiserfs_xattr_header *)data; + rxh->h_magic = cpu_to_le32 (REISERFS_XATTR_MAGIC); + rxh->h_hash = cpu_to_le32 (xahash); + } + + err = mapping->a_ops->prepare_write (fp, page, page_offset, + page_offset + chunk + skip); + if (!err) { + if (buffer) + memcpy (data + skip, buffer + buffer_pos, chunk); + err = mapping->a_ops->commit_write (fp, page, page_offset, + page_offset + chunk + skip); + } + unlock_page (page); + reiserfs_put_page (page); + buffer_pos += chunk; + file_pos += chunk; + skip = 0; + if (err || buffer_size == 0 || !buffer) + break; + } + + /* We can't mark the inode dirty if it's not hashed. This is the case + * when we're inheriting the default ACL. If we dirty it, the inode + * gets marked dirty, but won't (ever) make it onto the dirty list until + * it's synced explicitly to clear I_DIRTY. This is bad. */ + if (!hlist_unhashed(&inode->i_hash)) { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty (inode); + } + +out_filp: + up (&xinode->i_sem); + fput(fp); + +out: + return err; +} + +/* + * inode->i_sem: down + */ +int +reiserfs_xattr_get (const struct inode *inode, const char *name, void *buffer, + size_t buffer_size) +{ + ssize_t err = 0; + struct file *fp; + size_t isize; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct page *page; + struct inode *xinode; + __u32 hash = 0; + + if (name == NULL) + return -EINVAL; + + /* We can't have xattrs attached to v1 items since they don't have + * generation numbers */ + if (get_inode_sd_version (inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + fp = open_xa_file (inode, name, FL_READONLY); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + goto out; + } + + xinode = fp->f_dentry->d_inode; + isize = xinode->i_size; + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* Just return the size needed */ + if (buffer == NULL) { + err = isize - sizeof (struct reiserfs_xattr_header); + goto out_dput; + } + + if (buffer_size < isize - sizeof (struct reiserfs_xattr_header)) { + err = -ERANGE; + goto out_dput; + } + + while (file_pos < isize) { + size_t chunk; + char *data; + size_t skip = 0; + if (isize - file_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = isize - file_pos; + + page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT); + if (IS_ERR (page)) { + err = PTR_ERR (page); + goto out_dput; + } + + lock_page (page); + data = page_address (page); + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh = + (struct reiserfs_xattr_header *)data; + skip = file_pos = sizeof (struct reiserfs_xattr_header); + chunk -= skip; + /* Magic doesn't match up.. */ + if (rxh->h_magic != cpu_to_le32 (REISERFS_XATTR_MAGIC)) { + unlock_page (page); + reiserfs_put_page (page); + reiserfs_warning (inode->i_sb, "Invalid magic for xattr (%s) " + "associated with %k", name, + INODE_PKEY (inode)); + err = -EIO; + goto out_dput; + } + hash = le32_to_cpu (rxh->h_hash); + } + memcpy (buffer + buffer_pos, data + skip, chunk); + unlock_page (page); + reiserfs_put_page (page); + file_pos += chunk; + buffer_pos += chunk; + skip = 0; + } + err = isize - sizeof (struct reiserfs_xattr_header); + + if (xattr_hash (buffer, isize - sizeof (struct reiserfs_xattr_header)) != hash) { + reiserfs_warning (inode->i_sb, "Invalid hash for xattr (%s) associated " + "with %k", name, INODE_PKEY (inode)); + err = -EIO; + } + +out_dput: + fput(fp); + +out: + return err; +} + +static int +__reiserfs_xattr_del (struct dentry *xadir, const char *name, int namelen) +{ + struct dentry *dentry; + struct inode *dir = xadir->d_inode; + int err = 0; + + dentry = lookup_one_len (name, xadir, namelen); + if (IS_ERR (dentry)) { + err = PTR_ERR (dentry); + goto out; + } else if (!dentry->d_inode) { + err = -ENODATA; + goto out_file; + } + + /* Skip directories.. */ + if (S_ISDIR (dentry->d_inode->i_mode)) + goto out_file; + + if (!is_reiserfs_priv_object (dentry->d_inode)) { + reiserfs_warning (dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have " + "priv flag set [parent is %sset].", + le32_to_cpu (INODE_PKEY (dentry->d_inode)->k_objectid), + xadir->d_name.len, xadir->d_name.name, namelen, name, + is_reiserfs_priv_object (xadir->d_inode) ? "" : "not "); + dput (dentry); + return -EIO; + } + + err = dir->i_op->unlink (dir, dentry); + if (!err) + d_delete (dentry); + +out_file: + dput (dentry); + +out: + return err; +} + + +int +reiserfs_xattr_del (struct inode *inode, const char *name) +{ + struct dentry *dir; + int err; + + if (IS_RDONLY (inode)) + return -EROFS; + + dir = open_xa_dir (inode, FL_READONLY); + if (IS_ERR (dir)) { + err = PTR_ERR (dir); + goto out; + } + + err = __reiserfs_xattr_del (dir, name, strlen (name)); + dput (dir); + + if (!err) { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty (inode); + } + +out: + return err; +} + +/* The following are side effects of other operations that aren't explicitly + * modifying extended attributes. This includes operations such as permissions + * or ownership changes, object deletions, etc. */ + +static int +reiserfs_delete_xattrs_filler (void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) +{ + struct dentry *xadir = (struct dentry *)buf; + + return __reiserfs_xattr_del (xadir, name, namelen); + +} + +/* This is called w/ inode->i_sem downed */ +int +reiserfs_delete_xattrs (struct inode *inode) +{ + struct file *fp; + struct dentry *dir, *root; + int err = 0; + + /* Skip out, an xattr has no xattrs associated with it */ + if (is_reiserfs_priv_object (inode) || + get_inode_sd_version (inode) == STAT_DATA_V1 || + !reiserfs_xattrs(inode->i_sb)) + { + return 0; + } + reiserfs_read_lock_xattrs (inode->i_sb); + dir = open_xa_dir (inode, FL_READONLY); + reiserfs_read_unlock_xattrs (inode->i_sb); + if (IS_ERR (dir)) { + err = PTR_ERR (dir); + goto out; + } else if (!dir->d_inode) { + dput (dir); + return 0; + } + + fp = dentry_open (dir, NULL, O_RDWR); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + + lock_kernel (); + err = xattr_readdir (fp, reiserfs_delete_xattrs_filler, dir); + if (err) { + unlock_kernel (); + goto out_dir; + } + + /* Leftovers besides . and .. -- that's not good. */ + if (dir->d_inode->i_nlink <= 2) { + root = get_xa_root (inode->i_sb); + reiserfs_write_lock_xattrs (inode->i_sb); + err = vfs_rmdir (root->d_inode, dir); + reiserfs_write_unlock_xattrs (inode->i_sb); + dput (root); + } else { + reiserfs_warning (inode->i_sb, + "Couldn't remove all entries in directory"); + } + unlock_kernel (); + +out_dir: + fput(fp); + +out: + if (!err) + REISERFS_I(inode)->i_flags = REISERFS_I(inode)->i_flags & ~i_has_xattr_dir; + return err; +} + +struct reiserfs_chown_buf { + struct inode *inode; + struct dentry *xadir; + struct iattr *attrs; +}; + +/* XXX: If there is a better way to do this, I'd love to hear about it */ +static int +reiserfs_chown_xattrs_filler (void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) +{ + struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf; + struct dentry *xafile, *xadir = chown_buf->xadir; + struct iattr *attrs = chown_buf->attrs; + int err = 0; + + xafile = lookup_one_len (name, xadir, namelen); + if (IS_ERR (xafile)) + return PTR_ERR (xafile); + else if (!xafile->d_inode) { + dput (xafile); + return -ENODATA; + } + + if (!S_ISDIR (xafile->d_inode->i_mode)) + err = notify_change (xafile, attrs); + dput (xafile); + + return err; +} + +int +reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs) +{ + struct file *fp; + struct dentry *dir; + int err = 0; + struct reiserfs_chown_buf buf; + unsigned int ia_valid = attrs->ia_valid; + + /* Skip out, an xattr has no xattrs associated with it */ + if (is_reiserfs_priv_object (inode) || + get_inode_sd_version (inode) == STAT_DATA_V1 || + !reiserfs_xattrs(inode->i_sb)) + { + return 0; + } + reiserfs_read_lock_xattrs (inode->i_sb); + dir = open_xa_dir (inode, FL_READONLY); + reiserfs_read_unlock_xattrs (inode->i_sb); + if (IS_ERR (dir)) { + if (PTR_ERR (dir) != -ENODATA) + err = PTR_ERR (dir); + goto out; + } else if (!dir->d_inode) { + dput (dir); + goto out; + } + + fp = dentry_open (dir, NULL, O_RDWR); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + + lock_kernel (); + + attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME); + buf.xadir = dir; + buf.attrs = attrs; + buf.inode = inode; + + err = xattr_readdir (fp, reiserfs_chown_xattrs_filler, &buf); + if (err) { + unlock_kernel (); + goto out_dir; + } + + err = notify_change (dir, attrs); + unlock_kernel (); + +out_dir: + fput(fp); + +out: + attrs->ia_valid = ia_valid; + return err; +} + + +/* Actual operations that are exported to VFS-land */ + +/* + * Inode operation getxattr() + * Preliminary locking: we down dentry->d_inode->i_sem + */ +ssize_t +reiserfs_getxattr (struct dentry *dentry, const char *name, void *buffer, + size_t size) +{ + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + int err; + + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + reiserfs_read_lock_xattr_i (dentry->d_inode); + reiserfs_read_lock_xattrs (dentry->d_sb); + err = xah->get (dentry->d_inode, name, buffer, size); + reiserfs_read_unlock_xattrs (dentry->d_sb); + reiserfs_read_unlock_xattr_i (dentry->d_inode); + return err; +} + + +/* + * Inode operation setxattr() + * + * dentry->d_inode->i_sem down + */ +int +reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + int err; + int lock; + + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + if (IS_RDONLY (dentry->d_inode)) + return -EROFS; + + if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode)) + return -EROFS; + + reiserfs_write_lock_xattr_i (dentry->d_inode); + lock = !has_xattr_dir (dentry->d_inode); + if (lock) + reiserfs_write_lock_xattrs (dentry->d_sb); + else + reiserfs_read_lock_xattrs (dentry->d_sb); + err = xah->set (dentry->d_inode, name, value, size, flags); + if (lock) + reiserfs_write_unlock_xattrs (dentry->d_sb); + else + reiserfs_read_unlock_xattrs (dentry->d_sb); + reiserfs_write_unlock_xattr_i (dentry->d_inode); + return err; +} + +/* + * Inode operation removexattr() + * + * dentry->d_inode->i_sem down + */ +int +reiserfs_removexattr (struct dentry *dentry, const char *name) +{ + int err; + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + if (IS_RDONLY (dentry->d_inode)) + return -EROFS; + + if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode)) + return -EPERM; + + reiserfs_write_lock_xattr_i (dentry->d_inode); + reiserfs_read_lock_xattrs (dentry->d_sb); + + /* Deletion pre-operation */ + if (xah->del) { + err = xah->del (dentry->d_inode, name); + if (err) + goto out; + } + + err = reiserfs_xattr_del (dentry->d_inode, name); + + dentry->d_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty (dentry->d_inode); + +out: + reiserfs_read_unlock_xattrs (dentry->d_sb); + reiserfs_write_unlock_xattr_i (dentry->d_inode); + return err; +} + + +/* This is what filldir will use: + * r_pos will always contain the amount of space required for the entire + * list. If r_pos becomes larger than r_size, we need more space and we + * return an error indicating this. If r_pos is less than r_size, then we've + * filled the buffer successfully and we return success */ +struct reiserfs_listxattr_buf { + int r_pos; + int r_size; + char *r_buf; + struct inode *r_inode; +}; + +static int +reiserfs_listxattr_filler (void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) +{ + struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf; + int len = 0; + if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + if (!xah) return 0; /* Unsupported xattr name, skip it */ + + /* We call ->list() twice because the operation isn't required to just + * return the name back - we want to make sure we have enough space */ + len += xah->list (b->r_inode, name, namelen, NULL); + + if (len) { + if (b->r_pos + len + 1 <= b->r_size) { + char *p = b->r_buf + b->r_pos; + p += xah->list (b->r_inode, name, namelen, p); + *p++ = '\0'; + } + b->r_pos += len + 1; + } + } + + return 0; +} +/* + * Inode operation listxattr() + * + * Preliminary locking: we down dentry->d_inode->i_sem + */ +ssize_t +reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size) +{ + struct file *fp; + struct dentry *dir; + int err = 0; + struct reiserfs_listxattr_buf buf; + + if (!dentry->d_inode) + return -EINVAL; + + if (!reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + reiserfs_read_lock_xattr_i (dentry->d_inode); + reiserfs_read_lock_xattrs (dentry->d_sb); + dir = open_xa_dir (dentry->d_inode, FL_READONLY); + reiserfs_read_unlock_xattrs (dentry->d_sb); + if (IS_ERR (dir)) { + err = PTR_ERR (dir); + if (err == -ENODATA) + err = 0; /* Not an error if there aren't any xattrs */ + goto out; + } + + fp = dentry_open (dir, NULL, O_RDWR); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + + buf.r_buf = buffer; + buf.r_size = buffer ? size : 0; + buf.r_pos = 0; + buf.r_inode = dentry->d_inode; + + REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir; + + err = xattr_readdir (fp, reiserfs_listxattr_filler, &buf); + if (err) + goto out_dir; + + if (buf.r_pos > buf.r_size && buffer != NULL) + err = -ERANGE; + else + err = buf.r_pos; + +out_dir: + fput(fp); + +out: + reiserfs_read_unlock_xattr_i (dentry->d_inode); + return err; +} + +/* This is the implementation for the xattr plugin infrastructure */ +static struct list_head xattr_handlers = LIST_HEAD_INIT (xattr_handlers); +static DEFINE_RWLOCK(handler_lock); + +static struct reiserfs_xattr_handler * +find_xattr_handler_prefix (const char *prefix) +{ + struct reiserfs_xattr_handler *xah = NULL; + struct list_head *p; + + read_lock (&handler_lock); + list_for_each (p, &xattr_handlers) { + xah = list_entry (p, struct reiserfs_xattr_handler, handlers); + if (strncmp (xah->prefix, prefix, strlen (xah->prefix)) == 0) + break; + xah = NULL; + } + + read_unlock (&handler_lock); + return xah; +} + +static void +__unregister_handlers (void) +{ + struct reiserfs_xattr_handler *xah; + struct list_head *p, *tmp; + + list_for_each_safe (p, tmp, &xattr_handlers) { + xah = list_entry (p, struct reiserfs_xattr_handler, handlers); + if (xah->exit) + xah->exit(); + + list_del_init (p); + } + INIT_LIST_HEAD (&xattr_handlers); +} + +int __init +reiserfs_xattr_register_handlers (void) +{ + int err = 0; + struct reiserfs_xattr_handler *xah; + struct list_head *p; + + write_lock (&handler_lock); + + /* If we're already initialized, nothing to do */ + if (!list_empty (&xattr_handlers)) { + write_unlock (&handler_lock); + return 0; + } + + /* Add the handlers */ + list_add_tail (&user_handler.handlers, &xattr_handlers); + list_add_tail (&trusted_handler.handlers, &xattr_handlers); +#ifdef CONFIG_REISERFS_FS_SECURITY + list_add_tail (&security_handler.handlers, &xattr_handlers); +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + list_add_tail (&posix_acl_access_handler.handlers, &xattr_handlers); + list_add_tail (&posix_acl_default_handler.handlers, &xattr_handlers); +#endif + + /* Run initializers, if available */ + list_for_each (p, &xattr_handlers) { + xah = list_entry (p, struct reiserfs_xattr_handler, handlers); + if (xah->init) { + err = xah->init (); + if (err) { + list_del_init (p); + break; + } + } + } + + /* Clean up other handlers, if any failed */ + if (err) + __unregister_handlers (); + + write_unlock (&handler_lock); + return err; +} + +void +reiserfs_xattr_unregister_handlers (void) +{ + write_lock (&handler_lock); + __unregister_handlers (); + write_unlock (&handler_lock); +} + +/* This will catch lookups from the fs root to .reiserfs_priv */ +static int +xattr_lookup_poison (struct dentry *dentry, struct qstr *q1, struct qstr *name) +{ + struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; + if (name->len == priv_root->d_name.len && + name->hash == priv_root->d_name.hash && + !memcmp (name->name, priv_root->d_name.name, name->len)) { + return -ENOENT; + } else if (q1->len == name->len && + !memcmp(q1->name, name->name, name->len)) + return 0; + return 1; +} + +static struct dentry_operations xattr_lookup_poison_ops = { + .d_compare = xattr_lookup_poison, +}; + + +/* We need to take a copy of the mount flags since things like + * MS_RDONLY don't get set until *after* we're called. + * mount_flags != mount_options */ +int +reiserfs_xattr_init (struct super_block *s, int mount_flags) +{ + int err = 0; + + /* We need generation numbers to ensure that the oid mapping is correct + * v3.5 filesystems don't have them. */ + if (!old_format_only (s)) { + set_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + } else if (reiserfs_xattrs_optional (s)) { + /* Old format filesystem, but optional xattrs have been enabled + * at mount time. Error out. */ + reiserfs_warning (s, "xattrs/ACLs not supported on pre v3.6 " + "format filesystem. Failing mount."); + err = -EOPNOTSUPP; + goto error; + } else { + /* Old format filesystem, but no optional xattrs have been enabled. This + * means we silently disable xattrs on the filesystem. */ + clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + } + + /* If we don't have the privroot located yet - go find it */ + if (reiserfs_xattrs (s) && !REISERFS_SB(s)->priv_root) { + struct dentry *dentry; + dentry = lookup_one_len (PRIVROOT_NAME, s->s_root, + strlen (PRIVROOT_NAME)); + if (!IS_ERR (dentry)) { + if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { + struct inode *inode = dentry->d_parent->d_inode; + down (&inode->i_sem); + err = inode->i_op->mkdir (inode, dentry, 0700); + up (&inode->i_sem); + if (err) { + dput (dentry); + dentry = NULL; + } + + if (dentry && dentry->d_inode) + reiserfs_warning (s, "Created %s on %s - reserved for " + "xattr storage.", PRIVROOT_NAME, + reiserfs_bdevname (inode->i_sb)); + } else if (!dentry->d_inode) { + dput (dentry); + dentry = NULL; + } + } else + err = PTR_ERR (dentry); + + if (!err && dentry) { + s->s_root->d_op = &xattr_lookup_poison_ops; + reiserfs_mark_inode_private (dentry->d_inode); + REISERFS_SB(s)->priv_root = dentry; + } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */ + /* If we're read-only it just means that the dir hasn't been + * created. Not an error -- just no xattrs on the fs. We'll + * check again if we go read-write */ + reiserfs_warning (s, "xattrs/ACLs enabled and couldn't " + "find/create .reiserfs_priv. Failing mount."); + err = -EOPNOTSUPP; + } + } + +error: + /* This is only nonzero if there was an error initializing the xattr + * directory or if there is a condition where we don't support them. */ + if (err) { + clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit (REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit (REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); + } + + /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ + s->s_flags = s->s_flags & ~MS_POSIXACL; + if (reiserfs_posixacl (s)) + s->s_flags |= MS_POSIXACL; + + return err; +} + +static int +__reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd, + int need_lock) +{ + umode_t mode = inode->i_mode; + + if (mask & MAY_WRITE) { + /* + * Nobody gets write access to a read-only fs. + */ + if (IS_RDONLY(inode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; + + /* + * Nobody gets write access to an immutable file. + */ + if (IS_IMMUTABLE(inode)) + return -EACCES; + } + + /* We don't do permission checks on the internal objects. + * Permissions are determined by the "owning" object. */ + if (is_reiserfs_priv_object (inode)) + return 0; + + if (current->fsuid == inode->i_uid) { + mode >>= 6; +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + } else if (reiserfs_posixacl(inode->i_sb) && + get_inode_sd_version (inode) != STAT_DATA_V1) { + struct posix_acl *acl; + + /* ACL can't contain additional permissions if + the ACL_MASK entry is 0 */ + if (!(mode & S_IRWXG)) + goto check_groups; + + if (need_lock) { + reiserfs_read_lock_xattr_i (inode); + reiserfs_read_lock_xattrs (inode->i_sb); + } + acl = reiserfs_get_acl (inode, ACL_TYPE_ACCESS); + if (need_lock) { + reiserfs_read_unlock_xattrs (inode->i_sb); + reiserfs_read_unlock_xattr_i (inode); + } + if (IS_ERR (acl)) { + if (PTR_ERR (acl) == -ENODATA) + goto check_groups; + return PTR_ERR (acl); + } + + if (acl) { + int err = posix_acl_permission (inode, acl, mask); + posix_acl_release (acl); + if (err == -EACCES) { + goto check_capabilities; + } + return err; + } else { + goto check_groups; + } +#endif + } else { +check_groups: + if (in_group_p(inode->i_gid)) + mode >>= 3; + } + + /* + * If the DACs are ok we don't need any capability check. + */ + if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)) + return 0; + +check_capabilities: + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable if at least one exec bit is set. + */ + if (!(mask & MAY_EXEC) || + (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) + if (capable(CAP_DAC_OVERRIDE)) + return 0; + + /* + * Searching includes executable on directories, else just read. + */ + if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) + if (capable(CAP_DAC_READ_SEARCH)) + return 0; + + return -EACCES; +} + +int +reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd) +{ + return __reiserfs_permission (inode, mask, nd, 1); +} + +int +reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd) +{ + return __reiserfs_permission (inode, mask, nd, 0); +} diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c new file mode 100644 index 000000000000..e302071903a1 --- /dev/null +++ b/fs/reiserfs/xattr_acl.c @@ -0,0 +1,571 @@ +#include <linux/fs.h> +#include <linux/posix_acl.h> +#include <linux/reiserfs_fs.h> +#include <linux/errno.h> +#include <linux/pagemap.h> +#include <linux/xattr.h> +#include <linux/xattr_acl.h> +#include <linux/reiserfs_xattr.h> +#include <linux/reiserfs_acl.h> +#include <asm/uaccess.h> + +static int reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl); + +static int +xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) +{ + struct posix_acl *acl; + int error; + + if (!reiserfs_posixacl(inode->i_sb)) + return -EOPNOTSUPP; + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) { + return PTR_ERR(acl); + } else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + + error = reiserfs_set_acl (inode, type, acl); + +release_and_out: + posix_acl_release(acl); + return error; +} + + +static int +xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +{ + struct posix_acl *acl; + int error; + + if (!reiserfs_posixacl(inode->i_sb)) + return -EOPNOTSUPP; + + acl = reiserfs_get_acl (inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +posix_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(reiserfs_acl_header)) + return ERR_PTR(-EINVAL); + if (((reiserfs_acl_header *)value)->a_version != + cpu_to_le32(REISERFS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(reiserfs_acl_header); + count = reiserfs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n=0; n < count; n++) { + reiserfs_acl_entry *entry = + (reiserfs_acl_entry *)value; + if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(reiserfs_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +posix_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + reiserfs_acl_header *ext_acl; + char *e; + int n; + + *size = reiserfs_acl_size(acl->a_count); + ext_acl = (reiserfs_acl_header *)kmalloc(sizeof(reiserfs_acl_header) + + acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); + e = (char *)ext_acl + sizeof(reiserfs_acl_header); + for (n=0; n < acl->a_count; n++) { + reiserfs_acl_entry *entry = (reiserfs_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = + cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(reiserfs_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(reiserfs_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_sem: down + * BKL held [before 2.5.x] + */ +struct posix_acl * +reiserfs_get_acl(struct inode *inode, int type) +{ + char *name, *value; + struct posix_acl *acl, **p_acl; + size_t size; + int retval; + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_ACL_ACCESS; + p_acl = &reiserfs_i->i_acl_access; + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_ACL_DEFAULT; + p_acl = &reiserfs_i->i_acl_default; + break; + default: + return ERR_PTR (-EINVAL); + } + + if (IS_ERR (*p_acl)) { + if (PTR_ERR (*p_acl) == -ENODATA) + return NULL; + } else if (*p_acl != NULL) + return posix_acl_dup (*p_acl); + + size = reiserfs_xattr_get (inode, name, NULL, 0); + if ((int)size < 0) { + if (size == -ENODATA || size == -ENOSYS) { + *p_acl = ERR_PTR (-ENODATA); + return NULL; + } + return ERR_PTR (size); + } + + value = kmalloc (size, GFP_NOFS); + if (!value) + return ERR_PTR (-ENOMEM); + + retval = reiserfs_xattr_get(inode, name, value, size); + if (retval == -ENODATA || retval == -ENOSYS) { + /* This shouldn't actually happen as it should have + been caught above.. but just in case */ + acl = NULL; + *p_acl = ERR_PTR (-ENODATA); + } else if (retval < 0) { + acl = ERR_PTR(retval); + } else { + acl = posix_acl_from_disk(value, retval); + *p_acl = posix_acl_dup (acl); + } + + kfree(value); + return acl; +} + +/* + * Inode operation set_posix_acl(). + * + * inode->i_sem: down + * BKL held [before 2.5.x] + */ +static int +reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + char *name; + void *value = NULL; + struct posix_acl **p_acl; + size_t size; + int error; + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_ACL_ACCESS; + p_acl = &reiserfs_i->i_acl_access; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode (acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + if (error == 0) + acl = NULL; + } + } + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_ACL_DEFAULT; + p_acl = &reiserfs_i->i_acl_default; + if (!S_ISDIR (inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = posix_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + error = reiserfs_xattr_set(inode, name, value, size, 0); + } else { + error = reiserfs_xattr_del (inode, name); + if (error == -ENODATA) { + /* This may seem odd here, but it means that the ACL was set + * with a value representable with mode bits. If there was + * an ACL before, reiserfs_xattr_del already dirtied the inode. + */ + mark_inode_dirty (inode); + error = 0; + } + } + + if (value) + kfree(value); + + if (!error) { + /* Release the old one */ + if (!IS_ERR (*p_acl) && *p_acl) + posix_acl_release (*p_acl); + + if (acl == NULL) + *p_acl = ERR_PTR (-ENODATA); + else + *p_acl = posix_acl_dup (acl); + } + + return error; +} + +/* dir->i_sem: down, + * inode is new and not released into the wild yet */ +int +reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode) +{ + struct posix_acl *acl; + int err = 0; + + /* ACLs only get applied to files and directories */ + if (S_ISLNK (inode->i_mode)) + return 0; + + /* ACLs can only be used on "new" objects, so if it's an old object + * there is nothing to inherit from */ + if (get_inode_sd_version (dir) == STAT_DATA_V1) + goto apply_umask; + + /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This + * would be useless since permissions are ignored, and a pain because + * it introduces locking cycles */ + if (is_reiserfs_priv_object (dir)) { + reiserfs_mark_inode_private (inode); + goto apply_umask; + } + + acl = reiserfs_get_acl (dir, ACL_TYPE_DEFAULT); + if (IS_ERR (acl)) { + if (PTR_ERR (acl) == -ENODATA) + goto apply_umask; + return PTR_ERR (acl); + } + + if (acl) { + struct posix_acl *acl_copy; + mode_t mode = inode->i_mode; + int need_acl; + + /* Copy the default ACL to the default ACL of a new directory */ + if (S_ISDIR (inode->i_mode)) { + err = reiserfs_set_acl (inode, ACL_TYPE_DEFAULT, acl); + if (err) + goto cleanup; + } + + /* Now we reconcile the new ACL and the mode, + potentially modifying both */ + acl_copy = posix_acl_clone (acl, GFP_NOFS); + if (!acl_copy) { + err = -ENOMEM; + goto cleanup; + } + + + need_acl = posix_acl_create_masq (acl_copy, &mode); + if (need_acl >= 0) { + if (mode != inode->i_mode) { + inode->i_mode = mode; + } + + /* If we need an ACL.. */ + if (need_acl > 0) { + err = reiserfs_set_acl (inode, ACL_TYPE_ACCESS, acl_copy); + if (err) + goto cleanup_copy; + } + } +cleanup_copy: + posix_acl_release (acl_copy); +cleanup: + posix_acl_release (acl); + } else { +apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current->fs->umask; + } + + return err; +} + +/* Looks up and caches the result of the default ACL. + * We do this so that we don't need to carry the xattr_sem into + * reiserfs_new_inode if we don't need to */ +int +reiserfs_cache_default_acl (struct inode *inode) +{ + int ret = 0; + if (reiserfs_posixacl (inode->i_sb) && + !is_reiserfs_priv_object (inode)) { + struct posix_acl *acl; + reiserfs_read_lock_xattr_i (inode); + reiserfs_read_lock_xattrs (inode->i_sb); + acl = reiserfs_get_acl (inode, ACL_TYPE_DEFAULT); + reiserfs_read_unlock_xattrs (inode->i_sb); + reiserfs_read_unlock_xattr_i (inode); + ret = acl ? 1 : 0; + posix_acl_release (acl); + } + + return ret; +} + +int +reiserfs_acl_chmod (struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (get_inode_sd_version (inode) == STAT_DATA_V1 || + !reiserfs_posixacl(inode->i_sb)) + { + return 0; + } + + reiserfs_read_lock_xattrs (inode->i_sb); + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_read_unlock_xattrs (inode->i_sb); + if (!acl) + return 0; + if (IS_ERR(acl)) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_NOFS); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + int lock = !has_xattr_dir (inode); + reiserfs_write_lock_xattr_i (inode); + if (lock) + reiserfs_write_lock_xattrs (inode->i_sb); + else + reiserfs_read_lock_xattrs (inode->i_sb); + error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + if (lock) + reiserfs_write_unlock_xattrs (inode->i_sb); + else + reiserfs_read_unlock_xattrs (inode->i_sb); + reiserfs_write_unlock_xattr_i (inode); + } + posix_acl_release(clone); + return error; +} + +static int +posix_acl_access_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + return -EINVAL; + return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); +} + +static int +posix_acl_access_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + return -EINVAL; + return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int +posix_acl_access_del (struct inode *inode, const char *name) +{ + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + struct posix_acl **acl = &reiserfs_i->i_acl_access; + if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + return -EINVAL; + if (!IS_ERR (*acl) && *acl) { + posix_acl_release (*acl); + *acl = ERR_PTR (-ENODATA); + } + + return 0; +} + +static int +posix_acl_access_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + if (!reiserfs_posixacl (inode->i_sb)) + return 0; + if (out) + memcpy (out, name, len); + + return len; +} + +struct reiserfs_xattr_handler posix_acl_access_handler = { + .prefix = XATTR_NAME_ACL_ACCESS, + .get = posix_acl_access_get, + .set = posix_acl_access_set, + .del = posix_acl_access_del, + .list = posix_acl_access_list, +}; + +static int +posix_acl_default_get (struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + return -EINVAL; + return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); +} + +static int +posix_acl_default_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + return -EINVAL; + return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +static int +posix_acl_default_del (struct inode *inode, const char *name) +{ + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + struct posix_acl **acl = &reiserfs_i->i_acl_default; + if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + return -EINVAL; + if (!IS_ERR (*acl) && *acl) { + posix_acl_release (*acl); + *acl = ERR_PTR (-ENODATA); + } + + return 0; +} + +static int +posix_acl_default_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + if (!reiserfs_posixacl (inode->i_sb)) + return 0; + if (out) + memcpy (out, name, len); + + return len; +} + +struct reiserfs_xattr_handler posix_acl_default_handler = { + .prefix = XATTR_NAME_ACL_DEFAULT, + .get = posix_acl_default_get, + .set = posix_acl_default_set, + .del = posix_acl_default_del, + .list = posix_acl_default_list, +}; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c new file mode 100644 index 000000000000..e044d5117117 --- /dev/null +++ b/fs/reiserfs/xattr_security.c @@ -0,0 +1,69 @@ +#include <linux/reiserfs_fs.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/xattr.h> +#include <linux/reiserfs_xattr.h> +#include <asm/uaccess.h> + +#define XATTR_SECURITY_PREFIX "security." + +static int +security_get (struct inode *inode, const char *name, void *buffer, size_t size) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (is_reiserfs_priv_object(inode)) + return -EPERM; + + return reiserfs_xattr_get (inode, name, buffer, size); +} + +static int +security_set (struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (is_reiserfs_priv_object(inode)) + return -EPERM; + + return reiserfs_xattr_set (inode, name, buffer, size, flags); +} + +static int +security_del (struct inode *inode, const char *name) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (is_reiserfs_priv_object(inode)) + return -EPERM; + + return 0; +} + +static int +security_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + + if (is_reiserfs_priv_object(inode)) + return 0; + + if (out) + memcpy (out, name, len); + + return len; +} + + +struct reiserfs_xattr_handler security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = security_get, + .set = security_set, + .del = security_del, + .list = security_list, +}; diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c new file mode 100644 index 000000000000..43762197fb0a --- /dev/null +++ b/fs/reiserfs/xattr_trusted.c @@ -0,0 +1,81 @@ +#include <linux/reiserfs_fs.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/xattr.h> +#include <linux/reiserfs_xattr.h> +#include <asm/uaccess.h> + +#define XATTR_TRUSTED_PREFIX "trusted." + +static int +trusted_get (struct inode *inode, const char *name, void *buffer, size_t size) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs (inode->i_sb)) + return -EOPNOTSUPP; + + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; + + return reiserfs_xattr_get (inode, name, buffer, size); +} + +static int +trusted_set (struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs (inode->i_sb)) + return -EOPNOTSUPP; + + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; + + return reiserfs_xattr_set (inode, name, buffer, size, flags); +} + +static int +trusted_del (struct inode *inode, const char *name) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs (inode->i_sb)) + return -EOPNOTSUPP; + + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; + + return 0; +} + +static int +trusted_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + + if (!reiserfs_xattrs (inode->i_sb)) + return 0; + + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return 0; + + if (out) + memcpy (out, name, len); + + return len; +} + + +struct reiserfs_xattr_handler trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = trusted_get, + .set = trusted_set, + .del = trusted_del, + .list = trusted_list, +}; diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c new file mode 100644 index 000000000000..0772806466a8 --- /dev/null +++ b/fs/reiserfs/xattr_user.c @@ -0,0 +1,99 @@ +#include <linux/reiserfs_fs.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/xattr.h> +#include <linux/reiserfs_xattr.h> +#include <asm/uaccess.h> + +#ifdef CONFIG_REISERFS_FS_POSIX_ACL +# include <linux/reiserfs_acl.h> +#endif + +#define XATTR_USER_PREFIX "user." + +static int +user_get (struct inode *inode, const char *name, void *buffer, size_t size) +{ + + int error; + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user (inode->i_sb)) + return -EOPNOTSUPP; + + error = reiserfs_permission_locked (inode, MAY_READ, NULL); + if (error) + return error; + + return reiserfs_xattr_get (inode, name, buffer, size); +} + +static int +user_set (struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) +{ + + int error; + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user (inode->i_sb)) + return -EOPNOTSUPP; + + if (!S_ISREG (inode->i_mode) && + (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; + + error = reiserfs_permission_locked (inode, MAY_WRITE, NULL); + if (error) + return error; + + return reiserfs_xattr_set (inode, name, buffer, size, flags); +} + +static int +user_del (struct inode *inode, const char *name) +{ + int error; + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user (inode->i_sb)) + return -EOPNOTSUPP; + + if (!S_ISREG (inode->i_mode) && + (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; + + error = reiserfs_permission_locked (inode, MAY_WRITE, NULL); + if (error) + return error; + + return 0; +} + +static int +user_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + if (!reiserfs_xattrs_user (inode->i_sb)) + return 0; + + if (out) + memcpy (out, name, len); + + return len; +} + +struct reiserfs_xattr_handler user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = user_get, + .set = user_set, + .del = user_del, + .list = user_list, +}; |