From f720e3ba558680cc7dd3995d005bdc8ee2ef46af Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 6 Jun 2007 15:28:35 -0500 Subject: JFS: Whitespace cleanup and remove some dead code Signed-off-by: Dave Kleikamp --- fs/jfs/endian24.h | 2 +- fs/jfs/jfs_dinode.h | 42 +++--- fs/jfs/jfs_dmap.c | 419 +++++++++++++++++++++++++------------------------- fs/jfs/jfs_dmap.h | 118 +++++++-------- fs/jfs/jfs_dtree.c | 105 ++++++------- fs/jfs/jfs_dtree.h | 2 +- fs/jfs/jfs_extent.c | 102 ++++++------- fs/jfs/jfs_filsys.h | 13 +- fs/jfs/jfs_imap.c | 293 +++++++++++++++++------------------ fs/jfs/jfs_imap.h | 98 ++++++------ fs/jfs/jfs_incore.h | 4 +- fs/jfs/jfs_logmgr.c | 68 ++++----- fs/jfs/jfs_logmgr.h | 26 ++-- fs/jfs/jfs_mount.c | 6 +- fs/jfs/jfs_txnmgr.c | 286 +++++++++++++++++------------------ fs/jfs/jfs_txnmgr.h | 2 +- fs/jfs/jfs_types.h | 20 +-- fs/jfs/jfs_umount.c | 2 +- fs/jfs/jfs_xtree.c | 428 ++++++++++++++++++++++++++-------------------------- fs/jfs/jfs_xtree.h | 48 +++--- fs/jfs/namei.c | 26 ++-- fs/jfs/resize.c | 48 +++--- fs/jfs/xattr.c | 6 +- 23 files changed, 1066 insertions(+), 1098 deletions(-) (limited to 'fs') diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h index 79494c4f2b10..fa92f7f1d0d0 100644 --- a/fs/jfs/endian24.h +++ b/fs/jfs/endian24.h @@ -29,7 +29,7 @@ __u32 __x = (x); \ ((__u32)( \ ((__x & (__u32)0x000000ffUL) << 16) | \ - (__x & (__u32)0x0000ff00UL) | \ + (__x & (__u32)0x0000ff00UL) | \ ((__x & (__u32)0x00ff0000UL) >> 16) )); \ }) diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h index 40b20111383c..c387540d3425 100644 --- a/fs/jfs/jfs_dinode.h +++ b/fs/jfs/jfs_dinode.h @@ -19,23 +19,23 @@ #define _H_JFS_DINODE /* - * jfs_dinode.h: on-disk inode manager + * jfs_dinode.h: on-disk inode manager */ -#define INODESLOTSIZE 128 -#define L2INODESLOTSIZE 7 -#define log2INODESIZE 9 /* log2(bytes per dinode) */ +#define INODESLOTSIZE 128 +#define L2INODESLOTSIZE 7 +#define log2INODESIZE 9 /* log2(bytes per dinode) */ /* - * on-disk inode : 512 bytes + * on-disk inode : 512 bytes * * note: align 64-bit fields on 8-byte boundary. */ struct dinode { /* - * I. base area (128 bytes) - * ------------------------ + * I. base area (128 bytes) + * ------------------------ * * define generic/POSIX attributes */ @@ -70,16 +70,16 @@ struct dinode { __le32 di_acltype; /* 4: Type of ACL */ /* - * Extension Areas. + * Extension Areas. * - * Historically, the inode was partitioned into 4 128-byte areas, - * the last 3 being defined as unions which could have multiple - * uses. The first 96 bytes had been completely unused until - * an index table was added to the directory. It is now more - * useful to describe the last 3/4 of the inode as a single - * union. We would probably be better off redesigning the - * entire structure from scratch, but we don't want to break - * commonality with OS/2's JFS at this time. + * Historically, the inode was partitioned into 4 128-byte areas, + * the last 3 being defined as unions which could have multiple + * uses. The first 96 bytes had been completely unused until + * an index table was added to the directory. It is now more + * useful to describe the last 3/4 of the inode as a single + * union. We would probably be better off redesigning the + * entire structure from scratch, but we don't want to break + * commonality with OS/2's JFS at this time. */ union { struct { @@ -95,7 +95,7 @@ struct dinode { } _dir; /* (384) */ #define di_dirtable u._dir._table #define di_dtroot u._dir._dtroot -#define di_parent di_dtroot.header.idotdot +#define di_parent di_dtroot.header.idotdot #define di_DASD di_dtroot.header.DASD struct { @@ -127,14 +127,14 @@ struct dinode { #define di_inlinedata u._file._u2._special._u #define di_rdev u._file._u2._special._u._rdev #define di_fastsymlink u._file._u2._special._u._fastsymlink -#define di_inlineea u._file._u2._special._inlineea +#define di_inlineea u._file._u2._special._inlineea } u; }; /* extended mode bits (on-disk inode di_mode) */ -#define IFJOURNAL 0x00010000 /* journalled file */ -#define ISPARSE 0x00020000 /* sparse file enabled */ -#define INLINEEA 0x00040000 /* inline EA area free */ +#define IFJOURNAL 0x00010000 /* journalled file */ +#define ISPARSE 0x00020000 /* sparse file enabled */ +#define INLINEEA 0x00040000 /* inline EA area free */ #define ISWAPFILE 0x00800000 /* file open for pager swap space */ /* more extended mode bits: attributes for OS/2 */ diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index f3b1ebb22280..e1985066b1c6 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -154,12 +154,12 @@ static const s8 budtab[256] = { * the in-core descriptor is initialized from disk. * * PARAMETERS: - * ipbmap - pointer to in-core inode for the block map. + * ipbmap - pointer to in-core inode for the block map. * * RETURN VALUES: - * 0 - success - * -ENOMEM - insufficient memory - * -EIO - i/o error + * 0 - success + * -ENOMEM - insufficient memory + * -EIO - i/o error */ int dbMount(struct inode *ipbmap) { @@ -232,11 +232,11 @@ int dbMount(struct inode *ipbmap) * the memory for this descriptor is freed. * * PARAMETERS: - * ipbmap - pointer to in-core inode for the block map. + * ipbmap - pointer to in-core inode for the block map. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error */ int dbUnmount(struct inode *ipbmap, int mounterror) { @@ -320,13 +320,13 @@ int dbSync(struct inode *ipbmap) * at a time. * * PARAMETERS: - * ip - pointer to in-core inode; - * blkno - starting block number to be freed. - * nblocks - number of blocks to be freed. + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error */ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) { @@ -395,23 +395,23 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) /* * NAME: dbUpdatePMap() * - * FUNCTION: update the allocation state (free or allocate) of the + * FUNCTION: update the allocation state (free or allocate) of the * specified block range in the persistent block allocation map. * * the blocks will be updated in the persistent map one * dmap at a time. * * PARAMETERS: - * ipbmap - pointer to in-core inode for the block map. - * free - 'true' if block range is to be freed from the persistent - * map; 'false' if it is to be allocated. - * blkno - starting block number of the range. - * nblocks - number of contiguous blocks in the range. - * tblk - transaction block; + * ipbmap - pointer to in-core inode for the block map. + * free - 'true' if block range is to be freed from the persistent + * map; 'false' if it is to be allocated. + * blkno - starting block number of the range. + * nblocks - number of contiguous blocks in the range. + * tblk - transaction block; * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error */ int dbUpdatePMap(struct inode *ipbmap, @@ -573,7 +573,7 @@ dbUpdatePMap(struct inode *ipbmap, /* * NAME: dbNextAG() * - * FUNCTION: find the preferred allocation group for new allocations. + * FUNCTION: find the preferred allocation group for new allocations. * * Within the allocation groups, we maintain a preferred * allocation group which consists of a group with at least @@ -589,10 +589,10 @@ dbUpdatePMap(struct inode *ipbmap, * empty ags around for large allocations. * * PARAMETERS: - * ipbmap - pointer to in-core inode for the block map. + * ipbmap - pointer to in-core inode for the block map. * * RETURN VALUES: - * the preferred allocation group number. + * the preferred allocation group number. */ int dbNextAG(struct inode *ipbmap) { @@ -656,7 +656,7 @@ unlock: /* * NAME: dbAlloc() * - * FUNCTION: attempt to allocate a specified number of contiguous free + * FUNCTION: attempt to allocate a specified number of contiguous free * blocks from the working allocation block map. * * the block allocation policy uses hints and a multi-step @@ -680,16 +680,16 @@ unlock: * size or requests that specify no hint value. * * PARAMETERS: - * ip - pointer to in-core inode; - * hint - allocation hint. - * nblocks - number of contiguous blocks in the range. - * results - on successful return, set to the starting block number + * ip - pointer to in-core inode; + * hint - allocation hint. + * nblocks - number of contiguous blocks in the range. + * results - on successful return, set to the starting block number * of the newly allocated contiguous range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error */ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) { @@ -706,12 +706,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) /* assert that nblocks is valid */ assert(nblocks > 0); -#ifdef _STILL_TO_PORT - /* DASD limit check F226941 */ - if (OVER_LIMIT(ip, nblocks)) - return -ENOSPC; -#endif /* _STILL_TO_PORT */ - /* get the log2 number of blocks to be allocated. * if the number of blocks is not a log2 multiple, * it will be rounded up to the next log2 multiple. @@ -720,7 +714,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) bmp = JFS_SBI(ip->i_sb)->bmap; -//retry: /* serialize w.r.t.extendfs() */ mapSize = bmp->db_mapsize; /* the hint should be within the map */ @@ -879,17 +872,17 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) /* * NAME: dbAllocExact() * - * FUNCTION: try to allocate the requested extent; + * FUNCTION: try to allocate the requested extent; * * PARAMETERS: - * ip - pointer to in-core inode; - * blkno - extent address; - * nblocks - extent length; + * ip - pointer to in-core inode; + * blkno - extent address; + * nblocks - extent length; * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error */ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) { @@ -946,7 +939,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) /* * NAME: dbReAlloc() * - * FUNCTION: attempt to extend a current allocation by a specified + * FUNCTION: attempt to extend a current allocation by a specified * number of blocks. * * this routine attempts to satisfy the allocation request @@ -959,21 +952,21 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) * number of blocks required. * * PARAMETERS: - * ip - pointer to in-core inode requiring allocation. - * blkno - starting block of the current allocation. - * nblocks - number of contiguous blocks within the current + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current * allocation. - * addnblocks - number of blocks to add to the allocation. - * results - on successful return, set to the starting block number + * addnblocks - number of blocks to add to the allocation. + * results - on successful return, set to the starting block number * of the existing allocation if the existing allocation * was extended in place or to a newly allocated contiguous * range if the existing allocation could not be extended * in place. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error */ int dbReAlloc(struct inode *ip, @@ -1004,7 +997,7 @@ dbReAlloc(struct inode *ip, /* * NAME: dbExtend() * - * FUNCTION: attempt to extend a current allocation by a specified + * FUNCTION: attempt to extend a current allocation by a specified * number of blocks. * * this routine attempts to satisfy the allocation request @@ -1013,16 +1006,16 @@ dbReAlloc(struct inode *ip, * immediately following the current allocation. * * PARAMETERS: - * ip - pointer to in-core inode requiring allocation. - * blkno - starting block of the current allocation. - * nblocks - number of contiguous blocks within the current + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current * allocation. - * addnblocks - number of blocks to add to the allocation. + * addnblocks - number of blocks to add to the allocation. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error */ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) { @@ -1109,19 +1102,19 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) /* * NAME: dbAllocNext() * - * FUNCTION: attempt to allocate the blocks of the specified block + * FUNCTION: attempt to allocate the blocks of the specified block * range within a dmap. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap. - * blkno - starting block number of the range. - * nblocks - number of contiguous free blocks of the range. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - starting block number of the range. + * nblocks - number of contiguous free blocks of the range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) held on entry/exit; */ @@ -1233,7 +1226,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, /* * NAME: dbAllocNear() * - * FUNCTION: attempt to allocate a number of contiguous free blocks near + * FUNCTION: attempt to allocate a number of contiguous free blocks near * a specified block (hint) within a dmap. * * starting with the dmap leaf that covers the hint, we'll @@ -1242,18 +1235,18 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, * the desired free space. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap. - * blkno - block number to allocate near. - * nblocks - actual number of contiguous free blocks desired. - * l2nb - log2 number of contiguous free blocks desired. - * results - on successful return, set to the starting block number + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - block number to allocate near. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) held on entry/exit; */ @@ -1316,7 +1309,7 @@ dbAllocNear(struct bmap * bmp, /* * NAME: dbAllocAG() * - * FUNCTION: attempt to allocate the specified number of contiguous + * FUNCTION: attempt to allocate the specified number of contiguous * free blocks within the specified allocation group. * * unless the allocation group size is equal to the number @@ -1353,17 +1346,17 @@ dbAllocNear(struct bmap * bmp, * the allocation group. * * PARAMETERS: - * bmp - pointer to bmap descriptor + * bmp - pointer to bmap descriptor * agno - allocation group number. - * nblocks - actual number of contiguous free blocks desired. - * l2nb - log2 number of contiguous free blocks desired. - * results - on successful return, set to the starting block number + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * note: IWRITE_LOCK(ipmap) held on entry/exit; */ @@ -1546,7 +1539,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) /* * NAME: dbAllocAny() * - * FUNCTION: attempt to allocate the specified number of contiguous + * FUNCTION: attempt to allocate the specified number of contiguous * free blocks anywhere in the file system. * * dbAllocAny() attempts to find the sufficient free space by @@ -1556,16 +1549,16 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) * desired free space is allocated. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * nblocks - actual number of contiguous free blocks desired. - * l2nb - log2 number of contiguous free blocks desired. - * results - on successful return, set to the starting block number + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -1598,9 +1591,9 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) /* * NAME: dbFindCtl() * - * FUNCTION: starting at a specified dmap control page level and block + * FUNCTION: starting at a specified dmap control page level and block * number, search down the dmap control levels for a range of - * contiguous free blocks large enough to satisfy an allocation + * contiguous free blocks large enough to satisfy an allocation * request for the specified number of free blocks. * * if sufficient contiguous free blocks are found, this routine @@ -1609,17 +1602,17 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) * is sufficient in size. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * level - starting dmap control page level. - * l2nb - log2 number of contiguous free blocks desired. - * *blkno - on entry, starting block number for conducting the search. + * bmp - pointer to bmap descriptor + * level - starting dmap control page level. + * l2nb - log2 number of contiguous free blocks desired. + * *blkno - on entry, starting block number for conducting the search. * on successful return, the first block within a dmap page * that contains or starts a range of contiguous free blocks. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -1699,7 +1692,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) /* * NAME: dbAllocCtl() * - * FUNCTION: attempt to allocate a specified number of contiguous + * FUNCTION: attempt to allocate a specified number of contiguous * blocks starting within a specific dmap. * * this routine is called by higher level routines that search @@ -1726,18 +1719,18 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) * first dmap (i.e. blkno). * * PARAMETERS: - * bmp - pointer to bmap descriptor - * nblocks - actual number of contiguous free blocks to allocate. - * l2nb - log2 number of contiguous free blocks to allocate. - * blkno - starting block number of the dmap to start the allocation + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks to allocate. + * l2nb - log2 number of contiguous free blocks to allocate. + * blkno - starting block number of the dmap to start the allocation * from. - * results - on successful return, set to the starting block number + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -1870,7 +1863,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) /* * NAME: dbAllocDmapLev() * - * FUNCTION: attempt to allocate a specified number of contiguous blocks + * FUNCTION: attempt to allocate a specified number of contiguous blocks * from a specified dmap. * * this routine checks if the contiguous blocks are available. @@ -1878,17 +1871,17 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) * returned. * * PARAMETERS: - * mp - pointer to bmap descriptor - * dp - pointer to dmap to attempt to allocate blocks from. - * l2nb - log2 number of contiguous block desired. - * nblocks - actual number of contiguous block desired. - * results - on successful return, set to the starting block number + * mp - pointer to bmap descriptor + * dp - pointer to dmap to attempt to allocate blocks from. + * l2nb - log2 number of contiguous block desired. + * nblocks - actual number of contiguous block desired. + * results - on successful return, set to the starting block number * of the newly allocated range. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient disk resources - * -EIO - i/o error + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; @@ -1933,7 +1926,7 @@ dbAllocDmapLev(struct bmap * bmp, /* * NAME: dbAllocDmap() * - * FUNCTION: adjust the disk allocation map to reflect the allocation + * FUNCTION: adjust the disk allocation map to reflect the allocation * of a specified block range within a dmap. * * this routine allocates the specified blocks from the dmap @@ -1946,14 +1939,14 @@ dbAllocDmapLev(struct bmap * bmp, * covers this dmap. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap to allocate the block range from. - * blkno - starting block number of the block to be allocated. - * nblocks - number of blocks to be allocated. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate the block range from. + * blkno - starting block number of the block to be allocated. + * nblocks - number of blocks to be allocated. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -1989,7 +1982,7 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, /* * NAME: dbFreeDmap() * - * FUNCTION: adjust the disk allocation map to reflect the allocation + * FUNCTION: adjust the disk allocation map to reflect the allocation * of a specified block range within a dmap. * * this routine frees the specified blocks from the dmap through @@ -1997,18 +1990,18 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, * causes the maximum string of free blocks within the dmap to * change (i.e. the value of the root of the dmap's dmtree), this * routine will cause this change to be reflected up through the - * appropriate levels of the dmap control pages by a call to + * appropriate levels of the dmap control pages by a call to * dbAdjCtl() for the L0 dmap control page that covers this dmap. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap to free the block range from. - * blkno - starting block number of the block to be freed. - * nblocks - number of blocks to be freed. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free the block range from. + * blkno - starting block number of the block to be freed. + * nblocks - number of blocks to be freed. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -2055,7 +2048,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, /* * NAME: dbAllocBits() * - * FUNCTION: allocate a specified block range from a dmap. + * FUNCTION: allocate a specified block range from a dmap. * * this routine updates the dmap to reflect the working * state allocation of the specified block range. it directly @@ -2065,10 +2058,10 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, * dmap's dmtree, as a whole, to reflect the allocated range. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap to allocate bits from. - * blkno - starting block number of the bits to be allocated. - * nblocks - number of bits to be allocated. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate bits from. + * blkno - starting block number of the bits to be allocated. + * nblocks - number of bits to be allocated. * * RETURN VALUES: none * @@ -2149,7 +2142,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * the allocated words. */ for (; nwords > 0; nwords -= nw) { - if (leaf[word] < BUDMIN) { + if (leaf[word] < BUDMIN) { jfs_error(bmp->db_ipbmap->i_sb, "dbAllocBits: leaf page " "corrupt"); @@ -2202,7 +2195,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* * NAME: dbFreeBits() * - * FUNCTION: free a specified block range from a dmap. + * FUNCTION: free a specified block range from a dmap. * * this routine updates the dmap to reflect the working * state allocation of the specified block range. it directly @@ -2212,10 +2205,10 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * dmtree, as a whole, to reflect the deallocated range. * * PARAMETERS: - * bmp - pointer to bmap descriptor - * dp - pointer to dmap to free bits from. - * blkno - starting block number of the bits to be freed. - * nblocks - number of bits to be freed. + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free bits from. + * blkno - starting block number of the bits to be freed. + * nblocks - number of bits to be freed. * * RETURN VALUES: 0 for success * @@ -2388,19 +2381,19 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * the new root value and the next dmap control page level to * be adjusted. * PARAMETERS: - * bmp - pointer to bmap descriptor - * blkno - the first block of a block range within a dmap. it is + * bmp - pointer to bmap descriptor + * blkno - the first block of a block range within a dmap. it is * the allocation or deallocation of this block range that * requires the dmap control page to be adjusted. - * newval - the new value of the lower level dmap or dmap control + * newval - the new value of the lower level dmap or dmap control * page root. - * alloc - 'true' if adjustment is due to an allocation. - * level - current level of dmap control page (i.e. L0, L1, L2) to + * alloc - 'true' if adjustment is due to an allocation. + * level - current level of dmap control page (i.e. L0, L1, L2) to * be adjusted. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ @@ -2544,16 +2537,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) /* * NAME: dbSplit() * - * FUNCTION: update the leaf of a dmtree with a new value, splitting + * FUNCTION: update the leaf of a dmtree with a new value, splitting * the leaf from the binary buddy system of the dmtree's * leaves, as required. * * PARAMETERS: - * tp - pointer to the tree containing the leaf. - * leafno - the number of the leaf to be updated. - * splitsz - the size the binary buddy system starting at the leaf + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * splitsz - the size the binary buddy system starting at the leaf * must be split to, specified as the log2 number of blocks. - * newval - the new value for the leaf. + * newval - the new value for the leaf. * * RETURN VALUES: none * @@ -2600,7 +2593,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) /* * NAME: dbBackSplit() * - * FUNCTION: back split the binary buddy system of dmtree leaves + * FUNCTION: back split the binary buddy system of dmtree leaves * that hold a specified leaf until the specified leaf * starts its own binary buddy system. * @@ -2617,8 +2610,8 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) * in which a previous join operation must be backed out. * * PARAMETERS: - * tp - pointer to the tree containing the leaf. - * leafno - the number of the leaf to be updated. + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. * * RETURN VALUES: none * @@ -2692,14 +2685,14 @@ static int dbBackSplit(dmtree_t * tp, int leafno) /* * NAME: dbJoin() * - * FUNCTION: update the leaf of a dmtree with a new value, joining + * FUNCTION: update the leaf of a dmtree with a new value, joining * the leaf with other leaves of the dmtree into a multi-leaf * binary buddy system, as required. * * PARAMETERS: - * tp - pointer to the tree containing the leaf. - * leafno - the number of the leaf to be updated. - * newval - the new value for the leaf. + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. * * RETURN VALUES: none */ @@ -2785,15 +2778,15 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) /* * NAME: dbAdjTree() * - * FUNCTION: update a leaf of a dmtree with a new value, adjusting + * FUNCTION: update a leaf of a dmtree with a new value, adjusting * the dmtree, as required, to reflect the new leaf value. * the combination of any buddies must already be done before * this is called. * * PARAMETERS: - * tp - pointer to the tree to be adjusted. - * leafno - the number of the leaf to be updated. - * newval - the new value for the leaf. + * tp - pointer to the tree to be adjusted. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. * * RETURN VALUES: none */ @@ -2852,7 +2845,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval) /* * NAME: dbFindLeaf() * - * FUNCTION: search a dmtree_t for sufficient free blocks, returning + * FUNCTION: search a dmtree_t for sufficient free blocks, returning * the index of a leaf describing the free blocks if * sufficient free blocks are found. * @@ -2861,15 +2854,15 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval) * free space. * * PARAMETERS: - * tp - pointer to the tree to be searched. - * l2nb - log2 number of free blocks to search for. + * tp - pointer to the tree to be searched. + * l2nb - log2 number of free blocks to search for. * leafidx - return pointer to be set to the index of the leaf * describing at least l2nb free blocks if sufficient * free blocks are found. * * RETURN VALUES: - * 0 - success - * -ENOSPC - insufficient free blocks. + * 0 - success + * -ENOSPC - insufficient free blocks. */ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) { @@ -2916,18 +2909,18 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) /* * NAME: dbFindBits() * - * FUNCTION: find a specified number of binary buddy free bits within a + * FUNCTION: find a specified number of binary buddy free bits within a * dmap bitmap word value. * * this routine searches the bitmap value for (1 << l2nb) free * bits at (1 << l2nb) alignments within the value. * * PARAMETERS: - * word - dmap bitmap word value. - * l2nb - number of free bits specified as a log2 number. + * word - dmap bitmap word value. + * l2nb - number of free bits specified as a log2 number. * * RETURN VALUES: - * starting bit number of free bits. + * starting bit number of free bits. */ static int dbFindBits(u32 word, int l2nb) { @@ -2963,14 +2956,14 @@ static int dbFindBits(u32 word, int l2nb) /* * NAME: dbMaxBud(u8 *cp) * - * FUNCTION: determine the largest binary buddy string of free + * FUNCTION: determine the largest binary buddy string of free * bits within 32-bits of the map. * * PARAMETERS: - * cp - pointer to the 32-bit value. + * cp - pointer to the 32-bit value. * * RETURN VALUES: - * largest binary buddy of free bits within a dmap word. + * largest binary buddy of free bits within a dmap word. */ static int dbMaxBud(u8 * cp) { @@ -3000,14 +2993,14 @@ static int dbMaxBud(u8 * cp) /* * NAME: cnttz(uint word) * - * FUNCTION: determine the number of trailing zeros within a 32-bit + * FUNCTION: determine the number of trailing zeros within a 32-bit * value. * * PARAMETERS: - * value - 32-bit value to be examined. + * value - 32-bit value to be examined. * * RETURN VALUES: - * count of trailing zeros + * count of trailing zeros */ static int cnttz(u32 word) { @@ -3025,14 +3018,14 @@ static int cnttz(u32 word) /* * NAME: cntlz(u32 value) * - * FUNCTION: determine the number of leading zeros within a 32-bit + * FUNCTION: determine the number of leading zeros within a 32-bit * value. * * PARAMETERS: - * value - 32-bit value to be examined. + * value - 32-bit value to be examined. * * RETURN VALUES: - * count of leading zeros + * count of leading zeros */ static int cntlz(u32 value) { @@ -3050,14 +3043,14 @@ static int cntlz(u32 value) * NAME: blkstol2(s64 nb) * * FUNCTION: convert a block count to its log2 value. if the block - * count is not a l2 multiple, it is rounded up to the next + * count is not a l2 multiple, it is rounded up to the next * larger l2 multiple. * * PARAMETERS: - * nb - number of blocks + * nb - number of blocks * * RETURN VALUES: - * log2 number of blocks + * log2 number of blocks */ static int blkstol2(s64 nb) { @@ -3099,13 +3092,13 @@ static int blkstol2(s64 nb) * at a time. * * PARAMETERS: - * ip - pointer to in-core inode; - * blkno - starting block number to be freed. - * nblocks - number of blocks to be freed. + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * 0 - success + * -EIO - i/o error */ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) { @@ -3278,10 +3271,10 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, * L2 * | * L1---------------------------------L1 - * | | - * L0---------L0---------L0 L0---------L0---------L0 - * | | | | | | - * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; + * | | + * L0---------L0---------L0 L0---------L0---------L0 + * | | | | | | + * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm * * <---old---><----------------------------extend-----------------------> @@ -3307,7 +3300,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) (long long) blkno, (long long) nblocks, (long long) newsize); /* - * initialize bmap control page. + * initialize bmap control page. * * all the data in bmap control page should exclude * the mkfs hidden dmap page. @@ -3330,7 +3323,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; /* - * reconfigure db_agfree[] + * reconfigure db_agfree[] * from old AG configuration to new AG configuration; * * coalesce contiguous k (newAGSize/oldAGSize) AGs; @@ -3362,7 +3355,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) bmp->db_maxag = bmp->db_maxag / k; /* - * extend bmap + * extend bmap * * update bit maps and corresponding level control pages; * global control page db_nfree, db_agfree[agno], db_maxfreebud; @@ -3410,7 +3403,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) /* compute start L0 */ j = 0; l1leaf = l1dcp->stree + CTLLEAFIND; - p += nbperpage; /* 1st L0 of L1.k */ + p += nbperpage; /* 1st L0 of L1.k */ } /* @@ -3548,7 +3541,7 @@ errout: return -EIO; /* - * finalize bmap control page + * finalize bmap control page */ finalize: @@ -3567,7 +3560,7 @@ void dbFinalizeBmap(struct inode *ipbmap) int i, n; /* - * finalize bmap control page + * finalize bmap control page */ //finalize: /* @@ -3953,8 +3946,8 @@ static int dbGetL2AGSize(s64 nblocks) * convert number of map pages to the zero origin top dmapctl level */ #define BMAPPGTOLEV(npages) \ - (((npages) <= 3 + MAXL0PAGES) ? 0 \ - : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) + (((npages) <= 3 + MAXL0PAGES) ? 0 : \ + ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) s64 dbMapFileSizeToMapSize(struct inode * ipbmap) { @@ -3981,8 +3974,8 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap) factor = (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); complete = (u32) npages / factor; - ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL - : ((i == 1) ? LPERCTL : 1)); + ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL : + ((i == 1) ? LPERCTL : 1)); /* pages in last/incomplete child */ npages = (u32) npages % factor; diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h index 45ea454c74bd..11e6d471b364 100644 --- a/fs/jfs/jfs_dmap.h +++ b/fs/jfs/jfs_dmap.h @@ -83,7 +83,7 @@ static __inline signed char TREEMAX(signed char *cp) * - 1 is added to account for the control page of the map. */ #define BLKTODMAP(b,s) \ - ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) + ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) /* * convert disk block number to the logical block number of the LEVEL 0 @@ -98,7 +98,7 @@ static __inline signed char TREEMAX(signed char *cp) * - 1 is added to account for the control page of the map. */ #define BLKTOL0(b,s) \ - (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) + (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) /* * convert disk block number to the logical block number of the LEVEL 1 @@ -120,7 +120,7 @@ static __inline signed char TREEMAX(signed char *cp) * at the specified level which describes the disk block. */ #define BLKTOCTL(b,s,l) \ - (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) + (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) /* * convert aggregate map size to the zero origin dmapctl level of the @@ -145,27 +145,27 @@ static __inline signed char TREEMAX(signed char *cp) * dmaptree must be consistent with dmapctl. */ struct dmaptree { - __le32 nleafs; /* 4: number of tree leafs */ - __le32 l2nleafs; /* 4: l2 number of tree leafs */ - __le32 leafidx; /* 4: index of first tree leaf */ - __le32 height; /* 4: height of the tree */ + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of first tree leaf */ + __le32 height; /* 4: height of the tree */ s8 budmin; /* 1: min l2 tree leaf value to combine */ - s8 stree[TREESIZE]; /* TREESIZE: tree */ - u8 pad[2]; /* 2: pad to word boundary */ -}; /* - 360 - */ + s8 stree[TREESIZE]; /* TREESIZE: tree */ + u8 pad[2]; /* 2: pad to word boundary */ +}; /* - 360 - */ /* * dmap page per 8K blocks bitmap */ struct dmap { - __le32 nblocks; /* 4: num blks covered by this dmap */ - __le32 nfree; /* 4: num of free blks in this dmap */ - __le64 start; /* 8: starting blkno for this dmap */ - struct dmaptree tree; /* 360: dmap tree */ - u8 pad[1672]; /* 1672: pad to 2048 bytes */ - __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ - __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ -}; /* - 4096 - */ + __le32 nblocks; /* 4: num blks covered by this dmap */ + __le32 nfree; /* 4: num of free blks in this dmap */ + __le64 start; /* 8: starting blkno for this dmap */ + struct dmaptree tree; /* 360: dmap tree */ + u8 pad[1672]; /* 1672: pad to 2048 bytes */ + __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ + __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ +}; /* - 4096 - */ /* * disk map control page per level. @@ -173,14 +173,14 @@ struct dmap { * dmapctl must be consistent with dmaptree. */ struct dmapctl { - __le32 nleafs; /* 4: number of tree leafs */ - __le32 l2nleafs; /* 4: l2 number of tree leafs */ - __le32 leafidx; /* 4: index of the first tree leaf */ - __le32 height; /* 4: height of tree */ - s8 budmin; /* 1: minimum l2 tree leaf value */ - s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ - u8 pad[2714]; /* 2714: pad to 4096 */ -}; /* - 4096 - */ + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of the first tree leaf */ + __le32 height; /* 4: height of tree */ + s8 budmin; /* 1: minimum l2 tree leaf value */ + s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ + u8 pad[2714]; /* 2714: pad to 4096 */ +}; /* - 4096 - */ /* * common definition for dmaptree within dmap and dmapctl @@ -202,41 +202,41 @@ typedef union dmtree { * on-disk aggregate disk allocation map descriptor. */ struct dbmap_disk { - __le64 dn_mapsize; /* 8: number of blocks in aggregate */ - __le64 dn_nfree; /* 8: num free blks in aggregate map */ - __le32 dn_l2nbperpage; /* 4: number of blks per page */ - __le32 dn_numag; /* 4: total number of ags */ - __le32 dn_maxlevel; /* 4: number of active ags */ - __le32 dn_maxag; /* 4: max active alloc group number */ - __le32 dn_agpref; /* 4: preferred alloc group (hint) */ - __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ - __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ - __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ - __le32 dn_agstart; /* 4: start tree index at AG height */ - __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ - __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ - __le64 dn_agsize; /* 8: num of blks per alloc group */ - s8 dn_maxfreebud; /* 1: max free buddy system */ - u8 pad[3007]; /* 3007: pad to 4096 */ -}; /* - 4096 - */ + __le64 dn_mapsize; /* 8: number of blocks in aggregate */ + __le64 dn_nfree; /* 8: num free blks in aggregate map */ + __le32 dn_l2nbperpage; /* 4: number of blks per page */ + __le32 dn_numag; /* 4: total number of ags */ + __le32 dn_maxlevel; /* 4: number of active ags */ + __le32 dn_maxag; /* 4: max active alloc group number */ + __le32 dn_agpref; /* 4: preferred alloc group (hint) */ + __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ + __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ + __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ + __le32 dn_agstart; /* 4: start tree index at AG height */ + __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ + __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ + __le64 dn_agsize; /* 8: num of blks per alloc group */ + s8 dn_maxfreebud; /* 1: max free buddy system */ + u8 pad[3007]; /* 3007: pad to 4096 */ +}; /* - 4096 - */ struct dbmap { - s64 dn_mapsize; /* number of blocks in aggregate */ - s64 dn_nfree; /* num free blks in aggregate map */ - int dn_l2nbperpage; /* number of blks per page */ - int dn_numag; /* total number of ags */ - int dn_maxlevel; /* number of active ags */ - int dn_maxag; /* max active alloc group number */ - int dn_agpref; /* preferred alloc group (hint) */ - int dn_aglevel; /* dmapctl level holding the AG */ - int dn_agheigth; /* height in dmapctl of the AG */ - int dn_agwidth; /* width in dmapctl of the AG */ - int dn_agstart; /* start tree index at AG height */ - int dn_agl2size; /* l2 num of blks per alloc group */ - s64 dn_agfree[MAXAG]; /* per AG free count */ - s64 dn_agsize; /* num of blks per alloc group */ - signed char dn_maxfreebud; /* max free buddy system */ -}; /* - 4096 - */ + s64 dn_mapsize; /* number of blocks in aggregate */ + s64 dn_nfree; /* num free blks in aggregate map */ + int dn_l2nbperpage; /* number of blks per page */ + int dn_numag; /* total number of ags */ + int dn_maxlevel; /* number of active ags */ + int dn_maxag; /* max active alloc group number */ + int dn_agpref; /* preferred alloc group (hint) */ + int dn_aglevel; /* dmapctl level holding the AG */ + int dn_agheigth; /* height in dmapctl of the AG */ + int dn_agwidth; /* width in dmapctl of the AG */ + int dn_agstart; /* start tree index at AG height */ + int dn_agl2size; /* l2 num of blks per alloc group */ + s64 dn_agfree[MAXAG]; /* per AG free count */ + s64 dn_agsize; /* num of blks per alloc group */ + signed char dn_maxfreebud; /* max free buddy system */ +}; /* - 4096 - */ /* * in-memory aggregate disk allocation map descriptor. */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 6d62f3222892..c14ba3cfa818 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -315,8 +315,8 @@ static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, lv = &llck->lv[llck->index]; /* - * Linelock slot size is twice the size of directory table - * slot size. 512 entries per page. + * Linelock slot size is twice the size of directory table + * slot size. 512 entries per page. */ lv->offset = ((index - 2) & 511) >> 1; lv->length = 1; @@ -615,7 +615,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, btstack->nsplit = 1; /* - * search down tree from root: + * search down tree from root: * * between two consecutive entries of and of * internal page, child page Pi contains entry with k, Ki <= K < Kj. @@ -659,7 +659,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, } if (cmp == 0) { /* - * search hit + * search hit */ /* search hit - leaf page: * return the entry found @@ -723,7 +723,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, } /* - * search miss + * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or (maxindex + 1) index. @@ -834,7 +834,7 @@ int dtInsert(tid_t tid, struct inode *ip, struct lv *lv; /* - * retrieve search result + * retrieve search result * * dtSearch() returns (leaf page pinned, index at which to insert). * n.b. dtSearch() may return index of (maxindex + 1) of @@ -843,7 +843,7 @@ int dtInsert(tid_t tid, struct inode *ip, DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); /* - * insert entry for new key + * insert entry for new key */ if (DO_INDEX(ip)) { if (JFS_IP(ip)->next_index == DIREND) { @@ -860,9 +860,9 @@ int dtInsert(tid_t tid, struct inode *ip, data.leaf.ino = *fsn; /* - * leaf page does not have enough room for new entry: + * leaf page does not have enough room for new entry: * - * extend/split the leaf page; + * extend/split the leaf page; * * dtSplitUp() will insert the entry and unpin the leaf page. */ @@ -877,9 +877,9 @@ int dtInsert(tid_t tid, struct inode *ip, } /* - * leaf page does have enough room for new entry: + * leaf page does have enough room for new entry: * - * insert the new data entry into the leaf page; + * insert the new data entry into the leaf page; */ BT_MARK_DIRTY(mp, ip); /* @@ -967,13 +967,13 @@ static int dtSplitUp(tid_t tid, } /* - * split leaf page + * split leaf page * * The split routines insert the new entry, and * acquire txLock as appropriate. */ /* - * split root leaf page: + * split root leaf page: */ if (sp->header.flag & BT_ROOT) { /* @@ -1012,7 +1012,7 @@ static int dtSplitUp(tid_t tid, } /* - * extend first leaf page + * extend first leaf page * * extend the 1st extent if less than buffer page size * (dtExtendPage() reurns leaf page unpinned) @@ -1068,7 +1068,7 @@ static int dtSplitUp(tid_t tid, } /* - * split leaf page into and a new right page . + * split leaf page into and a new right page . * * return pinned and its extent descriptor */ @@ -1433,7 +1433,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, rp->header.freecnt = rp->header.maxslot - fsi; /* - * sequential append at tail: append without split + * sequential append at tail: append without split * * If splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is @@ -1467,7 +1467,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, } /* - * non-sequential insert (at possibly middle page) + * non-sequential insert (at possibly middle page) */ /* @@ -1508,7 +1508,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, left = 0; /* - * compute fill factor for split pages + * compute fill factor for split pages * * traces the next entry to move to rp * traces the next entry to stay in sp @@ -1551,7 +1551,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, /* poins to the 1st entry to move */ /* - * move entries to right page + * move entries to right page * * dtMoveEntry() initializes rp and reserves entry for insertion * @@ -1677,7 +1677,7 @@ static int dtExtendPage(tid_t tid, return (rc); /* - * extend the extent + * extend the extent */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; @@ -1722,7 +1722,7 @@ static int dtExtendPage(tid_t tid, } /* - * extend the page + * extend the page */ sp->header.self = *pxd; @@ -1739,9 +1739,6 @@ static int dtExtendPage(tid_t tid, /* update buffer extent descriptor of extended page */ xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; -#ifdef _STILL_TO_PORT - bmSetXD(smp, xaddr, xsize); -#endif /* _STILL_TO_PORT */ /* * copy old stbl to new stbl at start of extended area @@ -1836,7 +1833,7 @@ static int dtExtendPage(tid_t tid, } /* - * update parent entry on the parent/root page + * update parent entry on the parent/root page */ /* * acquire a transaction lock on the parent/root page @@ -1904,7 +1901,7 @@ static int dtSplitRoot(tid_t tid, sp = &JFS_IP(ip)->i_dtroot; /* - * allocate/initialize a single (right) child page + * allocate/initialize a single (right) child page * * N.B. at first split, a one (or two) block to fit new entry * is allocated; at subsequent split, a full page is allocated; @@ -1943,7 +1940,7 @@ static int dtSplitRoot(tid_t tid, rp->header.prev = 0; /* - * move in-line root page into new right page extent + * move in-line root page into new right page extent */ /* linelock header + copied entries + new stbl (1st slot) in new page */ ASSERT(dtlck->index == 0); @@ -2016,7 +2013,7 @@ static int dtSplitRoot(tid_t tid, dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); /* - * reset parent/root page + * reset parent/root page * * set the 1st entry offset to 0, which force the left-most key * at any level of the tree to be less than any search key. @@ -2102,7 +2099,7 @@ int dtDelete(tid_t tid, dtpage_t *np; /* - * search for the entry to delete: + * search for the entry to delete: * * dtSearch() returns (leaf page pinned, index at which to delete). */ @@ -2253,7 +2250,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, int i; /* - * keep the root leaf page which has become empty + * keep the root leaf page which has become empty */ if (BT_IS_ROOT(fmp)) { /* @@ -2269,7 +2266,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, } /* - * free the non-root leaf page + * free the non-root leaf page */ /* * acquire a transaction lock on the page @@ -2299,7 +2296,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, discard_metapage(fmp); /* - * propagate page deletion up the directory tree + * propagate page deletion up the directory tree * * If the delete from the parent page makes it empty, * continue all the way up the tree. @@ -2440,10 +2437,10 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, #ifdef _NOTYET /* - * NAME: dtRelocate() + * NAME: dtRelocate() * - * FUNCTION: relocate dtpage (internal or leaf) of directory; - * This function is mainly used by defragfs utility. + * FUNCTION: relocate dtpage (internal or leaf) of directory; + * This function is mainly used by defragfs utility. */ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, s64 nxaddr) @@ -2471,8 +2468,8 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, xlen); /* - * 1. get the internal parent dtpage covering - * router entry for the tartget page to be relocated; + * 1. get the internal parent dtpage covering + * router entry for the tartget page to be relocated; */ rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); if (rc) @@ -2483,7 +2480,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, jfs_info("dtRelocate: parent router entry validated."); /* - * 2. relocate the target dtpage + * 2. relocate the target dtpage */ /* read in the target page from src extent */ DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); @@ -2581,9 +2578,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, /* update the buffer extent descriptor of the dtpage */ xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; -#ifdef _STILL_TO_PORT - bmSetXD(mp, nxaddr, xsize); -#endif /* _STILL_TO_PORT */ + /* unpin the relocated page */ DT_PUTPAGE(mp); jfs_info("dtRelocate: target dtpage relocated."); @@ -2594,7 +2589,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, */ /* - * 3. acquire maplock for the source extent to be freed; + * 3. acquire maplock for the source extent to be freed; */ /* for dtpage relocation, write a LOG_NOREDOPAGE record * for the source dtpage (logredo() will init NoRedoPage @@ -2609,7 +2604,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, pxdlock->index = 1; /* - * 4. update the parent router entry for relocation; + * 4. update the parent router entry for relocation; * * acquire tlck for the parent entry covering the target dtpage; * write LOG_REDOPAGE to apply after image only; @@ -2637,7 +2632,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, * NAME: dtSearchNode() * * FUNCTION: Search for an dtpage containing a specified address - * This function is mainly used by defragfs utility. + * This function is mainly used by defragfs utility. * * NOTE: Search result on stack, the found page is pinned at exit. * The result page must be an internal dtpage. @@ -2660,7 +2655,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, BT_CLR(btstack); /* reset stack */ /* - * descend tree to the level with specified leftmost page + * descend tree to the level with specified leftmost page * * by convention, root bn = 0. */ @@ -2699,7 +2694,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, } /* - * search each page at the current levevl + * search each page at the current levevl */ loop: stbl = DT_GETSTBL(p); @@ -3044,9 +3039,9 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (DO_INDEX(ip)) { /* * persistent index is stored in directory entries. - * Special cases: 0 = . - * 1 = .. - * -1 = End of directory + * Special cases: 0 = . + * 1 = .. + * -1 = End of directory */ do_index = 1; @@ -3128,10 +3123,10 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * - * pn = index = 0: First entry "." - * pn = 0; index = 1: Second entry ".." - * pn > 0: Real entries, pn=1 -> leftmost page - * pn = index = -1: No more entries + * pn = index = 0: First entry "." + * pn = 0; index = 1: Second entry ".." + * pn > 0: Real entries, pn=1 -> leftmost page + * pn = index = -1: No more entries */ dtpos = filp->f_pos; if (dtpos == 0) { @@ -3351,7 +3346,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack) BT_CLR(btstack); /* reset stack */ /* - * descend leftmost path of the tree + * descend leftmost path of the tree * * by convention, root bn = 0. */ @@ -4531,7 +4526,7 @@ int dtModify(tid_t tid, struct inode *ip, struct ldtentry *entry; /* - * search for the entry to modify: + * search for the entry to modify: * * dtSearch() returns (leaf page pinned, index at which to modify). */ diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h index af8513f78648..8561c6ecece0 100644 --- a/fs/jfs/jfs_dtree.h +++ b/fs/jfs/jfs_dtree.h @@ -35,7 +35,7 @@ typedef union { /* - * entry segment/slot + * entry segment/slot * * an entry consists of type dependent head/only segment/slot and * additional segments/slots linked vi next field; diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index a35bdca6a805..7ae1e3281de9 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -34,8 +34,8 @@ static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); #endif static s64 extRoundDown(s64 nb); -#define DPD(a) (printk("(a): %d\n",(a))) -#define DPC(a) (printk("(a): %c\n",(a))) +#define DPD(a) (printk("(a): %d\n",(a))) +#define DPC(a) (printk("(a): %c\n",(a))) #define DPL1(a) \ { \ if ((a) >> 32) \ @@ -51,19 +51,19 @@ static s64 extRoundDown(s64 nb); printk("(a): %x\n",(a) << 32); \ } -#define DPD1(a) (printk("(a): %d ",(a))) -#define DPX(a) (printk("(a): %08x\n",(a))) -#define DPX1(a) (printk("(a): %08x ",(a))) -#define DPS(a) (printk("%s\n",(a))) -#define DPE(a) (printk("\nENTERING: %s\n",(a))) -#define DPE1(a) (printk("\nENTERING: %s",(a))) -#define DPS1(a) (printk(" %s ",(a))) +#define DPD1(a) (printk("(a): %d ",(a))) +#define DPX(a) (printk("(a): %08x\n",(a))) +#define DPX1(a) (printk("(a): %08x ",(a))) +#define DPS(a) (printk("%s\n",(a))) +#define DPE(a) (printk("\nENTERING: %s\n",(a))) +#define DPE1(a) (printk("\nENTERING: %s",(a))) +#define DPS1(a) (printk(" %s ",(a))) /* * NAME: extAlloc() * - * FUNCTION: allocate an extent for a specified page range within a + * FUNCTION: allocate an extent for a specified page range within a * file. * * PARAMETERS: @@ -78,9 +78,9 @@ static s64 extRoundDown(s64 nb); * should be marked as allocated but not recorded. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ int extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) @@ -192,9 +192,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) #ifdef _NOTYET /* - * NAME: extRealloc() + * NAME: extRealloc() * - * FUNCTION: extend the allocation of a file extent containing a + * FUNCTION: extend the allocation of a file extent containing a * partial back last page. * * PARAMETERS: @@ -207,9 +207,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) * should be marked as allocated but not recorded. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) { @@ -345,9 +345,9 @@ exit: /* - * NAME: extHint() + * NAME: extHint() * - * FUNCTION: produce an extent allocation hint for a file offset. + * FUNCTION: produce an extent allocation hint for a file offset. * * PARAMETERS: * ip - the inode of the file. @@ -356,8 +356,8 @@ exit: * the hint. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. + * 0 - success + * -EIO - i/o error. */ int extHint(struct inode *ip, s64 offset, xad_t * xp) { @@ -387,7 +387,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) lxdl.nlxd = 1; lxdl.lxd = &lxd; LXDoffset(&lxd, prev) - LXDlength(&lxd, nbperpage); + LXDlength(&lxd, nbperpage); xadl.maxnxad = 1; xadl.nxad = 0; @@ -397,11 +397,11 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) if ((rc = xtLookupList(ip, &lxdl, &xadl, 0))) return (rc); - /* check if not extent exists for the previous page. + /* check if no extent exists for the previous page. * this is possible for sparse files. */ if (xadl.nxad == 0) { -// assert(ISSPARSE(ip)); +// assert(ISSPARSE(ip)); return (0); } @@ -410,28 +410,28 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) */ xp->flag &= XAD_NOTRECORDED; - if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { + if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { jfs_error(ip->i_sb, "extHint: corrupt xtree"); return -EIO; - } + } return (0); } /* - * NAME: extRecord() + * NAME: extRecord() * - * FUNCTION: change a page with a file from not recorded to recorded. + * FUNCTION: change a page with a file from not recorded to recorded. * * PARAMETERS: * ip - inode of the file. * cp - cbuf of the file page. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ int extRecord(struct inode *ip, xad_t * xp) { @@ -451,9 +451,9 @@ int extRecord(struct inode *ip, xad_t * xp) #ifdef _NOTYET /* - * NAME: extFill() + * NAME: extFill() * - * FUNCTION: allocate disk space for a file page that represents + * FUNCTION: allocate disk space for a file page that represents * a file hole. * * PARAMETERS: @@ -461,16 +461,16 @@ int extRecord(struct inode *ip, xad_t * xp) * cp - cbuf of the file page represent the hole. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ int extFill(struct inode *ip, xad_t * xp) { int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; s64 blkno = offsetXAD(xp) >> ip->i_blkbits; -// assert(ISSPARSE(ip)); +// assert(ISSPARSE(ip)); /* initialize the extent allocation hint */ XADaddress(xp, 0); @@ -489,7 +489,7 @@ int extFill(struct inode *ip, xad_t * xp) /* * NAME: extBalloc() * - * FUNCTION: allocate disk blocks to form an extent. + * FUNCTION: allocate disk blocks to form an extent. * * initially, we will try to allocate disk blocks for the * requested size (nblocks). if this fails (nblocks @@ -513,9 +513,9 @@ int extFill(struct inode *ip, xad_t * xp) * allocated block range. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ static int extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) @@ -580,7 +580,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) /* * NAME: extBrealloc() * - * FUNCTION: attempt to extend an extent's allocation. + * FUNCTION: attempt to extend an extent's allocation. * * Initially, we will try to extend the extent's allocation * in place. If this fails, we'll try to move the extent @@ -597,8 +597,8 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * * PARAMETERS: * ip - the inode of the file. - * blkno - starting block number of the extents current allocation. - * nblks - number of blocks within the extents current allocation. + * blkno - starting block number of the extents current allocation. + * nblks - number of blocks within the extents current allocation. * newnblks - pointer to a s64 value. on entry, this value is the * the new desired extent size (number of blocks). on * successful exit, this value is set to the extent's actual @@ -606,9 +606,9 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * newblkno - the starting block number of the extents new allocation. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOSPC - insufficient disk resources. + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. */ static int extBrealloc(struct inode *ip, @@ -634,16 +634,16 @@ extBrealloc(struct inode *ip, /* - * NAME: extRoundDown() + * NAME: extRoundDown() * - * FUNCTION: round down a specified number of blocks to the next + * FUNCTION: round down a specified number of blocks to the next * smallest power of 2 number. * * PARAMETERS: * nb - the inode of the file. * * RETURN VALUES: - * next smallest power of 2 number. + * next smallest power of 2 number. */ static s64 extRoundDown(s64 nb) { diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index 38f70ac03bec..b3f5463fbe52 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -34,9 +34,9 @@ #define JFS_UNICODE 0x00000001 /* unicode name */ /* mount time flags for error handling */ -#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ -#define JFS_ERR_CONTINUE 0x00000004 /* continue */ -#define JFS_ERR_PANIC 0x00000008 /* panic */ +#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ +#define JFS_ERR_CONTINUE 0x00000004 /* continue */ +#define JFS_ERR_PANIC 0x00000008 /* panic */ /* Quota support */ #define JFS_USRQUOTA 0x00000010 @@ -83,7 +83,6 @@ /* case-insensitive name/directory support */ #define JFS_AIX 0x80000000 /* AIX support */ -/* POSIX name/directory support - Never implemented*/ /* * buffer cache configuration @@ -113,10 +112,10 @@ #define IDATASIZE 256 /* inode inline data size */ #define IXATTRSIZE 128 /* inode inline extended attribute size */ -#define XTPAGE_SIZE 4096 -#define log2_PAGESIZE 12 +#define XTPAGE_SIZE 4096 +#define log2_PAGESIZE 12 -#define IAG_SIZE 4096 +#define IAG_SIZE 4096 #define IAG_EXTENT_SIZE 4096 #define INOSPERIAG 4096 /* number of disk inodes per iag */ #define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index c6530227cda6..f5c5227b0d1f 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -93,21 +93,21 @@ static int copy_from_dinode(struct dinode *, struct inode *); static void copy_to_dinode(struct dinode *, struct inode *); /* - * NAME: diMount() + * NAME: diMount() * - * FUNCTION: initialize the incore inode map control structures for + * FUNCTION: initialize the incore inode map control structures for * a fileset or aggregate init time. * - * the inode map's control structure (dinomap) is - * brought in from disk and placed in virtual memory. + * the inode map's control structure (dinomap) is + * brought in from disk and placed in virtual memory. * * PARAMETERS: - * ipimap - pointer to inode map inode for the aggregate or fileset. + * ipimap - pointer to inode map inode for the aggregate or fileset. * * RETURN VALUES: - * 0 - success - * -ENOMEM - insufficient free virtual memory. - * -EIO - i/o error. + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. */ int diMount(struct inode *ipimap) { @@ -180,18 +180,18 @@ int diMount(struct inode *ipimap) /* - * NAME: diUnmount() + * NAME: diUnmount() * - * FUNCTION: write to disk the incore inode map control structures for + * FUNCTION: write to disk the incore inode map control structures for * a fileset or aggregate at unmount time. * * PARAMETERS: - * ipimap - pointer to inode map inode for the aggregate or fileset. + * ipimap - pointer to inode map inode for the aggregate or fileset. * * RETURN VALUES: - * 0 - success - * -ENOMEM - insufficient free virtual memory. - * -EIO - i/o error. + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. */ int diUnmount(struct inode *ipimap, int mounterror) { @@ -274,9 +274,9 @@ int diSync(struct inode *ipimap) /* - * NAME: diRead() + * NAME: diRead() * - * FUNCTION: initialize an incore inode from disk. + * FUNCTION: initialize an incore inode from disk. * * on entry, the specifed incore inode should itself * specify the disk inode number corresponding to the @@ -285,7 +285,7 @@ int diSync(struct inode *ipimap) * this routine handles incore inode initialization for * both "special" and "regular" inodes. special inodes * are those required early in the mount process and - * require special handling since much of the file system + * require special handling since much of the file system * is not yet initialized. these "special" inodes are * identified by a NULL inode map inode pointer and are * actually initialized by a call to diReadSpecial(). @@ -298,12 +298,12 @@ int diSync(struct inode *ipimap) * incore inode. * * PARAMETERS: - * ip - pointer to incore inode to be initialized from disk. + * ip - pointer to incore inode to be initialized from disk. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. - * -ENOMEM - insufficient memory + * 0 - success + * -EIO - i/o error. + * -ENOMEM - insufficient memory * */ int diRead(struct inode *ip) @@ -410,26 +410,26 @@ int diRead(struct inode *ip) /* - * NAME: diReadSpecial() + * NAME: diReadSpecial() * - * FUNCTION: initialize a 'special' inode from disk. + * FUNCTION: initialize a 'special' inode from disk. * * this routines handles aggregate level inodes. The * inode cache cannot differentiate between the * aggregate inodes and the filesystem inodes, so we * handle these here. We don't actually use the aggregate - * inode map, since these inodes are at a fixed location + * inode map, since these inodes are at a fixed location * and in some cases the aggregate inode map isn't initialized * yet. * * PARAMETERS: - * sb - filesystem superblock + * sb - filesystem superblock * inum - aggregate inode number * secondary - 1 if secondary aggregate inode table * * RETURN VALUES: - * new inode - success - * NULL - i/o error. + * new inode - success + * NULL - i/o error. */ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) { @@ -502,12 +502,12 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) } /* - * NAME: diWriteSpecial() + * NAME: diWriteSpecial() * - * FUNCTION: Write the special inode to disk + * FUNCTION: Write the special inode to disk * * PARAMETERS: - * ip - special inode + * ip - special inode * secondary - 1 if secondary aggregate inode table * * RETURN VALUES: none @@ -554,9 +554,9 @@ void diWriteSpecial(struct inode *ip, int secondary) } /* - * NAME: diFreeSpecial() + * NAME: diFreeSpecial() * - * FUNCTION: Free allocated space for special inode + * FUNCTION: Free allocated space for special inode */ void diFreeSpecial(struct inode *ip) { @@ -572,9 +572,9 @@ void diFreeSpecial(struct inode *ip) /* - * NAME: diWrite() + * NAME: diWrite() * - * FUNCTION: write the on-disk inode portion of the in-memory inode + * FUNCTION: write the on-disk inode portion of the in-memory inode * to its corresponding on-disk inode. * * on entry, the specifed incore inode should itself @@ -589,11 +589,11 @@ void diFreeSpecial(struct inode *ip) * * PARAMETERS: * tid - transacation id - * ip - pointer to incore inode to be written to the inode extent. + * ip - pointer to incore inode to be written to the inode extent. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. + * 0 - success + * -EIO - i/o error. */ int diWrite(tid_t tid, struct inode *ip) { @@ -730,7 +730,7 @@ int diWrite(tid_t tid, struct inode *ip) ilinelock = (struct linelock *) & tlck->lock; /* - * regular file: 16 byte (XAD slot) granularity + * regular file: 16 byte (XAD slot) granularity */ if (type & tlckXTREE) { xtpage_t *p, *xp; @@ -755,7 +755,7 @@ int diWrite(tid_t tid, struct inode *ip) xad->flag &= ~(XAD_NEW | XAD_EXTENDED); } /* - * directory: 32 byte (directory entry slot) granularity + * directory: 32 byte (directory entry slot) granularity */ else if (type & tlckDTREE) { dtpage_t *p, *xp; @@ -800,9 +800,8 @@ int diWrite(tid_t tid, struct inode *ip) } /* - * lock/copy inode base: 128 byte slot granularity + * lock/copy inode base: 128 byte slot granularity */ -// baseDinode: lv = & dilinelock->lv[dilinelock->index]; lv->offset = dioffset >> L2INODESLOTSIZE; copy_to_dinode(dp, ip); @@ -813,17 +812,6 @@ int diWrite(tid_t tid, struct inode *ip) lv->length = 1; dilinelock->index++; -#ifdef _JFS_FASTDASD - /* - * We aren't logging changes to the DASD used in directory inodes, - * but we need to write them to disk. If we don't unmount cleanly, - * mount will recalculate the DASD used. - */ - if (S_ISDIR(ip->i_mode) - && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED)) - memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd)); -#endif /* _JFS_FASTDASD */ - /* release the buffer holding the updated on-disk inode. * the buffer will be later written by commit processing. */ @@ -834,9 +822,9 @@ int diWrite(tid_t tid, struct inode *ip) /* - * NAME: diFree(ip) + * NAME: diFree(ip) * - * FUNCTION: free a specified inode from the inode working map + * FUNCTION: free a specified inode from the inode working map * for a fileset or aggregate. * * if the inode to be freed represents the first (only) @@ -865,11 +853,11 @@ int diWrite(tid_t tid, struct inode *ip) * any updates and are held until all updates are complete. * * PARAMETERS: - * ip - inode to be freed. + * ip - inode to be freed. * * RETURN VALUES: - * 0 - success - * -EIO - i/o error. + * 0 - success + * -EIO - i/o error. */ int diFree(struct inode *ip) { @@ -964,8 +952,8 @@ int diFree(struct inode *ip) return -EIO; } /* - * inode extent still has some inodes or below low water mark: - * keep the inode extent; + * inode extent still has some inodes or below low water mark: + * keep the inode extent; */ if (bitmap || imap->im_agctl[agno].numfree < 96 || @@ -1047,12 +1035,12 @@ int diFree(struct inode *ip) /* - * inode extent has become free and above low water mark: - * free the inode extent; + * inode extent has become free and above low water mark: + * free the inode extent; */ /* - * prepare to update iag list(s) (careful update step 1) + * prepare to update iag list(s) (careful update step 1) */ amp = bmp = cmp = dmp = NULL; fwd = back = -1; @@ -1152,7 +1140,7 @@ int diFree(struct inode *ip) invalidate_pxd_metapages(ip, freepxd); /* - * update iag list(s) (careful update step 2) + * update iag list(s) (careful update step 2) */ /* add the iag to the ag extent free list if this is the * first free extent for the iag. @@ -1338,20 +1326,20 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) /* - * NAME: diAlloc(pip,dir,ip) + * NAME: diAlloc(pip,dir,ip) * - * FUNCTION: allocate a disk inode from the inode working map + * FUNCTION: allocate a disk inode from the inode working map * for a fileset or aggregate. * * PARAMETERS: - * pip - pointer to incore inode for the parent inode. - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to a new inode + * pip - pointer to incore inode for the parent inode. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ int diAlloc(struct inode *pip, bool dir, struct inode *ip) { @@ -1433,7 +1421,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); /* - * try to allocate from the IAG + * try to allocate from the IAG */ /* check if the inode may be allocated from the iag * (i.e. the inode has free inodes or new extent can be added). @@ -1633,9 +1621,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) /* - * NAME: diAllocAG(imap,agno,dir,ip) + * NAME: diAllocAG(imap,agno,dir,ip) * - * FUNCTION: allocate a disk inode from the allocation group. + * FUNCTION: allocate a disk inode from the allocation group. * * this routine first determines if a new extent of free * inodes should be added for the allocation group, with @@ -1649,17 +1637,17 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) * PRE CONDITION: Already have the AG lock for this AG. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group to allocate from. - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to the new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group to allocate from. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to the new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) @@ -1709,9 +1697,9 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) /* - * NAME: diAllocAny(imap,agno,dir,iap) + * NAME: diAllocAny(imap,agno,dir,iap) * - * FUNCTION: allocate a disk inode from any other allocation group. + * FUNCTION: allocate a disk inode from any other allocation group. * * this routine is called when an allocation attempt within * the primary allocation group has failed. if attempts to @@ -1719,17 +1707,17 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) * specified primary group. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - primary allocation group (to avoid). - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to a new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - primary allocation group (to avoid). + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) @@ -1772,9 +1760,9 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) /* - * NAME: diAllocIno(imap,agno,ip) + * NAME: diAllocIno(imap,agno,ip) * - * FUNCTION: allocate a disk inode from the allocation group's free + * FUNCTION: allocate a disk inode from the allocation group's free * inode list, returning an error if this free list is * empty (i.e. no iags on the list). * @@ -1785,16 +1773,16 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) * PRE CONDITION: Already have AG lock for this AG. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group. - * ip - pointer to new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group. + * ip - pointer to new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) { @@ -1890,7 +1878,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) /* - * NAME: diAllocExt(imap,agno,ip) + * NAME: diAllocExt(imap,agno,ip) * * FUNCTION: add a new extent of free inodes to an iag, allocating * an inode from this extent to satisfy the current allocation @@ -1910,16 +1898,16 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) * for the purpose of satisfying this request. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group number. - * ip - pointer to new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group number. + * ip - pointer to new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) { @@ -2010,7 +1998,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) /* - * NAME: diAllocBit(imap,iagp,ino) + * NAME: diAllocBit(imap,iagp,ino) * * FUNCTION: allocate a backed inode from an iag. * @@ -2030,14 +2018,14 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) * this AG. Must have read lock on imap inode. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagp - pointer to iag. - * ino - inode number to be allocated within the iag. + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * ino - inode number to be allocated within the iag. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) { @@ -2144,11 +2132,11 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) /* - * NAME: diNewExt(imap,iagp,extno) + * NAME: diNewExt(imap,iagp,extno) * - * FUNCTION: initialize a new extent of inodes for an iag, allocating - * the first inode of the extent for use for the current - * allocation request. + * FUNCTION: initialize a new extent of inodes for an iag, allocating + * the first inode of the extent for use for the current + * allocation request. * * disk resources are allocated for the new extent of inodes * and the inodes themselves are initialized to reflect their @@ -2177,14 +2165,14 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) * this AG. Must have read lock on imap inode. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagp - pointer to iag. - * extno - extent number. + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * extno - extent number. * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. */ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) { @@ -2430,7 +2418,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) /* - * NAME: diNewIAG(imap,iagnop,agno) + * NAME: diNewIAG(imap,iagnop,agno) * * FUNCTION: allocate a new iag for an allocation group. * @@ -2443,16 +2431,16 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) * and returned to satisfy the request. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagnop - pointer to an iag number set with the number of the + * imap - pointer to inode map control structure. + * iagnop - pointer to an iag number set with the number of the * newly allocated iag upon successful return. - * agno - allocation group number. + * agno - allocation group number. * bpp - Buffer pointer to be filled in with new IAG's buffer * * RETURN VALUES: - * 0 - success. - * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. * * serialization: * AG lock held on entry/exit; @@ -2461,7 +2449,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) * * note: new iag transaction: * . synchronously write iag; - * . write log of xtree and inode of imap; + * . write log of xtree and inode of imap; * . commit; * . synchronous write of xtree (right to left, bottom to top); * . at start of logredo(): init in-memory imap with one additional iag page; @@ -2481,9 +2469,6 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) s64 xaddr = 0; s64 blkno; tid_t tid; -#ifdef _STILL_TO_PORT - xad_t xad; -#endif /* _STILL_TO_PORT */ struct inode *iplist[1]; /* pick up pointers to the inode map and mount inodes */ @@ -2674,15 +2659,15 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) } /* - * NAME: diIAGRead() + * NAME: diIAGRead() * - * FUNCTION: get the buffer for the specified iag within a fileset + * FUNCTION: get the buffer for the specified iag within a fileset * or aggregate inode map. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagno - iag number. - * bpp - point to buffer pointer to be filled in on successful + * imap - pointer to inode map control structure. + * iagno - iag number. + * bpp - point to buffer pointer to be filled in on successful * exit. * * SERIALIZATION: @@ -2691,8 +2676,8 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) * the read lock is unnecessary.) * * RETURN VALUES: - * 0 - success. - * -EIO - i/o error. + * 0 - success. + * -EIO - i/o error. */ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) { @@ -2712,17 +2697,17 @@ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) } /* - * NAME: diFindFree() + * NAME: diFindFree() * - * FUNCTION: find the first free bit in a word starting at + * FUNCTION: find the first free bit in a word starting at * the specified bit position. * * PARAMETERS: - * word - word to be examined. - * start - starting bit position. + * word - word to be examined. + * start - starting bit position. * * RETURN VALUES: - * bit position of first free bit in the word or 32 if + * bit position of first free bit in the word or 32 if * no free bits were found. */ static int diFindFree(u32 word, int start) @@ -2897,7 +2882,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) atomic_read(&imap->im_numfree)); /* - * reconstruct imap + * reconstruct imap * * coalesce contiguous k (newAGSize/oldAGSize) AGs; * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; @@ -2913,7 +2898,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) } /* - * process each iag page of the map. + * process each iag page of the map. * * rebuild AG Free Inode List, AG Free Inode Extent List; */ @@ -2932,7 +2917,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) /* leave free iag in the free iag list */ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { - release_metapage(bp); + release_metapage(bp); continue; } @@ -3063,13 +3048,13 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno, } /* - * NAME: copy_from_dinode() + * NAME: copy_from_dinode() * - * FUNCTION: Copies inode info from disk inode to in-memory inode + * FUNCTION: Copies inode info from disk inode to in-memory inode * * RETURN VALUES: - * 0 - success - * -ENOMEM - insufficient memory + * 0 - success + * -ENOMEM - insufficient memory */ static int copy_from_dinode(struct dinode * dip, struct inode *ip) { @@ -3151,9 +3136,9 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) } /* - * NAME: copy_to_dinode() + * NAME: copy_to_dinode() * - * FUNCTION: Copies inode info from in-memory inode to disk inode + * FUNCTION: Copies inode info from in-memory inode to disk inode */ static void copy_to_dinode(struct dinode * dip, struct inode *ip) { diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h index 4f9c346ed498..610a0e9d8941 100644 --- a/fs/jfs/jfs_imap.h +++ b/fs/jfs/jfs_imap.h @@ -24,17 +24,17 @@ * jfs_imap.h: disk inode manager */ -#define EXTSPERIAG 128 /* number of disk inode extent per iag */ -#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ -#define SMAPSZ 4 /* number of words per summary map */ +#define EXTSPERIAG 128 /* number of disk inode extent per iag */ +#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ +#define SMAPSZ 4 /* number of words per summary map */ #define EXTSPERSUM 32 /* number of extents per summary map entry */ #define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ #define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ -#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ -#define MAXAG 128 /* maximum number of allocation groups */ +#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ +#define MAXAG 128 /* maximum number of allocation groups */ -#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ -#define SMAPSIZE 16 /* bytes in the IAG summary maps */ +#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ +#define SMAPSIZE 16 /* bytes in the IAG summary maps */ /* convert inode number to iag number */ #define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) @@ -60,31 +60,31 @@ * inode allocation group page (per 4096 inodes of an AG) */ struct iag { - __le64 agstart; /* 8: starting block of ag */ - __le32 iagnum; /* 4: inode allocation group number */ - __le32 inofreefwd; /* 4: ag inode free list forward */ - __le32 inofreeback; /* 4: ag inode free list back */ - __le32 extfreefwd; /* 4: ag inode extent free list forward */ - __le32 extfreeback; /* 4: ag inode extent free list back */ - __le32 iagfree; /* 4: iag free list */ + __le64 agstart; /* 8: starting block of ag */ + __le32 iagnum; /* 4: inode allocation group number */ + __le32 inofreefwd; /* 4: ag inode free list forward */ + __le32 inofreeback; /* 4: ag inode free list back */ + __le32 extfreefwd; /* 4: ag inode extent free list forward */ + __le32 extfreeback; /* 4: ag inode extent free list back */ + __le32 iagfree; /* 4: iag free list */ /* summary map: 1 bit per inode extent */ __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; - * note: this indicates free and backed - * inodes, if the extent is not backed the - * value will be 1. if the extent is - * backed but all inodes are being used the - * value will be 1. if the extent is - * backed but at least one of the inodes is - * free the value will be 0. + * note: this indicates free and backed + * inodes, if the extent is not backed the + * value will be 1. if the extent is + * backed but all inodes are being used the + * value will be 1. if the extent is + * backed but at least one of the inodes is + * free the value will be 0. */ __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ - __le32 nfreeinos; /* 4: number of free inodes */ - __le32 nfreeexts; /* 4: number of free extents */ + __le32 nfreeinos; /* 4: number of free inodes */ + __le32 nfreeexts; /* 4: number of free extents */ /* (72) */ u8 pad[1976]; /* 1976: pad to 2048 bytes */ /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ - __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ + __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ }; /* (4096) */ @@ -93,44 +93,44 @@ struct iag { * per AG control information (in inode map control page) */ struct iagctl_disk { - __le32 inofree; /* 4: free inode list anchor */ - __le32 extfree; /* 4: free extent list anchor */ - __le32 numinos; /* 4: number of backed inodes */ - __le32 numfree; /* 4: number of free inodes */ + __le32 inofree; /* 4: free inode list anchor */ + __le32 extfree; /* 4: free extent list anchor */ + __le32 numinos; /* 4: number of backed inodes */ + __le32 numfree; /* 4: number of free inodes */ }; /* (16) */ struct iagctl { - int inofree; /* free inode list anchor */ - int extfree; /* free extent list anchor */ - int numinos; /* number of backed inodes */ - int numfree; /* number of free inodes */ + int inofree; /* free inode list anchor */ + int extfree; /* free extent list anchor */ + int numinos; /* number of backed inodes */ + int numfree; /* number of free inodes */ }; /* * per fileset/aggregate inode map control page */ struct dinomap_disk { - __le32 in_freeiag; /* 4: free iag list anchor */ - __le32 in_nextiag; /* 4: next free iag number */ - __le32 in_numinos; /* 4: num of backed inodes */ + __le32 in_freeiag; /* 4: free iag list anchor */ + __le32 in_nextiag; /* 4: next free iag number */ + __le32 in_numinos; /* 4: num of backed inodes */ __le32 in_numfree; /* 4: num of free backed inodes */ __le32 in_nbperiext; /* 4: num of blocks per inode extent */ - __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ - __le32 in_diskblock; /* 4: for standalone test driver */ - __le32 in_maxag; /* 4: for standalone test driver */ - u8 pad[2016]; /* 2016: pad to 2048 */ + __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ + __le32 in_diskblock; /* 4: for standalone test driver */ + __le32 in_maxag; /* 4: for standalone test driver */ + u8 pad[2016]; /* 2016: pad to 2048 */ struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */ }; /* (4096) */ struct dinomap { - int in_freeiag; /* free iag list anchor */ - int in_nextiag; /* next free iag number */ - int in_numinos; /* num of backed inodes */ - int in_numfree; /* num of free backed inodes */ + int in_freeiag; /* free iag list anchor */ + int in_nextiag; /* next free iag number */ + int in_numinos; /* num of backed inodes */ + int in_numfree; /* num of free backed inodes */ int in_nbperiext; /* num of blocks per inode extent */ - int in_l2nbperiext; /* l2 of in_nbperiext */ - int in_diskblock; /* for standalone test driver */ - int in_maxag; /* for standalone test driver */ + int in_l2nbperiext; /* l2 of in_nbperiext */ + int in_diskblock; /* for standalone test driver */ + int in_maxag; /* for standalone test driver */ struct iagctl in_agctl[MAXAG]; /* AG control information */ }; @@ -139,9 +139,9 @@ struct dinomap { */ struct inomap { struct dinomap im_imap; /* 4096: inode allocation control */ - struct inode *im_ipimap; /* 4: ptr to inode for imap */ - struct mutex im_freelock; /* 4: iag free list lock */ - struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ + struct inode *im_ipimap; /* 4: ptr to inode for imap */ + struct mutex im_freelock; /* 4: iag free list lock */ + struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ u32 *im_DBGdimap; atomic_t im_numinos; /* num of backed inodes */ atomic_t im_numfree; /* num of free backed inodes */ diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 8f453eff3c83..cb8f30985ad1 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -40,7 +40,7 @@ struct jfs_inode_info { uint mode2; /* jfs-specific mode */ uint saved_uid; /* saved for uid mount option */ uint saved_gid; /* saved for gid mount option */ - pxd_t ixpxd; /* inode extent descriptor */ + pxd_t ixpxd; /* inode extent descriptor */ dxd_t acl; /* dxd describing acl */ dxd_t ea; /* dxd describing ea */ time_t otime; /* time created */ @@ -190,7 +190,7 @@ struct jfs_sb_info { uint gengen; /* inode generation generator*/ uint inostamp; /* shows inode belongs to fileset*/ - /* Formerly in ipbmap */ + /* Formerly in ipbmap */ struct bmap *bmap; /* incore bmap descriptor */ struct nls_table *nls_tab; /* current codepage */ struct inode *direct_inode; /* metadata inode */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 44a2f33cb98d..88eae45a4644 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -244,7 +244,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, goto writeRecord; /* - * initialize/update page/transaction recovery lsn + * initialize/update page/transaction recovery lsn */ lsn = log->lsn; @@ -263,7 +263,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * initialize/update lsn of tblock of the page + * initialize/update lsn of tblock of the page * * transaction inherits oldest lsn of pages associated * with allocation/deallocation of resources (their @@ -307,7 +307,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, LOGSYNC_UNLOCK(log, flags); /* - * write the log record + * write the log record */ writeRecord: lsn = lmWriteRecord(log, tblk, lrd, tlck); @@ -372,7 +372,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, goto moveLrd; /* - * move log record data + * move log record data */ /* retrieve source meta-data page to log */ if (tlck->flag & tlckPAGELOCK) { @@ -465,7 +465,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * move log record descriptor + * move log record descriptor */ moveLrd: lrd->length = cpu_to_le16(len); @@ -574,7 +574,7 @@ static int lmNextPage(struct jfs_log * log) LOGGC_LOCK(log); /* - * write or queue the full page at the tail of write queue + * write or queue the full page at the tail of write queue */ /* get the tail tblk on commit queue */ if (list_empty(&log->cqueue)) @@ -625,7 +625,7 @@ static int lmNextPage(struct jfs_log * log) LOGGC_UNLOCK(log); /* - * allocate/initialize next page + * allocate/initialize next page */ /* if log wraps, the first data page of log is 2 * (0 never used, 1 is superblock). @@ -953,7 +953,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) } /* - * forward syncpt + * forward syncpt */ /* if last sync is same as last syncpt, * invoke sync point forward processing to update sync. @@ -989,7 +989,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) lsn = log->lsn; /* - * setup next syncpt trigger (SWAG) + * setup next syncpt trigger (SWAG) */ logsize = log->logsize; @@ -1000,11 +1000,11 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) if (more < 2 * LOGPSIZE) { jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); /* - * log wrapping + * log wrapping * * option 1 - panic ? No.! * option 2 - shutdown file systems - * associated with log ? + * associated with log ? * option 3 - extend log ? */ /* @@ -1062,7 +1062,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync) /* * NAME: lmLogOpen() * - * FUNCTION: open the log on first open; + * FUNCTION: open the log on first open; * insert filesystem in the active list of the log. * * PARAMETER: ipmnt - file system mount inode @@ -1113,7 +1113,7 @@ int lmLogOpen(struct super_block *sb) init_waitqueue_head(&log->syncwait); /* - * external log as separate logical volume + * external log as separate logical volume * * file systems to log may have n-to-1 relationship; */ @@ -1155,7 +1155,7 @@ journal_found: return 0; /* - * unwind on error + * unwind on error */ shutdown: /* unwind lbmLogInit() */ list_del(&log->journal_list); @@ -1427,7 +1427,7 @@ int lmLogInit(struct jfs_log * log) return 0; /* - * unwind on error + * unwind on error */ errout30: /* release log page */ log->wqueue = NULL; @@ -1480,7 +1480,7 @@ int lmLogClose(struct super_block *sb) if (test_bit(log_INLINELOG, &log->flag)) { /* - * in-line log in host file system + * in-line log in host file system */ rc = lmLogShutdown(log); kfree(log); @@ -1504,7 +1504,7 @@ int lmLogClose(struct super_block *sb) goto out; /* - * external log as separate logical volume + * external log as separate logical volume */ list_del(&log->journal_list); bdev = log->bdev; @@ -1723,7 +1723,7 @@ int lmLogShutdown(struct jfs_log * log) * * PARAMETE: log - pointer to logs inode. * fsdev - kdev_t of filesystem. - * serial - pointer to returned log serial number + * serial - pointer to returned log serial number * activate - insert/remove device from active list. * * RETURN: 0 - success @@ -1963,7 +1963,7 @@ static void lbmfree(struct lbuf * bp) * FUNCTION: add a log buffer to the log redrive list * * PARAMETER: - * bp - log buffer + * bp - log buffer * * NOTES: * Takes log_redrive_lock. @@ -2054,7 +2054,7 @@ static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, bp->l_flag = flag; /* - * insert bp at tail of write queue associated with log + * insert bp at tail of write queue associated with log * * (request is either for bp already/currently at head of queue * or new bp to be inserted at tail) @@ -2117,7 +2117,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); /* - * initiate pageout of the page + * initiate pageout of the page */ lbmStartIO(bp); } @@ -2128,7 +2128,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) * * FUNCTION: Interface to DD strategy routine * - * RETURN: none + * RETURN: none * * serialization: LCACHE_LOCK() is NOT held during log i/o; */ @@ -2222,7 +2222,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) bio_put(bio); /* - * pagein completion + * pagein completion */ if (bp->l_flag & lbmREAD) { bp->l_flag &= ~lbmREAD; @@ -2236,7 +2236,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) } /* - * pageout completion + * pageout completion * * the bp at the head of write queue has completed pageout. * @@ -2302,7 +2302,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) } /* - * synchronous pageout: + * synchronous pageout: * * buffer has not necessarily been removed from write queue * (e.g., synchronous write of partial-page with COMMIT): @@ -2316,7 +2316,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) } /* - * Group Commit pageout: + * Group Commit pageout: */ else if (bp->l_flag & lbmGC) { LCACHE_UNLOCK(flags); @@ -2324,7 +2324,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) } /* - * asynchronous pageout: + * asynchronous pageout: * * buffer must have been removed from write queue: * insert buffer at head of freelist where it can be recycled @@ -2375,7 +2375,7 @@ int jfsIOWait(void *arg) * FUNCTION: format file system log * * PARAMETERS: - * log - volume log + * log - volume log * logAddress - start address of log space in FS block * logSize - length of log space in FS block; * @@ -2407,16 +2407,16 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) npages = logSize >> sbi->l2nbperpage; /* - * log space: + * log space: * * page 0 - reserved; * page 1 - log superblock; * page 2 - log data page: A SYNC log record is written - * into this page at logform time; + * into this page at logform time; * pages 3-N - log data page: set to empty log data pages; */ /* - * init log superblock: log page 1 + * init log superblock: log page 1 */ logsuper = (struct logsuper *) bp->l_ldata; @@ -2436,7 +2436,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) goto exit; /* - * init pages 2 to npages-1 as log data pages: + * init pages 2 to npages-1 as log data pages: * * log page sequence number (lpsn) initialization: * @@ -2479,7 +2479,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) goto exit; /* - * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) + * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) */ for (lspn = 0; lspn < npages - 3; lspn++) { lp->h.page = lp->t.page = cpu_to_le32(lspn); @@ -2495,7 +2495,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) rc = 0; exit: /* - * finalize log + * finalize log */ /* release the buffer */ lbmFree(bp); diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index a53fb17ea219..1f85ef0ec045 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -144,7 +144,7 @@ struct logpage { * * (this comment should be rewritten !) * jfs uses only "after" log records (only a single writer is allowed - * in a page, pages are written to temporary paging space if + * in a page, pages are written to temporary paging space if * if they must be written to disk before commit, and i/o is * scheduled for modified pages to their home location after * the log records containing the after values and the commit @@ -153,7 +153,7 @@ struct logpage { * * a log record consists of a data area of variable length followed by * a descriptor of fixed size LOGRDSIZE bytes. - * the data area is rounded up to an integral number of 4-bytes and + * the data area is rounded up to an integral number of 4-bytes and * must be no longer than LOGPSIZE. * the descriptor is of size of multiple of 4-bytes and aligned on a * 4-byte boundary. @@ -215,13 +215,13 @@ struct lrd { union { /* - * COMMIT: commit + * COMMIT: commit * * transaction commit: no type-dependent information; */ /* - * REDOPAGE: after-image + * REDOPAGE: after-image * * apply after-image; * @@ -236,7 +236,7 @@ struct lrd { } redopage; /* (20) */ /* - * NOREDOPAGE: the page is freed + * NOREDOPAGE: the page is freed * * do not apply after-image records which precede this record * in the log with the same page block number to this page. @@ -252,7 +252,7 @@ struct lrd { } noredopage; /* (20) */ /* - * UPDATEMAP: update block allocation map + * UPDATEMAP: update block allocation map * * either in-line PXD, * or out-of-line XADLIST; @@ -268,7 +268,7 @@ struct lrd { } updatemap; /* (20) */ /* - * NOREDOINOEXT: the inode extent is freed + * NOREDOINOEXT: the inode extent is freed * * do not apply after-image records which precede this * record in the log with the any of the 4 page block @@ -286,7 +286,7 @@ struct lrd { } noredoinoext; /* (20) */ /* - * SYNCPT: log sync point + * SYNCPT: log sync point * * replay log upto syncpt address specified; */ @@ -295,13 +295,13 @@ struct lrd { } syncpt; /* - * MOUNT: file system mount + * MOUNT: file system mount * * file system mount: no type-dependent information; */ /* - * ? FREEXTENT: free specified extent(s) + * ? FREEXTENT: free specified extent(s) * * free specified extent(s) from block allocation map * N.B.: nextents should be length of data/sizeof(xad_t) @@ -314,7 +314,7 @@ struct lrd { } freextent; /* - * ? NOREDOFILE: this file is freed + * ? NOREDOFILE: this file is freed * * do not apply records which precede this record in the log * with the same inode number. @@ -330,7 +330,7 @@ struct lrd { } noredofile; /* - * ? NEWPAGE: + * ? NEWPAGE: * * metadata type dependent */ @@ -342,7 +342,7 @@ struct lrd { } newpage; /* - * ? DUMMY: filler + * ? DUMMY: filler * * no type-dependent information */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 4dd479834897..644429acb8c0 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -80,7 +80,7 @@ static int logMOUNT(struct super_block *sb); */ int jfs_mount(struct super_block *sb) { - int rc = 0; /* Return code */ + int rc = 0; /* Return code */ struct jfs_sb_info *sbi = JFS_SBI(sb); struct inode *ipaimap = NULL; struct inode *ipaimap2 = NULL; @@ -169,7 +169,7 @@ int jfs_mount(struct super_block *sb) sbi->ipaimap2 = NULL; /* - * mount (the only/single) fileset + * mount (the only/single) fileset */ /* * open fileset inode allocation map (aka fileset inode) @@ -195,7 +195,7 @@ int jfs_mount(struct super_block *sb) goto out; /* - * unwind on error + * unwind on error */ errout41: /* close fileset inode allocation map inode */ diFreeSpecial(ipimap); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 25430d0b0d59..f2dc4b986392 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -18,7 +18,7 @@ */ /* - * jfs_txnmgr.c: transaction manager + * jfs_txnmgr.c: transaction manager * * notes: * transaction starts with txBegin() and ends with txCommit() @@ -60,7 +60,7 @@ #include "jfs_debug.h" /* - * transaction management structures + * transaction management structures */ static struct { int freetid; /* index of a free tid structure */ @@ -103,19 +103,19 @@ module_param(nTxLock, int, 0); MODULE_PARM_DESC(nTxLock, "Number of transaction locks (max:65536)"); -struct tblock *TxBlock; /* transaction block table */ -static int TxLockLWM; /* Low water mark for number of txLocks used */ -static int TxLockHWM; /* High water mark for number of txLocks used */ -static int TxLockVHWM; /* Very High water mark */ -struct tlock *TxLock; /* transaction lock table */ +struct tblock *TxBlock; /* transaction block table */ +static int TxLockLWM; /* Low water mark for number of txLocks used */ +static int TxLockHWM; /* High water mark for number of txLocks used */ +static int TxLockVHWM; /* Very High water mark */ +struct tlock *TxLock; /* transaction lock table */ /* - * transaction management lock + * transaction management lock */ static DEFINE_SPINLOCK(jfsTxnLock); -#define TXN_LOCK() spin_lock(&jfsTxnLock) -#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) +#define TXN_LOCK() spin_lock(&jfsTxnLock) +#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) #define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); #define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) @@ -148,7 +148,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) #define TXN_WAKEUP(event) wake_up_all(event) /* - * statistics + * statistics */ static struct { tid_t maxtid; /* 4: biggest tid ever used */ @@ -181,8 +181,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, static void LogSyncRelease(struct metapage * mp); /* - * transaction block/lock management - * --------------------------------- + * transaction block/lock management + * --------------------------------- */ /* @@ -227,9 +227,9 @@ static void txLockFree(lid_t lid) } /* - * NAME: txInit() + * NAME: txInit() * - * FUNCTION: initialize transaction management structures + * FUNCTION: initialize transaction management structures * * RETURN: * @@ -333,9 +333,9 @@ int txInit(void) } /* - * NAME: txExit() + * NAME: txExit() * - * FUNCTION: clean up when module is unloaded + * FUNCTION: clean up when module is unloaded */ void txExit(void) { @@ -346,12 +346,12 @@ void txExit(void) } /* - * NAME: txBegin() + * NAME: txBegin() * - * FUNCTION: start a transaction. + * FUNCTION: start a transaction. * - * PARAMETER: sb - superblock - * flag - force for nested tx; + * PARAMETER: sb - superblock + * flag - force for nested tx; * * RETURN: tid - transaction id * @@ -447,13 +447,13 @@ tid_t txBegin(struct super_block *sb, int flag) } /* - * NAME: txBeginAnon() + * NAME: txBeginAnon() * - * FUNCTION: start an anonymous transaction. + * FUNCTION: start an anonymous transaction. * Blocks if logsync or available tlocks are low to prevent * anonymous tlocks from depleting supply. * - * PARAMETER: sb - superblock + * PARAMETER: sb - superblock * * RETURN: none */ @@ -489,11 +489,11 @@ void txBeginAnon(struct super_block *sb) } /* - * txEnd() + * txEnd() * * function: free specified transaction block. * - * logsync barrier processing: + * logsync barrier processing: * * serialization: */ @@ -577,13 +577,13 @@ wakeup: } /* - * txLock() + * txLock() * * function: acquire a transaction lock on the specified * * parameter: * - * return: transaction lock id + * return: transaction lock id * * serialization: */ @@ -857,17 +857,17 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, } /* - * NAME: txRelease() + * NAME: txRelease() * - * FUNCTION: Release buffers associated with transaction locks, but don't + * FUNCTION: Release buffers associated with transaction locks, but don't * mark homeok yet. The allows other transactions to modify * buffers, but won't let them go to disk until commit record * actually gets written. * * PARAMETER: - * tblk - + * tblk - * - * RETURN: Errors from subroutines. + * RETURN: Errors from subroutines. */ static void txRelease(struct tblock * tblk) { @@ -896,10 +896,10 @@ static void txRelease(struct tblock * tblk) } /* - * NAME: txUnlock() + * NAME: txUnlock() * - * FUNCTION: Initiates pageout of pages modified by tid in journalled - * objects and frees their lockwords. + * FUNCTION: Initiates pageout of pages modified by tid in journalled + * objects and frees their lockwords. */ static void txUnlock(struct tblock * tblk) { @@ -983,10 +983,10 @@ static void txUnlock(struct tblock * tblk) } /* - * txMaplock() + * txMaplock() * * function: allocate a transaction lock for freed page/entry; - * for freed page, maplock is used as xtlock/dtlock type; + * for freed page, maplock is used as xtlock/dtlock type; */ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) { @@ -1057,7 +1057,7 @@ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) } /* - * txLinelock() + * txLinelock() * * function: allocate a transaction lock for log vector list */ @@ -1092,39 +1092,39 @@ struct linelock *txLinelock(struct linelock * tlock) } /* - * transaction commit management - * ----------------------------- + * transaction commit management + * ----------------------------- */ /* - * NAME: txCommit() - * - * FUNCTION: commit the changes to the objects specified in - * clist. For journalled segments only the - * changes of the caller are committed, ie by tid. - * for non-journalled segments the data are flushed to - * disk and then the change to the disk inode and indirect - * blocks committed (so blocks newly allocated to the - * segment will be made a part of the segment atomically). - * - * all of the segments specified in clist must be in - * one file system. no more than 6 segments are needed - * to handle all unix svcs. - * - * if the i_nlink field (i.e. disk inode link count) - * is zero, and the type of inode is a regular file or - * directory, or symbolic link , the inode is truncated - * to zero length. the truncation is committed but the - * VM resources are unaffected until it is closed (see - * iput and iclose). + * NAME: txCommit() + * + * FUNCTION: commit the changes to the objects specified in + * clist. For journalled segments only the + * changes of the caller are committed, ie by tid. + * for non-journalled segments the data are flushed to + * disk and then the change to the disk inode and indirect + * blocks committed (so blocks newly allocated to the + * segment will be made a part of the segment atomically). + * + * all of the segments specified in clist must be in + * one file system. no more than 6 segments are needed + * to handle all unix svcs. + * + * if the i_nlink field (i.e. disk inode link count) + * is zero, and the type of inode is a regular file or + * directory, or symbolic link , the inode is truncated + * to zero length. the truncation is committed but the + * VM resources are unaffected until it is closed (see + * iput and iclose). * * PARAMETER: * * RETURN: * * serialization: - * on entry the inode lock on each segment is assumed - * to be held. + * on entry the inode lock on each segment is assumed + * to be held. * * i/o error: */ @@ -1175,7 +1175,7 @@ int txCommit(tid_t tid, /* transaction identifier */ if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) tblk->xflag |= COMMIT_LAZY; /* - * prepare non-journaled objects for commit + * prepare non-journaled objects for commit * * flush data pages of non-journaled file * to prevent the file getting non-initialized disk blocks @@ -1186,7 +1186,7 @@ int txCommit(tid_t tid, /* transaction identifier */ cd.nip = nip; /* - * acquire transaction lock on (on-disk) inodes + * acquire transaction lock on (on-disk) inodes * * update on-disk inode from in-memory inode * acquiring transaction locks for AFTER records @@ -1262,7 +1262,7 @@ int txCommit(tid_t tid, /* transaction identifier */ } /* - * write log records from transaction locks + * write log records from transaction locks * * txUpdateMap() resets XAD_NEW in XAD. */ @@ -1294,7 +1294,7 @@ int txCommit(tid_t tid, /* transaction identifier */ !test_cflag(COMMIT_Nolink, tblk->u.ip))); /* - * write COMMIT log record + * write COMMIT log record */ lrd->type = cpu_to_le16(LOG_COMMIT); lrd->length = 0; @@ -1303,7 +1303,7 @@ int txCommit(tid_t tid, /* transaction identifier */ lmGroupCommit(log, tblk); /* - * - transaction is now committed - + * - transaction is now committed - */ /* @@ -1314,11 +1314,11 @@ int txCommit(tid_t tid, /* transaction identifier */ txForce(tblk); /* - * update allocation map. + * update allocation map. * * update inode allocation map and inode: * free pager lock on memory object of inode if any. - * update block allocation map. + * update block allocation map. * * txUpdateMap() resets XAD_NEW in XAD. */ @@ -1326,7 +1326,7 @@ int txCommit(tid_t tid, /* transaction identifier */ txUpdateMap(tblk); /* - * free transaction locks and pageout/free pages + * free transaction locks and pageout/free pages */ txRelease(tblk); @@ -1335,7 +1335,7 @@ int txCommit(tid_t tid, /* transaction identifier */ /* - * reset in-memory object state + * reset in-memory object state */ for (k = 0; k < cd.nip; k++) { ip = cd.iplist[k]; @@ -1358,11 +1358,11 @@ int txCommit(tid_t tid, /* transaction identifier */ } /* - * NAME: txLog() + * NAME: txLog() * - * FUNCTION: Writes AFTER log records for all lines modified - * by tid for segments specified by inodes in comdata. - * Code assumes only WRITELOCKS are recorded in lockwords. + * FUNCTION: Writes AFTER log records for all lines modified + * by tid for segments specified by inodes in comdata. + * Code assumes only WRITELOCKS are recorded in lockwords. * * PARAMETERS: * @@ -1421,12 +1421,12 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) } /* - * diLog() + * diLog() * - * function: log inode tlock and format maplock to update bmap; + * function: log inode tlock and format maplock to update bmap; */ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck, struct commit * cd) + struct tlock * tlck, struct commit * cd) { int rc = 0; struct metapage *mp; @@ -1442,7 +1442,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, pxd = &lrd->log.redopage.pxd; /* - * inode after image + * inode after image */ if (tlck->type & tlckENTRY) { /* log after-image for logredo(): */ @@ -1456,7 +1456,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, tlck->flag |= tlckWRITEPAGE; } else if (tlck->type & tlckFREE) { /* - * free inode extent + * free inode extent * * (pages of the freed inode extent have been invalidated and * a maplock for free of the extent has been formatted at @@ -1498,7 +1498,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, jfs_err("diLog: UFO type tlck:0x%p", tlck); #ifdef _JFS_WIP /* - * alloc/free external EA extent + * alloc/free external EA extent * * a maplock for txUpdateMap() to update bPWMAP for alloc/free * of the extent has been formatted at txLock() time; @@ -1534,9 +1534,9 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * dataLog() + * dataLog() * - * function: log data tlock + * function: log data tlock */ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) @@ -1580,9 +1580,9 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * dtLog() + * dtLog() * - * function: log dtree tlock and format maplock to update bmap; + * function: log dtree tlock and format maplock to update bmap; */ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) @@ -1603,10 +1603,10 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); /* - * page extension via relocation: entry insertion; - * page extension in-place: entry insertion; - * new right page from page split, reinitialized in-line - * root from root page split: entry insertion; + * page extension via relocation: entry insertion; + * page extension in-place: entry insertion; + * new right page from page split, reinitialized in-line + * root from root page split: entry insertion; */ if (tlck->type & (tlckNEW | tlckEXTEND)) { /* log after-image of the new page for logredo(): @@ -1641,8 +1641,8 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * entry insertion/deletion, - * sibling page link update (old right page before split); + * entry insertion/deletion, + * sibling page link update (old right page before split); */ if (tlck->type & (tlckENTRY | tlckRELINK)) { /* log after-image for logredo(): */ @@ -1658,11 +1658,11 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * page deletion: page has been invalidated - * page relocation: source extent + * page deletion: page has been invalidated + * page relocation: source extent * - * a maplock for free of the page has been formatted - * at txLock() time); + * a maplock for free of the page has been formatted + * at txLock() time); */ if (tlck->type & (tlckFREE | tlckRELOCATE)) { /* log LOG_NOREDOPAGE of the deleted page for logredo() @@ -1683,9 +1683,9 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * xtLog() + * xtLog() * - * function: log xtree tlock and format maplock to update bmap; + * function: log xtree tlock and format maplock to update bmap; */ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) @@ -1725,8 +1725,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, xadlock = (struct xdlistlock *) maplock; /* - * entry insertion/extension; - * sibling page link update (old right page before split); + * entry insertion/extension; + * sibling page link update (old right page before split); */ if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { /* log after-image for logredo(): @@ -1801,7 +1801,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * page deletion: file deletion/truncation (ref. xtTruncate()) + * page deletion: file deletion/truncation (ref. xtTruncate()) * * (page will be invalidated after log is written and bmap * is updated from the page); @@ -1908,13 +1908,13 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * page/entry truncation: file truncation (ref. xtTruncate()) + * page/entry truncation: file truncation (ref. xtTruncate()) * - * |----------+------+------+---------------| - * | | | - * | | hwm - hwm before truncation - * | next - truncation point - * lwm - lwm before truncation + * |----------+------+------+---------------| + * | | | + * | | hwm - hwm before truncation + * | next - truncation point + * lwm - lwm before truncation * header ? */ if (tlck->type & tlckTRUNCATE) { @@ -1937,7 +1937,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, twm = xtlck->twm.offset; /* - * write log records + * write log records */ /* log after-image for logredo(): * @@ -1997,7 +1997,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * format maplock(s) for txUpdateMap() to update bmap + * format maplock(s) for txUpdateMap() to update bmap */ maplock->index = 0; @@ -2069,9 +2069,9 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * mapLog() + * mapLog() * - * function: log from maplock of freed data extents; + * function: log from maplock of freed data extents; */ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) @@ -2081,7 +2081,7 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, pxd_t *pxd; /* - * page relocation: free the source page extent + * page relocation: free the source page extent * * a maplock for txUpdateMap() for free of the page * has been formatted at txLock() time saving the src @@ -2155,10 +2155,10 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } /* - * txEA() + * txEA() * - * function: acquire maplock for EA/ACL extents or - * set COMMIT_INLINE flag; + * function: acquire maplock for EA/ACL extents or + * set COMMIT_INLINE flag; */ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) { @@ -2207,10 +2207,10 @@ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) } /* - * txForce() + * txForce() * * function: synchronously write pages locked by transaction - * after txLog() but before txUpdateMap(); + * after txLog() but before txUpdateMap(); */ static void txForce(struct tblock * tblk) { @@ -2273,10 +2273,10 @@ static void txForce(struct tblock * tblk) } /* - * txUpdateMap() + * txUpdateMap() * - * function: update persistent allocation map (and working map - * if appropriate); + * function: update persistent allocation map (and working map + * if appropriate); * * parameter: */ @@ -2298,7 +2298,7 @@ static void txUpdateMap(struct tblock * tblk) /* - * update block allocation map + * update block allocation map * * update allocation state in pmap (and wmap) and * update lsn of the pmap page; @@ -2382,7 +2382,7 @@ static void txUpdateMap(struct tblock * tblk) } } /* - * update inode allocation map + * update inode allocation map * * update allocation state in pmap and * update lsn of the pmap page; @@ -2407,24 +2407,24 @@ static void txUpdateMap(struct tblock * tblk) } /* - * txAllocPMap() + * txAllocPMap() * * function: allocate from persistent map; * * parameter: - * ipbmap - - * malock - - * xad list: - * pxd: - * - * maptype - - * allocate from persistent map; - * free from persistent map; - * (e.g., tmp file - free from working map at releae - * of last reference); - * free from persistent and working map; - * - * lsn - log sequence number; + * ipbmap - + * malock - + * xad list: + * pxd: + * + * maptype - + * allocate from persistent map; + * free from persistent map; + * (e.g., tmp file - free from working map at releae + * of last reference); + * free from persistent and working map; + * + * lsn - log sequence number; */ static void txAllocPMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk) @@ -2478,9 +2478,9 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock, } /* - * txFreeMap() + * txFreeMap() * - * function: free from persistent and/or working map; + * function: free from persistent and/or working map; * * todo: optimization */ @@ -2579,9 +2579,9 @@ void txFreeMap(struct inode *ip, } /* - * txFreelock() + * txFreelock() * - * function: remove tlock from inode anonymous locklist + * function: remove tlock from inode anonymous locklist */ void txFreelock(struct inode *ip) { @@ -2619,7 +2619,7 @@ void txFreelock(struct inode *ip) } /* - * txAbort() + * txAbort() * * function: abort tx before commit; * @@ -2679,7 +2679,7 @@ void txAbort(tid_t tid, int dirty) } /* - * txLazyCommit(void) + * txLazyCommit(void) * * All transactions except those changing ipimap (COMMIT_FORCE) are * processed by this routine. This insures that the inode and block @@ -2728,7 +2728,7 @@ static void txLazyCommit(struct tblock * tblk) } /* - * jfs_lazycommit(void) + * jfs_lazycommit(void) * * To be run as a kernel daemon. If lbmIODone is called in an interrupt * context, or where blocking is not wanted, this routine will process @@ -2913,7 +2913,7 @@ void txResume(struct super_block *sb) } /* - * jfs_sync(void) + * jfs_sync(void) * * To be run as a kernel daemon. This is awakened when tlocks run low. * We write any inodes that have anonymous tlocks so they will become diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h index 7863cf21afca..ab7288937019 100644 --- a/fs/jfs/jfs_txnmgr.h +++ b/fs/jfs/jfs_txnmgr.h @@ -94,7 +94,7 @@ extern struct tblock *TxBlock; /* transaction block table */ */ struct tlock { lid_t next; /* 2: index next lockword on tid locklist - * next lockword on freelist + * next lockword on freelist */ tid_t tid; /* 2: transaction id holding lock */ diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h index 09b252958687..649f9817accd 100644 --- a/fs/jfs/jfs_types.h +++ b/fs/jfs/jfs_types.h @@ -21,7 +21,7 @@ /* * jfs_types.h: * - * basic type/utility definitions + * basic type/utility definitions * * note: this header file must be the 1st include file * of JFS include list in all JFS .c file. @@ -54,8 +54,8 @@ struct timestruc_t { */ #define LEFTMOSTONE 0x80000000 -#define HIGHORDER 0x80000000u /* high order bit on */ -#define ONES 0xffffffffu /* all bit on */ +#define HIGHORDER 0x80000000u /* high order bit on */ +#define ONES 0xffffffffu /* all bit on */ /* * logical xd (lxd) @@ -148,7 +148,7 @@ typedef struct { #define sizeDXD(dxd) le32_to_cpu((dxd)->size) /* - * directory entry argument + * directory entry argument */ struct component_name { int namlen; @@ -160,14 +160,14 @@ struct component_name { * DASD limit information - stored in directory inode */ struct dasd { - u8 thresh; /* Alert Threshold (in percent) */ - u8 delta; /* Alert Threshold delta (in percent) */ + u8 thresh; /* Alert Threshold (in percent) */ + u8 delta; /* Alert Threshold delta (in percent) */ u8 rsrvd1; - u8 limit_hi; /* DASD limit (in logical blocks) */ - __le32 limit_lo; /* DASD limit (in logical blocks) */ + u8 limit_hi; /* DASD limit (in logical blocks) */ + __le32 limit_lo; /* DASD limit (in logical blocks) */ u8 rsrvd2[3]; - u8 used_hi; /* DASD usage (in logical blocks) */ - __le32 used_lo; /* DASD usage (in logical blocks) */ + u8 used_hi; /* DASD usage (in logical blocks) */ + __le32 used_lo; /* DASD usage (in logical blocks) */ }; #define DASDLIMIT(dasdp) \ diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index a386f48c73fc..7971f37534a3 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -60,7 +60,7 @@ int jfs_umount(struct super_block *sb) jfs_info("UnMount JFS: sb:0x%p", sb); /* - * update superblock and close log + * update superblock and close log * * if mounted read-write and log based recovery was enabled */ diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index acc97c46d8a4..1543906a2e0d 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -16,7 +16,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* - * jfs_xtree.c: extent allocation descriptor B+-tree manager + * jfs_xtree.c: extent allocation descriptor B+-tree manager */ #include @@ -32,30 +32,30 @@ /* * xtree local flag */ -#define XT_INSERT 0x00000001 +#define XT_INSERT 0x00000001 /* - * xtree key/entry comparison: extent offset + * xtree key/entry comparison: extent offset * * return: - * -1: k < start of extent - * 0: start_of_extent <= k <= end_of_extent - * 1: k > end_of_extent + * -1: k < start of extent + * 0: start_of_extent <= k <= end_of_extent + * 1: k > end_of_extent */ #define XT_CMP(CMP, K, X, OFFSET64)\ {\ - OFFSET64 = offsetXAD(X);\ - (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ - ((K) < OFFSET64) ? -1 : 0;\ + OFFSET64 = offsetXAD(X);\ + (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ + ((K) < OFFSET64) ? -1 : 0;\ } /* write a xad entry */ #define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ {\ - (XAD)->flag = (FLAG);\ - XADoffset((XAD), (OFF));\ - XADlength((XAD), (LEN));\ - XADaddress((XAD), (ADDR));\ + (XAD)->flag = (FLAG);\ + XADoffset((XAD), (OFF));\ + XADlength((XAD), (LEN));\ + XADaddress((XAD), (ADDR));\ } #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) @@ -76,13 +76,13 @@ MP = NULL;\ RC = -EIO;\ }\ - }\ + }\ } /* for consistency */ #define XT_PUTPAGE(MP) BT_PUTPAGE(MP) -#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ +#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) /* xtree entry parameter descriptor */ struct xtsplit { @@ -97,7 +97,7 @@ struct xtsplit { /* - * statistics + * statistics */ #ifdef CONFIG_JFS_STATISTICS static struct { @@ -136,7 +136,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); #endif /* _STILL_TO_PORT */ /* - * xtLookup() + * xtLookup() * * function: map a single page into a physical extent; */ @@ -179,7 +179,7 @@ int xtLookup(struct inode *ip, s64 lstart, } /* - * compute the physical extent covering logical extent + * compute the physical extent covering logical extent * * N.B. search may have failed (e.g., hole in sparse file), * and returned the index of the next entry. @@ -220,27 +220,27 @@ int xtLookup(struct inode *ip, s64 lstart, /* - * xtLookupList() + * xtLookupList() * * function: map a single logical extent into a list of physical extent; * * parameter: - * struct inode *ip, - * struct lxdlist *lxdlist, lxd list (in) - * struct xadlist *xadlist, xad list (in/out) - * int flag) + * struct inode *ip, + * struct lxdlist *lxdlist, lxd list (in) + * struct xadlist *xadlist, xad list (in/out) + * int flag) * * coverage of lxd by xad under assumption of * . lxd's are ordered and disjoint. * . xad's are ordered and disjoint. * * return: - * 0: success + * 0: success * * note: a page being written (even a single byte) is backed fully, - * except the last page which is only backed with blocks - * required to cover the last byte; - * the extent backing a page is fully contained within an xad; + * except the last page which is only backed with blocks + * required to cover the last byte; + * the extent backing a page is fully contained within an xad; */ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, struct xadlist * xadlist, int flag) @@ -284,7 +284,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, return rc; /* - * compute the physical extent covering logical extent + * compute the physical extent covering logical extent * * N.B. search may have failed (e.g., hole in sparse file), * and returned the index of the next entry. @@ -343,7 +343,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, if (lstart >= size) goto mapend; - /* compare with the current xad */ + /* compare with the current xad */ goto compare1; } /* lxd is covered by xad */ @@ -430,7 +430,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, /* * lxd is partially covered by xad */ - else { /* (xend < lend) */ + else { /* (xend < lend) */ /* * get next xad @@ -477,22 +477,22 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, /* - * xtSearch() + * xtSearch() * - * function: search for the xad entry covering specified offset. + * function: search for the xad entry covering specified offset. * * parameters: - * ip - file object; - * xoff - extent offset; - * nextp - address of next extent (if any) for search miss - * cmpp - comparison result: - * btstack - traverse stack; - * flag - search process flag (XT_INSERT); + * ip - file object; + * xoff - extent offset; + * nextp - address of next extent (if any) for search miss + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag (XT_INSERT); * * returns: - * btstack contains (bn, index) of search path traversed to the entry. - * *cmpp is set to result of comparison with the entry returned. - * the page containing the entry is pinned at exit. + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. */ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, int *cmpp, struct btstack * btstack, int flag) @@ -517,7 +517,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, btstack->nsplit = 0; /* - * search down tree from root: + * search down tree from root: * * between two consecutive entries of and of * internal page, child page Pi contains entry with k, Ki <= K < Kj. @@ -642,7 +642,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, XT_CMP(cmp, xoff, &p->xad[index], t64); if (cmp == 0) { /* - * search hit + * search hit */ /* search hit - leaf page: * return the entry found @@ -692,7 +692,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, } /* - * search miss + * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or maxentry index. @@ -773,22 +773,22 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, } /* - * xtInsert() + * xtInsert() * * function: * * parameter: - * tid - transaction id; - * ip - file object; - * xflag - extent flag (XAD_NOTRECORDED): - * xoff - extent offset; - * xlen - extent length; - * xaddrp - extent address pointer (in/out): - * if (*xaddrp) - * caller allocated data extent at *xaddrp; - * else - * allocate data extent and return its xaddr; - * flag - + * tid - transaction id; + * ip - file object; + * xflag - extent flag (XAD_NOTRECORDED): + * xoff - extent offset; + * xlen - extent length; + * xaddrp - extent address pointer (in/out): + * if (*xaddrp) + * caller allocated data extent at *xaddrp; + * else + * allocate data extent and return its xaddr; + * flag - * * return: */ @@ -813,7 +813,7 @@ int xtInsert(tid_t tid, /* transaction id */ jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); /* - * search for the entry location at which to insert: + * search for the entry location at which to insert: * * xtFastSearch() and xtSearch() both returns (leaf page * pinned, index at which to insert). @@ -853,13 +853,13 @@ int xtInsert(tid_t tid, /* transaction id */ } /* - * insert entry for new extent + * insert entry for new extent */ xflag |= XAD_NEW; /* - * if the leaf page is full, split the page and - * propagate up the router entry for the new page from split + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ @@ -886,7 +886,7 @@ int xtInsert(tid_t tid, /* transaction id */ } /* - * insert the new entry into the leaf page + * insert the new entry into the leaf page */ /* * acquire a transaction lock on the leaf page; @@ -930,16 +930,16 @@ int xtInsert(tid_t tid, /* transaction id */ /* - * xtSplitUp() + * xtSplitUp() * * function: - * split full pages as propagating insertion up the tree + * split full pages as propagating insertion up the tree * * parameter: - * tid - transaction id; - * ip - file object; - * split - entry parameter descriptor; - * btstack - traverse stack from xtSearch() + * tid - transaction id; + * ip - file object; + * split - entry parameter descriptor; + * btstack - traverse stack from xtSearch() * * return: */ @@ -1199,22 +1199,22 @@ xtSplitUp(tid_t tid, /* - * xtSplitPage() + * xtSplitPage() * * function: - * split a full non-root page into - * original/split/left page and new right page - * i.e., the original/split page remains as left page. + * split a full non-root page into + * original/split/left page and new right page + * i.e., the original/split page remains as left page. * * parameter: - * int tid, - * struct inode *ip, - * struct xtsplit *split, - * struct metapage **rmpp, - * u64 *rbnp, + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp, + * u64 *rbnp, * * return: - * Pointer to page in which to insert or NULL on error. + * Pointer to page in which to insert or NULL on error. */ static int xtSplitPage(tid_t tid, struct inode *ip, @@ -1248,9 +1248,9 @@ xtSplitPage(tid_t tid, struct inode *ip, rbn = addressPXD(pxd); /* Allocate blocks to quota. */ - if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { - rc = -EDQUOT; - goto clean_up; + if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { + rc = -EDQUOT; + goto clean_up; } quota_allocation += lengthPXD(pxd); @@ -1304,7 +1304,7 @@ xtSplitPage(tid_t tid, struct inode *ip, skip = split->index; /* - * sequential append at tail (after last entry of last page) + * sequential append at tail (after last entry of last page) * * if splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is @@ -1342,7 +1342,7 @@ xtSplitPage(tid_t tid, struct inode *ip, } /* - * non-sequential insert (at possibly middle page) + * non-sequential insert (at possibly middle page) */ /* @@ -1465,25 +1465,24 @@ xtSplitPage(tid_t tid, struct inode *ip, /* - * xtSplitRoot() + * xtSplitRoot() * * function: - * split the full root page into - * original/root/split page and new right page - * i.e., root remains fixed in tree anchor (inode) and - * the root is copied to a single new right child page - * since root page << non-root page, and - * the split root page contains a single entry for the - * new right child page. + * split the full root page into original/root/split page and new + * right page + * i.e., root remains fixed in tree anchor (inode) and the root is + * copied to a single new right child page since root page << + * non-root page, and the split root page contains a single entry + * for the new right child page. * * parameter: - * int tid, - * struct inode *ip, - * struct xtsplit *split, - * struct metapage **rmpp) + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp) * * return: - * Pointer to page in which to insert or NULL on error. + * Pointer to page in which to insert or NULL on error. */ static int xtSplitRoot(tid_t tid, @@ -1505,7 +1504,7 @@ xtSplitRoot(tid_t tid, INCREMENT(xtStat.split); /* - * allocate a single (right) child page + * allocate a single (right) child page */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; @@ -1573,7 +1572,7 @@ xtSplitRoot(tid_t tid, } /* - * reset the root + * reset the root * * init root with the single entry for the new right page * set the 1st entry offset to 0, which force the left-most key @@ -1610,7 +1609,7 @@ xtSplitRoot(tid_t tid, /* - * xtExtend() + * xtExtend() * * function: extend in-place; * @@ -1677,7 +1676,7 @@ int xtExtend(tid_t tid, /* transaction id */ goto extendOld; /* - * extent overflow: insert entry for new extent + * extent overflow: insert entry for new extent */ //insertNew: xoff = offsetXAD(xad) + MAXXLEN; @@ -1685,8 +1684,8 @@ int xtExtend(tid_t tid, /* transaction id */ nextindex = le16_to_cpu(p->header.nextindex); /* - * if the leaf page is full, insert the new entry and - * propagate up the router entry for the new page from split + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ @@ -1731,7 +1730,7 @@ int xtExtend(tid_t tid, /* transaction id */ } } /* - * insert the new entry into the leaf page + * insert the new entry into the leaf page */ else { /* insert the new entry: mark the entry NEW */ @@ -1771,11 +1770,11 @@ int xtExtend(tid_t tid, /* transaction id */ #ifdef _NOTYET /* - * xtTailgate() + * xtTailgate() * * function: split existing 'tail' extent - * (split offset >= start offset of tail extent), and - * relocate and extend the split tail half; + * (split offset >= start offset of tail extent), and + * relocate and extend the split tail half; * * note: existing extent may or may not have been committed. * caller is responsible for pager buffer cache update, and @@ -1804,7 +1803,7 @@ int xtTailgate(tid_t tid, /* transaction id */ /* printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", - (ulong)xoff, xlen, (ulong)xaddr); + (ulong)xoff, xlen, (ulong)xaddr); */ /* there must exist extent to be tailgated */ @@ -1842,18 +1841,18 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", xad = &p->xad[index]; /* printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", - (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); + (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); */ if ((llen = xoff - offsetXAD(xad)) == 0) goto updateOld; /* - * partially replace extent: insert entry for new extent + * partially replace extent: insert entry for new extent */ //insertNew: /* - * if the leaf page is full, insert the new entry and - * propagate up the router entry for the new page from split + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ @@ -1898,7 +1897,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", } } /* - * insert the new entry into the leaf page + * insert the new entry into the leaf page */ else { /* insert the new entry: mark the entry NEW */ @@ -1955,17 +1954,17 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", #endif /* _NOTYET */ /* - * xtUpdate() + * xtUpdate() * * function: update XAD; * - * update extent for allocated_but_not_recorded or - * compressed extent; + * update extent for allocated_but_not_recorded or + * compressed extent; * * parameter: - * nxad - new XAD; - * logical extent of the specified XAD must be completely - * contained by an existing XAD; + * nxad - new XAD; + * logical extent of the specified XAD must be completely + * contained by an existing XAD; */ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) { /* new XAD */ @@ -2416,19 +2415,19 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p); /* - * xtAppend() + * xtAppend() * * function: grow in append mode from contiguous region specified ; * * parameter: - * tid - transaction id; - * ip - file object; - * xflag - extent flag: - * xoff - extent offset; - * maxblocks - max extent length; - * xlen - extent length (in/out); - * xaddrp - extent address pointer (in/out): - * flag - + * tid - transaction id; + * ip - file object; + * xflag - extent flag: + * xoff - extent offset; + * maxblocks - max extent length; + * xlen - extent length (in/out); + * xaddrp - extent address pointer (in/out): + * flag - * * return: */ @@ -2460,7 +2459,7 @@ int xtAppend(tid_t tid, /* transaction id */ (ulong) xoff, maxblocks, xlen, (ulong) xaddr); /* - * search for the entry location at which to insert: + * search for the entry location at which to insert: * * xtFastSearch() and xtSearch() both returns (leaf page * pinned, index at which to insert). @@ -2482,13 +2481,13 @@ int xtAppend(tid_t tid, /* transaction id */ xlen = min(xlen, (int)(next - xoff)); //insert: /* - * insert entry for new extent + * insert entry for new extent */ xflag |= XAD_NEW; /* - * if the leaf page is full, split the page and - * propagate up the router entry for the new page from split + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ @@ -2545,7 +2544,7 @@ int xtAppend(tid_t tid, /* transaction id */ return 0; /* - * insert the new entry into the leaf page + * insert the new entry into the leaf page */ insertLeaf: /* @@ -2589,17 +2588,17 @@ int xtAppend(tid_t tid, /* transaction id */ /* - TBD for defragmentaion/reorganization - * - * xtDelete() + * xtDelete() * * function: - * delete the entry with the specified key. + * delete the entry with the specified key. * - * N.B.: whole extent of the entry is assumed to be deleted. + * N.B.: whole extent of the entry is assumed to be deleted. * * parameter: * * return: - * ENOENT: if the entry is not found. + * ENOENT: if the entry is not found. * * exception: */ @@ -2665,10 +2664,10 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) /* - TBD for defragmentaion/reorganization - * - * xtDeleteUp() + * xtDeleteUp() * * function: - * free empty pages as propagating deletion up the tree + * free empty pages as propagating deletion up the tree * * parameter: * @@ -2815,15 +2814,15 @@ xtDeleteUp(tid_t tid, struct inode *ip, /* - * NAME: xtRelocate() + * NAME: xtRelocate() * - * FUNCTION: relocate xtpage or data extent of regular file; - * This function is mainly used by defragfs utility. + * FUNCTION: relocate xtpage or data extent of regular file; + * This function is mainly used by defragfs utility. * - * NOTE: This routine does not have the logic to handle - * uncommitted allocated extent. The caller should call - * txCommit() to commit all the allocation before call - * this routine. + * NOTE: This routine does not have the logic to handle + * uncommitted allocated extent. The caller should call + * txCommit() to commit all the allocation before call + * this routine. */ int xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ @@ -2865,8 +2864,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); /* - * 1. get and validate the parent xtpage/xad entry - * covering the source extent to be relocated; + * 1. get and validate the parent xtpage/xad entry + * covering the source extent to be relocated; */ if (xtype == DATAEXT) { /* search in leaf entry */ @@ -2910,7 +2909,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ jfs_info("xtRelocate: parent xad entry validated."); /* - * 2. relocate the extent + * 2. relocate the extent */ if (xtype == DATAEXT) { /* if the extent is allocated-but-not-recorded @@ -2923,7 +2922,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ XT_PUTPAGE(pmp); /* - * cmRelocate() + * cmRelocate() * * copy target data pages to be relocated; * @@ -2945,8 +2944,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ pno = offset >> CM_L2BSIZE; npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; /* - npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - - (offset >> CM_L2BSIZE) + 1; + npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - + (offset >> CM_L2BSIZE) + 1; */ sxaddr = oxaddr; dxaddr = nxaddr; @@ -2981,7 +2980,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); jfs_info("xtRelocate: target data extent relocated."); - } else { /* (xtype == XTPAGE) */ + } else { /* (xtype == XTPAGE) */ /* * read in the target xtpage from the source extent; @@ -3026,16 +3025,14 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ */ if (lmp) { BT_MARK_DIRTY(lmp, ip); - tlck = - txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); + tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); lp->header.next = cpu_to_le64(nxaddr); XT_PUTPAGE(lmp); } if (rmp) { BT_MARK_DIRTY(rmp, ip); - tlck = - txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); rp->header.prev = cpu_to_le64(nxaddr); XT_PUTPAGE(rmp); } @@ -3062,7 +3059,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ * scan may be skipped by commit() and logredo(); */ BT_MARK_DIRTY(mp, ip); - /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ + /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); xtlck = (struct xtlock *) & tlck->lock; @@ -3084,7 +3081,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ } /* - * 3. acquire maplock for the source extent to be freed; + * 3. acquire maplock for the source extent to be freed; * * acquire a maplock saving the src relocated extent address; * to free of the extent at commit time; @@ -3105,7 +3102,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ * is no buffer associated with this lock since the buffer * has been redirected to the target location. */ - else /* (xtype == XTPAGE) */ + else /* (xtype == XTPAGE) */ tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); pxdlock = (struct pxd_lock *) & tlck->lock; @@ -3115,7 +3112,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ pxdlock->index = 1; /* - * 4. update the parent xad entry for relocation; + * 4. update the parent xad entry for relocation; * * acquire tlck for the parent entry with XAD_NEW as entry * update which will write LOG_REDOPAGE and update bmap for @@ -3143,22 +3140,22 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ /* - * xtSearchNode() + * xtSearchNode() * - * function: search for the internal xad entry covering specified extent. - * This function is mainly used by defragfs utility. + * function: search for the internal xad entry covering specified extent. + * This function is mainly used by defragfs utility. * * parameters: - * ip - file object; - * xad - extent to find; - * cmpp - comparison result: - * btstack - traverse stack; - * flag - search process flag; + * ip - file object; + * xad - extent to find; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag; * * returns: - * btstack contains (bn, index) of search path traversed to the entry. - * *cmpp is set to result of comparison with the entry returned. - * the page containing the entry is pinned at exit. + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. */ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ int *cmpp, struct btstack * btstack, int flag) @@ -3181,7 +3178,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ xaddr = addressXAD(xad); /* - * search down tree from root: + * search down tree from root: * * between two consecutive entries of and of * internal page, child page Pi contains entry with k, Ki <= K < Kj. @@ -3217,7 +3214,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ XT_CMP(cmp, xoff, &p->xad[index], t64); if (cmp == 0) { /* - * search hit + * search hit * * verify for exact match; */ @@ -3245,7 +3242,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ } /* - * search miss - non-leaf page: + * search miss - non-leaf page: * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or maxentry index. @@ -3268,15 +3265,15 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ /* - * xtRelink() + * xtRelink() * * function: - * link around a freed page. + * link around a freed page. * * Parameter: - * int tid, - * struct inode *ip, - * xtpage_t *p) + * int tid, + * struct inode *ip, + * xtpage_t *p) * * returns: */ @@ -3338,7 +3335,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) /* - * xtInitRoot() + * xtInitRoot() * * initialize file root (inline in inode) */ @@ -3385,42 +3382,42 @@ void xtInitRoot(tid_t tid, struct inode *ip) #define MAX_TRUNCATE_LEAVES 50 /* - * xtTruncate() + * xtTruncate() * * function: - * traverse for truncation logging backward bottom up; - * terminate at the last extent entry at the current subtree - * root page covering new down size. - * truncation may occur within the last extent entry. + * traverse for truncation logging backward bottom up; + * terminate at the last extent entry at the current subtree + * root page covering new down size. + * truncation may occur within the last extent entry. * * parameter: - * int tid, - * struct inode *ip, - * s64 newsize, - * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} + * int tid, + * struct inode *ip, + * s64 newsize, + * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} * * return: * * note: - * PWMAP: - * 1. truncate (non-COMMIT_NOLINK file) - * by jfs_truncate() or jfs_open(O_TRUNC): - * xtree is updated; + * PWMAP: + * 1. truncate (non-COMMIT_NOLINK file) + * by jfs_truncate() or jfs_open(O_TRUNC): + * xtree is updated; * 2. truncate index table of directory when last entry removed - * map update via tlock at commit time; - * PMAP: + * map update via tlock at commit time; + * PMAP: * Call xtTruncate_pmap instead - * WMAP: - * 1. remove (free zero link count) on last reference release - * (pmap has been freed at commit zero link count); - * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): - * xtree is updated; - * map update directly at truncation time; + * WMAP: + * 1. remove (free zero link count) on last reference release + * (pmap has been freed at commit zero link count); + * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): + * xtree is updated; + * map update directly at truncation time; * - * if (DELETE) - * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); - * else if (TRUNCATE) - * must write LOG_NOREDOPAGE for deleted index page; + * if (DELETE) + * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); + * else if (TRUNCATE) + * must write LOG_NOREDOPAGE for deleted index page; * * pages may already have been tlocked by anonymous transactions * during file growth (i.e., write) before truncation; @@ -3493,7 +3490,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) * retained in the new sized file. * if type is PMAP, the data and index pages are NOT * freed, and the data and index blocks are NOT freed - * from working map. + * from working map. * (this will allow continued access of data/index of * temporary file (zerolink count file truncated to zero-length)). */ @@ -3542,7 +3539,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) goto getChild; /* - * leaf page + * leaf page */ freed = 0; @@ -3916,7 +3913,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) } /* - * internal page: go down to child page of current entry + * internal page: go down to child page of current entry */ getChild: /* save current parent entry for the child page */ @@ -3965,7 +3962,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) /* - * xtTruncate_pmap() + * xtTruncate_pmap() * * function: * Perform truncate to zero lenghth for deleted file, leaving the @@ -3974,9 +3971,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) * is committed to disk. * * parameter: - * tid_t tid, - * struct inode *ip, - * s64 committed_size) + * tid_t tid, + * struct inode *ip, + * s64 committed_size) * * return: new committed size * @@ -4050,7 +4047,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) } /* - * leaf page + * leaf page */ if (++locked_leaves > MAX_TRUNCATE_LEAVES) { @@ -4062,7 +4059,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) xoff = offsetXAD(xad); xlen = lengthXAD(xad); XT_PUTPAGE(mp); - return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; } tlck = txLock(tid, ip, mp, tlckXTREE); tlck->type = tlckXTREE | tlckFREE; @@ -4099,8 +4096,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) */ tlck = txLock(tid, ip, mp, tlckXTREE); xtlck = (struct xtlock *) & tlck->lock; - xtlck->hwm.offset = - le16_to_cpu(p->header.nextindex) - 1; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; tlck->type = tlckXTREE | tlckFREE; XT_PUTPAGE(mp); @@ -4118,7 +4114,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) else index--; /* - * internal page: go down to child page of current entry + * internal page: go down to child page of current entry */ getChild: /* save current parent entry for the child page */ diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index 164f6f2b1019..70815c8a3d6a 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -19,14 +19,14 @@ #define _H_JFS_XTREE /* - * jfs_xtree.h: extent allocation descriptor B+-tree manager + * jfs_xtree.h: extent allocation descriptor B+-tree manager */ #include "jfs_btree.h" /* - * extent allocation descriptor (xad) + * extent allocation descriptor (xad) */ typedef struct xad { unsigned flag:8; /* 1: flag */ @@ -38,30 +38,30 @@ typedef struct xad { __le32 addr2; /* 4: address in unit of fsblksize */ } xad_t; /* (16) */ -#define MAXXLEN ((1 << 24) - 1) +#define MAXXLEN ((1 << 24) - 1) -#define XTSLOTSIZE 16 -#define L2XTSLOTSIZE 4 +#define XTSLOTSIZE 16 +#define L2XTSLOTSIZE 4 /* xad_t field construction */ #define XADoffset(xad, offset64)\ {\ - (xad)->off1 = ((u64)offset64) >> 32;\ - (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ + (xad)->off1 = ((u64)offset64) >> 32;\ + (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ } #define XADaddress(xad, address64)\ {\ - (xad)->addr1 = ((u64)address64) >> 32;\ - (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ + (xad)->addr1 = ((u64)address64) >> 32;\ + (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ } -#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) +#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) /* xad_t field extraction */ #define offsetXAD(xad)\ - ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) + ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) #define addressXAD(xad)\ - ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) -#define lengthXAD(xad) __le24_to_cpu((xad)->len) + ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) +#define lengthXAD(xad) __le24_to_cpu((xad)->len) /* xad list */ struct xadlist { @@ -71,22 +71,22 @@ struct xadlist { }; /* xad_t flags */ -#define XAD_NEW 0x01 /* new */ -#define XAD_EXTENDED 0x02 /* extended */ -#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ +#define XAD_NEW 0x01 /* new */ +#define XAD_EXTENDED 0x02 /* extended */ +#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ #define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ -#define XAD_COW 0x10 /* copy-on-write */ +#define XAD_COW 0x10 /* copy-on-write */ /* possible values for maxentry */ -#define XTROOTINITSLOT_DIR 6 -#define XTROOTINITSLOT 10 -#define XTROOTMAXSLOT 18 -#define XTPAGEMAXSLOT 256 -#define XTENTRYSTART 2 +#define XTROOTINITSLOT_DIR 6 +#define XTROOTINITSLOT 10 +#define XTROOTMAXSLOT 18 +#define XTPAGEMAXSLOT 256 +#define XTENTRYSTART 2 /* - * xtree page: + * xtree page: */ typedef union { struct xtheader { @@ -106,7 +106,7 @@ typedef union { } xtpage_t; /* - * external declaration + * external declaration */ extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, int *pflag, s64 * paddr, int *plen, int flag); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 41c204771262..25161c4121e4 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -328,7 +328,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) * dentry - child directory dentry * * RETURN: -EINVAL - if name is . or .. - * -EINVAL - if . or .. exist but are invalid. + * -EINVAL - if . or .. exist but are invalid. * errors from subroutines * * note: @@ -517,7 +517,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) inode_dec_link_count(ip); /* - * commit zero link count object + * commit zero link count object */ if (ip->i_nlink == 0) { assert(!test_cflag(COMMIT_Nolink, ip)); @@ -596,7 +596,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) /* * NAME: commitZeroLink() * - * FUNCTION: for non-directory, called by jfs_remove(), + * FUNCTION: for non-directory, called by jfs_remove(), * truncate a regular file, directory or symbolic * link to zero length. return 0 if type is not * one of these. @@ -676,7 +676,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip) /* * NAME: jfs_free_zero_link() * - * FUNCTION: for non-directory, called by iClose(), + * FUNCTION: for non-directory, called by iClose(), * free resources of a file from cache and WORKING map * for a file previously committed with zero link count * while associated with a pager object, @@ -855,12 +855,12 @@ static int jfs_link(struct dentry *old_dentry, * NAME: jfs_symlink(dip, dentry, name) * * FUNCTION: creates a symbolic link to by name - * in directory + * in directory * - * PARAMETER: dip - parent directory vnode - * dentry - dentry of symbolic link - * name - the path name of the existing object - * that will be the source of the link + * PARAMETER: dip - parent directory vnode + * dentry - dentry of symbolic link + * name - the path name of the existing object + * that will be the source of the link * * RETURN: errors from subroutines * @@ -1052,9 +1052,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, /* - * NAME: jfs_rename + * NAME: jfs_rename * - * FUNCTION: rename a file or directory + * FUNCTION: rename a file or directory */ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) @@ -1331,9 +1331,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* - * NAME: jfs_mknod + * NAME: jfs_mknod * - * FUNCTION: Create a special file (device) + * FUNCTION: Create a special file (device) */ static int jfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 79d625f3f733..71984ee95346 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -29,17 +29,17 @@ #include "jfs_txnmgr.h" #include "jfs_debug.h" -#define BITSPERPAGE (PSIZE << 3) -#define L2MEGABYTE 20 -#define MEGABYTE (1 << L2MEGABYTE) -#define MEGABYTE32 (MEGABYTE << 5) +#define BITSPERPAGE (PSIZE << 3) +#define L2MEGABYTE 20 +#define MEGABYTE (1 << L2MEGABYTE) +#define MEGABYTE32 (MEGABYTE << 5) /* convert block number to bmap file page number */ #define BLKTODMAPN(b)\ - (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) + (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) /* - * jfs_extendfs() + * jfs_extendfs() * * function: extend file system; * @@ -48,9 +48,9 @@ * workspace space * * input: - * new LVSize: in LV blocks (required) - * new LogSize: in LV blocks (optional) - * new FSSize: in LV blocks (optional) + * new LVSize: in LV blocks (required) + * new LogSize: in LV blocks (optional) + * new FSSize: in LV blocks (optional) * * new configuration: * 1. set new LogSize as specified or default from new LVSize; @@ -125,8 +125,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) } /* - * reconfigure LV spaces - * --------------------- + * reconfigure LV spaces + * --------------------- * * validate new size, or, if not specified, determine new size */ @@ -198,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) log_formatted = 1; } /* - * quiesce file system + * quiesce file system * * (prepare to move the inline log and to prevent map update) * @@ -270,8 +270,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) } /* - * extend block allocation map - * --------------------------- + * extend block allocation map + * --------------------------- * * extendfs() for new extension, retry after crash recovery; * @@ -283,7 +283,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * s_size: aggregate size in physical blocks; */ /* - * compute the new block allocation map configuration + * compute the new block allocation map configuration * * map dinode: * di_size: map file size in byte; @@ -301,7 +301,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) newNpages = BLKTODMAPN(t64) + 1; /* - * extend map from current map (WITHOUT growing mapfile) + * extend map from current map (WITHOUT growing mapfile) * * map new extension with unmapped part of the last partial * dmap page, if applicable, and extra page(s) allocated @@ -341,8 +341,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) XSize -= nblocks; /* - * grow map file to cover remaining extension - * and/or one extra dmap page for next extendfs(); + * grow map file to cover remaining extension + * and/or one extra dmap page for next extendfs(); * * allocate new map pages and its backing blocks, and * update map file xtree @@ -422,8 +422,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) dbFinalizeBmap(ipbmap); /* - * update inode allocation map - * --------------------------- + * update inode allocation map + * --------------------------- * * move iag lists from old to new iag; * agstart field is not updated for logredo() to reconstruct @@ -442,8 +442,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) } /* - * finalize - * -------- + * finalize + * -------- * * extension is committed when on-disk super block is * updated with new descriptors: logredo will recover @@ -480,7 +480,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) diFreeSpecial(ipbmap2); /* - * update superblock + * update superblock */ if ((rc = readSuper(sb, &bh))) goto error_out; @@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) resume: /* - * resume file system transactions + * resume file system transactions */ txResume(sb); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index b753ba216450..5c0f2de01316 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -63,9 +63,9 @@ * * On-disk: * - * FEALISTs are stored on disk using blocks allocated by dbAlloc() and - * written directly. An EA list may be in-lined in the inode if there is - * sufficient room available. + * FEALISTs are stored on disk using blocks allocated by dbAlloc() and + * written directly. An EA list may be in-lined in the inode if there is + * sufficient room available. */ struct ea_buffer { -- cgit v1.2.1 From 209e101bf408a50acc426e32c8252daefacde5b0 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 6 Jun 2007 16:30:17 -0500 Subject: JFS: use print_hex_dump() rather than private dump_mem() function Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_debug.c | 28 ---------------------------- fs/jfs/jfs_debug.h | 2 -- fs/jfs/jfs_imap.c | 3 ++- fs/jfs/jfs_logmgr.c | 18 ++++++++++++------ fs/jfs/jfs_metapage.c | 3 ++- fs/jfs/jfs_txnmgr.c | 17 +++++++++++------ fs/jfs/xattr.c | 3 ++- 7 files changed, 29 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index 9c5d59632aac..887f5759e536 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -26,34 +26,6 @@ #include "jfs_filsys.h" #include "jfs_debug.h" -#ifdef CONFIG_JFS_DEBUG -void dump_mem(char *label, void *data, int length) -{ - int i, j; - int *intptr = data; - char *charptr = data; - char buf[10], line[80]; - - printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length, - data); - for (i = 0; i < length; i += 16) { - line[0] = 0; - for (j = 0; (j < 4) && (i + j * 4 < length); j++) { - sprintf(buf, " %08x", intptr[i / 4 + j]); - strcat(line, buf); - } - buf[0] = ' '; - buf[2] = 0; - for (j = 0; (j < 16) && (i + j < length); j++) { - buf[1] = - isprint(charptr[i + j]) ? charptr[i + j] : '.'; - strcat(line, buf); - } - printk("%s\n", line); - } -} -#endif - #ifdef PROC_FS_JFS /* see jfs_debug.h */ static struct proc_dir_entry *base; diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index 7378798f0b21..044c1e654cc0 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -62,7 +62,6 @@ extern void jfs_proc_clean(void); extern int jfsloglevel; -extern void dump_mem(char *label, void *data, int length); extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); /* information message: e.g., configuration, major event */ @@ -94,7 +93,6 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); * --------- */ #else /* CONFIG_JFS_DEBUG */ -#define dump_mem(label,data,length) do {} while (0) #define ASSERT(p) do {} while (0) #define jfs_info(fmt, arg...) do {} while (0) #define jfs_debug(fmt, arg...) do {} while (0) diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index f5c5227b0d1f..19da0e17e4de 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -890,7 +890,8 @@ int diFree(struct inode *ip) * the map. */ if (iagno >= imap->im_nextiag) { - dump_mem("imap", imap, 32); + printk(KERN_ERR "Dump of imap:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, imap, 32); jfs_error(ip->i_sb, "diFree: inum = %d, iagno = %d, nextiag = %d", (uint) inum, iagno, imap->im_nextiag); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 88eae45a4644..2917ede90d67 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1622,16 +1622,22 @@ void jfs_flush_journal(struct jfs_log *log, int wait) if (!list_empty(&log->synclist)) { struct logsyncblk *lp; + printk(KERN_ERR "jfs_flush_journal: synclist not empty\n"); list_for_each_entry(lp, &log->synclist, synclist) { if (lp->xflag & COMMIT_PAGE) { struct metapage *mp = (struct metapage *)lp; - dump_mem("orphan metapage", lp, - sizeof(struct metapage)); - dump_mem("page", mp->page, sizeof(struct page)); + printk (KERN_ERR "orphan metapage:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, + lp, sizeof(struct metapage)); + printk (KERN_ERR "page:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, + mp->page, sizeof(struct page)); + } + else { + printk (KERN_ERR "orphan tblock:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, + lp, sizeof(struct tblock)); } - else - dump_mem("orphan tblock", lp, - sizeof(struct tblock)); } } #endif diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 43d4f69afbec..1c4fced1e6d8 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -472,7 +472,8 @@ add_failed: printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); goto skip; dump_bio: - dump_mem("bio", bio, sizeof(*bio)); + printk(KERN_ERR "JFS: dump of bio:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, bio, sizeof(*bio)); skip: bio_put(bio); unlock_page(page); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f2dc4b986392..d6f23f90ad36 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -829,12 +829,17 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, /* Only locks on ipimap or ipaimap should reach here */ /* assert(jfs_ip->fileset == AGGREGATE_I); */ if (jfs_ip->fileset != AGGREGATE_I) { - jfs_err("txLock: trying to lock locked page!"); - dump_mem("ip", ip, sizeof(struct inode)); - dump_mem("mp", mp, sizeof(struct metapage)); - dump_mem("Locker's tblk", tid_to_tblock(tid), - sizeof(struct tblock)); - dump_mem("Tlock", tlck, sizeof(struct tlock)); + printk(KERN_ERR "txLock: trying to lock locked page!"); + printk(KERN_ERR "ip:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, ip, sizeof(*ip)); + printk(KERN_ERR "mp:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, mp, sizeof(*mp)); + printk(KERN_ERR "Locker's tblk:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, + tid_to_tblock(tid), sizeof(struct tblock)); + printk(KERN_ERR "Tlock:\n"); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, tlck, + sizeof(*tlck)); BUG(); } INCREMENT(stattx.waitlock); /* statistics */ diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 5c0f2de01316..2dcb13275429 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -590,7 +590,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { printk(KERN_ERR "ea_get: invalid extended attribute\n"); - dump_mem("xattr", ea_buf->xattr, ea_size); + print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, ea_buf->xattr, + ea_size); ea_release(inode, ea_buf); rc = -EIO; goto clean_up; -- cgit v1.2.1 From dd9505879c3381e98fc46f74cd8c17f0b23d0cd7 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 11 Jun 2007 15:35:34 +0900 Subject: fs: hugetlbfs: Disable for shnommu. SH can turn CONFIG_MMU on and off, don't let us get to a state where hugetlbfs/hugetlbpage gets built when building for nommu. Signed-off-by: Paul Mundt --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 0fa0c1193e81..7f211613ae1b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -991,7 +991,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN + depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read -- cgit v1.2.1 From 288e4d838d1e999c0515f85a337cacb2be233071 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 13 Jun 2007 10:17:50 -0500 Subject: JFS: Update print_hex_dump() syntax Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_imap.c | 4 ++-- fs/jfs/jfs_logmgr.c | 26 +++++++++++++------------- fs/jfs/jfs_metapage.c | 4 ++-- fs/jfs/jfs_txnmgr.c | 19 +++++++++---------- fs/jfs/xattr.c | 4 ++-- 5 files changed, 28 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 19da0e17e4de..3870ba8b9086 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -890,8 +890,8 @@ int diFree(struct inode *ip) * the map. */ if (iagno >= imap->im_nextiag) { - printk(KERN_ERR "Dump of imap:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, imap, 32); + print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, + imap, 32, 0); jfs_error(ip->i_sb, "diFree: inum = %d, iagno = %d, nextiag = %d", (uint) inum, iagno, imap->im_nextiag); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 2917ede90d67..de3e4a506dbc 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1626,22 +1626,22 @@ void jfs_flush_journal(struct jfs_log *log, int wait) list_for_each_entry(lp, &log->synclist, synclist) { if (lp->xflag & COMMIT_PAGE) { struct metapage *mp = (struct metapage *)lp; - printk (KERN_ERR "orphan metapage:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, - lp, sizeof(struct metapage)); - printk (KERN_ERR "page:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, - mp->page, sizeof(struct page)); - } - else { - printk (KERN_ERR "orphan tblock:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, - lp, sizeof(struct tblock)); - } + print_hex_dump(KERN_ERR, "metapage: ", + DUMP_PREFIX_ADDRESS, 16, 4, + mp, sizeof(struct metapage), 0); + print_hex_dump(KERN_ERR, "page: ", + DUMP_PREFIX_ADDRESS, 16, + sizeof(long), mp->page, + sizeof(struct page), 0); + } else + print_hex_dump(KERN_ERR, "tblock:", + DUMP_PREFIX_ADDRESS, 16, 4, + lp, sizeof(struct tblock), 0); } } +#else + WARN_ON(!list_empty(&log->synclist)); #endif - //assert(list_empty(&log->synclist)); clear_bit(log_FLUSH, &log->flag); } diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 1c4fced1e6d8..77c7f1129dde 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -472,8 +472,8 @@ add_failed: printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); goto skip; dump_bio: - printk(KERN_ERR "JFS: dump of bio:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, bio, sizeof(*bio)); + print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16, + 4, bio, sizeof(*bio), 0); skip: bio_put(bio); unlock_page(page); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index d6f23f90ad36..7aa1f7004eaf 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -830,16 +830,15 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, /* assert(jfs_ip->fileset == AGGREGATE_I); */ if (jfs_ip->fileset != AGGREGATE_I) { printk(KERN_ERR "txLock: trying to lock locked page!"); - printk(KERN_ERR "ip:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, ip, sizeof(*ip)); - printk(KERN_ERR "mp:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, mp, sizeof(*mp)); - printk(KERN_ERR "Locker's tblk:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, - tid_to_tblock(tid), sizeof(struct tblock)); - printk(KERN_ERR "Tlock:\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, tlck, - sizeof(*tlck)); + print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4, + ip, sizeof(*ip), 0); + print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4, + mp, sizeof(*mp), 0); + print_hex_dump(KERN_ERR, "Locker's tblock: ", + DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid), + sizeof(struct tblock), 0); + print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4, + tlck, sizeof(*tlck), 0); BUG(); } INCREMENT(stattx.waitlock); /* statistics */ diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 2dcb13275429..b2375f0774b7 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -590,8 +590,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { printk(KERN_ERR "ea_get: invalid extended attribute\n"); - print_hex_dump(KERN_ERR, DUMP_PREFIX_ADDRESS, ea_buf->xattr, - ea_size); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, + ea_buf->xattr, ea_size, 1); ea_release(inode, ea_buf); rc = -EIO; goto clean_up; -- cgit v1.2.1 From 266f5aa0970409bf1ebdf9fc4e65a1186eeed3c2 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Sat, 23 Jun 2007 17:16:46 -0700 Subject: ext2: disallow setting xip on remount Yan Zheng pointed out that ext2_remount lacks checking if -o xip should be enabled or not. This patch checks for presence of direct_access on the backing block device and if the blocksize meets the requirements. Signed-off-by: Carsten Otte Cc: Yan Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 16337bff0272..c9fd8cf6eaa9 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1038,6 +1038,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset + EXT2_MOUNT_XIP if not */ + + if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { + printk("XIP: Unsupported blocksize\n"); + goto restore_opts; + } + es = sbi->s_es; if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != (old_mount_opt & EXT2_MOUNT_XIP)) && -- cgit v1.2.1 From e4a10a362cd1df6c23fe46f449d36b3f712e2824 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sat, 23 Jun 2007 17:16:48 -0700 Subject: ext3: lost brelse in ext3_read_inode() One of error path in ext3_read_inode() leaks bh since brelse is forgoten. Signed-off-by: Kirill Korotaev Acked-by: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index a6cb6171c3af..2a85ddee4740 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2677,8 +2677,10 @@ void ext3_read_inode(struct inode * inode) */ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT3_INODE_SIZE(inode->i_sb)) + EXT3_INODE_SIZE(inode->i_sb)) { + brelse (bh); goto bad_inode; + } if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ ei->i_extra_isize = sizeof(struct ext3_inode) - -- cgit v1.2.1 From e5d2861f31474b373ce7754dc5122b414a176c64 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sat, 23 Jun 2007 17:16:51 -0700 Subject: ext4: lost brelse in ext4_read_inode() One of error path in ext4_read_inode() leaks bh since brelse is forgoten. Signed-off-by: Kirill Korotaev Acked-by: Vasily Averin Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0bcf62a750ff..8416fa28c422 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2673,8 +2673,10 @@ void ext4_read_inode(struct inode * inode) */ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) + EXT4_INODE_SIZE(inode->i_sb)) { + brelse (bh); goto bad_inode; + } if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ ei->i_extra_isize = sizeof(struct ext4_inode) - -- cgit v1.2.1 From 240e2df5c740d73fc08cac9989872212deb2d20e Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Wed, 27 Jun 2007 14:09:44 -0700 Subject: eCryptfs: fix write zeros behavior This patch fixes the processes involved in wiping regions of the data during truncate and write events, fixing a kernel hang in 2.6.22-rc4 while assuring that zero values are written out to the appropriate locations during events in which the i_size will change. The range passed to ecryptfs_truncate() from ecryptfs_prepare_write() includes the page that is the object of ecryptfs_prepare_write(). This leads to a kernel hang as read_cache_page() is executed on the same page in the ecryptfs_truncate() execution path. This patch remedies this by limiting the range passed to ecryptfs_truncate() so as to exclude the page that is the object of ecryptfs_prepare_write(); it also adds code to ecryptfs_prepare_write() to zero out the region of its own page when writing past the i_size position. This patch also modifies ecryptfs_truncate() so that when a file is truncated to a smaller size, eCryptfs will zero out the contents of the new last page from the new size through to the end of the last page. Signed-off-by: Michael Halcrow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/ecryptfs_kernel.h | 2 ++ fs/ecryptfs/inode.c | 19 ++++++++++++++++ fs/ecryptfs/mmap.c | 53 +++++++++++++++++++++++-------------------- 3 files changed, 50 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 403e3bad1455..1b9dd9a96f19 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -580,5 +580,7 @@ void ecryptfs_write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); +int ecryptfs_write_zeros(struct file *file, pgoff_t index, int start, + int num_zeros); #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 1548be26b5e6..0981ae35ea16 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -800,6 +800,25 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) goto out_fput; } } else { /* new_length < i_size_read(inode) */ + pgoff_t index = 0; + int end_pos_in_page = -1; + + if (new_length != 0) { + index = ((new_length - 1) >> PAGE_CACHE_SHIFT); + end_pos_in_page = ((new_length - 1) & ~PAGE_CACHE_MASK); + } + if (end_pos_in_page != (PAGE_CACHE_SIZE - 1)) { + if ((rc = ecryptfs_write_zeros(&fake_ecryptfs_file, + index, + (end_pos_in_page + 1), + ((PAGE_CACHE_SIZE - 1) + - end_pos_in_page)))) { + printk(KERN_ERR "Error attempting to zero out " + "the remainder of the end page on " + "reducing truncate; rc = [%d]\n", rc); + goto out_fput; + } + } vmtruncate(inode, new_length); rc = ecryptfs_write_inode_size_to_metadata( lower_file, lower_dentry->d_inode, inode, dentry, diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 55cec98a84e7..6df410c77264 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -56,9 +56,6 @@ static struct page *ecryptfs_get1page(struct file *file, int index) return read_mapping_page(mapping, index, (void *)file); } -static -int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros); - /** * ecryptfs_fill_zeros * @file: The ecryptfs file @@ -101,10 +98,13 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length) if (old_end_page_index == new_end_page_index) { /* Start and end are in the same page; we just need to * set a portion of the existing page to zero's */ - rc = write_zeros(file, index, (old_end_pos_in_page + 1), - (new_end_pos_in_page - old_end_pos_in_page)); + rc = ecryptfs_write_zeros(file, index, + (old_end_pos_in_page + 1), + (new_end_pos_in_page + - old_end_pos_in_page)); if (rc) - ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " + ecryptfs_printk(KERN_ERR, "ecryptfs_write_zeros(" + "file=[%p], " "index=[0x%.16x], " "old_end_pos_in_page=[d], " "(PAGE_CACHE_SIZE - new_end_pos_in_page" @@ -117,10 +117,10 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length) goto out; } /* Fill the remainder of the previous last page with zeros */ - rc = write_zeros(file, index, (old_end_pos_in_page + 1), + rc = ecryptfs_write_zeros(file, index, (old_end_pos_in_page + 1), ((PAGE_CACHE_SIZE - 1) - old_end_pos_in_page)); if (rc) { - ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " + ecryptfs_printk(KERN_ERR, "ecryptfs_write_zeros(file=[%p], " "index=[0x%.16x], old_end_pos_in_page=[d], " "(PAGE_CACHE_SIZE - old_end_pos_in_page)=[d]) " "returned [%d]\n", file, index, @@ -131,9 +131,10 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length) index++; while (index < new_end_page_index) { /* Fill all intermediate pages with zeros */ - rc = write_zeros(file, index, 0, PAGE_CACHE_SIZE); + rc = ecryptfs_write_zeros(file, index, 0, PAGE_CACHE_SIZE); if (rc) { - ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " + ecryptfs_printk(KERN_ERR, "ecryptfs_write_zeros(" + "file=[%p], " "index=[0x%.16x], " "old_end_pos_in_page=[d], " "(PAGE_CACHE_SIZE - new_end_pos_in_page" @@ -149,9 +150,9 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length) } /* Fill the portion at the beginning of the last new page with * zero's */ - rc = write_zeros(file, index, 0, (new_end_pos_in_page + 1)); + rc = ecryptfs_write_zeros(file, index, 0, (new_end_pos_in_page + 1)); if (rc) { - ecryptfs_printk(KERN_ERR, "write_zeros(file=" + ecryptfs_printk(KERN_ERR, "ecryptfs_write_zeros(file=" "[%p], index=[0x%.16x], 0, " "new_end_pos_in_page=[%d]" "returned [%d]\n", file, index, @@ -400,7 +401,6 @@ out: static int ecryptfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - loff_t pos; int rc = 0; if (from == 0 && to == PAGE_CACHE_SIZE) @@ -408,14 +408,19 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, up to date. */ if (!PageUptodate(page)) rc = ecryptfs_do_readpage(file, page, page->index); - pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - if (pos > i_size_read(page->mapping->host)) { - rc = ecryptfs_truncate(file->f_path.dentry, pos); - if (rc) { - printk(KERN_ERR "Error on attempt to " - "truncate to (higher) offset [%lld];" - " rc = [%d]\n", pos, rc); - goto out; + if (page->index != 0) { + loff_t end_of_prev_pg_pos = + (((loff_t)page->index << PAGE_CACHE_SHIFT) - 1); + + if (end_of_prev_pg_pos > i_size_read(page->mapping->host)) { + rc = ecryptfs_truncate(file->f_path.dentry, + end_of_prev_pg_pos); + if (rc) { + printk(KERN_ERR "Error on attempt to " + "truncate to (higher) offset [%lld];" + " rc = [%d]\n", end_of_prev_pg_pos, rc); + goto out; + } } } out: @@ -753,7 +758,7 @@ out: } /** - * write_zeros + * ecryptfs_write_zeros * @file: The ecryptfs file * @index: The index in which we are writing * @start: The position after the last block of data @@ -763,8 +768,8 @@ out: * * (start + num_zeros) must be less than or equal to PAGE_CACHE_SIZE */ -static -int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros) +int +ecryptfs_write_zeros(struct file *file, pgoff_t index, int start, int num_zeros) { int rc = 0; struct page *tmp_page; -- cgit v1.2.1 From e10f281bca03f45bdec91e67645c394eaec2f8f6 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Wed, 27 Jun 2007 14:09:44 -0700 Subject: eCryptfs: initialize crypt_stat in setattr Recent changes in eCryptfs have made it possible to get to ecryptfs_setattr() with an uninitialized crypt_stat struct. This results in a wide and colorful variety of unpleasantries. This patch properly initializes the crypt_stat structure in ecryptfs_setattr() when it is necessary to do so. Signed-off-by: Michael Halcrow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/inode.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 0981ae35ea16..83e94fedd4e9 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -894,9 +894,54 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) struct ecryptfs_crypt_stat *crypt_stat; crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; - lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) + ecryptfs_init_crypt_stat(crypt_stat); inode = dentry->d_inode; lower_inode = ecryptfs_inode_to_lower(inode); + lower_dentry = ecryptfs_dentry_to_lower(dentry); + mutex_lock(&crypt_stat->cs_mutex); + if (S_ISDIR(dentry->d_inode->i_mode)) + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + else if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) + || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { + struct vfsmount *lower_mnt; + struct file *lower_file = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + int lower_flags; + + lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + lower_flags = O_RDONLY; + if ((rc = ecryptfs_open_lower_file(&lower_file, lower_dentry, + lower_mnt, lower_flags))) { + printk(KERN_ERR + "Error opening lower file; rc = [%d]\n", rc); + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + if ((rc = ecryptfs_read_metadata(dentry, lower_file))) { + if (!(mount_crypt_stat->flags + & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { + rc = -EIO; + printk(KERN_WARNING "Attempt to read file that " + "is not in a valid eCryptfs format, " + "and plaintext passthrough mode is not " + "enabled; returning -EIO\n"); + + mutex_unlock(&crypt_stat->cs_mutex); + fput(lower_file); + goto out; + } + rc = 0; + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + mutex_unlock(&crypt_stat->cs_mutex); + fput(lower_file); + goto out; + } + fput(lower_file); + } + mutex_unlock(&crypt_stat->cs_mutex); if (ia->ia_valid & ATTR_SIZE) { ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n", -- cgit v1.2.1 From d4c5cdb3e099da7cc64df622b02ff7659babe16e Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Wed, 27 Jun 2007 14:09:45 -0700 Subject: zero out last page for llseek/write When one llseek's past the end of the file and then writes, every page past the previous end of the file should be cleared. Trevor found that the code, as is, does not assure that the very last page is always cleared. This patch takes care of that. Signed-off-by: Michael Halcrow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/mmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 6df410c77264..7d5a43cb0d5c 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -422,6 +422,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, goto out; } } + if (end_of_prev_pg_pos + 1 > i_size_read(page->mapping->host)) + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); } out: return rc; -- cgit v1.2.1 From f8738c5c5298d55ccfc26383f9f45af082a9be57 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 27 Jun 2007 14:09:59 -0700 Subject: avoid spurious POLLIN returns in signalfd The new code in kernel/signal.c does not allow fetching private signals from another task. This patch avoid spurious POLLIN returns from a signalfd poll(2) operation. Signed-off-by: Davide Libenzi Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/signalfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/signalfd.c b/fs/signalfd.c index f1da89203a9a..3b07f26d984d 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -133,7 +133,8 @@ static unsigned int signalfd_poll(struct file *file, poll_table *wait) * the peer disconnects. */ if (signalfd_lock(ctx, &lk)) { - if (next_signal(&lk.tsk->pending, &ctx->sigmask) > 0 || + if ((lk.tsk == current && + next_signal(&lk.tsk->pending, &ctx->sigmask) > 0) || next_signal(&lk.tsk->signal->shared_pending, &ctx->sigmask) > 0) events |= POLLIN; -- cgit v1.2.1 From ddc80bd781590ef6eb8ce30a0f3ac88c5599e41c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 27 Jun 2007 14:10:08 -0700 Subject: ext2: fix return of uninitialised variable gcc correctly says fs/ext2/super.c: In function 'ext2_remount': fs/ext2/super.c:1055: warning: 'err' may be used uninitialized in this function Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index c9fd8cf6eaa9..5de5061eb331 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1043,6 +1043,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { printk("XIP: Unsupported blocksize\n"); + err = -EINVAL; goto restore_opts; } -- cgit v1.2.1 From edd5cd4a9424f22b0fa08bef5e299d41befd5622 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 27 Jun 2007 14:10:09 -0700 Subject: Introduce fixed sys_sync_file_range2() syscall, implement on PowerPC and ARM Not all the world is an i386. Many architectures need 64-bit arguments to be aligned in suitable pairs of registers, and the original sys_sync_file_range(int, loff_t, loff_t, int) was therefore wasting an argument register for padding after the first integer. Since we don't normally have more than 6 arguments for system calls, that left no room for the final argument on some architectures. Fix this by introducing sys_sync_file_range2(int, int, loff_t, loff_t) which all fits nicely. In fact, ARM already had that, but called it sys_arm_sync_file_range. Move it to fs/sync.c and rename it, then implement the needed compatibility routine. And stop the missing syscall check from bitching about the absence of sys_sync_file_range() if we've implemented sys_sync_file_range2() instead. Tested on PPC32 and with 32-bit and 64-bit userspace on PPC64. Signed-off-by: David Woodhouse Acked-by: Russell King Cc: Arnd Bergmann Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sync.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index 2f97576355b8..7cd005ea7639 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -236,6 +236,14 @@ out: return ret; } +/* It would be nice if people remember that not all the world's an i386 + when they introduce new system calls */ +asmlinkage long sys_sync_file_range2(int fd, unsigned int flags, + loff_t offset, loff_t nbytes) +{ + return sys_sync_file_range(fd, offset, nbytes, flags); +} + /* * `endbyte' is inclusive */ -- cgit v1.2.1 From fcb82f8835c1d71b4fe5de1d9894f45370f80dab Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 3 Jul 2007 15:28:55 -0700 Subject: dio: remove bogus refcounting BUG_ON Badari Pulavarty reported a case of this BUG_ON is triggering during testing. It's completely bogus and should be removed. It's trying to notice if we left references to the dio hanging around in the sync case. They should have been dropped as IO completed while this path was in dio_await_completion(). This condition will also be checked, via some twisty logic, by the BUG_ON(ret != -EIOCBQUEUED) a few lines lower. So to start this BUG_ON() is redundant. More fatally, it's dereferencing dio-> after having dropped its reference. It's only safe to dereference the dio after releasing the lock if the final reference was just dropped. Another CPU might free the dio in bio completion and reuse the memory after this path drops the dio lock but before the BUG_ON() is evaluated. This patch passed aio+dio regression unit tests and aio-stress on ext3. Signed-off-by: Zach Brown Cc: Badari Pulavarty Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 8593f3dfd299..52bb2638f7ab 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1106,7 +1106,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_lock_irqsave(&dio->bio_lock, flags); ret2 = --dio->refcount; spin_unlock_irqrestore(&dio->bio_lock, flags); - BUG_ON(!dio->is_async && ret2 != 0); + if (ret2 == 0) { ret = dio_complete(dio, offset, ret); kfree(dio); -- cgit v1.2.1 From e2baf4ed168589af8224d51f0ac50e65bcdee3f6 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 3 Jul 2007 16:51:19 -0400 Subject: [JFFS2] Fix readinode failure when read_dnode() detects CRC failure. We should have stopped returning 1 from read_dnode() to indicate failure. We can just mark the damn thing obsolete immediately. But I missed a case where we don't. Signed-off-by: David Woodhouse --- fs/jffs2/readinode.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 12e83f67eee4..7b363786c2d2 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -210,8 +210,7 @@ static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info * * offset, and the one with the smallest length will come first in the * ordering. * - * Returns 0 if the node was inserted - * 1 if the node is obsolete (because we can't mark it so yet) + * Returns 0 if the node was handled (including marking it obsolete) * < 0 an if error occurred */ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, @@ -572,8 +571,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r * Helper function for jffs2_get_inode_nodes(). * It is called every time an directory entry node is found. * - * Returns: 0 on succes; - * 1 if the node should be marked obsolete; + * Returns: 0 on success; * negative error code on failure. */ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, @@ -680,8 +678,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r * Helper function for jffs2_get_inode_nodes(). * It is called every time an inode node is found. * - * Returns: 0 on success; - * 1 if the node should be marked obsolete; + * Returns: 0 on success (possibly after marking a bad node obsolete); * negative error code on failure. */ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, @@ -690,7 +687,7 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref { struct jffs2_tmp_dnode_info *tn; uint32_t len, csize; - int ret = 1; + int ret = 0; uint32_t crc; /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */ @@ -719,8 +716,9 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref /* Sanity checks */ if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) || unlikely(PAD(je32_to_cpu(rd->csize) + sizeof(*rd)) != PAD(je32_to_cpu(rd->totlen)))) { - JFFS2_WARNING("inode node header CRC is corrupted at %#08x\n", ref_offset(ref)); - jffs2_dbg_dump_node(c, ref_offset(ref)); + JFFS2_WARNING("inode node header CRC is corrupted at %#08x\n", ref_offset(ref)); + jffs2_dbg_dump_node(c, ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); goto free_out; } @@ -775,6 +773,7 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref if (len >= csize && unlikely(tn->partial_crc != je32_to_cpu(rd->data_crc))) { JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n", ref_offset(ref), tn->partial_crc, je32_to_cpu(rd->data_crc)); + jffs2_mark_node_obsolete(c, ref); goto free_out; } @@ -854,7 +853,6 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref * It is called every time an unknown node is found. * * Returns: 0 on success; - * 1 if the node should be marked obsolete; * negative error code on failure. */ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, struct jffs2_unknown_node *un) @@ -1088,10 +1086,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf } err = read_unknown(c, ref, &node->u); - if (err == 1) { - jffs2_mark_node_obsolete(c, ref); - break; - } else if (unlikely(err)) + if (unlikely(err)) goto free_out; } -- cgit v1.2.1 From ef7320edb1dd2cf6c969d1dcef4a9499a42f24da Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 6 Jul 2007 02:39:49 -0700 Subject: Fix elf_core_dump() when writing arch specific notes (spu coredumps) elf_core_dump() supports dumping arch specific ELF notes, via the #define ELF_CORE_WRITE_EXTRA_NOTES. Currently the only user of this is the powerpc spu coredump code. There is a bug in the handling of foffset WRT the arch notes, which causes us to erroneously increment foffset by the size of the arch notes, leaving a block of zeroes in the file, and causing all subsequent data in the file to be at + . eg: LOAD 0x050000 0x00100000 0x00000000 0x20000 0x20000 R E 0x10000 Tells us we should have a chunk of data at 0x50000. The truth is the data is at 0x90dbc = 0x50000 + 0x40dbc (the size of the arch notes). This bug prevents gdb from reading the core file correctly. The simplest fix is to simply remember the size of the arch notes, and add it to foffset after we've written the arch notes. The only drawback is that if the arch code doesn't write as many bytes as it said it would, we end up with a broken core dump again. For now I think that's a reasonable requirement. Tested on a Cell blade, gdb no longer complains about the core file being bogus. While I'm here I should point out that the spu coredump code does not work if we're dumping to a pipe - we'll have to wait for 23 to fix that. Signed-off-by: Michael Ellerman Acked-by: Arnd Bergmann Acked-by: Benjamin Herrenschmidt Acked-by: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fa8ea33ab0be..08e4414b8374 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1499,6 +1499,9 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) #endif int thread_status_size = 0; elf_addr_t *auxv; +#ifdef ELF_CORE_WRITE_EXTRA_NOTES + int extra_notes_size; +#endif /* * We no longer stop all VM operations. @@ -1628,7 +1631,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) sz += thread_status_size; #ifdef ELF_CORE_WRITE_EXTRA_NOTES - sz += ELF_CORE_EXTRA_NOTES_SIZE; + extra_notes_size = ELF_CORE_EXTRA_NOTES_SIZE; + sz += extra_notes_size; #endif fill_elf_note_phdr(&phdr, sz, offset); @@ -1674,6 +1678,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) #ifdef ELF_CORE_WRITE_EXTRA_NOTES ELF_CORE_WRITE_EXTRA_NOTES; + foffset += extra_notes_size; #endif /* write out the thread status notes section */ -- cgit v1.2.1 From 95511ad4342cd094e62c807f6631b9a19cc6b129 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 7 Jul 2007 06:14:02 +0200 Subject: DLM must depend on SYSFS The dependency of DLM on SYSFS got lost in commit 6ed7257b46709e87d79ac2b6b819b7e0c9184998 resulting in the following compile error with CONFIG_DLM=y, CONFIG_SYSFS=n: <-- snip --> ... LD .tmp_vmlinux1 fs/built-in.o: In function `dlm_lockspace_init': /home/bunk/linux/kernel-2.6/linux-2.6.22-rc6-mm1/fs/dlm/lockspace.c:231: undefined reference to `kernel_subsys' fs/built-in.o: In function `configfs_init': /home/bunk/linux/kernel-2.6/linux-2.6.22-rc6-mm1/fs/configfs/mount.c:143: undefined reference to `kernel_subsys' make[1]: *** [.tmp_vmlinux1] Error 1 <-- snip --> Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- fs/dlm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 69a94690e493..54bcc00ec8df 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -3,7 +3,7 @@ menu "Distributed Lock Manager" config DLM tristate "Distributed Lock Manager (DLM)" - depends on IPV6 || IPV6=n + depends on SYSFS && (IPV6 || IPV6=n) select CONFIGFS_FS select IP_SCTP help -- cgit v1.2.1 From 1e5de2837c166535f9bb4232bfe97ea1f9fc7a1c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Jul 2007 12:02:55 -0700 Subject: Fix permission checking for the new utimensat() system call Commit 1c710c896eb461895d3c399e15bb5f20b39c9073 added the utimensat() system call, but didn't handle the case of checking for the writability of the target right, when the target was a file descriptor, not a filename. We cannot use vfs_permission(MAY_WRITE) for that case, and need to simply check whether the file descriptor is writable. The oops from using the wrong function was noticed and narrowed down by Markus Trippelsdorf. Cc: Ulrich Drepper Cc: Markus Trippelsdorf Cc: Andrew Morton Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/utimes.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/utimes.c b/fs/utimes.c index 480f7c8c29da..b3c88952465f 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -106,9 +106,16 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags if (IS_IMMUTABLE(inode)) goto dput_and_out; - if (current->fsuid != inode->i_uid && - (error = vfs_permission(&nd, MAY_WRITE)) != 0) - goto dput_and_out; + if (current->fsuid != inode->i_uid) { + if (f) { + if (!(f->f_mode & FMODE_WRITE)) + goto dput_and_out; + } else { + error = vfs_permission(&nd, MAY_WRITE); + if (error) + goto dput_and_out; + } + } } mutex_lock(&inode->i_mutex); error = notify_change(dentry, &newattrs); -- cgit v1.2.1 From b524fe646c9a226a847e30ca1221dc22e952f16b Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 2 May 2007 09:44:03 -0500 Subject: [GFS2] flush the glock completely in inode_go_sync Fix for bz #231910 When filemap_fdatawrite() is called on the inode mapping in data=ordered mode, it will add the glock to the log. In inode_go_sync(), if you do the gfs2_log_flush() before this, after the filemap_fdatawrite() call, the glock and its associated data buffers will be on the log again. This means you can demote a lock from exclusive, without having it flushed from the log. The attached patch simply moves the gfs2_log_flush up to after the filemap_fdatawrite() call. Originally, I tried moving the gfs2_log_flush to after gfs2_meta_sync(), but that caused me to trip the following assert. GFS2: fsid=cypher-36:test.0: fatal: assertion "!buffer_busy(bh)" failed GFS2: fsid=cypher-36:test.0: function = gfs2_ail_empty_gl, file = fs/gfs2/glops.c, line = 61 It appears that gfs2_log_flush() puts some of the glocks buffers in the busy state and the filemap_fdatawrite() call is necessary to flush them. This makes me worry slightly that a related problem could happen because of moving the gfs2_log_flush() after the initial filemap_fdatawrite(), but I assume that gfs2_ail_empty_gl() would catch that case as well. Signed-off-by: Benjamin E. Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 7b82657a9910..777ca46010e8 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -156,9 +156,9 @@ static void inode_go_sync(struct gfs2_glock *gl) ip = NULL; if (test_bit(GLF_DIRTY, &gl->gl_flags)) { - gfs2_log_flush(gl->gl_sbd, gl); if (ip) filemap_fdatawrite(ip->i_inode.i_mapping); + gfs2_log_flush(gl->gl_sbd, gl); gfs2_meta_sync(gl); if (ip) { struct address_space *mapping = ip->i_inode.i_mapping; -- cgit v1.2.1 From 3168b0780d06ace875696f8a648d04d6089654e5 Mon Sep 17 00:00:00 2001 From: Satyam Sharma Date: Tue, 8 May 2007 09:18:58 +0100 Subject: [DLM] fix a couple of races Fix two races in fs/dlm/config.c: (1) Grab the configfs subsystem semaphore before calling config_group_find_obj() in get_space(). This solves a potential race between get_space() and concurrent mkdir(2) or rmdir(2). (2) Grab a reference on the found config_item _while_ holding the configfs subsystem semaphore in get_comm(), and not after it. This solves a potential race between get_comm() and concurrent rmdir(2). Signed-off-by: Satyam Sharma Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/config.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 822abdcd1434..5a3d390cc826 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -748,9 +748,16 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len) static struct space *get_space(char *name) { + struct config_item *i; + if (!space_list) return NULL; - return to_space(config_group_find_obj(space_list, name)); + + down(&space_list->cg_subsys->su_sem); + i = config_group_find_obj(space_list, name); + up(&space_list->cg_subsys->su_sem); + + return to_space(i); } static void put_space(struct space *sp) @@ -776,20 +783,20 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) if (cm->nodeid != nodeid) continue; found = 1; + config_item_get(i); break; } else { if (!cm->addr_count || memcmp(cm->addr[0], addr, sizeof(*addr))) continue; found = 1; + config_item_get(i); break; } } up(&clusters_root.subsys.su_sem); - if (found) - config_item_get(i); - else + if (!found) cm = NULL; return cm; } -- cgit v1.2.1 From 7ae8fa8451dfb3879ecbc04f2760a707dc65b988 Mon Sep 17 00:00:00 2001 From: Robert Peterson Date: Wed, 9 May 2007 09:37:57 -0500 Subject: [GFS2] kernel changes to support new gfs2_grow command This is another revision of my gfs2 kernel patch that allows gfs2_grow to function properly. Steve Whitehouse expressed some concerns about the previous patch and I restructured it based on his comments. The previous patch was doing the statfs_change at file close time, under its own transaction. The current patch does the statfs_change inside the gfs2_commit_write function, which keeps it under the umbrella of the inode transaction. I can't call ri_update to re-read the rindex file during the transaction because the transaction may have outstanding unwritten buffers attached to the rgrps that would be otherwise blown away. So instead, I created a new function, gfs2_ri_total, that will re-read the rindex file just to total the file system space for the sake of the statfs_change. The ri_update will happen later, when gfs2 realizes the version number has changed, as it happened before my patch. Since the statfs_change is happening at write_commit time and there may be multiple writes to the rindex file for one grow operation. So one consequence of this restructuring is that instead of getting one kernel message to indicate the change, you may see several. For example, before when you did a gfs2_grow, you'd get a single message like: GFS2: File system extended by 247876 blocks (968MB) Now you get something like: GFS2: File system extended by 207896 blocks (812MB) GFS2: File system extended by 39980 blocks (156MB) This version has also been successfully run against the hours-long "gfs2_fsck_hellfire" test that does several gfs2_grow and gfs2_fsck while interjecting file system damage. It does this repeatedly under a variety Resource Group conditions. Signed-off-By: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_address.c | 29 ++++++++++++++++++++++++- fs/gfs2/ops_address.h | 5 ++++- fs/gfs2/rgrp.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 86 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 30c15622174f..846c0ff75cff 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -449,6 +449,30 @@ out_uninit: return error; } +/** + * adjust_fs_space - Adjusts the free space available due to gfs2_grow + * @inode: the rindex inode + */ +static void adjust_fs_space(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + u64 fs_total, new_free; + + /* Total up the file system space, according to the latest rindex. */ + fs_total = gfs2_ri_total(sdp); + + spin_lock(&sdp->sd_statfs_spin); + if (fs_total > (m_sc->sc_total + l_sc->sc_total)) + new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); + else + new_free = 0; + spin_unlock(&sdp->sd_statfs_spin); + fs_warn(sdp, "File system extended by %llu blocks.\n", new_free); + gfs2_statfs_change(sdp, new_free, new_free, 0); +} + /** * gfs2_commit_write - Commit write to a file * @file: The file to write to @@ -511,6 +535,9 @@ static int gfs2_commit_write(struct file *file, struct page *page, di->di_size = cpu_to_be64(inode->i_size); } + if (inode == sdp->sd_rindex) + adjust_fs_space(inode); + brelse(dibh); gfs2_trans_end(sdp); if (al->al_requested) { diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h index 35aaee4aa7e1..56c30daf895b 100644 --- a/fs/gfs2/ops_address.h +++ b/fs/gfs2/ops_address.h @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -18,5 +18,8 @@ extern const struct address_space_operations gfs2_file_aops; extern int gfs2_get_block(struct inode *inode, sector_t lblock, struct buffer_head *bh_result, int create); extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); +extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes); #endif /* __OPS_ADDRESS_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 1727f5012efe..e857f405353b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -430,6 +430,38 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) return 0; } +/** + * gfs2_ri_total - Total up the file system space, according to the rindex. + * + */ +u64 gfs2_ri_total(struct gfs2_sbd *sdp) +{ + u64 total_data = 0; + struct inode *inode = sdp->sd_rindex; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_rindex_host ri; + char buf[sizeof(struct gfs2_rindex)]; + struct file_ra_state ra_state; + int error, rgrps; + + mutex_lock(&sdp->sd_rindex_mutex); + file_ra_state_init(&ra_state, inode->i_mapping); + for (rgrps = 0;; rgrps++) { + loff_t pos = rgrps * sizeof(struct gfs2_rindex); + + if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) + break; + error = gfs2_internal_read(ip, &ra_state, buf, &pos, + sizeof(struct gfs2_rindex)); + if (error != sizeof(struct gfs2_rindex)) + break; + gfs2_rindex_in(&ri, buf); + total_data += ri.ri_data; + } + mutex_unlock(&sdp->sd_rindex_mutex); + return total_data; +} + /** * gfs2_ri_update - Pull in a new resource index from the disk * @gl: The glock covering the rindex inode @@ -447,7 +479,12 @@ static int gfs2_ri_update(struct gfs2_inode *ip) u64 junk = ip->i_di.di_size; int error; - if (do_div(junk, sizeof(struct gfs2_rindex))) { + /* If someone is holding the rindex file with a glock, they must + be updating it, in which case we may have partial entries. + In this case, we ignore the partials. */ + if (!gfs2_glock_is_held_excl(ip->i_gl) && + !gfs2_glock_is_held_shrd(ip->i_gl) && + do_div(junk, sizeof(struct gfs2_rindex))) { gfs2_consist_inode(ip); return -EIO; } @@ -457,6 +494,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip) file_ra_state_init(&ra_state, inode->i_mapping); for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); + + if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) + break; error = gfs2_internal_read(ip, &ra_state, buf, &pos, sizeof(struct gfs2_rindex)); if (!error) @@ -978,18 +1018,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = &ip->i_alloc; - int error; + int error = 0; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + /* We need to hold the rindex unless the inode we're using is + the rindex itself, in which case it's already held. */ + if (ip != GFS2_I(sdp->sd_rindex)) + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ + error = gfs2_ri_update(ip); + if (error) return error; error = get_local_rgrp(ip); if (error) { - gfs2_glock_dq_uninit(&al->al_ri_gh); + if (ip != GFS2_I(sdp->sd_rindex)) + gfs2_glock_dq_uninit(&al->al_ri_gh); return error; } @@ -1019,7 +1066,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip) al->al_rgd = NULL; gfs2_glock_dq_uninit(&al->al_rgd_gh); - gfs2_glock_dq_uninit(&al->al_ri_gh); + if (ip != GFS2_I(sdp->sd_rindex)) + gfs2_glock_dq_uninit(&al->al_ri_gh); } /** -- cgit v1.2.1 From 6c53267f05dc6689ff662efeec426d25d2c0ab84 Mon Sep 17 00:00:00 2001 From: Robert Peterson Date: Thu, 10 May 2007 16:54:38 -0500 Subject: [GFS2] Kernel changes to support new gfs2_grow command (part 2) To avoid code redundancy, I separated out the operational "guts" into a new function called read_rindex_entry. Then I made two functions: the closer-to-original gfs2_ri_update (without the special condition checks) and gfs2_ri_update_special that's designed with that condition in mind. (I don't like the name, but if you have a suggestion, I'm all ears). Oh, and there's an added benefit: we don't need all the ugly gotos anymore. ;) This patch has been tested with gfs2_fsck_hellfire (which runs for three and a half hours, btw). Signed-off-By: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_address.c | 3 +- fs/gfs2/rgrp.c | 139 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 93 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 846c0ff75cff..e0b4e8c2b68d 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -469,7 +469,8 @@ static void adjust_fs_space(struct inode *inode) else new_free = 0; spin_unlock(&sdp->sd_statfs_spin); - fs_warn(sdp, "File system extended by %llu blocks.\n", new_free); + fs_warn(sdp, "File system extended by %llu blocks.\n", + (unsigned long long)new_free); gfs2_statfs_change(sdp, new_free, new_free, 0); } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index e857f405353b..48a6461d601c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -463,9 +463,62 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) } /** - * gfs2_ri_update - Pull in a new resource index from the disk + * read_rindex_entry - Pull in a new resource index entry from the disk * @gl: The glock covering the rindex inode * + * Returns: 0 on success, error code otherwise + */ + +static int read_rindex_entry(struct gfs2_inode *ip, + struct file_ra_state *ra_state) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); + char buf[sizeof(struct gfs2_rindex)]; + int error; + struct gfs2_rgrpd *rgd; + + error = gfs2_internal_read(ip, ra_state, buf, &pos, + sizeof(struct gfs2_rindex)); + if (!error) + return 0; + if (error != sizeof(struct gfs2_rindex)) { + if (error > 0) + error = -EIO; + return error; + } + + rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS); + error = -ENOMEM; + if (!rgd) + return error; + + mutex_init(&rgd->rd_mutex); + lops_init_le(&rgd->rd_le, &gfs2_rg_lops); + rgd->rd_sbd = sdp; + + list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); + list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + + gfs2_rindex_in(&rgd->rd_ri, buf); + error = compute_bitstructs(rgd); + if (error) + return error; + + error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr, + &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); + if (error) + return error; + + rgd->rd_gl->gl_object = rgd; + rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; + return error; +} + +/** + * gfs2_ri_update - Pull in a new resource index from the disk + * @ip: pointer to the rindex inode + * * Returns: 0 on successful update, error code otherwise */ @@ -473,18 +526,11 @@ static int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; - struct gfs2_rgrpd *rgd; - char buf[sizeof(struct gfs2_rindex)]; struct file_ra_state ra_state; u64 junk = ip->i_di.di_size; int error; - /* If someone is holding the rindex file with a glock, they must - be updating it, in which case we may have partial entries. - In this case, we ignore the partials. */ - if (!gfs2_glock_is_held_excl(ip->i_gl) && - !gfs2_glock_is_held_shrd(ip->i_gl) && - do_div(junk, sizeof(struct gfs2_rindex))) { + if (do_div(junk, sizeof(struct gfs2_rindex))) { gfs2_consist_inode(ip); return -EIO; } @@ -493,52 +539,49 @@ static int gfs2_ri_update(struct gfs2_inode *ip) file_ra_state_init(&ra_state, inode->i_mapping); for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { - loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); - - if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) - break; - error = gfs2_internal_read(ip, &ra_state, buf, &pos, - sizeof(struct gfs2_rindex)); - if (!error) - break; - if (error != sizeof(struct gfs2_rindex)) { - if (error > 0) - error = -EIO; - goto fail; + error = read_rindex_entry(ip, &ra_state); + if (error) { + clear_rgrpdi(sdp); + return error; } + } - rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS); - error = -ENOMEM; - if (!rgd) - goto fail; - - mutex_init(&rgd->rd_mutex); - lops_init_le(&rgd->rd_le, &gfs2_rg_lops); - rgd->rd_sbd = sdp; - - list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); - list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - - gfs2_rindex_in(&rgd->rd_ri, buf); - error = compute_bitstructs(rgd); - if (error) - goto fail; + sdp->sd_rindex_vn = ip->i_gl->gl_vn; + return 0; +} - error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr, - &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); - if (error) - goto fail; +/** + * gfs2_ri_update_special - Pull in a new resource index from the disk + * + * This is a special version that's safe to call from gfs2_inplace_reserve_i. + * In this case we know that we don't have any resource groups in memory yet. + * + * @ip: pointer to the rindex inode + * + * Returns: 0 on successful update, error code otherwise + */ +static int gfs2_ri_update_special(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct file_ra_state ra_state; + int error; - rgd->rd_gl->gl_object = rgd; - rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; + file_ra_state_init(&ra_state, inode->i_mapping); + for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { + /* Ignore partials */ + if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > + ip->i_di.di_size) + break; + error = read_rindex_entry(ip, &ra_state); + if (error) { + clear_rgrpdi(sdp); + return error; + } } sdp->sd_rindex_vn = ip->i_gl->gl_vn; return 0; - -fail: - clear_rgrpdi(sdp); - return error; } /** @@ -1028,7 +1071,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) if (ip != GFS2_I(sdp->sd_rindex)) error = gfs2_rindex_hold(sdp, &al->al_ri_gh); else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ - error = gfs2_ri_update(ip); + error = gfs2_ri_update_special(ip); if (error) return error; -- cgit v1.2.1 From 0507ecf50f22e433592f5ec3a36dc831aaec2e02 Mon Sep 17 00:00:00 2001 From: Nate Diller Date: Thu, 10 May 2007 22:41:28 -0700 Subject: [GFS2] use zero_user_page Use zero_user_page() instead of open-coding it. Signed-off-by: Nate Diller Cc: Steven Whitehouse Signed-off-by: Andrew Morton --- fs/gfs2/bmap.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c53a5d2d0590..1c40c4bbf379 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -885,7 +885,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping) unsigned blocksize, iblock, length, pos; struct buffer_head *bh; struct page *page; - void *kaddr; int err; page = grab_cache_page(mapping, index); @@ -933,10 +932,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping) if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) gfs2_trans_add_bh(ip->i_gl, bh, 0); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + zero_user_page(page, offset, length, KM_USER0); unlock: unlock_page(page); -- cgit v1.2.1 From cd81a4bac67d44742ab0aa1848f4a78e9d7e1093 Mon Sep 17 00:00:00 2001 From: Robert Peterson Date: Mon, 14 May 2007 12:42:18 -0500 Subject: [GFS2] Addendum patch 2 for gfs2_grow This addendum patch 2 corrects three things: 1. It fixes a stupid mistake in the previous addendum that broke gfs2. Ref: https://www.redhat.com/archives/cluster-devel/2007-May/msg00162.html 2. It fixes a problem that Dave Teigland pointed out regarding the external declarations in ops_address.h being in the wrong place. 3. It recasts a couple more %llu printks to (unsigned long long) as requested by Steve Whitehouse. I would have loved to put this all in one revised patch, but there was a rush to get some patches for RHEL5. Therefore, the previous patches were applied to the git tree "as is" and therefore, I'm posting another addendum. Sorry. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 7 ++++--- fs/gfs2/ops_address.c | 1 + fs/gfs2/ops_address.h | 3 --- fs/gfs2/rgrp.c | 6 +++--- fs/gfs2/rgrp.h | 1 + 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1815429a2978..c66c718013ef 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1823,7 +1823,8 @@ static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip) print_dbg(gi, " Inode:\n"); print_dbg(gi, " num = %llu/%llu\n", - ip->i_num.no_formal_ino, ip->i_num.no_addr); + (unsigned long long)ip->i_num.no_formal_ino, + (unsigned long long)ip->i_num.no_addr); print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode)); print_dbg(gi, " i_flags ="); for (x = 0; x < 32; x++) @@ -1909,8 +1910,8 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl) } if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n", - gl->gl_demote_state, - (u64)(jiffies - gl->gl_demote_time)*(1000000/HZ)); + gl->gl_demote_state, (unsigned long long) + (jiffies - gl->gl_demote_time)*(1000000/HZ)); } if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) { if (!test_bit(GLF_LOCK, &gl->gl_flags) && diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index e0b4e8c2b68d..4913ef57b095 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -32,6 +32,7 @@ #include "trans.h" #include "rgrp.h" #include "ops_file.h" +#include "super.h" #include "util.h" #include "glops.h" diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h index 56c30daf895b..fa1b5b3d28b9 100644 --- a/fs/gfs2/ops_address.h +++ b/fs/gfs2/ops_address.h @@ -18,8 +18,5 @@ extern const struct address_space_operations gfs2_file_aops; extern int gfs2_get_block(struct inode *inode, sector_t lblock, struct buffer_head *bh_result, int create); extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); -extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, - s64 dinodes); #endif /* __OPS_ADDRESS_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 48a6461d601c..a62c0f2d26d3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -527,10 +527,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; - u64 junk = ip->i_di.di_size; + u64 rgrp_count = ip->i_di.di_size; int error; - if (do_div(junk, sizeof(struct gfs2_rindex))) { + if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { gfs2_consist_inode(ip); return -EIO; } @@ -538,7 +538,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip) clear_rgrpdi(sdp); file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { + for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) { error = read_rindex_entry(ip, &ra_state); if (error) { clear_rgrpdi(sdp); diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index b01e0cfc99b5..b4c6adfc6f2e 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -65,5 +65,6 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, int flags); void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +u64 gfs2_ri_total(struct gfs2_sbd *sdp); #endif /* __RGRP_DOT_H__ */ -- cgit v1.2.1 From 41d7db0ab437bc84f8a6e77cccc626ce937605ac Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 14 May 2007 17:43:26 +0100 Subject: [GFS2] Reduce size of struct gdlm_lock This patch removes the completion (which is rather large) from struct gdlm_lock in favour of using the wait_on_bit() functions. We don't need to add any extra fields to the structure to do this, so we save 32 bytes (on x86_64) per structure. This adds up to quite a lot when we may potentially have millions of these lock structures, Signed-off-by: Steven Whitehouse Acked-by: David Teigland --- fs/gfs2/locking/dlm/lock.c | 11 ++++++++--- fs/gfs2/locking/dlm/lock_dlm.h | 2 +- fs/gfs2/locking/dlm/thread.c | 11 +++++++++-- 3 files changed, 18 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c index c305255bfe8a..542a797ac89a 100644 --- a/fs/gfs2/locking/dlm/lock.c +++ b/fs/gfs2/locking/dlm/lock.c @@ -174,7 +174,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name, lp->cur = DLM_LOCK_IV; lp->lvb = NULL; lp->hold_null = NULL; - init_completion(&lp->ast_wait); INIT_LIST_HEAD(&lp->clist); INIT_LIST_HEAD(&lp->blist); INIT_LIST_HEAD(&lp->delay_list); @@ -399,6 +398,12 @@ static void gdlm_del_lvb(struct gdlm_lock *lp) lp->lksb.sb_lvbptr = NULL; } +static int gdlm_ast_wait(void *word) +{ + schedule(); + return 0; +} + /* This can do a synchronous dlm request (requiring a lock_dlm thread to get the completion) because gfs won't call hold_lvb() during a callback (from the context of a lock_dlm thread). */ @@ -424,10 +429,10 @@ static int hold_null_lock(struct gdlm_lock *lp) lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE; set_bit(LFL_NOBAST, &lpn->flags); set_bit(LFL_INLOCK, &lpn->flags); + set_bit(LFL_AST_WAIT, &lpn->flags); - init_completion(&lpn->ast_wait); gdlm_do_lock(lpn); - wait_for_completion(&lpn->ast_wait); + wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE); error = lpn->lksb.sb_status; if (error) { printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n", diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h index d074c6e6f9bf..24d70f73b651 100644 --- a/fs/gfs2/locking/dlm/lock_dlm.h +++ b/fs/gfs2/locking/dlm/lock_dlm.h @@ -101,6 +101,7 @@ enum { LFL_NOBAST = 10, LFL_HEADQUE = 11, LFL_UNLOCK_DELETE = 12, + LFL_AST_WAIT = 13, }; struct gdlm_lock { @@ -117,7 +118,6 @@ struct gdlm_lock { unsigned long flags; /* lock_dlm flags LFL_ */ int bast_mode; /* protected by async_lock */ - struct completion ast_wait; struct list_head clist; /* complete */ struct list_head blist; /* blocking */ diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c index 9cf1f168eaf8..1aca51e45092 100644 --- a/fs/gfs2/locking/dlm/thread.c +++ b/fs/gfs2/locking/dlm/thread.c @@ -44,6 +44,13 @@ static void process_blocking(struct gdlm_lock *lp, int bast_mode) ls->fscb(ls->sdp, cb, &lp->lockname); } +static void wake_up_ast(struct gdlm_lock *lp) +{ + clear_bit(LFL_AST_WAIT, &lp->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&lp->flags, LFL_AST_WAIT); +} + static void process_complete(struct gdlm_lock *lp) { struct gdlm_ls *ls = lp->ls; @@ -136,7 +143,7 @@ static void process_complete(struct gdlm_lock *lp) */ if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) { - complete(&lp->ast_wait); + wake_up_ast(lp); return; } @@ -214,7 +221,7 @@ out: if (test_bit(LFL_INLOCK, &lp->flags)) { clear_bit(LFL_NOBLOCK, &lp->flags); lp->cur = lp->req; - complete(&lp->ast_wait); + wake_up_ast(lp); return; } -- cgit v1.2.1 From dbb7cae2a36170cd17ffbe286ec0c91a998740ff Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 15 May 2007 15:37:50 +0100 Subject: [GFS2] Clean up inode number handling This patch cleans up the inode number handling code. The main difference is that instead of looking up the inodes using a struct gfs2_inum_host we now use just the no_addr member of this structure. The tests relating to no_formal_ino can then be done by the calling code. This has advantages in that we want to do different things in different code paths if the no_formal_ino doesn't match. In the NFS patch we want to return -ESTALE, but in the ->lookup() path, its a bug in the fs if the no_formal_ino doesn't match and thus we can withdraw in this case. In order to later fix bz #201012, we need to be able to look up an inode without knowing no_formal_ino, as the only information that is known to us is the on-disk location of the inode in question. This patch will also help us to fix bz #236099 at a later date by cleaning up a lot of the code in that area. There are no user visible changes as a result of this patch and there are no changes to the on-disk format either. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 2 +- fs/gfs2/dir.c | 56 +++++++++++++++++++++++++++--------- fs/gfs2/dir.h | 9 +++--- fs/gfs2/glock.c | 4 +-- fs/gfs2/incore.h | 4 +-- fs/gfs2/inode.c | 80 ++++++++++++++++++++------------------------------- fs/gfs2/inode.h | 18 ++++++++---- fs/gfs2/meta_io.h | 2 +- fs/gfs2/ondisk.c | 37 +++++++----------------- fs/gfs2/ops_address.c | 4 +-- fs/gfs2/ops_dentry.c | 24 ++++++---------- fs/gfs2/ops_export.c | 29 +++++++++++-------- fs/gfs2/ops_file.c | 4 +-- fs/gfs2/ops_fstype.c | 16 +++++------ fs/gfs2/ops_inode.c | 20 ++++++------- fs/gfs2/rgrp.c | 6 ++-- fs/gfs2/super.c | 2 +- fs/gfs2/util.c | 4 +-- 18 files changed, 160 insertions(+), 161 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1c40c4bbf379..e76a887a89b2 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1040,7 +1040,7 @@ static int trunc_end(struct gfs2_inode *ip) ip->i_di.di_height = 0; ip->i_di.di_goal_meta = ip->i_di.di_goal_data = - ip->i_num.no_addr; + ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index a96fa07b3f3b..9cdd71cef59c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1456,7 +1456,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, if (dip->i_di.di_entries != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir %llu, " "ip->i_di.di_entries (%u) != g.offset (%u)\n", - (unsigned long long)dip->i_num.no_addr, + (unsigned long long)dip->i_no_addr, dip->i_di.di_entries, g.offset); error = -EIO; @@ -1488,24 +1488,54 @@ out: * Returns: errno */ -int gfs2_dir_search(struct inode *dir, const struct qstr *name, - struct gfs2_inum_host *inum, unsigned int *type) +struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) { struct buffer_head *bh; struct gfs2_dirent *dent; + struct inode *inode; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + if (IS_ERR(dent)) + return ERR_PTR(PTR_ERR(dent)); + inode = gfs2_inode_lookup(dir->i_sb, + be64_to_cpu(dent->de_inum.no_addr), + be16_to_cpu(dent->de_type)); + brelse(bh); + return inode; + } + return ERR_PTR(-ENOENT); +} + +int gfs2_dir_check(struct inode *dir, const struct qstr *name, + const struct gfs2_inode *ip) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + int ret = -ENOENT; dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); if (dent) { if (IS_ERR(dent)) return PTR_ERR(dent); - if (inum) - gfs2_inum_in(inum, (char *)&dent->de_inum); - if (type) - *type = be16_to_cpu(dent->de_type); + if (ip) { + if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr) + goto out; + if (be64_to_cpu(dent->de_inum.no_formal_ino) != + ip->i_no_formal_ino) + goto out; + if (unlikely(IF2DT(ip->i_inode.i_mode) != + be16_to_cpu(dent->de_type))) { + gfs2_consist_inode(GFS2_I(dir)); + ret = -EIO; + goto out; + } + } + ret = 0; +out: brelse(bh); - return 0; } - return -ENOENT; + return ret; } static int dir_new_leaf(struct inode *inode, const struct qstr *name) @@ -1565,7 +1595,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) */ int gfs2_dir_add(struct inode *inode, const struct qstr *name, - const struct gfs2_inum_host *inum, unsigned type) + const struct gfs2_inode *nip, unsigned type) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *bh; @@ -1580,7 +1610,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, if (IS_ERR(dent)) return PTR_ERR(dent); dent = gfs2_init_dirent(inode, dent, name, bh); - gfs2_inum_out(inum, (char *)&dent->de_inum); + gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(type); if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; @@ -1700,7 +1730,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) */ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, - struct gfs2_inum_host *inum, unsigned int new_type) + const struct gfs2_inode *nip, unsigned int new_type) { struct buffer_head *bh; struct gfs2_dirent *dent; @@ -1715,7 +1745,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, return PTR_ERR(dent); gfs2_trans_add_bh(dip->i_gl, bh, 1); - gfs2_inum_out(inum, (char *)&dent->de_inum); + gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(new_type); if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 48fe89046bba..8a468cac9328 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -16,15 +16,16 @@ struct inode; struct gfs2_inode; struct gfs2_inum; -int gfs2_dir_search(struct inode *dir, const struct qstr *filename, - struct gfs2_inum_host *inum, unsigned int *type); +struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); +int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inum_host *inum, unsigned int type); + const struct gfs2_inode *ip, unsigned int type); int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir); int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, - struct gfs2_inum_host *new_inum, unsigned int new_type); + const struct gfs2_inode *nip, unsigned int new_type); int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c66c718013ef..b3ed58551e74 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1823,8 +1823,8 @@ static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip) print_dbg(gi, " Inode:\n"); print_dbg(gi, " num = %llu/%llu\n", - (unsigned long long)ip->i_num.no_formal_ino, - (unsigned long long)ip->i_num.no_addr); + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr); print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode)); print_dbg(gi, " i_flags ="); for (x = 0; x < 32; x++) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d995441373ab..00c3004a4c22 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -213,8 +213,8 @@ enum { struct gfs2_inode { struct inode i_inode; - struct gfs2_inum_host i_num; - + u64 i_no_addr; + u64 i_no_formal_ino; unsigned long i_flags; /* GIF_... */ struct gfs2_dinode_host i_di; /* To be replaced by ref to block */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index df0b8b3018b9..58f5a67e1c35 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -41,9 +41,9 @@ static int iget_test(struct inode *inode, void *opaque) { struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_inum_host *inum = opaque; + u64 *no_addr = opaque; - if (ip->i_num.no_addr == inum->no_addr && + if (ip->i_no_addr == *no_addr && inode->i_private != NULL) return 1; @@ -53,37 +53,37 @@ static int iget_test(struct inode *inode, void *opaque) static int iget_set(struct inode *inode, void *opaque) { struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_inum_host *inum = opaque; + u64 *no_addr = opaque; - ip->i_num = *inum; - inode->i_ino = inum->no_addr; + inode->i_ino = (unsigned long)*no_addr; + ip->i_no_addr = *no_addr; return 0; } -struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum) +struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr) { - return ilookup5(sb, (unsigned long)inum->no_addr, - iget_test, inum); + unsigned long hash = (unsigned long)no_addr; + return ilookup5(sb, hash, iget_test, &no_addr); } -static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum_host *inum) +static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) { - return iget5_locked(sb, (unsigned long)inum->no_addr, - iget_test, iget_set, inum); + unsigned long hash = (unsigned long)no_addr; + return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); } /** * gfs2_inode_lookup - Lookup an inode * @sb: The super block - * @inum: The inode number + * @no_addr: The inode number * @type: The type of the inode * * Returns: A VFS inode, or an error */ -struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *inum, unsigned int type) +struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned int type) { - struct inode *inode = gfs2_iget(sb, inum); + struct inode *inode = gfs2_iget(sb, no_addr); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_glock *io_gl; int error; @@ -110,12 +110,12 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *i inode->i_op = &gfs2_dev_iops; } - error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) goto fail; ip->i_gl->gl_object = ip; - error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) goto fail_put; @@ -144,14 +144,12 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) struct gfs2_dinode_host *di = &ip->i_di; const struct gfs2_dinode *str = buf; - if (ip->i_num.no_addr != be64_to_cpu(str->di_num.no_addr)) { + if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) { if (gfs2_consist_inode(ip)) gfs2_dinode_print(ip); return -EIO; } - if (ip->i_num.no_formal_ino != be64_to_cpu(str->di_num.no_formal_ino)) - return -ESTALE; - + ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); ip->i_inode.i_mode = be32_to_cpu(str->di_mode); ip->i_inode.i_rdev = 0; switch (ip->i_inode.i_mode & S_IFMT) { @@ -247,7 +245,7 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip) if (error) goto out_qs; - rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); if (!rgd) { gfs2_consist_inode(ip); error = -EIO; @@ -366,8 +364,6 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, struct super_block *sb = dir->i_sb; struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_holder d_gh; - struct gfs2_inum_host inum; - unsigned int type; int error; struct inode *inode = NULL; int unlock = 0; @@ -395,12 +391,9 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, goto out; } - error = gfs2_dir_search(dir, name, &inum, &type); - if (error) - goto out; - - inode = gfs2_inode_lookup(sb, &inum, type); - + inode = gfs2_dir_search(dir, name); + if (IS_ERR(inode)) + error = PTR_ERR(inode); out: if (unlock) gfs2_glock_dq_uninit(&d_gh); @@ -548,7 +541,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, if (!dip->i_inode.i_nlink) return -EPERM; - error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL); + error = gfs2_dir_check(&dip->i_inode, name, NULL); switch (error) { case -ENOENT: error = 0; @@ -588,8 +581,7 @@ static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode, *gid = current->fsgid; } -static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum_host *inum, - u64 *generation) +static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); int error; @@ -605,7 +597,7 @@ static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum_host *inum, if (error) goto out_ipreserv; - inum->no_addr = gfs2_alloc_di(dip, generation); + *no_addr = gfs2_alloc_di(dip, generation); gfs2_trans_end(sdp); @@ -760,7 +752,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; } - error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_inode.i_mode)); + error = gfs2_dir_add(&dip->i_inode, name, ip, IF2DT(ip->i_inode.i_mode)); if (error) goto fail_end_trans; @@ -844,7 +836,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, struct gfs2_inode *dip = ghs->gh_gl->gl_object; struct inode *dir = &dip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_inum_host inum; + struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; int error; u64 generation; @@ -864,7 +856,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock; - error = alloc_dinode(dip, &inum, &generation); + error = alloc_dinode(dip, &inum.no_addr, &generation); if (error) goto fail_gunlock; @@ -877,7 +869,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; - inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode)); + inode = gfs2_inode_lookup(dir->i_sb, inum.no_addr, IF2DT(mode)); if (IS_ERR(inode)) goto fail_gunlock2; @@ -976,10 +968,8 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, */ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) + const struct gfs2_inode *ip) { - struct gfs2_inum_host inum; - unsigned int type; int error; if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) @@ -997,18 +987,10 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (error) return error; - error = gfs2_dir_search(&dip->i_inode, name, &inum, &type); + error = gfs2_dir_check(&dip->i_inode, name, ip); if (error) return error; - if (!gfs2_inum_equal(&inum, &ip->i_num)) - return -ENOENT; - - if (IF2DT(ip->i_inode.i_mode) != type) { - gfs2_consist_inode(dip); - return -EIO; - } - return 0; } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index b57f448b15bc..05fc095d8540 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -10,17 +10,17 @@ #ifndef __INODE_DOT_H__ #define __INODE_DOT_H__ -static inline int gfs2_is_stuffed(struct gfs2_inode *ip) +static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) { return !ip->i_di.di_height; } -static inline int gfs2_is_jdata(struct gfs2_inode *ip) +static inline int gfs2_is_jdata(const struct gfs2_inode *ip) { return ip->i_di.di_flags & GFS2_DIF_JDATA; } -static inline int gfs2_is_dir(struct gfs2_inode *ip) +static inline int gfs2_is_dir(const struct gfs2_inode *ip) { return S_ISDIR(ip->i_inode.i_mode); } @@ -32,9 +32,15 @@ static inline void gfs2_set_inode_blocks(struct inode *inode) (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); } +static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, + u64 no_formal_ino) +{ + return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino; +} + void gfs2_inode_attr_in(struct gfs2_inode *ip); -struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *inum, unsigned type); -struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum); +struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned type); +struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); int gfs2_inode_refresh(struct gfs2_inode *ip); @@ -47,7 +53,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, struct gfs2_inode *ip); int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip); + const struct gfs2_inode *ip); int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); int gfs2_glock_nq_atime(struct gfs2_holder *gh); diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index e037425bc042..527bf19d9690 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -63,7 +63,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, struct buffer_head **bhp) { - return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp); + return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, 0, bhp); } struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c index d9ecfd23a49e..cd4cf055c37d 100644 --- a/fs/gfs2/ondisk.c +++ b/fs/gfs2/ondisk.c @@ -33,26 +33,10 @@ * first arg: the cpu-order structure */ -void gfs2_inum_in(struct gfs2_inum_host *no, const void *buf) +void gfs2_inum_out(const struct gfs2_inode *ip, struct gfs2_dirent *dent) { - const struct gfs2_inum *str = buf; - - no->no_formal_ino = be64_to_cpu(str->no_formal_ino); - no->no_addr = be64_to_cpu(str->no_addr); -} - -void gfs2_inum_out(const struct gfs2_inum_host *no, void *buf) -{ - struct gfs2_inum *str = buf; - - str->no_formal_ino = cpu_to_be64(no->no_formal_ino); - str->no_addr = cpu_to_be64(no->no_addr); -} - -static void gfs2_inum_print(const struct gfs2_inum_host *no) -{ - printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino); - printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr); + dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); } static void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf) @@ -74,9 +58,10 @@ void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); sb->sb_bsize = be32_to_cpu(str->sb_bsize); sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); - - gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir); - gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir); + sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); + sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); + sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); + sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); @@ -146,9 +131,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_header.__pad0 = 0; str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); str->di_header.__pad1 = 0; - - gfs2_inum_out(&ip->i_num, &str->di_num); - + str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); str->di_mode = cpu_to_be32(ip->i_inode.i_mode); str->di_uid = cpu_to_be32(ip->i_inode.i_uid); str->di_gid = cpu_to_be32(ip->i_inode.i_gid); @@ -178,7 +162,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip) { const struct gfs2_dinode_host *di = &ip->i_di; - gfs2_inum_print(&ip->i_num); + printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)ip->i_no_formal_ino); + printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)ip->i_no_addr); printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 4913ef57b095..fb84478e1df6 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -757,8 +757,8 @@ static unsigned limit = 0; return; fs_warn(sdp, "ip = %llu %llu\n", - (unsigned long long)ip->i_num.no_formal_ino, - (unsigned long long)ip->i_num.no_addr); + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr); for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) fs_warn(sdp, "ip->i_cache[%u] = %s\n", diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c index a6fdc52f554a..793e334d098e 100644 --- a/fs/gfs2/ops_dentry.c +++ b/fs/gfs2/ops_dentry.c @@ -21,6 +21,7 @@ #include "glock.h" #include "ops_dentry.h" #include "util.h" +#include "inode.h" /** * gfs2_drevalidate - Check directory lookup consistency @@ -40,14 +41,15 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) struct gfs2_inode *dip = GFS2_I(parent->d_inode); struct inode *inode = dentry->d_inode; struct gfs2_holder d_gh; - struct gfs2_inode *ip; - struct gfs2_inum_host inum; - unsigned int type; + struct gfs2_inode *ip = NULL; int error; int had_lock=0; - if (inode && is_bad_inode(inode)) - goto invalid; + if (inode) { + if (is_bad_inode(inode)) + goto invalid; + ip = GFS2_I(inode); + } if (sdp->sd_args.ar_localcaching) goto valid; @@ -59,7 +61,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) goto fail; } - error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type); + error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip); switch (error) { case 0: if (!inode) @@ -73,16 +75,6 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) goto fail_gunlock; } - ip = GFS2_I(inode); - - if (!gfs2_inum_equal(&ip->i_num, &inum)) - goto invalid_gunlock; - - if (IF2DT(ip->i_inode.i_mode) != type) { - gfs2_consist_inode(dip); - goto fail_gunlock; - } - valid_gunlock: if (!had_lock) gfs2_glock_dq_uninit(&d_gh); diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index aad918337a46..51a8a14deb29 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -75,10 +75,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, (connectable && *len < GFS2_LARGE_FH_SIZE)) return 255; - fh[0] = cpu_to_be32(ip->i_num.no_formal_ino >> 32); - fh[1] = cpu_to_be32(ip->i_num.no_formal_ino & 0xFFFFFFFF); - fh[2] = cpu_to_be32(ip->i_num.no_addr >> 32); - fh[3] = cpu_to_be32(ip->i_num.no_addr & 0xFFFFFFFF); + fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[2] = cpu_to_be32(ip->i_no_addr >> 32); + fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); *len = GFS2_SMALL_FH_SIZE; if (!connectable || inode == sb->s_root->d_inode) @@ -90,10 +90,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, igrab(inode); spin_unlock(&dentry->d_lock); - fh[4] = cpu_to_be32(ip->i_num.no_formal_ino >> 32); - fh[5] = cpu_to_be32(ip->i_num.no_formal_ino & 0xFFFFFFFF); - fh[6] = cpu_to_be32(ip->i_num.no_addr >> 32); - fh[7] = cpu_to_be32(ip->i_num.no_addr & 0xFFFFFFFF); + fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[6] = cpu_to_be32(ip->i_no_addr >> 32); + fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); fh[8] = cpu_to_be32(inode->i_mode); fh[9] = 0; /* pad to double word */ @@ -144,7 +144,8 @@ static int gfs2_get_name(struct dentry *parent, char *name, ip = GFS2_I(inode); *name = 0; - gnfd.inum = ip->i_num; + gnfd.inum.no_addr = ip->i_no_addr; + gnfd.inum.no_formal_ino = ip->i_no_formal_ino; gnfd.name = name; error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); @@ -202,9 +203,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) /* System files? */ - inode = gfs2_ilookup(sb, inum); + inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { - if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) { + if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { iput(inode); return ERR_PTR(-ESTALE); } @@ -236,7 +237,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) gfs2_glock_dq_uninit(&rgd_gh); gfs2_glock_dq_uninit(&ri_gh); - inode = gfs2_inode_lookup(sb, inum, fh_obj->imode); + inode = gfs2_inode_lookup(sb, inum->no_addr, fh_obj->imode); if (!inode) goto fail; if (IS_ERR(inode)) { @@ -249,6 +250,10 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) iput(inode); goto fail; } + if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { + iput(inode); + goto fail; + } error = -EIO; if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 064df8804582..550032c3b5f7 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -502,7 +502,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockname name = - { .ln_number = ip->i_num.no_addr, + { .ln_number = ip->i_no_addr, .ln_type = LM_TYPE_PLOCK }; if (!(fl->fl_flags & FL_POSIX)) @@ -557,7 +557,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) gfs2_glock_dq_uninit(fl_gh); } else { error = gfs2_glock_get(GFS2_SB(&ip->i_inode), - ip->i_num.no_addr, &gfs2_flock_glops, + ip->i_no_addr, &gfs2_flock_glops, CREATE, &gl); if (error) goto out; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 2c5f8e7def0d..c682371717ff 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -236,17 +236,17 @@ fail: return error; } -static struct inode *gfs2_lookup_root(struct super_block *sb, - struct gfs2_inum_host *inum) +static inline struct inode *gfs2_lookup_root(struct super_block *sb, + u64 no_addr) { - return gfs2_inode_lookup(sb, inum, DT_DIR); + return gfs2_inode_lookup(sb, no_addr, DT_DIR); } static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) { struct super_block *sb = sdp->sd_vfs; struct gfs2_holder sb_gh; - struct gfs2_inum_host *inum; + u64 no_addr; struct inode *inode; int error = 0; @@ -289,10 +289,10 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); /* Get the root inode */ - inum = &sdp->sd_sb.sb_root_dir; + no_addr = sdp->sd_sb.sb_root_dir.no_addr; if (sb->s_type == &gfs2meta_fs_type) - inum = &sdp->sd_sb.sb_master_dir; - inode = gfs2_lookup_root(sb, inum); + no_addr = sdp->sd_sb.sb_master_dir.no_addr; + inode = gfs2_lookup_root(sb, no_addr); if (IS_ERR(inode)) { error = PTR_ERR(inode); fs_err(sdp, "can't read in root inode: %d\n", error); @@ -449,7 +449,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_qinode; - inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir); + inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr); if (IS_ERR(inode)) { error = PTR_ERR(inode); fs_err(sdp, "can't read in master directory: %d\n", error); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index d85f6e05cb95..f8ecfec4064b 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -157,7 +157,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_gunlock; - error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL); + error = gfs2_dir_check(dir, &dentry->d_name, NULL); switch (error) { case -ENOENT: break; @@ -217,8 +217,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, goto out_ipres; } - error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num, - IF2DT(inode->i_mode)); + error = gfs2_dir_add(dir, &dentry->d_name, ip, IF2DT(inode->i_mode)); if (error) goto out_end_trans; @@ -275,7 +274,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); @@ -420,7 +419,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); - gfs2_inum_out(&dip->i_num, &dent->de_inum); + gfs2_inum_out(dip, dent); dent->de_type = cpu_to_be16(DT_DIR); gfs2_dinode_out(ip, di); @@ -472,7 +471,7 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); error = gfs2_glock_nq_m(3, ghs); @@ -614,7 +613,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, * this is the case of the target file already existing * so we unlink before doing the rename */ - nrgd = gfs2_blk2rgrpd(sdp, nip->i_num.no_addr); + nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr); if (nrgd) gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); } @@ -653,7 +652,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock; - error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL); + error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); switch (error) { case -ENOENT: error = 0; @@ -750,7 +749,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR); + error = gfs2_dir_mvino(ip, &name, nip, DT_DIR); if (error) goto out_end_trans; } else { @@ -768,8 +767,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num, - IF2DT(ip->i_inode.i_mode)); + error = gfs2_dir_add(ndir, &ndentry->d_name, ip, IF2DT(ip->i_inode.i_mode)); if (error) goto out_end_trans; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index a62c0f2d26d3..30eb428065c5 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1470,7 +1470,7 @@ void gfs2_unlink_di(struct inode *inode) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_rgrpd *rgd; - u64 blkno = ip->i_num.no_addr; + u64 blkno = ip->i_no_addr; rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); if (!rgd) @@ -1505,9 +1505,9 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { - gfs2_free_uninit_di(rgd, ip->i_num.no_addr); + gfs2_free_uninit_di(rgd, ip->i_no_addr); gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_meta_wipe(ip, ip->i_num.no_addr, 1); + gfs2_meta_wipe(ip, ip->i_no_addr, 1); } /** diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4fdda974dc83..faccffd19907 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -360,7 +360,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) name.len = sprintf(buf, "journal%u", sdp->sd_journals); name.hash = gfs2_disk_hash(name.name, name.len); - error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL); + error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); if (error == -ENOENT) { error = 0; break; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 601eaa1b9ed6..3f5edc54e80a 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -115,8 +115,8 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, "GFS2: fsid=%s: inode = %llu %llu\n" "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino, - (unsigned long long)ip->i_num.no_addr, + sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, sdp->sd_fsname, function, file, line); return rv; } -- cgit v1.2.1 From 2a87ab080607d009b8b2a8706f4e27d70402ca9c Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Wed, 16 May 2007 17:02:19 -0500 Subject: [GFS2] Quotas non-functional - fix bug This patch fixes an error in the quota code where a 'struct gfs2_quota_lvb*' was being passed to gfs2_adjust_quota() instead of a 'struct gfs2_quota_data*'. Also moved 'struct gfs2_quota_lvb' from fs/gfs2/incore.h to include/linux/gfs2_ondisk.h as per Steve's suggestion. Signed-off-by: Abhijith Das Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 8 -------- fs/gfs2/quota.c | 4 +++- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 00c3004a4c22..b2079fcd2513 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -275,14 +275,6 @@ enum { QDF_LOCKED = 2, }; -struct gfs2_quota_lvb { - __be32 qb_magic; - u32 __pad; - __be64 qb_limit; /* Hard limit of # blocks to alloc */ - __be64 qb_warn; /* Warn user when alloc is above this # */ - __be64 qb_value; /* Current # blocks allocated */ -}; - struct gfs2_quota_data { struct list_head qd_list; unsigned int qd_count; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c186857e48a8..fcd3ee2c5b96 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -627,6 +627,8 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, err = 0; qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC); qd->qd_qb.qb_value = cpu_to_be64(value); + ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC); + ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value); unlock: unlock_page(page); page_cache_release(page); @@ -709,7 +711,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) offset = qd2offset(qd); error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, (struct gfs2_quota_data *) - qd->qd_gl->gl_lvb); + qd); if (error) goto out_end_trans; -- cgit v1.2.1 From 916297aad5de2363dccd531873eda55d4d6afb57 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 May 2007 15:56:13 -0400 Subject: [DLM] keep dlm from panicing when traversing rsb list in debugfs This problem was originally reported against GFS6.1, but the same issue exists in upstream DLM. This patch keeps the rsb iterator assigning under the rsbtbl list lock. Each time we process an rsb we grab a reference to it to make sure it is not freed out from underneath us, and then put it when we get the next rsb in the list or move onto another list. Signed-off-by: Josef Bacik Signed-off-by: Steven Whitehouse --- fs/dlm/debug_fs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 61ba670b9e02..9e27a1675794 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -17,6 +17,7 @@ #include #include "dlm_internal.h" +#include "lock.h" #define DLM_DEBUG_BUF_LEN 4096 static char debug_buf[DLM_DEBUG_BUF_LEN]; @@ -166,6 +167,9 @@ static int rsb_iter_next(struct rsb_iter *ri) read_lock(&ls->ls_rsbtbl[i].lock); if (!list_empty(&ls->ls_rsbtbl[i].list)) { ri->next = ls->ls_rsbtbl[i].list.next; + ri->rsb = list_entry(ri->next, struct dlm_rsb, + res_hashchain); + dlm_hold_rsb(ri->rsb); read_unlock(&ls->ls_rsbtbl[i].lock); break; } @@ -176,6 +180,7 @@ static int rsb_iter_next(struct rsb_iter *ri) if (ri->entry >= ls->ls_rsbtbl_size) return 1; } else { + struct dlm_rsb *old = ri->rsb; i = ri->entry; read_lock(&ls->ls_rsbtbl[i].lock); ri->next = ri->next->next; @@ -184,11 +189,13 @@ static int rsb_iter_next(struct rsb_iter *ri) ri->next = NULL; ri->entry++; read_unlock(&ls->ls_rsbtbl[i].lock); + dlm_put_rsb(old); goto top; } + ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); read_unlock(&ls->ls_rsbtbl[i].lock); + dlm_put_rsb(old); } - ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); return 0; } -- cgit v1.2.1 From 85e86edf951a8a39954c0ba1edbe4a58827dcd5c Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 May 2007 08:58:15 -0500 Subject: [DLM] block scand during recovery [1/6] Don't let dlm_scand run during recovery since it may try to do a resource directory removal while the directory nodes are changing. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lock.c | 47 +++++++++++++++++++++++------------------------ fs/dlm/lock.h | 2 ++ fs/dlm/lockspace.c | 8 ++++++-- 3 files changed, 31 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index d8d6e729f96b..09668ec2e279 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -194,17 +194,17 @@ void dlm_dump_rsb(struct dlm_rsb *r) /* Threads cannot use the lockspace while it's being recovered */ -static inline void lock_recovery(struct dlm_ls *ls) +static inline void dlm_lock_recovery(struct dlm_ls *ls) { down_read(&ls->ls_in_recovery); } -static inline void unlock_recovery(struct dlm_ls *ls) +void dlm_unlock_recovery(struct dlm_ls *ls) { up_read(&ls->ls_in_recovery); } -static inline int lock_recovery_try(struct dlm_ls *ls) +int dlm_lock_recovery_try(struct dlm_ls *ls) { return down_read_trylock(&ls->ls_in_recovery); } @@ -985,11 +985,10 @@ void dlm_scan_rsbs(struct dlm_ls *ls) { int i; - if (dlm_locking_stopped(ls)) - return; - for (i = 0; i < ls->ls_rsbtbl_size; i++) { shrink_bucket(ls, i); + if (dlm_locking_stopped(ls)) + break; cond_resched(); } } @@ -2274,7 +2273,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (!ls) return -EINVAL; - lock_recovery(ls); + dlm_lock_recovery(ls); if (convert) error = find_lkb(ls, lksb->sb_lkid, &lkb); @@ -2302,7 +2301,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EAGAIN) error = 0; out: - unlock_recovery(ls); + dlm_unlock_recovery(ls); dlm_put_lockspace(ls); return error; } @@ -2322,7 +2321,7 @@ int dlm_unlock(dlm_lockspace_t *lockspace, if (!ls) return -EINVAL; - lock_recovery(ls); + dlm_lock_recovery(ls); error = find_lkb(ls, lkid, &lkb); if (error) @@ -2344,7 +2343,7 @@ int dlm_unlock(dlm_lockspace_t *lockspace, out_put: dlm_put_lkb(lkb); out: - unlock_recovery(ls); + dlm_unlock_recovery(ls); dlm_put_lockspace(ls); return error; } @@ -3424,7 +3423,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) } } - if (lock_recovery_try(ls)) + if (dlm_lock_recovery_try(ls)) break; schedule(); } @@ -3503,7 +3502,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) log_error(ls, "unknown message type %d", ms->m_type); } - unlock_recovery(ls); + dlm_unlock_recovery(ls); out: dlm_put_lockspace(ls); dlm_astd_wake(); @@ -4040,7 +4039,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, struct dlm_args args; int error; - lock_recovery(ls); + dlm_lock_recovery(ls); error = create_lkb(ls, &lkb); if (error) { @@ -4094,7 +4093,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); spin_unlock(&ua->proc->locks_spin); out: - unlock_recovery(ls); + dlm_unlock_recovery(ls); return error; } @@ -4106,7 +4105,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, struct dlm_user_args *ua; int error; - lock_recovery(ls); + dlm_lock_recovery(ls); error = find_lkb(ls, lkid, &lkb); if (error) @@ -4146,7 +4145,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, out_put: dlm_put_lkb(lkb); out: - unlock_recovery(ls); + dlm_unlock_recovery(ls); kfree(ua_tmp); return error; } @@ -4159,7 +4158,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, struct dlm_user_args *ua; int error; - lock_recovery(ls); + dlm_lock_recovery(ls); error = find_lkb(ls, lkid, &lkb); if (error) @@ -4194,7 +4193,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, out_put: dlm_put_lkb(lkb); out: - unlock_recovery(ls); + dlm_unlock_recovery(ls); kfree(ua_tmp); return error; } @@ -4207,7 +4206,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, struct dlm_user_args *ua; int error; - lock_recovery(ls); + dlm_lock_recovery(ls); error = find_lkb(ls, lkid, &lkb); if (error) @@ -4231,7 +4230,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, out_put: dlm_put_lkb(lkb); out: - unlock_recovery(ls); + dlm_unlock_recovery(ls); kfree(ua_tmp); return error; } @@ -4314,7 +4313,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) { struct dlm_lkb *lkb, *safe; - lock_recovery(ls); + dlm_lock_recovery(ls); while (1) { lkb = del_proc_lock(ls, proc); @@ -4347,7 +4346,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) } mutex_unlock(&ls->ls_clear_proc_locks); - unlock_recovery(ls); + dlm_unlock_recovery(ls); } static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) @@ -4429,12 +4428,12 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, if (nodeid != dlm_our_nodeid()) { error = send_purge(ls, nodeid, pid); } else { - lock_recovery(ls); + dlm_lock_recovery(ls); if (pid == current->pid) purge_proc_locks(ls, proc); else do_purge(ls, nodeid, pid); - unlock_recovery(ls); + dlm_unlock_recovery(ls); } return error; } diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 64fc4ec40668..19403aa08739 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,6 +24,8 @@ void dlm_put_rsb(struct dlm_rsb *r); void dlm_hold_rsb(struct dlm_rsb *r); int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); +int dlm_lock_recovery_try(struct dlm_ls *ls); +void dlm_unlock_recovery(struct dlm_ls *ls); int dlm_purge_locks(struct dlm_ls *ls); void dlm_purge_mstcpy_locks(struct dlm_rsb *r); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index a677b2a5eed4..414a108df934 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -234,8 +234,12 @@ static int dlm_scand(void *data) struct dlm_ls *ls; while (!kthread_should_stop()) { - list_for_each_entry(ls, &lslist, ls_list) - dlm_scan_rsbs(ls); + list_for_each_entry(ls, &lslist, ls_list) { + if (dlm_lock_recovery_try(ls)) { + dlm_scan_rsbs(ls); + dlm_unlock_recovery(ls); + } + } schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); } return 0; -- cgit v1.2.1 From 3ae1acf93a21512512f8a78430fcde5992dd208e Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 May 2007 08:59:31 -0500 Subject: [DLM] add lock timeouts and warnings [2/6] New features: lock timeouts and time warnings. If the DLM_LKF_TIMEOUT flag is set, then the request/conversion will be canceled after waiting the specified number of centiseconds (specified per lock). This feature is only available for locks requested through libdlm (can be enabled for kernel dlm users if there's a use for it.) If the new DLM_LSFL_TIMEWARN flag is set when creating the lockspace, then a warning message will be sent to userspace (using genetlink) after a request/conversion has been waiting for a given number of centiseconds (configurable per node). The time warnings will be used in the future to do deadlock detection in userspace. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/Makefile | 1 + fs/dlm/config.c | 8 ++- fs/dlm/config.h | 1 + fs/dlm/dlm_internal.h | 10 ++++ fs/dlm/lock.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++- fs/dlm/lock.h | 4 +- fs/dlm/lockspace.c | 10 +++- fs/dlm/main.c | 11 +++- fs/dlm/member.c | 5 +- fs/dlm/netlink.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/dlm/recoverd.c | 4 +- fs/dlm/user.c | 2 +- 12 files changed, 348 insertions(+), 9 deletions(-) create mode 100644 fs/dlm/netlink.c (limited to 'fs') diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 604cf7dc5f39..d248e60951ba 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -8,6 +8,7 @@ dlm-y := ast.o \ member.o \ memory.o \ midcomms.o \ + netlink.o \ lowcomms.o \ rcom.o \ recover.o \ diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 5a3d390cc826..2909abf1bbc3 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -90,6 +90,7 @@ struct cluster { unsigned int cl_scan_secs; unsigned int cl_log_debug; unsigned int cl_protocol; + unsigned int cl_timewarn_cs; }; enum { @@ -103,6 +104,7 @@ enum { CLUSTER_ATTR_SCAN_SECS, CLUSTER_ATTR_LOG_DEBUG, CLUSTER_ATTR_PROTOCOL, + CLUSTER_ATTR_TIMEWARN_CS, }; struct cluster_attribute { @@ -162,6 +164,7 @@ CLUSTER_ATTR(toss_secs, 1); CLUSTER_ATTR(scan_secs, 1); CLUSTER_ATTR(log_debug, 0); CLUSTER_ATTR(protocol, 0); +CLUSTER_ATTR(timewarn_cs, 1); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, @@ -174,6 +177,7 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, + [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, NULL, }; @@ -916,6 +920,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_SCAN_SECS 5 #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_PROTOCOL 0 +#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, @@ -927,6 +932,7 @@ struct dlm_config_info dlm_config = { .ci_toss_secs = DEFAULT_TOSS_SECS, .ci_scan_secs = DEFAULT_SCAN_SECS, .ci_log_debug = DEFAULT_LOG_DEBUG, - .ci_protocol = DEFAULT_PROTOCOL + .ci_protocol = DEFAULT_PROTOCOL, + .ci_timewarn_cs = DEFAULT_TIMEWARN_CS }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 967cc3d72e5e..a3170fe22090 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -27,6 +27,7 @@ struct dlm_config_info { int ci_scan_secs; int ci_log_debug; int ci_protocol; + int ci_timewarn_cs; }; extern struct dlm_config_info dlm_config; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 30994d68f6a0..65a5fc076b8a 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -213,8 +213,10 @@ struct dlm_args { #define DLM_IFL_OVERLAP_UNLOCK 0x00080000 #define DLM_IFL_OVERLAP_CANCEL 0x00100000 #define DLM_IFL_ENDOFLIFE 0x00200000 +#define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 +#define DLM_IFL_TIMEOUT_CANCEL 0x00000004 struct dlm_lkb { struct dlm_rsb *lkb_resource; /* the rsb */ @@ -243,6 +245,9 @@ struct dlm_lkb { struct list_head lkb_wait_reply; /* waiting for remote reply */ struct list_head lkb_astqueue; /* need ast to be sent */ struct list_head lkb_ownqueue; /* list of locks for a process */ + struct list_head lkb_time_list; + unsigned long lkb_timestamp; + unsigned long lkb_timeout_cs; char *lkb_lvbptr; struct dlm_lksb *lkb_lksb; /* caller's status block */ @@ -447,6 +452,9 @@ struct dlm_ls { struct mutex ls_orphans_mutex; struct list_head ls_orphans; + struct mutex ls_timeout_mutex; + struct list_head ls_timeout; + struct list_head ls_nodes; /* current nodes in ls */ struct list_head ls_nodes_gone; /* dead node list, recovery */ int ls_num_nodes; /* number of nodes in ls */ @@ -472,6 +480,7 @@ struct dlm_ls { struct task_struct *ls_recoverd_task; struct mutex ls_recoverd_active; spinlock_t ls_recover_lock; + unsigned long ls_recover_begin; /* jiffies timestamp */ uint32_t ls_recover_status; /* DLM_RS_ */ uint64_t ls_recover_seq; struct dlm_recover *ls_recover_args; @@ -501,6 +510,7 @@ struct dlm_ls { #define LSFL_RCOM_READY 3 #define LSFL_RCOM_WAIT 4 #define LSFL_UEVENT_WAIT 5 +#define LSFL_TIMEWARN 6 /* much of this is just saving user space pointers associated with the lock that we pass back to the user lib with an ast */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 09668ec2e279..ab986dfbe6d3 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -82,10 +82,13 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode); static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb); static int send_remove(struct dlm_rsb *r); static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, struct dlm_message *ms); static int receive_extralen(struct dlm_message *ms); static void do_purge(struct dlm_ls *ls, int nodeid, int pid); +static void del_timeout(struct dlm_lkb *lkb); +void dlm_timeout_warn(struct dlm_lkb *lkb); /* * Lock compatibilty matrix - thanks Steve @@ -286,8 +289,17 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) if (is_master_copy(lkb)) return; + del_timeout(lkb); + DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); + /* if the operation was a cancel, then return -DLM_ECANCEL, if a + timeout caused the cancel then return -ETIMEDOUT */ + if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { + lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; + rv = -ETIMEDOUT; + } + lkb->lkb_lksb->sb_status = rv; lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; @@ -581,6 +593,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); + INIT_LIST_HEAD(&lkb->lkb_time_list); get_random_bytes(&bucket, sizeof(bucket)); bucket &= (ls->ls_lkbtbl_size - 1); @@ -993,6 +1006,125 @@ void dlm_scan_rsbs(struct dlm_ls *ls) } } +static void add_timeout(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + + if (is_master_copy(lkb)) + return; + + if (lkb->lkb_exflags & DLM_LKF_TIMEOUT) + goto add_it; + + if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && + !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { + lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN; + goto add_it; + } + return; + + add_it: + DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); + mutex_lock(&ls->ls_timeout_mutex); + hold_lkb(lkb); + lkb->lkb_timestamp = jiffies; + list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); + mutex_unlock(&ls->ls_timeout_mutex); +} + +static void del_timeout(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + + mutex_lock(&ls->ls_timeout_mutex); + if (!list_empty(&lkb->lkb_time_list)) { + list_del_init(&lkb->lkb_time_list); + unhold_lkb(lkb); + } + mutex_unlock(&ls->ls_timeout_mutex); +} + +/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and + lkb_lksb_timeout without lock_rsb? Note: we can't lock timeout_mutex + and then lock rsb because of lock ordering in add_timeout. We may need + to specify some special timeout-related bits in the lkb that are just to + be accessed under the timeout_mutex. */ + +void dlm_scan_timeout(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + struct dlm_lkb *lkb; + int do_cancel, do_warn; + + for (;;) { + if (dlm_locking_stopped(ls)) + break; + + do_cancel = 0; + do_warn = 0; + mutex_lock(&ls->ls_timeout_mutex); + list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { + + if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && + time_after_eq(jiffies, lkb->lkb_timestamp + + lkb->lkb_timeout_cs * HZ/100)) + do_cancel = 1; + + if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && + time_after_eq(jiffies, lkb->lkb_timestamp + + dlm_config.ci_timewarn_cs * HZ/100)) + do_warn = 1; + + if (!do_cancel && !do_warn) + continue; + hold_lkb(lkb); + break; + } + mutex_unlock(&ls->ls_timeout_mutex); + + if (!do_cancel && !do_warn) + break; + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + if (do_warn) { + /* clear flag so we only warn once */ + lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; + if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT)) + del_timeout(lkb); + dlm_timeout_warn(lkb); + } + + if (do_cancel) { + lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; + lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL; + del_timeout(lkb); + _cancel_lock(r, lkb); + } + + unlock_rsb(r); + unhold_rsb(r); + dlm_put_lkb(lkb); + } +} + +/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping + dlm_recoverd before checking/setting ls_recover_begin. */ + +void dlm_adjust_timeouts(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + long adj = jiffies - ls->ls_recover_begin; + + ls->ls_recover_begin = 0; + mutex_lock(&ls->ls_timeout_mutex); + list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) + lkb->lkb_timestamp += adj; + mutex_unlock(&ls->ls_timeout_mutex); +} + /* lkb is master or local copy */ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -1902,6 +2034,9 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) if (is_overlap(lkb)) goto out; + /* don't let scand try to do a cancel */ + del_timeout(lkb); + if (lkb->lkb_flags & DLM_IFL_RESEND) { lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; rv = -EBUSY; @@ -1933,6 +2068,9 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) if (is_overlap_unlock(lkb)) goto out; + /* don't let scand try to do a cancel */ + del_timeout(lkb); + if (lkb->lkb_flags & DLM_IFL_RESEND) { lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; rv = -EBUSY; @@ -1993,6 +2131,7 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) error = -EINPROGRESS; add_lkb(r, lkb, DLM_LKSTS_WAITING); send_blocking_asts(r, lkb); + add_timeout(lkb); goto out; } @@ -2040,6 +2179,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) del_lkb(r, lkb); add_lkb(r, lkb, DLM_LKSTS_CONVERT); send_blocking_asts(r, lkb); + add_timeout(lkb); goto out; } @@ -3110,9 +3250,10 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) lkb->lkb_remid = ms->m_lkid; if (is_altmode(lkb)) munge_altmode(lkb, ms); - if (result) + if (result) { add_lkb(r, lkb, DLM_LKSTS_WAITING); - else { + add_timeout(lkb); + } else { grant_lock_pc(r, lkb, ms); queue_cast(r, lkb, 0); } @@ -3178,6 +3319,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, munge_demoted(lkb, ms); del_lkb(r, lkb); add_lkb(r, lkb, DLM_LKSTS_CONVERT); + add_timeout(lkb); break; case 0: diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 19403aa08739..6b5b71f0e9dd 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -26,6 +26,8 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); +void dlm_scan_timeout(struct dlm_ls *ls); +void dlm_adjust_timeouts(struct dlm_ls *ls); int dlm_purge_locks(struct dlm_ls *ls); void dlm_purge_mstcpy_locks(struct dlm_rsb *r); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 414a108df934..339a204d7479 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -237,6 +237,7 @@ static int dlm_scand(void *data) list_for_each_entry(ls, &lslist, ls_list) { if (dlm_lock_recovery_try(ls)) { dlm_scan_rsbs(ls); + dlm_scan_timeout(ls); dlm_unlock_recovery(ls); } } @@ -421,11 +422,16 @@ static int new_lockspace(char *name, int namelen, void **lockspace, goto out; memcpy(ls->ls_name, name, namelen); ls->ls_namelen = namelen; - ls->ls_exflags = flags; ls->ls_lvblen = lvblen; ls->ls_count = 0; ls->ls_flags = 0; + /* ls_exflags are forced to match among nodes, and we don't + need to require all nodes to have TIMEWARN active */ + if (flags & DLM_LSFL_TIMEWARN) + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + ls->ls_exflags = (flags & ~DLM_LSFL_TIMEWARN); + size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; @@ -465,6 +471,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace, mutex_init(&ls->ls_waiters_mutex); INIT_LIST_HEAD(&ls->ls_orphans); mutex_init(&ls->ls_orphans_mutex); + INIT_LIST_HEAD(&ls->ls_timeout); + mutex_init(&ls->ls_timeout_mutex); INIT_LIST_HEAD(&ls->ls_nodes); INIT_LIST_HEAD(&ls->ls_nodes_gone); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 162fbae58fe5..eca2907f2386 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -25,6 +25,8 @@ void dlm_unregister_debugfs(void); static inline int dlm_register_debugfs(void) { return 0; } static inline void dlm_unregister_debugfs(void) { } #endif +int dlm_netlink_init(void); +void dlm_netlink_exit(void); static int __init init_dlm(void) { @@ -50,10 +52,16 @@ static int __init init_dlm(void) if (error) goto out_debug; + error = dlm_netlink_init(); + if (error) + goto out_user; + printk("DLM (built %s %s) installed\n", __DATE__, __TIME__); return 0; + out_user: + dlm_user_exit(); out_debug: dlm_unregister_debugfs(); out_config: @@ -68,6 +76,7 @@ static int __init init_dlm(void) static void __exit exit_dlm(void) { + dlm_netlink_exit(); dlm_user_exit(); dlm_config_exit(); dlm_memory_exit(); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 85e2897bd740..f08faec3d854 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -284,6 +284,9 @@ int dlm_ls_stop(struct dlm_ls *ls) dlm_recoverd_suspend(ls); ls->ls_recover_status = 0; dlm_recoverd_resume(ls); + + if (!ls->ls_recover_begin) + ls->ls_recover_begin = jiffies; return 0; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c new file mode 100644 index 000000000000..804b32cd22c1 --- /dev/null +++ b/fs/dlm/netlink.c @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include + +#include "dlm_internal.h" + +static uint32_t dlm_nl_seqnum; +static uint32_t listener_nlpid; + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = DLM_GENL_NAME, + .version = DLM_GENL_VERSION, +}; + +static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) +{ + struct sk_buff *skb; + void *data; + + skb = genlmsg_new(size, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + /* add the message headers */ + data = genlmsg_put(skb, 0, dlm_nl_seqnum++, &family, 0, cmd); + if (!data) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + return 0; +} + +static struct dlm_lock_data *mk_data(struct sk_buff *skb) +{ + struct nlattr *ret; + + ret = nla_reserve(skb, DLM_TYPE_LOCK, sizeof(struct dlm_lock_data)); + if (!ret) + return NULL; + return nla_data(ret); +} + +static int send_data(struct sk_buff *skb) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *data = genlmsg_data(genlhdr); + int rv; + + rv = genlmsg_end(skb, data); + if (rv < 0) { + nlmsg_free(skb); + return rv; + } + + return genlmsg_unicast(skb, listener_nlpid); +} + +static int user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + listener_nlpid = info->snd_pid; + printk("user_cmd nlpid %u\n", listener_nlpid); + return 0; +} + +static struct genl_ops dlm_nl_ops = { + .cmd = DLM_CMD_HELLO, + .doit = user_cmd, +}; + +int dlm_netlink_init(void) +{ + int rv; + + rv = genl_register_family(&family); + if (rv) + return rv; + + rv = genl_register_ops(&family, &dlm_nl_ops); + if (rv < 0) + goto err; + return 0; + err: + genl_unregister_family(&family); + return rv; +} + +void dlm_netlink_exit(void) +{ + genl_unregister_ops(&family, &dlm_nl_ops); + genl_unregister_family(&family); +} + +static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb) +{ + struct dlm_rsb *r = lkb->lkb_resource; + struct dlm_user_args *ua = (struct dlm_user_args *) lkb->lkb_astparam; + + memset(data, 0, sizeof(struct dlm_lock_data)); + + data->version = DLM_LOCK_DATA_VERSION; + data->nodeid = lkb->lkb_nodeid; + data->ownpid = lkb->lkb_ownpid; + data->id = lkb->lkb_id; + data->remid = lkb->lkb_remid; + data->status = lkb->lkb_status; + data->grmode = lkb->lkb_grmode; + data->rqmode = lkb->lkb_rqmode; + data->timestamp = lkb->lkb_timestamp; + if (ua) + data->xid = ua->xid; + if (r) { + data->lockspace_id = r->res_ls->ls_global_id; + data->resource_namelen = r->res_length; + memcpy(data->resource_name, r->res_name, r->res_length); + } +} + +void dlm_timeout_warn(struct dlm_lkb *lkb) +{ + struct dlm_lock_data *data; + struct sk_buff *send_skb; + size_t size; + int rv; + + log_debug(lkb->lkb_resource->res_ls, "timeout_warn %x", lkb->lkb_id); + + size = nla_total_size(sizeof(struct dlm_lock_data)) + + nla_total_size(0); /* why this? */ + + rv = prepare_data(DLM_CMD_TIMEOUT, &send_skb, size); + if (rv < 0) + return; + + data = mk_data(send_skb); + if (!data) { + nlmsg_free(send_skb); + return; + } + + fill_data(data, lkb); + + send_data(send_skb); +} + diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 3cb636d60249..66575997861c 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -190,6 +190,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_clear_members_gone(ls); + dlm_adjust_timeouts(ls); + error = enable_locking(ls, rv->seq); if (error) { log_debug(ls, "enable_locking failed %d", error); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index b0201ec325a7..c7612da5b617 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -348,7 +348,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) return -EPERM; error = dlm_new_lockspace(params->name, strlen(params->name), - &lockspace, 0, DLM_USER_LVB_LEN); + &lockspace, params->flags, DLM_USER_LVB_LEN); if (error) return error; -- cgit v1.2.1 From d7db923ea4990edb5583bf54af868ba687a1bc84 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 May 2007 09:00:32 -0500 Subject: [DLM] dlm_device interface changes [3/6] Change the user/kernel device interface used by libdlm: - Add ability for userspace to check the version of the interface. libdlm can now adapt to different versions of the kernel interface. - Increase the size of the flags passed in a lock request so all possible flags can be used from userspace. - Add an opaque "xid" value for each lock. This "transaction id" will be used later to associate locks with each other during deadlock detection. - Add a "timeout" value for each lock. This is used along with the DLM_LKF_TIMEOUT flag. Also, remove a fragment of unused code in device_read(). This patch requires updating libdlm which is backward compatible with older kernels. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/dlm_internal.h | 2 ++ fs/dlm/lock.c | 24 ++++++++++++----------- fs/dlm/lock.h | 6 ++++-- fs/dlm/user.c | 53 +++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 62 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 65a5fc076b8a..a8d6e993697c 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -151,6 +151,7 @@ struct dlm_args { void *bastaddr; int mode; struct dlm_lksb *lksb; + unsigned long timeout; }; @@ -528,6 +529,7 @@ struct dlm_user_args { void __user *castaddr; void __user *bastparam; void __user *bastaddr; + uint64_t xid; }; #define DLM_PROC_FLAGS_CLOSING 1 diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index ab986dfbe6d3..ad3797a37942 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1098,6 +1098,8 @@ void dlm_scan_timeout(struct dlm_ls *ls) } if (do_cancel) { + log_debug("timeout cancel %x node %d %s", lkb->lkb_id, + lkb->lkb_nodeid, r->res_name); lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL; del_timeout(lkb); @@ -1864,7 +1866,7 @@ static void confirm_master(struct dlm_rsb *r, int error) } static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, - int namelen, uint32_t parent_lkid, void *ast, + int namelen, unsigned long timeout_cs, void *ast, void *astarg, void *bast, struct dlm_args *args) { int rv = -EINVAL; @@ -1907,10 +1909,6 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr) goto out; - /* parent/child locks not yet supported */ - if (parent_lkid) - goto out; - if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid) goto out; @@ -1922,6 +1920,7 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, args->astaddr = ast; args->astparam = (long) astarg; args->bastaddr = bast; + args->timeout = timeout_cs; args->mode = mode; args->lksb = lksb; rv = 0; @@ -1976,6 +1975,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_lksb = args->lksb; lkb->lkb_lvbptr = args->lksb->sb_lvbptr; lkb->lkb_ownpid = (int) current->pid; + lkb->lkb_timeout_cs = args->timeout; rv = 0; out: return rv; @@ -2423,7 +2423,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; - error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast, + error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); if (error) goto out_put; @@ -4175,7 +4175,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, - uint32_t parent_lkid) + unsigned long timeout_cs) { struct dlm_lkb *lkb; struct dlm_args args; @@ -4203,7 +4203,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, When DLM_IFL_USER is set, the dlm knows that this is a userspace lock and that lkb_astparam is the dlm_user_args structure. */ - error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid, + error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args); lkb->lkb_flags |= DLM_IFL_USER; ua->old_mode = DLM_LOCK_IV; @@ -4240,7 +4240,8 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, - int mode, uint32_t flags, uint32_t lkid, char *lvb_in) + int mode, uint32_t flags, uint32_t lkid, char *lvb_in, + unsigned long timeout_cs) { struct dlm_lkb *lkb; struct dlm_args args; @@ -4268,6 +4269,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (lvb_in && ua->lksb.sb_lvbptr) memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); + ua->xid = ua_tmp->xid; ua->castparam = ua_tmp->castparam; ua->castaddr = ua_tmp->castaddr; ua->bastparam = ua_tmp->bastparam; @@ -4275,8 +4277,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->user_lksb = ua_tmp->user_lksb; ua->old_mode = lkb->lkb_grmode; - error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST, - ua, DLM_FAKE_USER_AST, &args); + error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, + DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args); if (error) goto out_put; diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 6b5b71f0e9dd..99ab4635074e 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -38,9 +38,11 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, - uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid); + uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs); int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, - int mode, uint32_t flags, uint32_t lkid, char *lvb_in); + int mode, uint32_t flags, uint32_t lkid, char *lvb_in, + unsigned long timeout_cs); int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in); int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, diff --git a/fs/dlm/user.c b/fs/dlm/user.c index c7612da5b617..37aad3fe8949 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -33,16 +33,17 @@ static const struct file_operations device_fops; struct dlm_lock_params32 { __u8 mode; __u8 namelen; - __u16 flags; + __u16 unused; + __u32 flags; __u32 lkid; __u32 parent; - + __u64 xid; + __u64 timeout; __u32 castparam; __u32 castaddr; __u32 bastparam; __u32 bastaddr; __u32 lksb; - char lvb[DLM_USER_LVB_LEN]; char name[0]; }; @@ -68,6 +69,7 @@ struct dlm_lksb32 { }; struct dlm_lock_result32 { + __u32 version[3]; __u32 length; __u32 user_astaddr; __u32 user_astparam; @@ -102,6 +104,8 @@ static void compat_input(struct dlm_write_request *kb, kb->i.lock.flags = kb32->i.lock.flags; kb->i.lock.lkid = kb32->i.lock.lkid; kb->i.lock.parent = kb32->i.lock.parent; + kb->i.lock.xid = kb32->i.lock.xid; + kb->i.lock.timeout = kb32->i.lock.timeout; kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam; kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr; kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam; @@ -115,6 +119,10 @@ static void compat_input(struct dlm_write_request *kb, static void compat_output(struct dlm_lock_result *res, struct dlm_lock_result32 *res32) { + res32->version[0] = res->version[0]; + res32->version[1] = res->version[1]; + res32->version[2] = res->version[2]; + res32->user_astaddr = (__u32)(long)res->user_astaddr; res32->user_astparam = (__u32)(long)res->user_astparam; res32->user_lksb = (__u32)(long)res->user_lksb; @@ -252,16 +260,18 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->castaddr = params->castaddr; ua->bastparam = params->bastparam; ua->bastaddr = params->bastaddr; + ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) error = dlm_user_convert(ls, ua, params->mode, params->flags, - params->lkid, params->lvb); + params->lkid, params->lvb, + (unsigned long) params->timeout); else { error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, - params->parent); + (unsigned long) params->timeout); if (!error) error = ua->lksb.sb_lkid; } @@ -641,6 +651,9 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, int struct_len; memset(&result, 0, sizeof(struct dlm_lock_result)); + result.version[0] = DLM_DEVICE_VERSION_MAJOR; + result.version[1] = DLM_DEVICE_VERSION_MINOR; + result.version[2] = DLM_DEVICE_VERSION_PATCH; memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb)); result.user_lksb = ua->user_lksb; @@ -699,6 +712,20 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, return error; } +static int copy_version_to_user(char __user *buf, size_t count) +{ + struct dlm_device_version ver; + + memset(&ver, 0, sizeof(struct dlm_device_version)); + ver.version[0] = DLM_DEVICE_VERSION_MAJOR; + ver.version[1] = DLM_DEVICE_VERSION_MINOR; + ver.version[2] = DLM_DEVICE_VERSION_PATCH; + + if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version))) + return -EFAULT; + return sizeof(struct dlm_device_version); +} + /* a read returns a single ast described in a struct dlm_lock_result */ static ssize_t device_read(struct file *file, char __user *buf, size_t count, @@ -710,6 +737,16 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, DECLARE_WAITQUEUE(wait, current); int error, type=0, bmode=0, removed = 0; + if (count == sizeof(struct dlm_device_version)) { + error = copy_version_to_user(buf, count); + return error; + } + + if (!proc) { + log_print("non-version read from control device %zu", count); + return -EINVAL; + } + #ifdef CONFIG_COMPAT if (count < sizeof(struct dlm_lock_result32)) #else @@ -747,11 +784,6 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, } } - if (list_empty(&proc->asts)) { - spin_unlock(&proc->asts_spin); - return -EAGAIN; - } - /* there may be both completion and blocking asts to return for the lkb, don't remove lkb from asts list unless no asts remain */ @@ -823,6 +855,7 @@ static const struct file_operations device_fops = { static const struct file_operations ctl_device_fops = { .open = ctl_device_open, .release = ctl_device_close, + .read = device_read, .write = device_write, .owner = THIS_MODULE, }; -- cgit v1.2.1 From c85d65e91430db94ae9ce0cf38b56e496658b642 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 May 2007 09:01:26 -0500 Subject: [DLM] cancel in conversion deadlock [4/6] When conversion deadlock is detected, cancel the conversion and return EDEADLK to the application. This is a new default behavior where before the dlm would allow the deadlock to exist indefinately. The DLM_LKF_NODLCKWT flag can now be used in a conversion to prevent the dlm from performing conversion deadlock detection/cancelation on it. The DLM_LKF_CONVDEADLK flag can continue to be used as before to tell the dlm to demote the granted mode of the lock being converted if it gets into a conversion deadlock. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lock.c | 193 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 132 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index ad3797a37942..3c4d570477b4 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1408,10 +1408,8 @@ static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb) * queue for one resource. The granted mode of each lock blocks the requested * mode of the other lock." * - * Part 2: if the granted mode of lkb is preventing the first lkb in the - * convert queue from being granted, then demote lkb (set grmode to NL). - * This second form requires that we check for conv-deadlk even when - * now == 0 in _can_be_granted(). + * Part 2: if the granted mode of lkb is preventing an earlier lkb in the + * convert queue from being granted, then deadlk/demote lkb. * * Example: * Granted Queue: empty @@ -1420,41 +1418,52 @@ static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb) * * The first lock can't be granted because of the granted mode of the second * lock and the second lock can't be granted because it's not first in the - * list. We demote the granted mode of the second lock (the lkb passed to this - * function). + * list. We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we + * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK + * flag set and return DEMOTED in the lksb flags. * - * After the resolution, the "grant pending" function needs to go back and try - * to grant locks on the convert queue again since the first lock can now be - * granted. + * Originally, this function detected conv-deadlk in a more limited scope: + * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or + * - if lkb1 was the first entry in the queue (not just earlier), and was + * blocked by the granted mode of lkb2, and there was nothing on the + * granted queue preventing lkb1 from being granted immediately, i.e. + * lkb2 was the only thing preventing lkb1 from being granted. + * + * That second condition meant we'd only say there was conv-deadlk if + * resolving it (by demotion) would lead to the first lock on the convert + * queue being granted right away. It allowed conversion deadlocks to exist + * between locks on the convert queue while they couldn't be granted anyway. + * + * Now, we detect and take action on conversion deadlocks immediately when + * they're created, even if they may not be immediately consequential. If + * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted + * mode that would prevent lkb1's conversion from being granted, we do a + * deadlk/demote on lkb2 right away and don't let it onto the convert queue. + * I think this means that the lkb_is_ahead condition below should always + * be zero, i.e. there will never be conv-deadlk between two locks that are + * both already on the convert queue. */ -static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb) +static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2) { - struct dlm_lkb *this, *first = NULL, *self = NULL; + struct dlm_lkb *lkb1; + int lkb_is_ahead = 0; - list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) { - if (!first) - first = this; - if (this == lkb) { - self = lkb; + list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) { + if (lkb1 == lkb2) { + lkb_is_ahead = 1; continue; } - if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) - return 1; - } - - /* if lkb is on the convert queue and is preventing the first - from being granted, then there's deadlock and we demote lkb. - multiple converting locks may need to do this before the first - converting lock can be granted. */ - - if (self && self != first) { - if (!modes_compat(lkb, first) && - !queue_conflict(&rsb->res_grantqueue, first)) - return 1; + if (!lkb_is_ahead) { + if (!modes_compat(lkb2, lkb1)) + return 1; + } else { + if (!modes_compat(lkb2, lkb1) && + !modes_compat(lkb1, lkb2)) + return 1; + } } - return 0; } @@ -1583,42 +1592,57 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) if (!now && !conv && list_empty(&r->res_convertqueue) && first_in_list(lkb, &r->res_waitqueue)) return 1; - out: - /* - * The following, enabled by CONVDEADLK, departs from VMS. - */ - - if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) && - conversion_deadlock_detect(r, lkb)) { - lkb->lkb_grmode = DLM_LOCK_NL; - lkb->lkb_sbflags |= DLM_SBF_DEMOTED; - } - return 0; } -/* - * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a - * simple way to provide a big optimization to applications that can use them. - */ - -static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) +static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, + int *err) { - uint32_t flags = lkb->lkb_exflags; int rv; int8_t alt = 0, rqmode = lkb->lkb_rqmode; + int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV); + + if (err) + *err = 0; rv = _can_be_granted(r, lkb, now); if (rv) goto out; - if (lkb->lkb_sbflags & DLM_SBF_DEMOTED) + /* + * The CONVDEADLK flag is non-standard and tells the dlm to resolve + * conversion deadlocks by demoting grmode to NL, otherwise the dlm + * cancels one of the locks. + */ + + if (is_convert && can_be_queued(lkb) && + conversion_deadlock_detect(r, lkb)) { + if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) { + lkb->lkb_grmode = DLM_LOCK_NL; + lkb->lkb_sbflags |= DLM_SBF_DEMOTED; + } else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { + if (err) + *err = -EDEADLK; + else { + log_print("can_be_granted deadlock %x now %d", + lkb->lkb_id, now); + dlm_dump_rsb(r); + } + } goto out; + } - if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR) + /* + * The ALTPR and ALTCW flags are non-standard and tell the dlm to try + * to grant a request in a mode other than the normal rqmode. It's a + * simple way to provide a big optimization to applications that can + * use them. + */ + + if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR)) alt = DLM_LOCK_PR; - else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW) + else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW)) alt = DLM_LOCK_CW; if (alt) { @@ -1633,10 +1657,20 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) return rv; } +/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock + for locks pending on the convert list. Once verified (watch for these + log_prints), we should be able to just call _can_be_granted() and not + bother with the demote/deadlk cases here (and there's no easy way to deal + with a deadlk here, we'd have to generate something like grant_lock with + the deadlk error.) */ + +/* returns the highest requested mode of all blocked conversions */ + static int grant_pending_convert(struct dlm_rsb *r, int high) { struct dlm_lkb *lkb, *s; int hi, demoted, quit, grant_restart, demote_restart; + int deadlk; quit = 0; restart: @@ -1646,14 +1680,29 @@ static int grant_pending_convert(struct dlm_rsb *r, int high) list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) { demoted = is_demoted(lkb); - if (can_be_granted(r, lkb, 0)) { + deadlk = 0; + + if (can_be_granted(r, lkb, 0, &deadlk)) { grant_lock_pending(r, lkb); grant_restart = 1; - } else { - hi = max_t(int, lkb->lkb_rqmode, hi); - if (!demoted && is_demoted(lkb)) - demote_restart = 1; + continue; } + + if (!demoted && is_demoted(lkb)) { + log_print("WARN: pending demoted %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + demote_restart = 1; + continue; + } + + if (deadlk) { + log_print("WARN: pending deadlock %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + dlm_dump_rsb(r); + continue; + } + + hi = max_t(int, lkb->lkb_rqmode, hi); } if (grant_restart) @@ -1671,7 +1720,7 @@ static int grant_pending_wait(struct dlm_rsb *r, int high) struct dlm_lkb *lkb, *s; list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { - if (can_be_granted(r, lkb, 0)) + if (can_be_granted(r, lkb, 0, NULL)) grant_lock_pending(r, lkb); else high = max_t(int, lkb->lkb_rqmode, high); @@ -2121,7 +2170,7 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) { int error = 0; - if (can_be_granted(r, lkb, 1)) { + if (can_be_granted(r, lkb, 1, NULL)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); goto out; @@ -2147,16 +2196,32 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) { int error = 0; + int deadlk = 0; /* changing an existing lock may allow others to be granted */ - if (can_be_granted(r, lkb, 1)) { + if (can_be_granted(r, lkb, 1, &deadlk)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); grant_pending_locks(r); goto out; } + /* can_be_granted() detected that this lock would block in a conversion + deadlock, so we leave it on the granted queue and return EDEADLK in + the ast for the convert. */ + + if (deadlk) { + /* it's left on the granted queue */ + log_debug(r->res_ls, "deadlock %x node %d sts%d g%d r%d %s", + lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_status, + lkb->lkb_grmode, lkb->lkb_rqmode, r->res_name); + revert_lock(r, lkb); + queue_cast(r, lkb, -EDEADLK); + error = -EDEADLK; + goto out; + } + /* is_demoted() means the can_be_granted() above set the grmode to NL, and left us on the granted queue. This auto-demotion (due to CONVDEADLK) might mean other locks, and/or this lock, are @@ -2438,7 +2503,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, out_put: if (convert || error) __put_lkb(ls, lkb); - if (error == -EAGAIN) + if (error == -EAGAIN || error == -EDEADLK) error = 0; out: dlm_unlock_recovery(ls); @@ -3312,6 +3377,12 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, queue_cast(r, lkb, -EAGAIN); break; + case -EDEADLK: + receive_flags_reply(lkb, ms); + revert_lock_pc(r, lkb); + queue_cast(r, lkb, -EDEADLK); + break; + case -EINPROGRESS: /* convert was queued on remote master */ receive_flags_reply(lkb, ms); @@ -4284,7 +4355,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, error = convert_lock(ls, lkb, &args); - if (error == -EINPROGRESS || error == -EAGAIN) + if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK) error = 0; out_put: dlm_put_lkb(lkb); -- cgit v1.2.1 From 79d72b54483bf81b9f9de0dd555c710ac7267986 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 May 2007 09:02:20 -0500 Subject: [DLM] fix new_lockspace error exit [5/6] Fix the error path when exiting new_lockspace(). It was kfree'ing the lockspace struct at the end, but that's only valid if it exits before kobject_register occured. After kobject_register we have to let the kobject do the freeing. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lockspace.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 339a204d7479..a3a50e67e4dd 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -400,6 +400,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, { struct dlm_ls *ls; int i, size, error = -ENOMEM; + int do_unreg = 0; if (namelen > DLM_LOCKSPACE_LEN) return -EINVAL; @@ -525,32 +526,34 @@ static int new_lockspace(char *name, int namelen, void **lockspace, error = dlm_recoverd_start(ls); if (error) { log_error(ls, "can't start dlm_recoverd %d", error); - goto out_rcomfree; + goto out_delist; } - dlm_create_debug_file(ls); - error = kobject_setup(ls); if (error) - goto out_del; + goto out_stop; error = kobject_register(&ls->ls_kobj); if (error) - goto out_del; + goto out_stop; + + /* let kobject handle freeing of ls if there's an error */ + do_unreg = 1; error = do_uevent(ls, 1); if (error) - goto out_unreg; + goto out_stop; + + dlm_create_debug_file(ls); + + log_debug(ls, "join complete"); *lockspace = ls; return 0; - out_unreg: - kobject_unregister(&ls->ls_kobj); - out_del: - dlm_delete_debug_file(ls); + out_stop: dlm_recoverd_stop(ls); - out_rcomfree: + out_delist: spin_lock(&lslist_lock); list_del(&ls->ls_list); spin_unlock(&lslist_lock); @@ -562,7 +565,10 @@ static int new_lockspace(char *name, int namelen, void **lockspace, out_rsbfree: kfree(ls->ls_rsbtbl); out_lsfree: - kfree(ls); + if (do_unreg) + kobject_unregister(&ls->ls_kobj); + else + kfree(ls); out: module_put(THIS_MODULE); return error; @@ -708,7 +714,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_clear_members_gone(ls); kfree(ls->ls_node_array); kobject_unregister(&ls->ls_kobj); - /* The ls structure will be freed when the kobject is done with */ + /* The ls structure will be freed when the kobject is done with */ mutex_lock(&ls_lock); ls_count--; -- cgit v1.2.1 From 8b0e7b2cf35aa827ed5efb508c1879481b970496 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 May 2007 09:03:35 -0500 Subject: [DLM] wait for config check during join [6/6] Joining the lockspace should wait for the initial round of inter-node config checks to complete before returning. This way, if there's a configuration mismatch between the joining node and the existing nodes, the join can fail and return an error to the application. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/dlm_internal.h | 2 ++ fs/dlm/lockspace.c | 30 ++++++++++++++++++++++++++++++ fs/dlm/member.c | 6 ++++++ fs/dlm/rcom.c | 4 ++-- 4 files changed, 40 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index a8d6e993697c..03ba6c4fd5c2 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -472,6 +472,8 @@ struct dlm_ls { wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; + struct completion ls_members_done; + int ls_members_result; struct miscdevice ls_device; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index a3a50e67e4dd..c8f0c15ac166 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -197,13 +197,24 @@ static int do_uevent(struct dlm_ls *ls, int in) else kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); + log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving"); + + /* dlm_controld will see the uevent, do the necessary group management + and then write to sysfs to wake us */ + error = wait_event_interruptible(ls->ls_uevent_wait, test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); + + log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result); + if (error) goto out; error = ls->ls_uevent_result; out: + if (error) + log_error(ls, "group %s failed %d %d", in ? "join" : "leave", + error, ls->ls_uevent_result); return error; } @@ -490,6 +501,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace, init_waitqueue_head(&ls->ls_uevent_wait); ls->ls_uevent_result = 0; + init_completion(&ls->ls_members_done); + ls->ls_members_result = -1; ls->ls_recoverd_task = NULL; mutex_init(&ls->ls_recoverd_active); @@ -540,10 +553,21 @@ static int new_lockspace(char *name, int namelen, void **lockspace, /* let kobject handle freeing of ls if there's an error */ do_unreg = 1; + /* This uevent triggers dlm_controld in userspace to add us to the + group of nodes that are members of this lockspace (managed by the + cluster infrastructure.) Once it's done that, it tells us who the + current lockspace members are (via configfs) and then tells the + lockspace to start running (via sysfs) in dlm_ls_start(). */ + error = do_uevent(ls, 1); if (error) goto out_stop; + wait_for_completion(&ls->ls_members_done); + error = ls->ls_members_result; + if (error) + goto out_members; + dlm_create_debug_file(ls); log_debug(ls, "join complete"); @@ -551,6 +575,10 @@ static int new_lockspace(char *name, int namelen, void **lockspace, *lockspace = ls; return 0; + out_members: + do_uevent(ls, 0); + dlm_clear_members(ls); + kfree(ls->ls_node_array); out_stop: dlm_recoverd_stop(ls); out_delist: @@ -588,6 +616,8 @@ int dlm_new_lockspace(char *name, int namelen, void **lockspace, error = new_lockspace(name, namelen, lockspace, flags, lvblen); if (!error) ls_count++; + else if (!ls_count) + threads_stop(); out: mutex_unlock(&ls_lock); return error; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index f08faec3d854..073599dced2a 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -233,6 +233,12 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); + if (!error || error == -EPROTO) { + /* new_lockspace() may be waiting to know if the config + is good or bad */ + ls->ls_members_result = error; + complete(&ls->ls_members_done); + } if (error) goto out; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 6bfbd6153809..f71c23542f0f 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -90,7 +90,7 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) log_error(ls, "version mismatch: %x nodeid %d: %x", DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid, rc->rc_header.h_version); - return -EINVAL; + return -EPROTO; } if (rf->rf_lvblen != ls->ls_lvblen || @@ -98,7 +98,7 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", ls->ls_lvblen, ls->ls_exflags, nodeid, rf->rf_lvblen, rf->rf_lsflags); - return -EINVAL; + return -EPROTO; } return 0; } -- cgit v1.2.1 From 639aca417d91ebba1077a6084e4423af1c1dd811 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 May 2007 16:02:57 -0500 Subject: [DLM] fix compile breakage In the rush to get the previous patch set sent, a compilation bug I fixed shortly before sending somehow got clobbered, probably by a missed quilt refresh or something. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 3c4d570477b4..b47e6fd0172c 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1098,8 +1098,8 @@ void dlm_scan_timeout(struct dlm_ls *ls) } if (do_cancel) { - log_debug("timeout cancel %x node %d %s", lkb->lkb_id, - lkb->lkb_nodeid, r->res_name); + log_debug(r->res_ls, "timeout cancel %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL; del_timeout(lkb); -- cgit v1.2.1 From b3cab7b9a34a6e65c1ca8f80fb57b256d57e8555 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 29 May 2007 11:14:21 +0100 Subject: [DLM] Compile fix A one liner fix which got missed from the earlier patches. Signed-off-by: Steven Whitehouse Cc: Fabio Massimo Di Nitto Cc: David Teigland --- fs/dlm/lock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index b47e6fd0172c..2f8a5a700cc0 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1098,7 +1098,7 @@ void dlm_scan_timeout(struct dlm_ls *ls) } if (do_cancel) { - log_debug(r->res_ls, "timeout cancel %x node %d %s", + log_debug(ls, "timeout cancel %x node %d %s", lkb->lkb_id, lkb->lkb_nodeid, r->res_name); lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL; -- cgit v1.2.1 From 84d8cd69a8e7f1c9962f46bc79850c9f1f663806 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 29 May 2007 08:44:23 -0500 Subject: [DLM] timeout fixes Various fixes related to the new timeout feature: - add_timeout() missed setting TIMEWARN flag on lkb's when the TIMEOUT flag was already set - clear_proc_locks should remove a dead process's locks from the timeout list - the end-of-life calculation for user locks needs to consider that ETIMEDOUT is equivalent to -DLM_ECANCEL - make initial default timewarn_cs config value visible in configfs - change bit position of TIMEOUT_CANCEL flag so it's not copied to a remote master node - set timestamp on remote lkb's so a lock dump will display the time they've been waiting Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/config.c | 1 + fs/dlm/dlm_internal.h | 2 +- fs/dlm/lock.c | 13 +++++++------ fs/dlm/netlink.c | 2 -- fs/dlm/user.c | 49 ++++++++++++++++++++++++++++++------------------- 5 files changed, 39 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 2909abf1bbc3..1b59fa56a599 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -433,6 +433,7 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_toss_secs = dlm_config.ci_toss_secs; cl->cl_scan_secs = dlm_config.ci_scan_secs; cl->cl_log_debug = dlm_config.ci_log_debug; + cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; space_list = &sps->ss_group; comm_list = &cms->cs_group; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 03ba6c4fd5c2..a7435a8df35e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -215,9 +215,9 @@ struct dlm_args { #define DLM_IFL_OVERLAP_CANCEL 0x00100000 #define DLM_IFL_ENDOFLIFE 0x00200000 #define DLM_IFL_WATCH_TIMEWARN 0x00400000 +#define DLM_IFL_TIMEOUT_CANCEL 0x00800000 #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 -#define DLM_IFL_TIMEOUT_CANCEL 0x00000004 struct dlm_lkb { struct dlm_rsb *lkb_resource; /* the rsb */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 2f8a5a700cc0..df91578145d1 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1010,17 +1010,18 @@ static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - if (is_master_copy(lkb)) + if (is_master_copy(lkb)) { + lkb->lkb_timestamp = jiffies; return; - - if (lkb->lkb_exflags & DLM_LKF_TIMEOUT) - goto add_it; + } if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN; goto add_it; } + if (lkb->lkb_exflags & DLM_LKF_TIMEOUT) + goto add_it; return; add_it: @@ -3510,8 +3511,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) case -DLM_ECANCEL: receive_flags_reply(lkb, ms); revert_lock_pc(r, lkb); - if (ms->m_result) - queue_cast(r, lkb, -DLM_ECANCEL); + queue_cast(r, lkb, -DLM_ECANCEL); break; case 0: break; @@ -4534,6 +4534,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) lkb = del_proc_lock(ls, proc); if (!lkb) break; + del_timeout(lkb); if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) orphan_proc_lock(ls, lkb); else diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 804b32cd22c1..863b87d0dc71 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -133,8 +133,6 @@ void dlm_timeout_warn(struct dlm_lkb *lkb) size_t size; int rv; - log_debug(lkb->lkb_resource->res_ls, "timeout_warn %x", lkb->lkb_id); - size = nla_total_size(sizeof(struct dlm_lock_data)) + nla_total_size(0); /* why this? */ diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 37aad3fe8949..329da1b5285f 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -138,6 +138,35 @@ static void compat_output(struct dlm_lock_result *res, } #endif +/* Figure out if this lock is at the end of its life and no longer + available for the application to use. The lkb still exists until + the final ast is read. A lock becomes EOL in three situations: + 1. a noqueue request fails with EAGAIN + 2. an unlock completes with EUNLOCK + 3. a cancel of a waiting request completes with ECANCEL/EDEADLK + An EOL lock needs to be removed from the process's list of locks. + And we can't allow any new operation on an EOL lock. This is + not related to the lifetime of the lkb struct which is managed + entirely by refcount. */ + +static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) +{ + switch (sb_status) { + case -DLM_EUNLOCK: + return 1; + case -DLM_ECANCEL: + case -ETIMEDOUT: + if (lkb->lkb_grmode == DLM_LOCK_IV) + return 1; + break; + case -EAGAIN: + if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV) + return 1; + break; + } + return 0; +} + /* we could possibly check if the cancel of an orphan has resulted in the lkb being removed and then remove that lkb from the orphans list and free it */ @@ -184,25 +213,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type) log_debug(ls, "ast overlap %x status %x %x", lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags); - /* Figure out if this lock is at the end of its life and no longer - available for the application to use. The lkb still exists until - the final ast is read. A lock becomes EOL in three situations: - 1. a noqueue request fails with EAGAIN - 2. an unlock completes with EUNLOCK - 3. a cancel of a waiting request completes with ECANCEL - An EOL lock needs to be removed from the process's list of locks. - And we can't allow any new operation on an EOL lock. This is - not related to the lifetime of the lkb struct which is managed - entirely by refcount. */ - - if (type == AST_COMP && - lkb->lkb_grmode == DLM_LOCK_IV && - ua->lksb.sb_status == -EAGAIN) - eol = 1; - else if (ua->lksb.sb_status == -DLM_EUNLOCK || - (ua->lksb.sb_status == -DLM_ECANCEL && - lkb->lkb_grmode == DLM_LOCK_IV)) - eol = 1; + eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type); if (eol) { lkb->lkb_ast_type &= ~AST_BAST; lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; -- cgit v1.2.1 From 8b4021fa436f7c76a2299e6d85d4d4a619724e9a Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 29 May 2007 08:46:00 -0500 Subject: [DLM] canceling deadlocked lock Add a function that can be used through libdlm by a system daemon to cancel another process's deadlocked lock. A completion ast with EDEADLK is returned to the process waiting for the lock. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/dlm_internal.h | 1 + fs/dlm/lock.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/dlm/lock.h | 1 + fs/dlm/user.c | 25 ++++++++++++++++++++++++ 4 files changed, 80 insertions(+) (limited to 'fs') diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index a7435a8df35e..a006fa59e7da 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -216,6 +216,7 @@ struct dlm_args { #define DLM_IFL_ENDOFLIFE 0x00200000 #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index df91578145d1..de943afacb37 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -300,6 +300,11 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) rv = -ETIMEDOUT; } + if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { + lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; + rv = -EDEADLK; + } + lkb->lkb_lksb->sb_status = rv; lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; @@ -4450,6 +4455,54 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, return error; } +int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + struct dlm_rsb *r; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + ua = (struct dlm_user_args *)lkb->lkb_astparam; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + /* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */ + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, &args); + if (error) + goto out_r; + lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL; + + error = _cancel_lock(r, lkb); + out_r: + unlock_rsb(r); + put_rsb(r); + + if (error == -DLM_ECANCEL) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + return error; +} + /* lkb's that are removed from the waiters list by revert are just left on the orphans list with the granted orphan locks, to be freed by purge */ diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 99ab4635074e..1720313c22df 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -49,6 +49,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid); int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, int nodeid, int pid); +int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid); void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); static inline int is_master(struct dlm_rsb *r) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 329da1b5285f..6438941ab1f8 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -156,6 +156,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) return 1; case -DLM_ECANCEL: case -ETIMEDOUT: + case -EDEADLK: if (lkb->lkb_grmode == DLM_LOCK_IV) return 1; break; @@ -320,6 +321,22 @@ static int device_user_unlock(struct dlm_user_proc *proc, return error; } +static int device_user_deadlock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + error = dlm_user_deadlock(ls, params->flags, params->lkid); + + dlm_put_lockspace(ls); + return error; +} + static int create_misc_device(struct dlm_ls *ls, char *name) { int error, len; @@ -545,6 +562,14 @@ static ssize_t device_write(struct file *file, const char __user *buf, error = device_user_unlock(proc, &kbuf->i.lock); break; + case DLM_USER_DEADLOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_sig; + } + error = device_user_deadlock(proc, &kbuf->i.lock); + break; + case DLM_USER_CREATE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); -- cgit v1.2.1 From 9dd592d70be0db6fa8e4e19d7642cfaa424b200e Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 29 May 2007 08:47:04 -0500 Subject: [DLM] dumping master locks Add a new debugfs file that dumps a compact list of mastered locks. This will be used by a userland daemon to collect state for deadlock detection. Also, for the existing function that prints all lock state, lock the rsb before going through the lock lists since they can be changing in the course of normal dlm activity. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/debug_fs.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++++- fs/dlm/dlm_internal.h | 1 + 2 files changed, 163 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 9e27a1675794..184be9870c63 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -27,6 +27,8 @@ static struct dentry *dlm_root; struct rsb_iter { int entry; + int master; + int header; struct dlm_ls *ls; struct list_head *next; struct dlm_rsb *rsb; @@ -86,6 +88,8 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) struct dlm_lkb *lkb; int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; + lock_rsb(res); + seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); for (i = 0; i < res->res_length; i++) { if (isprint(res->res_name[i])) @@ -152,6 +156,59 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) seq_printf(s, "\n"); } out: + unlock_rsb(res); + return 0; +} + +static void print_master_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *r) +{ + struct dlm_user_args *ua; + unsigned int waiting = 0; + uint64_t xid = 0; + + if (lkb->lkb_flags & DLM_IFL_USER) { + ua = (struct dlm_user_args *) lkb->lkb_astparam; + if (ua) + xid = ua->xid; + } + + if (lkb->lkb_timestamp) + waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp); + + /* id nodeid remid pid xid flags sts grmode rqmode time_ms len name */ + + seq_printf(s, "%x %d %x %u %llu %x %d %d %d %u %d \"%s\"\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + waiting, + r->res_length, + r->res_name); +} + +static int print_master_resource(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + + lock_rsb(r); + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) + print_master_lock(s, lkb, r); + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) + print_master_lock(s, lkb, r); + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) + print_master_lock(s, lkb, r); + + unlock_rsb(r); return 0; } @@ -209,7 +266,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls) { struct rsb_iter *ri; - ri = kmalloc(sizeof *ri, GFP_KERNEL); + ri = kzalloc(sizeof *ri, GFP_KERNEL); if (!ri) return NULL; @@ -267,7 +324,17 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr) { struct rsb_iter *ri = iter_ptr; - print_resource(ri->rsb, file); + if (ri->master) { + if (ri->header) { + seq_printf(file, "id nodeid remid pid xid flags sts " + "grmode rqmode time_ms len name\n"); + ri->header = 0; + } + if (is_master(ri->rsb)) + print_master_resource(ri->rsb, file); + } else { + print_resource(ri->rsb, file); + } return 0; } @@ -302,6 +369,83 @@ static const struct file_operations rsb_fops = { .release = seq_release }; +/* + * Dump master lock state + */ + +static struct rsb_iter *master_iter_init(struct dlm_ls *ls, loff_t *pos) +{ + struct rsb_iter *ri; + + ri = kzalloc(sizeof *ri, GFP_KERNEL); + if (!ri) + return NULL; + + ri->ls = ls; + ri->entry = 0; + ri->next = NULL; + ri->master = 1; + + if (*pos == 0) + ri->header = 1; + + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + + return ri; +} + +static void *master_seq_start(struct seq_file *file, loff_t *pos) +{ + struct rsb_iter *ri; + loff_t n = *pos; + + ri = master_iter_init(file->private, pos); + if (!ri) + return NULL; + + while (n--) { + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + } + + return ri; +} + +static struct seq_operations master_seq_ops = { + .start = master_seq_start, + .next = rsb_seq_next, + .stop = rsb_seq_stop, + .show = rsb_seq_show, +}; + +static int master_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &master_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations master_fops = { + .owner = THIS_MODULE, + .open = master_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + /* * dump lkb's on the ls_waiters list */ @@ -369,6 +513,20 @@ int dlm_create_debug_file(struct dlm_ls *ls) return -ENOMEM; } + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_master", ls->ls_name); + + ls->ls_debug_master_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &master_fops); + if (!ls->ls_debug_master_dentry) { + debugfs_remove(ls->ls_debug_waiters_dentry); + debugfs_remove(ls->ls_debug_rsb_dentry); + return -ENOMEM; + } + return 0; } @@ -378,6 +536,8 @@ void dlm_delete_debug_file(struct dlm_ls *ls) debugfs_remove(ls->ls_debug_rsb_dentry); if (ls->ls_debug_waiters_dentry) debugfs_remove(ls->ls_debug_waiters_dentry); + if (ls->ls_debug_master_dentry) + debugfs_remove(ls->ls_debug_master_dentry); } int dlm_register_debugfs(void) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index a006fa59e7da..f2c85493c0c6 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -470,6 +470,7 @@ struct dlm_ls { struct dentry *ls_debug_rsb_dentry; /* debugfs */ struct dentry *ls_debug_waiters_dentry; /* debugfs */ + struct dentry *ls_debug_master_dentry; /* debugfs */ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; -- cgit v1.2.1 From 0b7cac0fb0e541a7f54d0ba55b31d829ce3dd899 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 29 May 2007 08:47:51 -0500 Subject: [DLM] show default protocol Display the initial value of the "protocol" config value in configfs. The default value has always been 0 in the past anyway, so it's always appeared to be correct. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/config.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 1b59fa56a599..5069b2cb5a1f 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -433,6 +433,7 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_toss_secs = dlm_config.ci_toss_secs; cl->cl_scan_secs = dlm_config.ci_scan_secs; cl->cl_log_debug = dlm_config.ci_log_debug; + cl->cl_protocol = dlm_config.ci_protocol; cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; space_list = &sps->ss_group; -- cgit v1.2.1 From 1990e917651d58a3c5155d0491431c09e29e385b Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Thu, 31 May 2007 17:52:02 -0500 Subject: [GFS2] Quotas non-functional - fix another bug This patch fixes a bug where gfs2 was writing update quota usage information to the wrong location in the quota file. Signed-off-by: Abhijith Das Signed-off-by: Steven Whitehouse --- fs/gfs2/ondisk.c | 10 ++++++++++ fs/gfs2/quota.c | 11 +++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c index cd4cf055c37d..a5b05ea3d4c7 100644 --- a/fs/gfs2/ondisk.c +++ b/fs/gfs2/ondisk.c @@ -121,6 +121,16 @@ void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf) qu->qu_value = be64_to_cpu(str->qu_value); } +void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf) +{ + struct gfs2_quota *str = buf; + + str->qu_limit = cpu_to_be64(qu->qu_limit); + str->qu_warn = cpu_to_be64(qu->qu_warn); + str->qu_value = cpu_to_be64(qu->qu_value); + memset(&str->qu_reserved, 0, sizeof(str->qu_reserved)); +} + void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { const struct gfs2_dinode_host *di = &ip->i_di; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index fcd3ee2c5b96..8a58815dea08 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -573,12 +573,13 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct inode *inode = &ip->i_inode; struct address_space *mapping = inode->i_mapping; unsigned long index = loc >> PAGE_CACHE_SHIFT; - unsigned offset = loc & (PAGE_CACHE_SHIFT - 1); + unsigned offset = loc & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, pos; struct buffer_head *bh; struct page *page; void *kaddr; - __be64 *ptr; + char *ptr; + struct gfs2_quota_host qp; s64 value; int err = -EIO; @@ -620,8 +621,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, kaddr = kmap_atomic(page, KM_USER0); ptr = kaddr + offset; - value = (s64)be64_to_cpu(*ptr) + change; - *ptr = cpu_to_be64(value); + gfs2_quota_in(&qp, ptr); + qp.qu_value += change; + value = qp.qu_value; + gfs2_quota_out(&qp, ptr); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); err = 0; -- cgit v1.2.1 From 89918647a445fddfe223b097e29f775dcfa81eab Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 1 Jun 2007 15:19:33 +0100 Subject: [GFS2] Make the log reserved blocks depend on block size The number of blocks which we reserve in the log at the start of each transaction needs to depends upon the block size since the overhead is related to the number of "pointers" which can be fitted into a single block. This relates to Red Hat bz #240435 Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 291415ddfe51..586923d24e6e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -262,7 +262,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) * @sdp: The GFS2 superblock * @blks: The number of blocks to reserve * - * Note that we never give out the last 6 blocks of the journal. Thats + * Note that we never give out the last few blocks of the journal. Thats * due to the fact that there is are a small number of header blocks * associated with each log flush. The exact number can't be known until * flush time, so we ensure that we have just enough free blocks at all @@ -274,6 +274,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) { unsigned int try = 0; + unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); if (gfs2_assert_warn(sdp, blks) || gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) @@ -281,7 +282,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) mutex_lock(&sdp->sd_log_reserve_mutex); gfs2_log_lock(sdp); - while(sdp->sd_log_blks_free <= (blks + 6)) { + while(sdp->sd_log_blks_free <= (blks + reserved_blks)) { gfs2_log_unlock(sdp); gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL); -- cgit v1.2.1 From afb853fb4eec380b492a3c369f837359359c28e8 Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Fri, 1 Jun 2007 10:07:26 -0500 Subject: [DLM] fix socket shutdown This patch clears the user_data of active sockets as part of cleanup. This prevents any late-arriving data from trying to add jobs to the work queue while we are tidying up. Signed-Off-By: Patrick Caulfield Signed-Off-By: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lowcomms.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 27970a58d29b..fc0bff74c61e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -260,7 +260,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) static void lowcomms_data_ready(struct sock *sk, int count_unused) { struct connection *con = sock2con(sk); - if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) + if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); } @@ -268,7 +268,7 @@ static void lowcomms_write_space(struct sock *sk) { struct connection *con = sock2con(sk); - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags)) queue_work(send_workqueue, &con->swork); } @@ -1400,8 +1400,11 @@ void dlm_lowcomms_stop(void) down(&connections_lock); for (i = 0; i <= max_nodeid; i++) { con = __nodeid2con(i, 0); - if (con) + if (con) { con->flags |= 0xFF; + if (con->sock) + con->sock->sk->sk_user_data = NULL; + } } up(&connections_lock); -- cgit v1.2.1 From ddf4b426aababdae4cb96326d7aeb9d119f42c50 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Fri, 1 Jun 2007 14:21:38 -0500 Subject: [GFS2] fix jdata issues This is a patch for the first three issues of RHBZ #238162 The first issue is that when you allocate a new page for a file, it will not start off uptodate. This makes sense, since you haven't written anything to that part of the file yet. Unfortunately, gfs2_pin() checks to make sure that the buffers are uptodate. The solution to this is to mark the buffers uptodate in gfs2_commit_write(), after they have been zeroed out and have the data written into them. I'm pretty confident with this fix, although it's not completely obvious that there is no problem with marking the buffers uptodate here. The second issue is simply that you can try to pin a data buffer that is already on the incore log, and thus, already pinned. This patch checks to see if this buffer is already on the log, and exits databuf_lo_add() if it is, just like buf_lo_add() does. The third issue is that gfs2_log_flush() doesn't do it's block accounting correctly. Both metadata and journaled data are logged, but gfs2_log_flush() only compares the number of metadata blocks with the number of blocks to commit to the ondisk journal. This patch also counts the journaled data blocks. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 2 +- fs/gfs2/lops.c | 2 ++ fs/gfs2/ops_address.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 586923d24e6e..1fb846fc545e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -566,7 +566,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) INIT_LIST_HEAD(&ai->ai_ail1_list); INIT_LIST_HEAD(&ai->ai_ail2_list); - gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf); + gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf + sdp->sd_log_num_jdata == sdp->sd_log_commited_buf); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index f82d84d05d23..3e971f25120d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -475,6 +475,8 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) tr->tr_num_buf++; list_add(&bd->bd_list_tr, &tr->tr_list_buf); gfs2_log_unlock(sdp); + if (!list_empty(&le->le_list)) + return; gfs2_pin(sdp, bd->bd_bh); tr->tr_num_buf_new++; } else { diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index fb84478e1df6..ac5659521386 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -50,6 +50,8 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, end = start + bsize; if (end <= from || start >= to) continue; + if (gfs2_is_jdata(ip)) + set_buffer_uptodate(bh); gfs2_trans_add_bh(ip->i_gl, bh, 0); } } -- cgit v1.2.1 From bb8d8a6f54c1c84d7c74623491bab043b36a38c5 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 1 Jun 2007 14:11:58 +0100 Subject: [GFS2] Fix sign problem in quota/statfs and cleanup _host structures This patch fixes some sign issues which were accidentally introduced into the quota & statfs code during the endianess annotation process. Also included is a general clean up which moves all of the _host structures out of gfs2_ondisk.h (where they should not have been to start with) and into the places where they are actually used (often only one place). Also those _host structures which are not required any more are removed entirely (which is the eventual plan for all of them). The conversion routines from ondisk.c are also moved into the places where they are actually used, which for almost every one, was just one single place, so all those are now static functions. This also cleans up the end of gfs2_ondisk.h which no longer needs the #ifdef __KERNEL__. The net result is a reduction of about 100 lines of code, many functions now marked static plus the bug fixes as mentioned above. For good measure I ran the code through sparse after making these changes to check that there are no warnings generated. This fixes Red Hat bz #239686 Signed-off-by: Steven Whitehouse --- fs/gfs2/Makefile | 2 +- fs/gfs2/bmap.c | 4 +- fs/gfs2/dir.c | 2 +- fs/gfs2/eattr.c | 6 +- fs/gfs2/incore.h | 63 ++++++++++++- fs/gfs2/inode.c | 83 ++++++++++++++++- fs/gfs2/inode.h | 10 +++ fs/gfs2/ondisk.c | 246 --------------------------------------------------- fs/gfs2/ops_export.c | 10 ++- fs/gfs2/ops_export.h | 22 ----- fs/gfs2/ops_fstype.c | 13 +-- fs/gfs2/ops_fstype.h | 1 + fs/gfs2/ops_inode.c | 4 +- fs/gfs2/ops_vm.c | 2 +- fs/gfs2/quota.c | 42 ++++++++- fs/gfs2/recovery.c | 22 ++++- fs/gfs2/rgrp.c | 111 +++++++++++++++-------- fs/gfs2/super.c | 77 ++++++++++++---- fs/gfs2/super.h | 2 +- fs/gfs2/util.c | 2 +- 20 files changed, 372 insertions(+), 352 deletions(-) delete mode 100644 fs/gfs2/ondisk.c delete mode 100644 fs/gfs2/ops_export.h (limited to 'fs') diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index e3f1ada643ac..04ad0caebedb 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ - mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ + mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e76a887a89b2..b784cf3c6482 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -718,7 +718,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; rgd = rlist.rl_ghs[x].gh_gl->gl_object; - rg_blocks += rgd->rd_ri.ri_length; + rg_blocks += rgd->rd_length; } error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); @@ -824,7 +824,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size) goto out_gunlock_q; error = gfs2_trans_begin(sdp, - sdp->sd_max_height + al->al_rgd->rd_ri.ri_length + + sdp->sd_max_height + al->al_rgd->rd_length + RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 9cdd71cef59c..2f154049b59d 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1897,7 +1897,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; rgd = rlist.rl_ghs[x].gh_gl->gl_object; - rg_blocks += rgd->rd_ri.ri_length; + rg_blocks += rgd->rd_length; } error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index 5b83ca6acab1..40e1d37112e6 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -254,7 +254,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) return error; - error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE + + error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE + RES_EATTR + RES_STATFS + RES_QUOTA, blks); if (error) goto out_gunlock; @@ -700,7 +700,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), - blks + al->al_rgd->rd_ri.ri_length + + blks + al->al_rgd->rd_length + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; @@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; rgd = rlist.rl_ghs[x].gh_gl->gl_object; - rg_blocks += rgd->rd_ri.ri_length; + rg_blocks += rgd->rd_length; } error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b2079fcd2513..e5069b912d5e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -28,6 +28,14 @@ struct gfs2_sbd; typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); +struct gfs2_log_header_host { + u64 lh_sequence; /* Sequence number of this transaction */ + u32 lh_flags; /* GFS2_LOG_HEAD_... */ + u32 lh_tail; /* Block number of log tail */ + u32 lh_blkno; + u32 lh_hash; +}; + /* * Structure of operations that are associated with each * type of element in the log. @@ -60,12 +68,23 @@ struct gfs2_bitmap { u32 bi_len; }; +struct gfs2_rgrp_host { + u32 rg_flags; + u32 rg_free; + u32 rg_dinodes; + u64 rg_igeneration; +}; + struct gfs2_rgrpd { struct list_head rd_list; /* Link with superblock */ struct list_head rd_list_mru; struct list_head rd_recent; /* Recently used rgrps */ struct gfs2_glock *rd_gl; /* Glock for this rgrp */ - struct gfs2_rindex_host rd_ri; + u64 rd_addr; /* grp block disk address */ + u64 rd_data0; /* first data location */ + u32 rd_length; /* length of rgrp header in fs blocks */ + u32 rd_data; /* num of data blocks in rgrp */ + u32 rd_bitbytes; /* number of bytes in data bitmaps */ struct gfs2_rgrp_host rd_rg; u64 rd_rg_vn; struct gfs2_bitmap *rd_bits; @@ -211,6 +230,20 @@ enum { GIF_SW_PAGED = 3, }; +struct gfs2_dinode_host { + u64 di_size; /* number of bytes in file */ + u64 di_blocks; /* number of blocks in file */ + u64 di_goal_meta; /* rgrp to alloc from next */ + u64 di_goal_data; /* data block goal */ + u64 di_generation; /* generation number for NFS */ + u32 di_flags; /* GFS2_DIF_... */ + u16 di_height; /* height of metadata */ + /* These only apply to directories */ + u16 di_depth; /* Number of bits in the table */ + u32 di_entries; /* The number of entries in the directory */ + u64 di_eattr; /* extended attribute block number */ +}; + struct gfs2_inode { struct inode i_inode; u64 i_no_addr; @@ -346,6 +379,12 @@ struct gfs2_jdesc { unsigned int jd_blocks; }; +struct gfs2_statfs_change_host { + s64 sc_total; + s64 sc_free; + s64 sc_dinodes; +}; + #define GFS2_GLOCKD_DEFAULT 1 #define GFS2_GLOCKD_MAX 16 @@ -418,6 +457,28 @@ enum { #define GFS2_FSNAME_LEN 256 +struct gfs2_inum_host { + u64 no_formal_ino; + u64 no_addr; +}; + +struct gfs2_sb_host { + u32 sb_magic; + u32 sb_type; + u32 sb_format; + + u32 sb_fs_format; + u32 sb_multihost_format; + u32 sb_bsize; + u32 sb_bsize_shift; + + struct gfs2_inum_host sb_master_dir; + struct gfs2_inum_host sb_root_dir; + + char sb_lockproto[GFS2_LOCKNAME_LEN]; + char sb_locktable[GFS2_LOCKNAME_LEN]; +}; + struct gfs2_sbd { struct super_block *sd_vfs; struct super_block *sd_vfs_meta; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 58f5a67e1c35..a31a4b80ba3c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -38,6 +38,11 @@ #include "trans.h" #include "util.h" +struct gfs2_inum_range_host { + u64 ir_start; + u64 ir_length; +}; + static int iget_test(struct inode *inode, void *opaque) { struct gfs2_inode *ip = GFS2_I(inode); @@ -402,6 +407,22 @@ out: return inode ? inode : ERR_PTR(error); } +static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf) +{ + const struct gfs2_inum_range *str = buf; + + ir->ir_start = be64_to_cpu(str->ir_start); + ir->ir_length = be64_to_cpu(str->ir_length); +} + +static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf) +{ + struct gfs2_inum_range *str = buf; + + str->ir_start = cpu_to_be64(ir->ir_start); + str->ir_length = cpu_to_be64(ir->ir_length); +} + static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) { struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); @@ -741,7 +762,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_ri.ri_length + + al->al_rgd->rd_length + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -1234,3 +1255,63 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) return error; } +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) +{ + const struct gfs2_dinode_host *di = &ip->i_di; + struct gfs2_dinode *str = buf; + + str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); + str->di_header.__pad0 = 0; + str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); + str->di_header.__pad1 = 0; + str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + str->di_mode = cpu_to_be32(ip->i_inode.i_mode); + str->di_uid = cpu_to_be32(ip->i_inode.i_uid); + str->di_gid = cpu_to_be32(ip->i_inode.i_gid); + str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); + str->di_size = cpu_to_be64(di->di_size); + str->di_blocks = cpu_to_be64(di->di_blocks); + str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); + str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); + str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); + + str->di_goal_meta = cpu_to_be64(di->di_goal_meta); + str->di_goal_data = cpu_to_be64(di->di_goal_data); + str->di_generation = cpu_to_be64(di->di_generation); + + str->di_flags = cpu_to_be32(di->di_flags); + str->di_height = cpu_to_be16(di->di_height); + str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && + !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? + GFS2_FORMAT_DE : 0); + str->di_depth = cpu_to_be16(di->di_depth); + str->di_entries = cpu_to_be32(di->di_entries); + + str->di_eattr = cpu_to_be64(di->di_eattr); +} + +void gfs2_dinode_print(const struct gfs2_inode *ip) +{ + const struct gfs2_dinode_host *di = &ip->i_di; + + printk(KERN_INFO " no_formal_ino = %llu\n", + (unsigned long long)ip->i_no_formal_ino); + printk(KERN_INFO " no_addr = %llu\n", + (unsigned long long)ip->i_no_addr); + printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); + printk(KERN_INFO " di_blocks = %llu\n", + (unsigned long long)di->di_blocks); + printk(KERN_INFO " di_goal_meta = %llu\n", + (unsigned long long)di->di_goal_meta); + printk(KERN_INFO " di_goal_data = %llu\n", + (unsigned long long)di->di_goal_data); + printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); + printk(KERN_INFO " di_height = %u\n", di->di_height); + printk(KERN_INFO " di_depth = %u\n", di->di_depth); + printk(KERN_INFO " di_entries = %u\n", di->di_entries); + printk(KERN_INFO " di_eattr = %llu\n", + (unsigned long long)di->di_eattr); +} + diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 05fc095d8540..35375fc43fa3 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -38,6 +38,14 @@ static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino; } +static inline void gfs2_inum_out(const struct gfs2_inode *ip, + struct gfs2_dirent *dent) +{ + dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); +} + + void gfs2_inode_attr_in(struct gfs2_inode *ip); struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned type); struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); @@ -59,6 +67,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); int gfs2_glock_nq_atime(struct gfs2_holder *gh); int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +void gfs2_dinode_print(const struct gfs2_inode *ip); #endif /* __INODE_DOT_H__ */ diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c deleted file mode 100644 index a5b05ea3d4c7..000000000000 --- a/fs/gfs2/ondisk.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include - -#include "gfs2.h" -#include -#include -#include "incore.h" - -#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \ - struct->member); - -/* - * gfs2_xxx_in - read in an xxx struct - * first arg: the cpu-order structure - * buf: the disk-order buffer - * - * gfs2_xxx_out - write out an xxx struct - * first arg: the cpu-order structure - * buf: the disk-order buffer - * - * gfs2_xxx_print - print out an xxx struct - * first arg: the cpu-order structure - */ - -void gfs2_inum_out(const struct gfs2_inode *ip, struct gfs2_dirent *dent) -{ - dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); -} - -static void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf) -{ - const struct gfs2_meta_header *str = buf; - - mh->mh_magic = be32_to_cpu(str->mh_magic); - mh->mh_type = be32_to_cpu(str->mh_type); - mh->mh_format = be32_to_cpu(str->mh_format); -} - -void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) -{ - const struct gfs2_sb *str = buf; - - gfs2_meta_header_in(&sb->sb_header, buf); - - sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); - sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); - sb->sb_bsize = be32_to_cpu(str->sb_bsize); - sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); - sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); - sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); - sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); - sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); - - memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); - memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); -} - -void gfs2_rindex_in(struct gfs2_rindex_host *ri, const void *buf) -{ - const struct gfs2_rindex *str = buf; - - ri->ri_addr = be64_to_cpu(str->ri_addr); - ri->ri_length = be32_to_cpu(str->ri_length); - ri->ri_data0 = be64_to_cpu(str->ri_data0); - ri->ri_data = be32_to_cpu(str->ri_data); - ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes); - -} - -void gfs2_rindex_print(const struct gfs2_rindex_host *ri) -{ - printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr); - pv(ri, ri_length, "%u"); - - printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0); - pv(ri, ri_data, "%u"); - - pv(ri, ri_bitbytes, "%u"); -} - -void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf) -{ - const struct gfs2_rgrp *str = buf; - - rg->rg_flags = be32_to_cpu(str->rg_flags); - rg->rg_free = be32_to_cpu(str->rg_free); - rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); - rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); -} - -void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf) -{ - struct gfs2_rgrp *str = buf; - - str->rg_flags = cpu_to_be32(rg->rg_flags); - str->rg_free = cpu_to_be32(rg->rg_free); - str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); - str->__pad = cpu_to_be32(0); - str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); - memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); -} - -void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf) -{ - const struct gfs2_quota *str = buf; - - qu->qu_limit = be64_to_cpu(str->qu_limit); - qu->qu_warn = be64_to_cpu(str->qu_warn); - qu->qu_value = be64_to_cpu(str->qu_value); -} - -void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf) -{ - struct gfs2_quota *str = buf; - - str->qu_limit = cpu_to_be64(qu->qu_limit); - str->qu_warn = cpu_to_be64(qu->qu_warn); - str->qu_value = cpu_to_be64(qu->qu_value); - memset(&str->qu_reserved, 0, sizeof(str->qu_reserved)); -} - -void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) -{ - const struct gfs2_dinode_host *di = &ip->i_di; - struct gfs2_dinode *str = buf; - - str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); - str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); - str->di_header.__pad0 = 0; - str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); - str->di_header.__pad1 = 0; - str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); - str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - str->di_mode = cpu_to_be32(ip->i_inode.i_mode); - str->di_uid = cpu_to_be32(ip->i_inode.i_uid); - str->di_gid = cpu_to_be32(ip->i_inode.i_gid); - str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(di->di_size); - str->di_blocks = cpu_to_be64(di->di_blocks); - str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); - str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); - - str->di_goal_meta = cpu_to_be64(di->di_goal_meta); - str->di_goal_data = cpu_to_be64(di->di_goal_data); - str->di_generation = cpu_to_be64(di->di_generation); - - str->di_flags = cpu_to_be32(di->di_flags); - str->di_height = cpu_to_be16(di->di_height); - str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && - !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? - GFS2_FORMAT_DE : 0); - str->di_depth = cpu_to_be16(di->di_depth); - str->di_entries = cpu_to_be32(di->di_entries); - - str->di_eattr = cpu_to_be64(di->di_eattr); -} - -void gfs2_dinode_print(const struct gfs2_inode *ip) -{ - const struct gfs2_dinode_host *di = &ip->i_di; - - printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)ip->i_no_formal_ino); - printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)ip->i_no_addr); - - printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); - printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks); - printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta); - printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data); - - pv(di, di_flags, "0x%.8X"); - pv(di, di_height, "%u"); - - pv(di, di_depth, "%u"); - pv(di, di_entries, "%u"); - - printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr); -} - -void gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf) -{ - const struct gfs2_log_header *str = buf; - - gfs2_meta_header_in(&lh->lh_header, buf); - lh->lh_sequence = be64_to_cpu(str->lh_sequence); - lh->lh_flags = be32_to_cpu(str->lh_flags); - lh->lh_tail = be32_to_cpu(str->lh_tail); - lh->lh_blkno = be32_to_cpu(str->lh_blkno); - lh->lh_hash = be32_to_cpu(str->lh_hash); -} - -void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf) -{ - const struct gfs2_inum_range *str = buf; - - ir->ir_start = be64_to_cpu(str->ir_start); - ir->ir_length = be64_to_cpu(str->ir_length); -} - -void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf) -{ - struct gfs2_inum_range *str = buf; - - str->ir_start = cpu_to_be64(ir->ir_start); - str->ir_length = cpu_to_be64(ir->ir_length); -} - -void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) -{ - const struct gfs2_statfs_change *str = buf; - - sc->sc_total = be64_to_cpu(str->sc_total); - sc->sc_free = be64_to_cpu(str->sc_free); - sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); -} - -void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) -{ - struct gfs2_statfs_change *str = buf; - - str->sc_total = cpu_to_be64(sc->sc_total); - str->sc_free = cpu_to_be64(sc->sc_free); - str->sc_dinodes = cpu_to_be64(sc->sc_dinodes); -} - -void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf) -{ - const struct gfs2_quota_change *str = buf; - - qc->qc_change = be64_to_cpu(str->qc_change); - qc->qc_flags = be32_to_cpu(str->qc_flags); - qc->qc_id = be32_to_cpu(str->qc_id); -} - diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index 51a8a14deb29..d07230ee5fc0 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -22,10 +22,18 @@ #include "glops.h" #include "inode.h" #include "ops_dentry.h" -#include "ops_export.h" +#include "ops_fstype.h" #include "rgrp.h" #include "util.h" +#define GFS2_SMALL_FH_SIZE 4 +#define GFS2_LARGE_FH_SIZE 10 + +struct gfs2_fh_obj { + struct gfs2_inum_host this; + u32 imode; +}; + static struct dentry *gfs2_decode_fh(struct super_block *sb, __u32 *p, int fh_len, diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h deleted file mode 100644 index f925a955b3b8..000000000000 --- a/fs/gfs2/ops_export.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_EXPORT_DOT_H__ -#define __OPS_EXPORT_DOT_H__ - -#define GFS2_SMALL_FH_SIZE 4 -#define GFS2_LARGE_FH_SIZE 10 - -extern struct export_operations gfs2_export_ops; -struct gfs2_fh_obj { - struct gfs2_inum_host this; - __u32 imode; -}; - -#endif /* __OPS_EXPORT_DOT_H__ */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c682371717ff..0443e255173d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -27,7 +27,6 @@ #include "inode.h" #include "lm.h" #include "mount.h" -#include "ops_export.h" #include "ops_fstype.h" #include "ops_super.h" #include "recovery.h" @@ -116,7 +115,6 @@ static void init_vfs(struct super_block *sb, unsigned noatime) static int init_names(struct gfs2_sbd *sdp, int silent) { - struct page *page; char *proto, *table; int error = 0; @@ -126,14 +124,9 @@ static int init_names(struct gfs2_sbd *sdp, int silent) /* Try to autodetect */ if (!proto[0] || !table[0]) { - struct gfs2_sb *sb; - page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); - if (!page) - return -ENOBUFS; - sb = kmap(page); - gfs2_sb_in(&sdp->sd_sb, sb); - kunmap(page); - __free_page(page); + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + if (error) + return error; error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); if (error) diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h index 7cc2c296271b..407029b3b2b3 100644 --- a/fs/gfs2/ops_fstype.h +++ b/fs/gfs2/ops_fstype.h @@ -14,5 +14,6 @@ extern struct file_system_type gfs2_fs_type; extern struct file_system_type gfs2meta_fs_type; +extern struct export_operations gfs2_export_ops; #endif /* __OPS_FSTYPE_DOT_H__ */ diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index f8ecfec4064b..919a661e4f79 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -206,7 +206,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_ri.ri_length + + al->al_rgd->rd_length + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -711,7 +711,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_ri.ri_length + + al->al_rgd->rd_length + 4 * RES_DINODE + 4 * RES_LEAF + RES_STATFS + RES_QUOTA + 4, 0); if (error) diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c index aa0dbd2aac1b..404b7cc9f8c4 100644 --- a/fs/gfs2/ops_vm.c +++ b/fs/gfs2/ops_vm.c @@ -66,7 +66,7 @@ static int alloc_page_backing(struct gfs2_inode *ip, struct page *page) if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length + + error = gfs2_trans_begin(sdp, al->al_rgd->rd_length + ind_blocks + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8a58815dea08..6e546ee8f3d4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -66,6 +66,18 @@ #define QUOTA_USER 1 #define QUOTA_GROUP 0 +struct gfs2_quota_host { + u64 qu_limit; + u64 qu_warn; + s64 qu_value; +}; + +struct gfs2_quota_change_host { + u64 qc_change; + u32 qc_flags; /* GFS2_QCF_... */ + u32 qc_id; +}; + static u64 qd2offset(struct gfs2_quota_data *qd) { u64 offset; @@ -561,6 +573,25 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) mutex_unlock(&sdp->sd_quota_mutex); } +static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf) +{ + const struct gfs2_quota *str = buf; + + qu->qu_limit = be64_to_cpu(str->qu_limit); + qu->qu_warn = be64_to_cpu(str->qu_warn); + qu->qu_value = be64_to_cpu(str->qu_value); +} + +static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf) +{ + struct gfs2_quota *str = buf; + + str->qu_limit = cpu_to_be64(qu->qu_limit); + str->qu_warn = cpu_to_be64(qu->qu_warn); + str->qu_value = cpu_to_be64(qu->qu_value); + memset(&str->qu_reserved, 0, sizeof(str->qu_reserved)); +} + /** * gfs2_adjust_quota * @@ -694,7 +725,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out_alloc; error = gfs2_trans_begin(sdp, - al->al_rgd->rd_ri.ri_length + + al->al_rgd->rd_length + num_qd * data_blocks + nalloc * ind_blocks + RES_DINODE + num_qd + @@ -1055,6 +1086,15 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) return error; } +static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf) +{ + const struct gfs2_quota_change *str = buf; + + qc->qc_change = be64_to_cpu(str->qc_change); + qc->qc_flags = be32_to_cpu(str->qc_flags); + qc->qc_id = be32_to_cpu(str->qc_id); +} + int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 8bc182c7e2ef..5ada38c99a2c 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -116,6 +116,22 @@ void gfs2_revoke_clean(struct gfs2_sbd *sdp) } } +static int gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf) +{ + const struct gfs2_log_header *str = buf; + + if (str->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + str->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH)) + return 1; + + lh->lh_sequence = be64_to_cpu(str->lh_sequence); + lh->lh_flags = be32_to_cpu(str->lh_flags); + lh->lh_tail = be32_to_cpu(str->lh_tail); + lh->lh_blkno = be32_to_cpu(str->lh_blkno); + lh->lh_hash = be32_to_cpu(str->lh_hash); + return 0; +} + /** * get_log_header - read the log header for a given segment * @jd: the journal @@ -147,12 +163,10 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, sizeof(u32)); hash = crc32_le(hash, (unsigned char const *)¬hing, sizeof(nothing)); hash ^= (u32)~0; - gfs2_log_header_in(&lh, bh->b_data); + error = gfs2_log_header_in(&lh, bh->b_data); brelse(bh); - if (lh.lh_header.mh_magic != GFS2_MAGIC || - lh.lh_header.mh_type != GFS2_METATYPE_LH || - lh.lh_blkno != blk || lh.lh_hash != hash) + if (error || lh.lh_blkno != blk || lh.lh_hash != hash) return 1; *head = lh; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 30eb428065c5..027f6ec5b0d9 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -204,7 +204,7 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_bitmap *bi = NULL; - u32 length = rgd->rd_ri.ri_length; + u32 length = rgd->rd_length; u32 count[4], tmp; int buf, x; @@ -227,7 +227,7 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) return; } - tmp = rgd->rd_ri.ri_data - + tmp = rgd->rd_data - rgd->rd_rg.rg_free - rgd->rd_rg.rg_dinodes; if (count[1] + count[2] != tmp) { @@ -253,10 +253,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) } -static inline int rgrp_contains_block(struct gfs2_rindex_host *ri, u64 block) +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) { - u64 first = ri->ri_data0; - u64 last = first + ri->ri_data; + u64 first = rgd->rd_data0; + u64 last = first + rgd->rd_data; return first <= block && block < last; } @@ -275,7 +275,7 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) spin_lock(&sdp->sd_rindex_spin); list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { - if (rgrp_contains_block(&rgd->rd_ri, blk)) { + if (rgrp_contains_block(rgd, blk)) { list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); spin_unlock(&sdp->sd_rindex_spin); return rgd; @@ -354,6 +354,15 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) mutex_unlock(&sdp->sd_rindex_mutex); } +static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) +{ + printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); + printk(KERN_INFO " ri_length = %u\n", rgd->rd_length); + printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); + printk(KERN_INFO " ri_data = %u\n", rgd->rd_data); + printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes); +} + /** * gfs2_compute_bitstructs - Compute the bitmap sizes * @rgd: The resource group descriptor @@ -367,7 +376,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_bitmap *bi; - u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */ + u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */ u32 bytes_left, bytes; int x; @@ -378,7 +387,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) if (!rgd->rd_bits) return -ENOMEM; - bytes_left = rgd->rd_ri.ri_bitbytes; + bytes_left = rgd->rd_bitbytes; for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; @@ -399,14 +408,14 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) } else if (x + 1 == length) { bytes = bytes_left; bi->bi_offset = sizeof(struct gfs2_meta_header); - bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; + bi->bi_start = rgd->rd_bitbytes - bytes_left; bi->bi_len = bytes; /* other blocks */ } else { bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); bi->bi_offset = sizeof(struct gfs2_meta_header); - bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; + bi->bi_start = rgd->rd_bitbytes - bytes_left; bi->bi_len = bytes; } @@ -418,9 +427,9 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) return -EIO; } bi = rgd->rd_bits + (length - 1); - if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) { + if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) { if (gfs2_consist_rgrpd(rgd)) { - gfs2_rindex_print(&rgd->rd_ri); + gfs2_rindex_print(rgd); fs_err(sdp, "start=%u len=%u offset=%u\n", bi->bi_start, bi->bi_len, bi->bi_offset); } @@ -431,6 +440,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) } /** + * gfs2_ri_total - Total up the file system space, according to the rindex. * */ @@ -439,7 +449,6 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) u64 total_data = 0; struct inode *inode = sdp->sd_rindex; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_rindex_host ri; char buf[sizeof(struct gfs2_rindex)]; struct file_ra_state ra_state; int error, rgrps; @@ -455,13 +464,23 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) sizeof(struct gfs2_rindex)); if (error != sizeof(struct gfs2_rindex)) break; - gfs2_rindex_in(&ri, buf); - total_data += ri.ri_data; + total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data); } mutex_unlock(&sdp->sd_rindex_mutex); return total_data; } +static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) +{ + const struct gfs2_rindex *str = buf; + + rgd->rd_addr = be64_to_cpu(str->ri_addr); + rgd->rd_length = be32_to_cpu(str->ri_length); + rgd->rd_data0 = be64_to_cpu(str->ri_data0); + rgd->rd_data = be32_to_cpu(str->ri_data); + rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes); +} + /** * read_rindex_entry - Pull in a new resource index entry from the disk * @gl: The glock covering the rindex inode @@ -500,12 +519,12 @@ static int read_rindex_entry(struct gfs2_inode *ip, list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - gfs2_rindex_in(&rgd->rd_ri, buf); + gfs2_rindex_in(rgd, buf); error = compute_bitstructs(rgd); if (error) return error; - error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr, + error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) return error; @@ -626,6 +645,28 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) return error; } +static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + + rg->rg_flags = be32_to_cpu(str->rg_flags); + rg->rg_free = be32_to_cpu(str->rg_free); + rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); + rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); +} + +static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf) +{ + struct gfs2_rgrp *str = buf; + + str->rg_flags = cpu_to_be32(rg->rg_flags); + str->rg_free = cpu_to_be32(rg->rg_free); + str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); + str->__pad = cpu_to_be32(0); + str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); + memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); +} + /** * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps * @rgd: the struct gfs2_rgrpd describing the RG to read in @@ -640,7 +681,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl = rgd->rd_gl; - unsigned int length = rgd->rd_ri.ri_length; + unsigned int length = rgd->rd_length; struct gfs2_bitmap *bi; unsigned int x, y; int error; @@ -658,7 +699,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; - error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh); + error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); if (error) goto fail; } @@ -720,7 +761,7 @@ void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd) void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; - int x, length = rgd->rd_ri.ri_length; + int x, length = rgd->rd_length; spin_lock(&sdp->sd_rindex_spin); gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); @@ -743,7 +784,7 @@ void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; - unsigned int length = rgd->rd_ri.ri_length; + unsigned int length = rgd->rd_length; unsigned int x; for (x = 0; x < length; x++) { @@ -826,7 +867,7 @@ static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp, goto first; list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { - if (rgd->rd_ri.ri_addr == rglast) + if (rgd->rd_addr == rglast) goto out; } @@ -1037,7 +1078,7 @@ static int get_local_rgrp(struct gfs2_inode *ip) } out: - ip->i_last_rg_alloc = rgd->rd_ri.ri_addr; + ip->i_last_rg_alloc = rgd->rd_addr; if (begin) { recent_rgrp_add(rgd); @@ -1128,8 +1169,8 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) unsigned int buf; unsigned char type; - length = rgd->rd_ri.ri_length; - rgrp_block = block - rgd->rd_ri.ri_data0; + length = rgd->rd_length; + rgrp_block = block - rgd->rd_data0; for (buf = 0; buf < length; buf++) { bi = rgd->rd_bits + buf; @@ -1171,7 +1212,7 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, unsigned char old_state, unsigned char new_state) { struct gfs2_bitmap *bi = NULL; - u32 length = rgd->rd_ri.ri_length; + u32 length = rgd->rd_length; u32 blk = 0; unsigned int buf, x; @@ -1247,9 +1288,9 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, return NULL; } - length = rgd->rd_ri.ri_length; + length = rgd->rd_length; - rgrp_blk = bstart - rgd->rd_ri.ri_data0; + rgrp_blk = bstart - rgd->rd_data0; while (blen--) { for (buf = 0; buf < length; buf++) { @@ -1293,15 +1334,15 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip) u32 goal, blk; u64 block; - if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data)) - goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0; + if (rgrp_contains_block(rgd, ip->i_di.di_goal_data)) + goal = ip->i_di.di_goal_data - rgd->rd_data0; else goal = rgd->rd_last_alloc_data; blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); rgd->rd_last_alloc_data = blk; - block = rgd->rd_ri.ri_data0 + blk; + block = rgd->rd_data0 + blk; ip->i_di.di_goal_data = block; gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); @@ -1337,15 +1378,15 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip) u32 goal, blk; u64 block; - if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta)) - goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0; + if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta)) + goal = ip->i_di.di_goal_meta - rgd->rd_data0; else goal = rgd->rd_last_alloc_meta; blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); rgd->rd_last_alloc_meta = blk; - block = rgd->rd_ri.ri_data0 + blk; + block = rgd->rd_data0 + blk; ip->i_di.di_goal_meta = block; gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); @@ -1387,7 +1428,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) rgd->rd_last_alloc_meta = blk; - block = rgd->rd_ri.ri_data0 + blk; + block = rgd->rd_data0 + blk; gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); rgd->rd_rg.rg_free--; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index faccffd19907..f916b9740c75 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -95,8 +95,8 @@ int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) { unsigned int x; - if (sb->sb_header.mh_magic != GFS2_MAGIC || - sb->sb_header.mh_type != GFS2_METATYPE_SB) { + if (sb->sb_magic != GFS2_MAGIC || + sb->sb_type != GFS2_METATYPE_SB) { if (!silent) printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); return -EINVAL; @@ -174,10 +174,31 @@ static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error) return 0; } +static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) +{ + const struct gfs2_sb *str = buf; + + sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); + sb->sb_type = be32_to_cpu(str->sb_header.mh_type); + sb->sb_format = be32_to_cpu(str->sb_header.mh_format); + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); + sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); + sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); + sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); +} + /** * gfs2_read_super - Read the gfs2 super block from disk - * @sb: The VFS super block + * @sdp: The GFS2 super block * @sector: The location of the super block + * @error: The error code to return * * This uses the bio functions to read the super block from disk * because we want to be 100% sure that we never read cached data. @@ -189,17 +210,19 @@ static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error) * the master directory (contains pointers to journals etc) and the * root directory. * - * Returns: A page containing the sb or NULL + * Returns: 0 on success or error */ -struct page *gfs2_read_super(struct super_block *sb, sector_t sector) +int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) { + struct super_block *sb = sdp->sd_vfs; + struct gfs2_sb *p; struct page *page; struct bio *bio; page = alloc_page(GFP_KERNEL); if (unlikely(!page)) - return NULL; + return -ENOBUFS; ClearPageUptodate(page); ClearPageDirty(page); @@ -208,7 +231,7 @@ struct page *gfs2_read_super(struct super_block *sb, sector_t sector) bio = bio_alloc(GFP_KERNEL, 1); if (unlikely(!bio)) { __free_page(page); - return NULL; + return -ENOBUFS; } bio->bi_sector = sector * (sb->s_blocksize >> 9); @@ -222,9 +245,13 @@ struct page *gfs2_read_super(struct super_block *sb, sector_t sector) bio_put(bio); if (!PageUptodate(page)) { __free_page(page); - return NULL; + return -EIO; } - return page; + p = kmap(page); + gfs2_sb_in(&sdp->sd_sb, p); + kunmap(page); + __free_page(page); + return 0; } /** @@ -241,19 +268,13 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) u32 tmp_blocks; unsigned int x; int error; - struct page *page; - char *sb; - page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); - if (!page) { + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + if (error) { if (!silent) fs_err(sdp, "can't read superblock\n"); - return -EIO; + return error; } - sb = kmap(page); - gfs2_sb_in(&sdp->sd_sb, sb); - kunmap(page); - __free_page(page); error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); if (error) @@ -593,6 +614,24 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) return error; } +static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) +{ + const struct gfs2_statfs_change *str = buf; + + sc->sc_total = be64_to_cpu(str->sc_total); + sc->sc_free = be64_to_cpu(str->sc_free); + sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); +} + +static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +{ + struct gfs2_statfs_change *str = buf; + + str->sc_total = cpu_to_be64(sc->sc_total); + str->sc_free = cpu_to_be64(sc->sc_free); + str->sc_dinodes = cpu_to_be64(sc->sc_dinodes); +} + int gfs2_statfs_init(struct gfs2_sbd *sdp) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -772,7 +811,7 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd, struct gfs2_statfs_change_host *sc) { gfs2_rgrp_verify(rgd); - sc->sc_total += rgd->rd_ri.ri_data; + sc->sc_total += rgd->rd_data; sc->sc_free += rgd->rd_rg.rg_free; sc->sc_dinodes += rgd->rd_rg.rg_dinodes; return 0; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index e590b2df11dc..60a870e430be 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -16,7 +16,7 @@ void gfs2_tune_init(struct gfs2_tune *gt); int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent); int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); -struct page *gfs2_read_super(struct super_block *sb, sector_t sector); +int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 3f5edc54e80a..424a0774eda8 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -137,7 +137,7 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, "GFS2: fsid=%s: RG = %llu\n" "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr, + sdp->sd_fsname, (unsigned long long)rgd->rd_addr, sdp->sd_fsname, function, file, line); return rv; } -- cgit v1.2.1 From 4bd91ba18198eee42c39d4c334c825d1a0a4b445 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 5 Jun 2007 09:39:18 +0100 Subject: [GFS2] Add nanosecond timestamp feature This adds a nanosecond timestamp feature to the GFS2 filesystem. Due to the way that the on-disk format works, older filesystems will just appear to have this field set to zero. When mounted by an older version of GFS2, the filesystem will simply ignore the extra fields so that it will again appear to have whole second resolution, so that its trivially backward compatible. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 10 +++++----- fs/gfs2/dir.c | 10 +++++----- fs/gfs2/eattr.c | 8 ++++---- fs/gfs2/inode.c | 30 +++++++++++++++++++----------- fs/gfs2/ops_fstype.c | 1 + fs/gfs2/ops_inode.c | 2 +- 6 files changed, 35 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index b784cf3c6482..d16044cb023a 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -772,7 +772,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_free_data(ip, bstart, blen); } - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); @@ -847,7 +847,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size) } ip->i_di.di_size = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) @@ -958,7 +958,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (gfs2_is_stuffed(ip)) { ip->i_di.di_size = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); @@ -970,7 +970,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (!error) { ip->i_di.di_size = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1043,7 +1043,7 @@ static int trunc_end(struct gfs2_inode *ip) ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); } - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 2f154049b59d..f793e31a050e 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_di.di_size < offset + size) ip->i_di.di_size = offset + size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -228,7 +228,7 @@ out: if (ip->i_di.di_size < offset + copied) ip->i_di.di_size = offset + copied; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1622,7 +1622,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, break; gfs2_trans_add_bh(ip->i_gl, bh, 1); ip->i_di.di_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, bh->b_data); brelse(bh); error = 0; @@ -1708,7 +1708,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) gfs2_consist_inode(dip); gfs2_trans_add_bh(dip->i_gl, bh, 1); dip->i_di.di_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(dip, bh->b_data); brelse(bh); mark_inode_dirty(&dip->i_inode); @@ -1756,7 +1756,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, gfs2_trans_add_bh(dip->i_gl, bh, 1); } - dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(dip, bh->b_data); brelse(bh); return 0; diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index 40e1d37112e6..2a7435b5c4dc 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -300,7 +300,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -717,7 +717,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, (er->er_mode & S_IFMT)); ip->i_inode.i_mode = er->er_mode; } - ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -852,7 +852,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); ip->i_inode.i_mode = er->er_mode; } - ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1133,7 +1133,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a31a4b80ba3c..3ef0f051d076 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -178,11 +178,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) di->di_blocks = be64_to_cpu(str->di_blocks); gfs2_set_inode_blocks(&ip->i_inode); ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); - ip->i_inode.i_atime.tv_nsec = 0; + ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); - ip->i_inode.i_mtime.tv_nsec = 0; + ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); - ip->i_inode.i_ctime.tv_nsec = 0; + ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); di->di_goal_meta = be64_to_cpu(str->di_goal_meta); di->di_goal_data = be64_to_cpu(str->di_goal_data); @@ -317,7 +317,7 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff) else drop_nlink(&ip->i_inode); - ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -648,6 +648,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_dinode *di; struct buffer_head *dibh; + struct timespec tv = CURRENT_TIME; dibh = gfs2_meta_new(gl, inum->no_addr); gfs2_trans_add_bh(gl, dibh, 1); @@ -663,7 +664,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_nlink = 0; di->di_size = 0; di->di_blocks = cpu_to_be64(1); - di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds()); + di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); di->di_major = cpu_to_be32(MAJOR(dev)); di->di_minor = cpu_to_be32(MINOR(dev)); di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); @@ -693,6 +694,9 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_entries = 0; memset(&di->__pad4, 0, sizeof(di->__pad4)); di->di_eattr = 0; + di->di_atime_nsec = cpu_to_be32(tv.tv_nsec); + di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); + di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); memset(&di->di_reserved, 0, sizeof(di->di_reserved)); brelse(dibh); @@ -1135,10 +1139,11 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_inode *ip = gl->gl_object; - s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum); + s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum); unsigned int state; int flags; int error; + struct timespec tv = CURRENT_TIME; if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) || gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) || @@ -1156,8 +1161,7 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh) (sdp->sd_vfs->s_flags & MS_RDONLY)) return 0; - curtime = get_seconds(); - if (curtime - ip->i_inode.i_atime.tv_sec >= quantum) { + if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) { gfs2_glock_dq(gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY, gh); @@ -1168,8 +1172,8 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh) /* Verify that atime hasn't been updated while we were trying to get exclusive lock. */ - curtime = get_seconds(); - if (curtime - ip->i_inode.i_atime.tv_sec >= quantum) { + tv = CURRENT_TIME; + if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) { struct buffer_head *dibh; struct gfs2_dinode *di; @@ -1183,11 +1187,12 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh) if (error) goto fail_end_trans; - ip->i_inode.i_atime.tv_sec = curtime; + ip->i_inode.i_atime = tv; gfs2_trans_add_bh(ip->i_gl, dibh, 1); di = (struct gfs2_dinode *)dibh->b_data; di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); + di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); brelse(dibh); gfs2_trans_end(sdp); @@ -1290,6 +1295,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_entries = cpu_to_be32(di->di_entries); str->di_eattr = cpu_to_be64(di->di_eattr); + str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); + str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); } void gfs2_dinode_print(const struct gfs2_inode *ip) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 0443e255173d..b46727275e58 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -104,6 +104,7 @@ static void init_vfs(struct super_block *sb, unsigned noatime) sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_export_op = &gfs2_export_ops; + sb->s_time_gran = 1; sb->s_maxbytes = MAX_LFS_FILESIZE; if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 919a661e4f79..9cc07f442ef5 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -757,7 +757,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out_end_trans; - ip->i_inode.i_ctime = CURRENT_TIME_SEC; + ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); -- cgit v1.2.1 From 292e539e9386823df8aab556f3da09667f78da8c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 5 Jun 2007 17:36:38 -0400 Subject: [DLM] fix reference counting This is a fix for the patch 021d2ff3a08019260a1dc002793c92d6bf18afb6 I left off a dlm_hold_rsb which causes the box to panic if you try to use debugfs. This patch fixes the problem. Sorry about that, Signed-off-by: Josef Bacik Signed-off-by: Steven Whitehouse --- fs/dlm/debug_fs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 184be9870c63..9f5de37706a9 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -250,6 +250,7 @@ static int rsb_iter_next(struct rsb_iter *ri) goto top; } ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); + dlm_hold_rsb(ri->rsb); read_unlock(&ls->ls_rsbtbl[i].lock); dlm_put_rsb(old); } -- cgit v1.2.1 From 44f487a5536a3afd96a9f571de24c36559e9ae82 Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Wed, 6 Jun 2007 09:21:22 -0500 Subject: [DLM] variable allocation Add a new flag, DLM_LSFL_FS, to be used when a file system creates a lockspace. This flag causes the dlm to use GFP_NOFS for allocations instead of GFP_KERNEL. (This updated version of the patch uses gfp_t for ls_allocation.) Signed-Off-By: Patrick Caulfield Signed-Off-By: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/dlm_internal.h | 1 + fs/dlm/lock.c | 2 +- fs/dlm/lockspace.c | 5 +++++ fs/dlm/rcom.c | 9 +++++---- fs/gfs2/locking/dlm/mount.c | 2 +- 5 files changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index f2c85493c0c6..8ac081882c78 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -463,6 +463,7 @@ struct dlm_ls { int ls_low_nodeid; int ls_total_weight; int *ls_node_array; + gfp_t ls_allocation; struct dlm_rsb ls_stub_rsb; /* for returning errors */ struct dlm_lkb ls_stub_lkb; /* for returning errors */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index de943afacb37..b455919c1998 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2594,7 +2594,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len, pass into lowcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb); + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); if (!mh) return -ENOBUFS; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index c8f0c15ac166..6802653473d1 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -444,6 +444,11 @@ static int new_lockspace(char *name, int namelen, void **lockspace, set_bit(LSFL_TIMEWARN, &ls->ls_flags); ls->ls_exflags = (flags & ~DLM_LSFL_TIMEWARN); + if (flags & DLM_LSFL_FS) + ls->ls_allocation = GFP_NOFS; + else + ls->ls_allocation = GFP_KERNEL; + size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f71c23542f0f..e3a1527cbdbe 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, char *mb; int mb_len = sizeof(struct dlm_rcom) + len; - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb); + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); if (!mh) { log_print("create_rcom to %d type %d len %d ENOBUFS", to_nodeid, type, len); @@ -386,7 +386,8 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_recover_process_copy(ls, rc_in); } -static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) +static int send_ls_not_ready(struct dlm_ls *ls, int nodeid, + struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct rcom_config *rf; @@ -394,7 +395,7 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) char *mb; int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); - mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb); + mh = dlm_lowcomms_get_buffer(nodeid, mb_len, ls->ls_allocation, &mb); if (!mh) return -ENOBUFS; memset(mb, 0, mb_len); @@ -464,7 +465,7 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid) log_print("lockspace %x from %d type %x not found", hd->h_lockspace, nodeid, rc->rc_type); if (rc->rc_type == DLM_RCOM_STATUS) - send_ls_not_ready(nodeid, rc); + send_ls_not_ready(ls, nodeid, rc); return; } diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index 1d8faa3da8af..41c5b04caaba 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -147,7 +147,7 @@ static int gdlm_mount(char *table_name, char *host_data, error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), &ls->dlm_lockspace, - nodir ? DLM_LSFL_NODIR : 0, + DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0), GDLM_LVB_SIZE); if (error) { log_error("dlm_new_lockspace error %d", error); -- cgit v1.2.1 From ffed8ab342e39b8b5f4d5c94c37a708e225ffcd8 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 7 Jun 2007 11:29:35 +0100 Subject: [GFS2] Fix typo in rename of directories A typo caused us to pass a NULL pointer when renaming directories. It was accidentally introduced in: [GFS2] Clean up inode number handling Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 9cc07f442ef5..84051b997a43 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -749,7 +749,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_mvino(ip, &name, nip, DT_DIR); + error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); if (error) goto out_end_trans; } else { -- cgit v1.2.1 From e1cc86037b689a82cdb2df50c32fa8cf9d6b6c3a Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 7 Jun 2007 11:47:52 +0100 Subject: [GFS2] Fix bug in error path of inode This fixes a bug in the ordering of operations in the error path of createi. Its not valid to do an iput() when holding the inode's glock since the iput() will (in this case) result in delete_inode() being called which needs to grab the lock itself. This was causing the recursive lock checking code to trigger. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3ef0f051d076..87505f7eb745 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -857,7 +857,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, unsigned int mode, dev_t dev) { - struct inode *inode; + struct inode *inode = NULL; struct gfs2_inode *dip = ghs->gh_gl->gl_object; struct inode *dir = &dip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); @@ -900,28 +900,28 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, error = gfs2_inode_refresh(GFS2_I(inode)); if (error) - goto fail_iput; + goto fail_gunlock2; error = gfs2_acl_create(dip, GFS2_I(inode)); if (error) - goto fail_iput; + goto fail_gunlock2; error = gfs2_security_init(dip, GFS2_I(inode)); if (error) - goto fail_iput; + goto fail_gunlock2; error = link_dinode(dip, name, GFS2_I(inode)); if (error) - goto fail_iput; + goto fail_gunlock2; if (!inode) return ERR_PTR(-ENOMEM); return inode; -fail_iput: - iput(inode); fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); + if (inode) + iput(inode); fail_gunlock: gfs2_glock_dq(ghs); fail: -- cgit v1.2.1 From b35997d4482ed24b43a5951c5b021d224b24293c Mon Sep 17 00:00:00 2001 From: Robert Peterson Date: Thu, 7 Jun 2007 09:10:01 -0500 Subject: [GFS2] Can't mount GFS2 file system on AoE device This patch fixes bug 243131: Can't mount GFS2 file system on AoE device. When using AoE devices with lock_nolock, there is no locking table, so gfs2 (and gfs1) uses the superblock s_id. This turns out to be the device name in some cases. In the case of AoE, the device contains a slash, (e.g. "etherd/e1.1p2") which is an invalid character when we try to register the table in sysfs. This patch replaces the "/" with underscore. Rather than add a new variable to the stack, I'm just reusing a (char *) variable that's no longer used: table. This code has been tested on the failing system using a RHEL5 patch. The upstream code was tested by using gfs2_tool sb to interject a "/" into the table name of a clustered gfs2 file system. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_fstype.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b46727275e58..dae1d7142fe5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -145,6 +145,9 @@ static int init_names(struct gfs2_sbd *sdp, int silent) snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto); snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table); + while ((table = strchr(sdp->sd_table_name, '/'))) + *table = '_'; + out: return error; } -- cgit v1.2.1 From c8cdf479377462315d6b4f56379f8ac989b0ef29 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 8 Jun 2007 10:05:33 +0100 Subject: [GFS2] Recovery for lost unlinked inodes Under certain circumstances its possible (though rather unlikely) that inodes which were unlinked by one node while still open on another might get "lost" in the sense that they don't get deallocated if the node which held the inode open crashed before it was unlinked. This patch adds the recovery code which allows automatic deallocation of the inode if its found during block allocation (the sensible time to look for such inodes since we are scanning the rgrp's bitmaps anyway at this time, so it adds no overhead to do this). Since the inode will have had its i_nlink set to zero, all we need to trigger recovery is a lookup and an iput(), and the normal deallocation code takes care of the rest. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 2 ++ fs/gfs2/inode.c | 50 ++++++++++++++++++++++---------- fs/gfs2/rgrp.c | 87 +++++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 107 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e5069b912d5e..c7c6ec0f17c7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -95,6 +95,8 @@ struct gfs2_rgrpd { u32 rd_last_alloc_data; u32 rd_last_alloc_meta; struct gfs2_sbd *rd_sbd; + unsigned long rd_flags; +#define GFS2_RDF_CHECK 0x0001 /* Need to check for unlinked inodes */ }; enum gfs2_state_bits { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 87505f7eb745..cacdb0dfe577 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -98,22 +98,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned in if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); - umode_t mode = DT2IF(type); + umode_t mode; inode->i_private = ip; - inode->i_mode = mode; - - if (S_ISREG(mode)) { - inode->i_op = &gfs2_file_iops; - inode->i_fop = &gfs2_file_fops; - inode->i_mapping->a_ops = &gfs2_file_aops; - } else if (S_ISDIR(mode)) { - inode->i_op = &gfs2_dir_iops; - inode->i_fop = &gfs2_dir_fops; - } else if (S_ISLNK(mode)) { - inode->i_op = &gfs2_symlink_iops; - } else { - inode->i_op = &gfs2_dev_iops; - } error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) @@ -130,10 +116,44 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned in goto fail_iopen; gfs2_glock_put(io_gl); + + /* + * We must read the inode in order to work out its type in + * this case. Note that this doesn't happen often as we normally + * know the type beforehand. This code path only occurs during + * unlinked inode recovery (where it is safe to do this glock, + * which is not true in the general case). + */ + inode->i_mode = mode = DT2IF(type); + if (type == DT_UNKNOWN) { + struct gfs2_holder gh; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (unlikely(error)) + goto fail_glock; + /* Inode is now uptodate */ + mode = inode->i_mode; + gfs2_glock_dq_uninit(&gh); + } + + if (S_ISREG(mode)) { + inode->i_op = &gfs2_file_iops; + inode->i_fop = &gfs2_file_fops; + inode->i_mapping->a_ops = &gfs2_file_aops; + } else if (S_ISDIR(mode)) { + inode->i_op = &gfs2_dir_iops; + inode->i_fop = &gfs2_dir_fops; + } else if (S_ISLNK(mode)) { + inode->i_op = &gfs2_symlink_iops; + } else { + inode->i_op = &gfs2_dev_iops; + } + unlock_new_inode(inode); } return inode; +fail_glock: + gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: gfs2_glock_put(io_gl); fail_put: diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 027f6ec5b0d9..fd3fd9074779 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -28,6 +28,7 @@ #include "ops_file.h" #include "util.h" #include "log.h" +#include "inode.h" #define BFITNOENT ((u32)~0) @@ -50,6 +51,9 @@ static const char valid_change[16] = { 1, 0, 0, 0 }; +static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, + unsigned char old_state, unsigned char new_state); + /** * gfs2_setbit - Set a bit in the bitmaps * @buffer: the buffer that holds the bitmaps @@ -531,6 +535,7 @@ static int read_rindex_entry(struct gfs2_inode *ip, rgd->rd_gl->gl_object = rgd; rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; + rgd->rd_flags |= GFS2_RDF_CHECK; return error; } @@ -845,6 +850,37 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) return ret; } +/** + * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes + * @rgd: The rgrp + * + * Returns: The inode, if one has been found + */ + +static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) +{ + struct inode *inode; + u32 goal = 0; + u64 ino; + + for(;;) { + goal = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, + GFS2_BLKST_UNLINKED); + if (goal == 0) + return 0; + ino = goal + rgd->rd_data0; + if (ino <= *last_unlinked) + continue; + *last_unlinked = ino; + inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, ino, DT_UNKNOWN); + if (!IS_ERR(inode)) + return inode; + } + + rgd->rd_flags &= ~GFS2_RDF_CHECK; + return NULL; +} + /** * recent_rgrp_first - get first RG from "recent" list * @sdp: The GFS2 superblock @@ -1006,8 +1042,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) * Returns: errno */ -static int get_local_rgrp(struct gfs2_inode *ip) +static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) { + struct inode *inode = NULL; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; struct gfs2_alloc *al = &ip->i_alloc; @@ -1027,7 +1064,11 @@ static int get_local_rgrp(struct gfs2_inode *ip) case 0: if (try_rgrp_fit(rgd, al)) goto out; + if (rgd->rd_flags & GFS2_RDF_CHECK) + inode = try_rgrp_unlink(rgd, last_unlinked); gfs2_glock_dq_uninit(&al->al_rgd_gh); + if (inode) + return inode; rgd = recent_rgrp_next(rgd, 1); break; @@ -1036,7 +1077,7 @@ static int get_local_rgrp(struct gfs2_inode *ip) break; default: - return error; + return ERR_PTR(error); } } @@ -1051,7 +1092,11 @@ static int get_local_rgrp(struct gfs2_inode *ip) case 0: if (try_rgrp_fit(rgd, al)) goto out; + if (rgd->rd_flags & GFS2_RDF_CHECK) + inode = try_rgrp_unlink(rgd, last_unlinked); gfs2_glock_dq_uninit(&al->al_rgd_gh); + if (inode) + return inode; break; case GLR_TRYFAILED: @@ -1059,7 +1104,7 @@ static int get_local_rgrp(struct gfs2_inode *ip) break; default: - return error; + return ERR_PTR(error); } rgd = gfs2_rgrpd_get_next(rgd); @@ -1068,7 +1113,7 @@ static int get_local_rgrp(struct gfs2_inode *ip) if (rgd == begin) { if (++loops >= 3) - return -ENOSPC; + return ERR_PTR(-ENOSPC); if (!skipped) loops++; flags = 0; @@ -1088,7 +1133,7 @@ out: forward_rgrp_set(sdp, rgd); } - return 0; + return NULL; } /** @@ -1102,11 +1147,14 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = &ip->i_alloc; + struct inode *inode; int error = 0; + u64 last_unlinked = 0; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; +try_again: /* We need to hold the rindex unless the inode we're using is the rindex itself, in which case it's already held. */ if (ip != GFS2_I(sdp->sd_rindex)) @@ -1117,11 +1165,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) if (error) return error; - error = get_local_rgrp(ip); - if (error) { + inode = get_local_rgrp(ip, &last_unlinked); + if (inode) { if (ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); - return error; + if (IS_ERR(inode)) + return PTR_ERR(inode); + iput(inode); + gfs2_log_flush(sdp, NULL); + goto try_again; } al->al_file = file; @@ -1209,7 +1261,7 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) */ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state) + unsigned char old_state, unsigned char new_state) { struct gfs2_bitmap *bi = NULL; u32 length = rgd->rd_length; @@ -1250,17 +1302,18 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, goal = 0; } - if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length)) - blk = 0; + if (old_state != new_state) { + gfs2_assert_withdraw(rgd->rd_sbd, blk != BFITNOENT); - gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len, blk, new_state); - if (bi->bi_clone) - gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset, + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, bi->bi_len, blk, new_state); + if (bi->bi_clone) + gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset, + bi->bi_len, blk, new_state); + } - return bi->bi_start * GFS2_NBBY + blk; + return (blk == BFITNOENT) ? 0 : (bi->bi_start * GFS2_NBBY) + blk; } /** -- cgit v1.2.1 From 037bcbb7564e35aef937c54799550cd27735aac6 Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Fri, 8 Jun 2007 16:42:14 -0700 Subject: [GFS2] gfs2_lookupi() uninitialised var fix fs/gfs2/inode.c: In function 'gfs2_lookupi': fs/gfs2/inode.c:392: warning: 'error' may be used uninitialized in this function Looks like a real bug to me. Cc: Steven Whitehouse Signed-off-by: Andrew Morton Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index cacdb0dfe577..366235d6a5b5 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -389,7 +389,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, struct super_block *sb = dir->i_sb; struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_holder d_gh; - int error; + int error = 0; struct inode *inode = NULL; int unlock = 0; -- cgit v1.2.1 From d88101d4d82ea09433692af30618c3b7afb7da02 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 8 Jun 2007 16:00:22 -0500 Subject: [GFS2] set plock owner in GETLK info Set the owner field in the plock info sent to userspace for GETLK. Without this, gfs_controld won't correctly see when the GETLK from a process matches one of the process's existing locks. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/locking/dlm/plock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c index f82495e18c2d..1dc76805dd45 100644 --- a/fs/gfs2/locking/dlm/plock.c +++ b/fs/gfs2/locking/dlm/plock.c @@ -242,7 +242,7 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name, op->info.number = name->ln_number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - + op->info.owner = (__u64)(long) fl->fl_owner; send_op(op); wait_event(recv_wq, (op->done != 0)); -- cgit v1.2.1 From a7a2ff8a951ab373732116e7c31e2e1fe025d5e0 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 8 Jun 2007 17:01:40 -0500 Subject: [GFS2] return conflicts for GETLK We weren't returning the correct result when GETLK found a conflict, which is indicated by userspace passing back a 1. Signed-off-by: Abhijith Das Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/locking/dlm/plock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c index 1dc76805dd45..fba1f1d87e4f 100644 --- a/fs/gfs2/locking/dlm/plock.c +++ b/fs/gfs2/locking/dlm/plock.c @@ -254,16 +254,20 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name, } spin_unlock(&ops_lock); + /* info.rv from userspace is 1 for conflict, 0 for no-conflict, + -ENOENT if there are no locks on the file */ + rv = op->info.rv; fl->fl_type = F_UNLCK; if (rv == -ENOENT) rv = 0; - else if (rv == 0 && op->info.pid != fl->fl_pid) { + else if (rv > 0) { fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; fl->fl_pid = op->info.pid; fl->fl_start = op->info.start; fl->fl_end = op->info.end; + rv = 0; } kfree(op); -- cgit v1.2.1 From d93cfa9884354dac2d8ccd894594a43e0b962b6f Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Mon, 11 Jun 2007 08:22:32 +0100 Subject: [GFS2] Fix deallocation issues There were two issues during deallocation of unlinked inodes. The first was relating to the use of a "try" lock which in the case of the inode lock wasn't trying hard enough to deallocate in all circumstances (now changed to a normal glock) and in the case of the iopen lock didn't wait for the demotion of the shared lock before attempting to get the exclusive lock, and thereby sometimes (timing dependent) not completing the deallocation when it should have done. The second issue related to the lack of a way to invalidate dcache entries on remote nodes (now fixed by this patch) which meant that unlinks were taking a long time to return disk space to the fs. By adding some code to invalidate the dcache entries across the cluster for unlinked inodes, that is now fixed. This patch was written jointly by Abhijith Das and Steven Whitehouse. Signed-off-by: Abhijith Das Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 52 +++++++++++++++++++++++++++++++++++++++++----------- fs/gfs2/glock.h | 1 + fs/gfs2/inode.c | 1 + fs/gfs2/ops_super.c | 8 +++++--- 4 files changed, 48 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b3ed58551e74..384cae623ed3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -422,11 +422,11 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) static void gfs2_holder_wake(struct gfs2_holder *gh) { clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb(); + smp_mb__after_clear_bit(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); } -static int holder_wait(void *word) +static int just_schedule(void *word) { schedule(); return 0; @@ -435,7 +435,20 @@ static int holder_wait(void *word) static void wait_on_holder(struct gfs2_holder *gh) { might_sleep(); - wait_on_bit(&gh->gh_iflags, HIF_WAIT, holder_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); +} + +static void gfs2_demote_wake(struct gfs2_glock *gl) +{ + clear_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&gl->gl_flags, GLF_DEMOTE); +} + +static void wait_on_demote(struct gfs2_glock *gl) +{ + might_sleep(); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE); } /** @@ -528,7 +541,7 @@ static int rq_demote(struct gfs2_glock *gl) if (gl->gl_state == gl->gl_demote_state || gl->gl_state == LM_ST_UNLOCKED) { - clear_bit(GLF_DEMOTE, &gl->gl_flags); + gfs2_demote_wake(gl); return 0; } set_bit(GLF_LOCK, &gl->gl_flags); @@ -666,12 +679,22 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl) * practise: LM_ST_SHARED and LM_ST_UNLOCKED */ -static void handle_callback(struct gfs2_glock *gl, unsigned int state) +static void handle_callback(struct gfs2_glock *gl, unsigned int state, int remote) { spin_lock(&gl->gl_spin); if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; + if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && + gl->gl_object) { + struct inode *inode = igrab(gl->gl_object); + spin_unlock(&gl->gl_spin); + if (inode) { + d_prune_aliases(inode); + iput(inode); + } + return; + } } else if (gl->gl_demote_state != LM_ST_UNLOCKED) { gl->gl_demote_state = state; } @@ -740,7 +763,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) if (ret & LM_OUT_CANCELED) op_done = 0; else - clear_bit(GLF_DEMOTE, &gl->gl_flags); + gfs2_demote_wake(gl); } else { spin_lock(&gl->gl_spin); list_del_init(&gh->gh_list); @@ -848,7 +871,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret) gfs2_assert_warn(sdp, !ret); state_change(gl, LM_ST_UNLOCKED); - clear_bit(GLF_DEMOTE, &gl->gl_flags); + gfs2_demote_wake(gl); if (glops->go_inval) glops->go_inval(gl, DIO_METADATA); @@ -1174,7 +1197,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) const struct gfs2_glock_operations *glops = gl->gl_ops; if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED); + handle_callback(gl, LM_ST_UNLOCKED, 0); gfs2_glmutex_lock(gl); @@ -1196,6 +1219,13 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_unlock(&gl->gl_spin); } +void gfs2_glock_dq_wait(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + gfs2_glock_dq(gh); + wait_on_demote(gl); +} + /** * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it * @gh: the holder structure @@ -1456,7 +1486,7 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, if (!gl) return; - handle_callback(gl, state); + handle_callback(gl, state, 1); spin_lock(&gl->gl_spin); run_queue(gl); @@ -1596,7 +1626,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp) if (gfs2_glmutex_trylock(gl)) { if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) - handle_callback(gl, LM_ST_UNLOCKED); + handle_callback(gl, LM_ST_UNLOCKED, 0); gfs2_glmutex_unlock(gl); } @@ -1709,7 +1739,7 @@ static void clear_glock(struct gfs2_glock *gl) if (gfs2_glmutex_trylock(gl)) { if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED); + handle_callback(gl, LM_ST_UNLOCKED, 0); gfs2_glmutex_unlock(gl); } } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index b3e152db70c8..7721ca3fff9e 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -87,6 +87,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh); int gfs2_glock_poll(struct gfs2_holder *gh); int gfs2_glock_wait(struct gfs2_holder *gh); void gfs2_glock_dq(struct gfs2_holder *gh); +void gfs2_glock_dq_wait(struct gfs2_holder *gh); void gfs2_glock_dq_uninit(struct gfs2_holder *gh); int gfs2_glock_nq_num(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 366235d6a5b5..792d64f69cc2 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -114,6 +114,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned in error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_iopen; + ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 485ce3d49923..603d940f1159 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -326,8 +326,10 @@ static void gfs2_clear_inode(struct inode *inode) gfs2_glock_schedule_for_reclaim(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; - if (ip->i_iopen_gh.gh_gl) + if (ip->i_iopen_gh.gh_gl) { + ip->i_iopen_gh.gh_gl->gl_object = NULL; gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } } } @@ -422,13 +424,13 @@ static void gfs2_delete_inode(struct inode *inode) if (!inode->i_private) goto out; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB, &gh); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (unlikely(error)) { gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } - gfs2_glock_dq(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); error = gfs2_glock_nq(&ip->i_iopen_gh); if (error) -- cgit v1.2.1 From fad59c1390045b5adb7c7249ec4e77e0f868aca5 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 11 Jun 2007 10:47:18 -0500 Subject: [DLM] don't require FS flag on all nodes Mask off the recently added DLM_LSFL_FS flag when setting the exflags. This way all the nodes in the lockspace aren't required to have the FS flag set, since we later check that exflags matches among all nodes. Signed-off-by: Patrick Caulfield Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lockspace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 6802653473d1..1dc72105ab12 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -438,17 +438,18 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_count = 0; ls->ls_flags = 0; - /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have TIMEWARN active */ if (flags & DLM_LSFL_TIMEWARN) set_bit(LSFL_TIMEWARN, &ls->ls_flags); - ls->ls_exflags = (flags & ~DLM_LSFL_TIMEWARN); if (flags & DLM_LSFL_FS) ls->ls_allocation = GFP_NOFS; else ls->ls_allocation = GFP_KERNEL; + /* ls_exflags are forced to match among nodes, and we don't + need to require all nodes to have TIMEWARN or FS set */ + ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS)); + size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; -- cgit v1.2.1 From 8fb68595d508fd30ec90939572484b263600376c Mon Sep 17 00:00:00 2001 From: Robert Peterson Date: Tue, 12 Jun 2007 11:24:36 -0500 Subject: [GFS2] Journaled file write/unstuff bug This patch is for bugzilla bug 283162, which uncovered a number of bugs pertaining to writing to files that have the journaled bit on. These bugs happen most often when writing to the meta_fs because the files are always journaled. So operations like gfs2_grow were particularly vulnerable, although many of the problems could be recreated with normal files after setting the journaled bit on. The problems fixed are: -GFS2 wasn't ever writing unstuffed journaled data blocks to their in-place location on disk. Now it does. -If you unmounted too quickly after doing IO to a journaled file, GFS2 was crashing because you would discard a buffer whose bufdata was still on the active items list. GFS2 now deals with this gracefully. -GFS2 was losing track of the bufdata for journaled data blocks, and it wasn't getting freed, causing an error when you tried to unmount the module. GFS2 now frees all the bufdata structures. -There was a memory corruption occurring because GFS2 wrote twice as many log entries for journaled buffers. -It was occasionally trying to write journal headers in buffers that weren't currently mapped. Signed-off-by: Bob Peterson Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 15 ++++++++++++++- fs/gfs2/lops.c | 4 +++- fs/gfs2/ops_address.c | 26 +++++++++++++++++++++++++- 3 files changed, 42 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 1fb846fc545e..fbdc0dc9923e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -83,6 +83,11 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) gfs2_assert(sdp, bd->bd_ail == ai); + if (!bh){ + list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + continue; + } + if (!buffer_busy(bh)) { if (!buffer_uptodate(bh)) { gfs2_log_unlock(sdp); @@ -125,6 +130,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl bd_ail_st_list) { bh = bd->bd_bh; + if (!bh){ + list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + continue; + } + gfs2_assert(sdp, bd->bd_ail == ai); if (buffer_busy(bh)) { @@ -227,7 +237,10 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) list_del(&bd->bd_ail_st_list); list_del(&bd->bd_ail_gl_list); atomic_dec(&bd->bd_gl->gl_ail_count); - brelse(bd->bd_bh); + if (bd->bd_bh) + brelse(bd->bd_bh); + else + kmem_cache_free(gfs2_bufdata_cachep, bd); } } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3e971f25120d..df6bceea379a 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -607,7 +607,8 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp) if (unlikely(magic != 0)) set_buffer_escaped(bh1); gfs2_log_lock(sdp); - if (n++ > num) + n += 2; + if (n >= num) break; } else if (!bh1) { total_dbuf--; @@ -624,6 +625,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_log_unlock(sdp); if (bh) { + set_buffer_mapped(bh); set_buffer_dirty(bh); ll_rw_block(WRITE, 1, &bh); bh = NULL; diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index ac5659521386..9ab35a9ee75a 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -137,7 +137,9 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) return 0; /* don't care */ } - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) { + if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) && + PageChecked(page)) { + ClearPageChecked(page); error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); if (error) goto out_ignore; @@ -573,6 +575,23 @@ fail_nounlock: return error; } +/** + * gfs2_set_page_dirty - Page dirtying function + * @page: The page to dirty + * + * Returns: 1 if it dirtyed the page, or 0 otherwise + */ + +static int gfs2_set_page_dirty(struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + SetPageChecked(page); + return __set_page_dirty_buffers(page); +} + /** * gfs2_bmap - Block map function * @mapping: Address space info @@ -609,6 +628,8 @@ static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh) if (bd) { bd->bd_bh = NULL; bh->b_private = NULL; + if (!bd->bd_ail && list_empty(&bd->bd_le.le_list)) + kmem_cache_free(gfs2_bufdata_cachep, bd); } gfs2_log_unlock(sdp); @@ -629,6 +650,8 @@ static void gfs2_invalidatepage(struct page *page, unsigned long offset) unsigned int curr_off = 0; BUG_ON(!PageLocked(page)); + if (offset == 0) + ClearPageChecked(page); if (!page_has_buffers(page)) return; @@ -841,6 +864,7 @@ const struct address_space_operations gfs2_file_aops = { .sync_page = block_sync_page, .prepare_write = gfs2_prepare_write, .commit_write = gfs2_commit_write, + .set_page_dirty = gfs2_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, -- cgit v1.2.1 From c4201214cbf10636e2c1ab9131573f735b42c8d4 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 14 Jun 2007 16:39:13 +0100 Subject: [GFS2] Remove bogus '\0' in rgrp.c Not sure how it slipped in, but we don't want it anyway. Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index fd3fd9074779..36c523d487a7 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -444,7 +444,6 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) } /** - * gfs2_ri_total - Total up the file system space, according to the rindex. * */ -- cgit v1.2.1 From 2840501ac822c5bf712f67b4b02640e16e145a29 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 18 Jun 2007 16:31:42 +0100 Subject: [GFS2] Use zero_user_page() in stuffed_readpage() As suggested by Robert P. J. Day Signed-off-by: Steven Whitehouse Cc: Robert P. J. Day --- fs/gfs2/ops_address.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 9ab35a9ee75a..26c888890c24 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -208,11 +208,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) * so we need to supply one here. It doesn't happen often. */ if (unlikely(page->index)) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr, 0, PAGE_CACHE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page); - SetPageUptodate(page); + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); return 0; } -- cgit v1.2.1 From 2332c4435bb733b5cd4f612ee57532bd8fde4c1c Mon Sep 17 00:00:00 2001 From: Robert Peterson Date: Mon, 18 Jun 2007 14:50:20 -0500 Subject: [GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 4 ++- fs/gfs2/log.c | 100 +++++++++++++++++++++++++++++++++++++++++------------- fs/gfs2/lops.c | 53 +++++++++++++---------------- fs/gfs2/lops.h | 23 +++++++++++++ fs/gfs2/meta_io.c | 8 ++++- 5 files changed, 132 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c7c6ec0f17c7..170ba93829c0 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -354,7 +354,9 @@ struct gfs2_trans { unsigned int tr_num_buf; unsigned int tr_num_buf_new; + unsigned int tr_num_databuf_new; unsigned int tr_num_buf_rm; + unsigned int tr_num_databuf_rm; struct list_head tr_list_buf; unsigned int tr_num_revoke; @@ -599,6 +601,7 @@ struct gfs2_sbd { unsigned int sd_log_blks_reserved; unsigned int sd_log_commited_buf; + unsigned int sd_log_commited_databuf; unsigned int sd_log_commited_revoke; unsigned int sd_log_num_gl; @@ -607,7 +610,6 @@ struct gfs2_sbd { unsigned int sd_log_num_rg; unsigned int sd_log_num_databuf; unsigned int sd_log_num_jdata; - unsigned int sd_log_num_hdrs; struct list_head sd_log_le_gl; struct list_head sd_log_le_buf; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index fbdc0dc9923e..8fcfb784f906 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -276,7 +276,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) * @blks: The number of blocks to reserve * * Note that we never give out the last few blocks of the journal. Thats - * due to the fact that there is are a small number of header blocks + * due to the fact that there is a small number of header blocks * associated with each log flush. The exact number can't be known until * flush time, so we ensure that we have just enough free blocks at all * times to avoid running out during a log flush. @@ -371,6 +371,58 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer return dist; } +/** + * calc_reserved - Calculate the number of blocks to reserve when + * refunding a transaction's unused buffers. + * @sdp: The GFS2 superblock + * + * This is complex. We need to reserve room for all our currently used + * metadata buffers (e.g. normal file I/O rewriting file time stamps) and + * all our journaled data buffers for journaled files (e.g. files in the + * meta_fs like rindex, or files for which chattr +j was done.) + * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush + * will count it as free space (sd_log_blks_free) and corruption will follow. + * + * We can have metadata bufs and jdata bufs in the same journal. So each + * type gets its own log header, for which we need to reserve a block. + * In fact, each type has the potential for needing more than one header + * in cases where we have more buffers than will fit on a journal page. + * Metadata journal entries take up half the space of journaled buffer entries. + * Thus, metadata entries have buf_limit (502) and journaled buffers have + * databuf_limit (251) before they cause a wrap around. + * + * Also, we need to reserve blocks for revoke journal entries and one for an + * overall header for the lot. + * + * Returns: the number of blocks reserved + */ +static unsigned int calc_reserved(struct gfs2_sbd *sdp) +{ + unsigned int reserved = 0; + unsigned int mbuf_limit, metabufhdrs_needed; + unsigned int dbuf_limit, databufhdrs_needed; + unsigned int revokes = 0; + + mbuf_limit = buf_limit(sdp); + metabufhdrs_needed = (sdp->sd_log_commited_buf + + (mbuf_limit - 1)) / mbuf_limit; + dbuf_limit = databuf_limit(sdp); + databufhdrs_needed = (sdp->sd_log_commited_databuf + + (dbuf_limit - 1)) / dbuf_limit; + + if (sdp->sd_log_commited_revoke) + revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, + sizeof(u64)); + + reserved = sdp->sd_log_commited_buf + metabufhdrs_needed + + sdp->sd_log_commited_databuf + databufhdrs_needed + + revokes; + /* One for the overall header */ + if (reserved) + reserved++; + return reserved; +} + static unsigned int current_tail(struct gfs2_sbd *sdp) { struct gfs2_ail *ai; @@ -461,14 +513,14 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, return bh; } -static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull) +static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) { unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); ail2_empty(sdp, new_tail); gfs2_log_lock(sdp); - sdp->sd_log_blks_free += dist - (pull ? 1 : 0); + sdp->sd_log_blks_free += dist; gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); gfs2_log_unlock(sdp); @@ -518,7 +570,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) brelse(bh); if (sdp->sd_log_tail != tail) - log_pull_tail(sdp, tail, pull); + log_pull_tail(sdp, tail); else gfs2_assert_withdraw(sdp, !pull); @@ -579,7 +631,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) INIT_LIST_HEAD(&ai->ai_ail1_list); INIT_LIST_HEAD(&ai->ai_ail2_list); - gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf + sdp->sd_log_num_jdata == sdp->sd_log_commited_buf); + gfs2_assert_withdraw(sdp, + sdp->sd_log_num_buf + sdp->sd_log_num_jdata == + sdp->sd_log_commited_buf + + sdp->sd_log_commited_databuf); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); @@ -590,16 +645,19 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) lops_before_commit(sdp); if (!list_empty(&sdp->sd_log_flush_list)) log_flush_commit(sdp); - else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle) + else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ + gfs2_log_lock(sdp); + sdp->sd_log_blks_free--; /* Adjust for unreserved buffer */ + gfs2_log_unlock(sdp); log_write_header(sdp, 0, PULL); + } lops_after_commit(sdp, ai); gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; - sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs; sdp->sd_log_blks_reserved = 0; sdp->sd_log_commited_buf = 0; - sdp->sd_log_num_hdrs = 0; + sdp->sd_log_commited_databuf = 0; sdp->sd_log_commited_revoke = 0; if (!list_empty(&ai->ai_ail1_list)) { @@ -616,32 +674,26 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - unsigned int reserved = 0; + unsigned int reserved; unsigned int old; gfs2_log_lock(sdp); sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; - gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0); + sdp->sd_log_commited_databuf += tr->tr_num_databuf_new - + tr->tr_num_databuf_rm; + gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || + (((int)sdp->sd_log_commited_databuf) >= 0)); sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); - - if (sdp->sd_log_commited_buf) - reserved += sdp->sd_log_commited_buf; - if (sdp->sd_log_commited_revoke) - reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, - sizeof(u64)); - if (reserved) - reserved++; - + reserved = calc_reserved(sdp); old = sdp->sd_log_blks_free; sdp->sd_log_blks_free += tr->tr_reserved - (reserved - sdp->sd_log_blks_reserved); gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old); - gfs2_assert_withdraw(sdp, - sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks + - sdp->sd_log_num_hdrs); + gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= + sdp->sd_jdesc->jd_blocks); sdp->sd_log_blks_reserved = reserved; @@ -687,13 +739,13 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs); gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); sdp->sd_log_flush_head = sdp->sd_log_head; sdp->sd_log_flush_wrapped = 0; - log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0); + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, + (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL); gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index df6bceea379a..dd810ad68cf0 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,6 +17,7 @@ #include "gfs2.h" #include "incore.h" +#include "inode.h" #include "glock.h" #include "log.h" #include "lops.h" @@ -117,15 +118,13 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) struct gfs2_log_descriptor *ld; struct gfs2_bufdata *bd1 = NULL, *bd2; unsigned int total = sdp->sd_log_num_buf; - unsigned int offset = sizeof(struct gfs2_log_descriptor); + unsigned int offset = BUF_OFFSET; unsigned int limit; unsigned int num; unsigned n; __be64 *ptr; - offset += sizeof(__be64) - 1; - offset &= ~(sizeof(__be64) - 1); - limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64); + limit = buf_limit(sdp); /* for 4k blocks, limit = 503 */ bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); @@ -134,7 +133,6 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) if (total > limit) num = limit; bh = gfs2_log_get_buf(sdp); - sdp->sd_log_num_hdrs++; ld = (struct gfs2_log_descriptor *)bh->b_data; ptr = (__be64 *)(bh->b_data + offset); ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); @@ -469,27 +467,26 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) struct gfs2_inode *ip = GFS2_I(mapping->host); gfs2_log_lock(sdp); - tr->tr_touched = 1; - if (list_empty(&bd->bd_list_tr) && - (ip->i_di.di_flags & GFS2_DIF_JDATA)) { - tr->tr_num_buf++; - list_add(&bd->bd_list_tr, &tr->tr_list_buf); - gfs2_log_unlock(sdp); - if (!list_empty(&le->le_list)) - return; - gfs2_pin(sdp, bd->bd_bh); - tr->tr_num_buf_new++; - } else { + if (!list_empty(&bd->bd_list_tr)) { gfs2_log_unlock(sdp); + return; } + tr->tr_touched = 1; + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + gfs2_log_unlock(sdp); + if (!list_empty(&le->le_list)) + return; + gfs2_trans_add_gl(bd->bd_gl); - gfs2_log_lock(sdp); - if (list_empty(&le->le_list)) { - if (ip->i_di.di_flags & GFS2_DIF_JDATA) - sdp->sd_log_num_jdata++; - sdp->sd_log_num_databuf++; - list_add(&le->le_list, &sdp->sd_log_le_databuf); + if (gfs2_is_jdata(ip)) { + sdp->sd_log_num_jdata++; + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_databuf_new++; } + sdp->sd_log_num_databuf++; + gfs2_log_lock(sdp); + list_add(&le->le_list, &sdp->sd_log_le_databuf); gfs2_log_unlock(sdp); } @@ -522,7 +519,6 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp) LIST_HEAD(started); struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt; struct buffer_head *bh = NULL,*bh1 = NULL; - unsigned int offset = sizeof(struct gfs2_log_descriptor); struct gfs2_log_descriptor *ld; unsigned int limit; unsigned int total_dbuf = sdp->sd_log_num_databuf; @@ -530,9 +526,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp) unsigned int num, n; __be64 *ptr = NULL; - offset += 2*sizeof(__be64) - 1; - offset &= ~(2*sizeof(__be64) - 1); - limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64); + limit = databuf_limit(sdp); /* * Start writing ordered buffers, write journaled buffers @@ -583,10 +577,10 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); if (!bh) { bh = gfs2_log_get_buf(sdp); - sdp->sd_log_num_hdrs++; ld = (struct gfs2_log_descriptor *) bh->b_data; - ptr = (__be64 *)(bh->b_data + offset); + ptr = (__be64 *)(bh->b_data + + DATABUF_OFFSET); ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); ld->ld_header.mh_type = @@ -607,8 +601,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp) if (unlikely(magic != 0)) set_buffer_escaped(bh1); gfs2_log_lock(sdp); - n += 2; - if (n >= num) + if (++n >= num) break; } else if (!bh1) { total_dbuf--; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 965bc65c7c64..41a00df75587 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -13,6 +13,13 @@ #include #include "incore.h" +#define BUF_OFFSET \ + ((sizeof(struct gfs2_log_descriptor) + sizeof(__be64) - 1) & \ + ~(sizeof(__be64) - 1)) +#define DATABUF_OFFSET \ + ((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \ + ~(2 * sizeof(__be64) - 1)) + extern const struct gfs2_log_operations gfs2_glock_lops; extern const struct gfs2_log_operations gfs2_buf_lops; extern const struct gfs2_log_operations gfs2_revoke_lops; @@ -21,6 +28,22 @@ extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; +static inline unsigned int buf_limit(struct gfs2_sbd *sdp) +{ + unsigned int limit; + + limit = (sdp->sd_sb.sb_bsize - BUF_OFFSET) / sizeof(__be64); + return limit; +} + +static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) +{ + unsigned int limit; + + limit = (sdp->sd_sb.sb_bsize - DATABUF_OFFSET) / (2 * sizeof(__be64)); + return limit; +} + static inline void lops_init_le(struct gfs2_log_element *le, const struct gfs2_log_operations *lops) { diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index e62d4f620c58..8da343b34ae7 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -387,12 +387,18 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) if (test_clear_buffer_pinned(bh)) { struct gfs2_trans *tr = current->journal_info; + struct gfs2_inode *bh_ip = + GFS2_I(bh->b_page->mapping->host); + gfs2_log_lock(sdp); list_del_init(&bd->bd_le.le_list); gfs2_assert_warn(sdp, sdp->sd_log_num_buf); sdp->sd_log_num_buf--; gfs2_log_unlock(sdp); - tr->tr_num_buf_rm++; + if (bh_ip->i_inode.i_private != NULL) + tr->tr_num_databuf_rm++; + else + tr->tr_num_buf_rm++; brelse(bh); } if (bd) { -- cgit v1.2.1 From eaf5bd3cac92126e5825c6ebc10bee0fba35d555 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 19 Jun 2007 15:38:17 +0100 Subject: [GFS2] Simplify multiple glock aquisition There is a bug in the code which acquires multiple glocks where if the initial out-of-order attempt fails part way though we can land up trying to acquire the wrong number of glocks. This is part of the fix for red hat bz #239737. The other part of the bz doesn't apply to upstream kernels since it was fixed by: http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=d3717bdf8f08a0e1039158c8bab2c24d20f492b6 Since the out-of-order code doesn't appear to add anything to the performance of GFS2, this patch just removed it rather than trying to fix it. It should be much easier to see whats going on here now. In addition, we don't allocate any memory unless we are using a lot of glocks (which is a relatively uncommon case). Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 64 ++++++++++++--------------------------------------------- 1 file changed, 13 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 384cae623ed3..3f0974e1afef 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1327,10 +1327,6 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * - * Figure out how big an impact this function has. Either: - * 1) Replace this code with code that calls gfs2_glock_prefetch() - * 2) Forget async stuff and just call nq_m_sync() - * 3) Leave it like it is * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) @@ -1338,62 +1334,28 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) { - int *e; - unsigned int x; - int borked = 0, serious = 0; + struct gfs2_holder *tmp[4]; + struct gfs2_holder **pph = tmp; int error = 0; - if (!num_gh) + switch(num_gh) { + case 0: return 0; - - if (num_gh == 1) { + case 1: ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); return gfs2_glock_nq(ghs); - } - - e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL); - if (!e) - return -ENOMEM; - - for (x = 0; x < num_gh; x++) { - ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC; - error = gfs2_glock_nq(&ghs[x]); - if (error) { - borked = 1; - serious = error; - num_gh = x; + default: + if (num_gh <= 4) break; - } + pph = kmalloc(num_gh * sizeof(struct gfs2_holder *), GFP_NOFS); + if (!pph) + return -ENOMEM; } - for (x = 0; x < num_gh; x++) { - error = e[x] = glock_wait_internal(&ghs[x]); - if (error) { - borked = 1; - if (error != GLR_TRYFAILED && error != GLR_CANCELED) - serious = error; - } - } - - if (!borked) { - kfree(e); - return 0; - } - - for (x = 0; x < num_gh; x++) - if (!e[x]) - gfs2_glock_dq(&ghs[x]); - - if (serious) - error = serious; - else { - for (x = 0; x < num_gh; x++) - gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags, - &ghs[x]); - error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e); - } + error = nq_m_sync(num_gh, ghs, pph); - kfree(e); + if (pph != tmp) + kfree(pph); return error; } -- cgit v1.2.1 From 773ed1a044adc868036dee1722b8bca6ce5923e2 Mon Sep 17 00:00:00 2001 From: Robert Peterson Date: Wed, 20 Jun 2007 08:34:06 -0500 Subject: [GFS2] Addendum to the journaled file/unmount patch This patch is an addendum to the previous journaled file/unmount patch. It fixes a problem discovered during testing. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/lops.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index dd810ad68cf0..aff70f0698fd 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -472,8 +472,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) return; } tr->tr_touched = 1; - tr->tr_num_buf++; - list_add(&bd->bd_list_tr, &tr->tr_list_buf); + if (gfs2_is_jdata(ip)) { + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + } gfs2_log_unlock(sdp); if (!list_empty(&le->le_list)) return; -- cgit v1.2.1 From 1875f2f31b3955dff8c3712a56ae61836c8b90fe Mon Sep 17 00:00:00 2001 From: "S. Wendy Cheng" Date: Mon, 25 Jun 2007 21:14:31 -0400 Subject: [GFS2] Fix gfs2_block_truncate_page err return Code segment inside gfs2_block_truncate_page() doesn't set the return code correctly. This causes NFSD erroneously returns EIO back to client with setattr procedure call (truncate error). Signed-off-by: S. Wendy Cheng Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d16044cb023a..cd805a66880d 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -927,6 +927,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping) /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) goto unlock; + err = 0; } if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) -- cgit v1.2.1 From 97d848365e603def43c69e160937f073bf9cf02e Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Wed, 27 Jun 2007 11:36:23 +0100 Subject: [DLM] Telnet to port 21064 can stop all lockspaces This patch fixes Red Hat bz#245892 Opening a tcp connection from a cluster member to another cluster member targeting the dlm port it is enough to stop every dlm operation in the cluster. This means that GFS and rgmanager will hang. Signed-Off-By: Patrick Caulfield Signed-off-by: Steven Whitehouse --- fs/dlm/lowcomms.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index fc0bff74c61e..73d44f57e24a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -720,11 +720,17 @@ static int tcp_accept_from_sock(struct connection *con) INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); newcon->othercon = othercon; + othercon->sock = newsock; + newsock->sk->sk_user_data = othercon; + add_sock(newsock, othercon); + addcon = othercon; + } + else { + printk("Extra connection from node %d attempted\n", nodeid); + result = -EAGAIN; + up_write(&newcon->sock_sem); + goto accept_err; } - othercon->sock = newsock; - newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon); - addcon = othercon; } else { newsock->sk->sk_user_data = newcon; -- cgit v1.2.1 From 090ffaa55dacea774af9ee378d09e47fb7cea9ff Mon Sep 17 00:00:00 2001 From: Wendy Cheng Date: Wed, 27 Jun 2007 11:00:03 -0400 Subject: [GFS2] inode size inconsistency This should have been part of the NFS patch #1 but somehow I missed it when packaging the patches. It is not a critical issue as the others (I hope). RHEL 5.1 31.el5 kernel runs fine without this change. Our truncate code is chopped into two parts, one for vfs inode changes (in vmtruncate()) and one of gfs inode (in gfs2_truncatei()). These two operatons are, unfortunately, not atomic. So it could happens that vmtruncate() succeeds (inode->i_size is changed) but gfs2_truncatei fails (say kernel temporarily out of memory). This would leave gfs inode i_di.di_size out of sync with vfs inode i_size. It will later confuse gfs2_commit_write() if a write is issued. Last time I checked, it will cause file corruption. Signed-off-by: S. Wendy Cheng Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 84051b997a43..911c115b5c6c 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -903,8 +903,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr) } error = gfs2_truncatei(ip, attr->ia_size); - if (error) - return error; + if (error && (inode->i_size != ip->i_di.di_size)) + i_size_write(inode, ip->i_di.di_size); return error; } -- cgit v1.2.1 From 569a7b6c2e8965ff4908003b925757703a3d649c Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 27 Jun 2007 10:15:56 -0500 Subject: [GFS2] remounting w/o acl option leaves acls enabled This patch is for bugzilla bug #245663. This crosswrites a fix from gfs1 (bz #210369) so that the mount options are reset properly upon remount. This was tested on system trin-10. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/mount.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index 4864659555d4..6f006a804db3 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -82,20 +82,19 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) char *options, *o, *v; int error = 0; - if (!remount) { - /* If someone preloaded options, use those instead */ - spin_lock(&gfs2_sys_margs_lock); - if (gfs2_sys_margs) { - data = gfs2_sys_margs; - gfs2_sys_margs = NULL; - } - spin_unlock(&gfs2_sys_margs_lock); - - /* Set some defaults */ - args->ar_num_glockd = GFS2_GLOCKD_DEFAULT; - args->ar_quota = GFS2_QUOTA_DEFAULT; - args->ar_data = GFS2_DATA_DEFAULT; + /* If someone preloaded options, use those instead */ + spin_lock(&gfs2_sys_margs_lock); + if (!remount && gfs2_sys_margs) { + data = gfs2_sys_margs; + gfs2_sys_margs = NULL; } + spin_unlock(&gfs2_sys_margs_lock); + + /* Set some defaults */ + memset(args, 0, sizeof(struct gfs2_args)); + args->ar_num_glockd = GFS2_GLOCKD_DEFAULT; + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; /* Split the options into tokens with the "," character and process them */ -- cgit v1.2.1 From b3657629249eba0b3b61ff964d6e1539b469d117 Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Wed, 27 Jun 2007 11:06:19 -0500 Subject: [GFS2] System won't suspend with GFS2 file system mounted The kernel threads in gfs2, namely gfs2_scand, gfs2_logd, gfs2_quotad, gfs2_glockd, gfs2_recoverd weren't doing anything when the suspend mechanism was trying to freeze them. I put in calls to refrigerator() in the loops for all the daemons and suspend works as expected. Signed-off-by: Abhijith Das Signed-off-by: Steven Whitehouse --- fs/gfs2/daemon.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c index 683cb5bda870..3548d9f31e0d 100644 --- a/fs/gfs2/daemon.c +++ b/fs/gfs2/daemon.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -49,6 +50,8 @@ int gfs2_scand(void *data) while (!kthread_should_stop()) { gfs2_scand_internal(sdp); t = gfs2_tune_get(sdp, gt_scand_secs) * HZ; + if (freezing(current)) + refrigerator(); schedule_timeout_interruptible(t); } @@ -74,6 +77,8 @@ int gfs2_glockd(void *data) wait_event_interruptible(sdp->sd_reclaim_wq, (atomic_read(&sdp->sd_reclaim_count) || kthread_should_stop())); + if (freezing(current)) + refrigerator(); } return 0; @@ -93,6 +98,8 @@ int gfs2_recoverd(void *data) while (!kthread_should_stop()) { gfs2_check_journals(sdp); t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; + if (freezing(current)) + refrigerator(); schedule_timeout_interruptible(t); } @@ -141,6 +148,8 @@ int gfs2_logd(void *data) } t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; + if (freezing(current)) + refrigerator(); schedule_timeout_interruptible(t); } @@ -191,6 +200,8 @@ int gfs2_quotad(void *data) gfs2_quota_scan(sdp); t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ; + if (freezing(current)) + refrigerator(); schedule_timeout_interruptible(t); } -- cgit v1.2.1 From f4fadb23ca49abd2f1387a0b7e78b385ebc760ce Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Wed, 27 Jun 2007 14:43:37 -0700 Subject: [GFS2] git-gfs2-nmw-build-fix Cc: Steven Whitehouse Signed-off-by: Andrew Morton Signed-off-by: Steven Whitehouse --- fs/dlm/lowcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 73d44f57e24a..0553a6158dcb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -728,7 +728,7 @@ static int tcp_accept_from_sock(struct connection *con) else { printk("Extra connection from node %d attempted\n", nodeid); result = -EAGAIN; - up_write(&newcon->sock_sem); + mutex_unlock(&newcon->sock_mutex); goto accept_err; } } -- cgit v1.2.1 From bb9bcf061660661c57ddcf31337529f82414b937 Mon Sep 17 00:00:00 2001 From: Wendy Cheng Date: Wed, 27 Jun 2007 17:07:08 -0400 Subject: [GFS2] Obtaining no_formal_ino from directory entry GFS2 lookup code doesn't ask for inode shared glock. This implies during in-memory inode creation for existing file, GFS2 will not disk-read in the inode contents. This leaves no_formal_ino un-initialized during lookup time. The un-initialized no_formal_ino is subsequently encoded into file handle. Clients will get ESTALE error whenever it tries to access these files. Signed-off-by: S. Wendy Cheng Signed-off-by: Steven Whitehouse --- fs/gfs2/dir.c | 7 ++++--- fs/gfs2/inode.c | 10 ++++++++-- fs/gfs2/inode.h | 3 ++- fs/gfs2/ops_export.c | 4 +++- fs/gfs2/ops_fstype.c | 2 +- fs/gfs2/rgrp.c | 11 ++++++----- 6 files changed, 24 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index f793e31a050e..2beb2f401aa2 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1498,9 +1498,10 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) if (dent) { if (IS_ERR(dent)) return ERR_PTR(PTR_ERR(dent)); - inode = gfs2_inode_lookup(dir->i_sb, - be64_to_cpu(dent->de_inum.no_addr), - be16_to_cpu(dent->de_type)); + inode = gfs2_inode_lookup(dir->i_sb, + be16_to_cpu(dent->de_type), + be64_to_cpu(dent->de_inum.no_addr), + be64_to_cpu(dent->de_inum.no_formal_ino)); brelse(bh); return inode; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 792d64f69cc2..26aaf54959d9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -86,7 +86,10 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) * Returns: A VFS inode, or an error */ -struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned int type) +struct inode *gfs2_inode_lookup(struct super_block *sb, + unsigned int type, + u64 no_addr, + u64 no_formal_ino) { struct inode *inode = gfs2_iget(sb, no_addr); struct gfs2_inode *ip = GFS2_I(inode); @@ -100,6 +103,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned in struct gfs2_sbd *sdp = GFS2_SB(inode); umode_t mode; inode->i_private = ip; + ip->i_no_formal_ino = no_formal_ino; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) @@ -915,7 +919,9 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; - inode = gfs2_inode_lookup(dir->i_sb, inum.no_addr, IF2DT(mode)); + inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), + inum.no_addr, + inum.no_formal_ino); if (IS_ERR(inode)) goto fail_gunlock2; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 35375fc43fa3..3268a2fed672 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -47,7 +47,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, void gfs2_inode_attr_in(struct gfs2_inode *ip); -struct inode *gfs2_inode_lookup(struct super_block *sb, u64 no_addr, unsigned type); +struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino); struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index d07230ee5fc0..0fe14478a54d 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -245,7 +245,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) gfs2_glock_dq_uninit(&rgd_gh); gfs2_glock_dq_uninit(&ri_gh); - inode = gfs2_inode_lookup(sb, inum->no_addr, fh_obj->imode); + inode = gfs2_inode_lookup(sb, fh_obj->imode, + inum->no_addr, + inum->no_formal_ino); if (!inode) goto fail; if (IS_ERR(inode)) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index dae1d7142fe5..cf5aa5050548 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -236,7 +236,7 @@ fail: static inline struct inode *gfs2_lookup_root(struct super_block *sb, u64 no_addr) { - return gfs2_inode_lookup(sb, no_addr, DT_DIR); + return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); } static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 36c523d487a7..7fb74484af63 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -860,18 +860,19 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) { struct inode *inode; u32 goal = 0; - u64 ino; + u64 no_addr; for(;;) { goal = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, GFS2_BLKST_UNLINKED); if (goal == 0) return 0; - ino = goal + rgd->rd_data0; - if (ino <= *last_unlinked) + no_addr = goal + rgd->rd_data0; + if (no_addr <= *last_unlinked) continue; - *last_unlinked = ino; - inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, ino, DT_UNKNOWN); + *last_unlinked = no_addr; + inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, + no_addr, 0); if (!IS_ERR(inode)) return inode; } -- cgit v1.2.1 From 35dcc52e3a916184b145fd840250244b81004200 Mon Sep 17 00:00:00 2001 From: Wendy Cheng Date: Wed, 27 Jun 2007 17:07:53 -0400 Subject: [GFS2] Remove i_mode passing from NFS File Handle GFS2 has been passing i_mode within NFS File Handle. Other than the wrong assumption that there is always room for this extra 16 bit value, the current gfs2_get_dentry doesn't really need the i_mode to work correctly. Note that GFS2 NFS code does go thru the same lookup code path as direct file access route (where the mode is obtained from name lookup) but gfs2_get_dentry() is coded for different purpose. It is not used during lookup time. It is part of the file access procedure call. When the call is invoked, if on-disk inode is not in-memory, it has to be read-in. This makes i_mode passing a useless overhead. Signed-off-by: S. Wendy Cheng Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 54 +++++++++++++++++++++++++++++++++++----------------- fs/gfs2/inode.h | 1 + fs/gfs2/ops_export.c | 38 +++++++++++++++--------------------- fs/gfs2/rgrp.c | 2 +- 4 files changed, 54 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 26aaf54959d9..34f7bcdea1e9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -77,6 +77,36 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); } +/** + * GFS2 lookup code fills in vfs inode contents based on info obtained + * from directory entry inside gfs2_inode_lookup(). This has caused issues + * with NFS code path since its get_dentry routine doesn't have the relevant + * directory entry when gfs2_inode_lookup() is invoked. Part of the code + * segment inside gfs2_inode_lookup code needs to get moved around. + * + * Clean up I_LOCK and I_NEW as well. + **/ + +void gfs2_set_iop(struct inode *inode) +{ + umode_t mode = inode->i_mode; + + if (S_ISREG(mode)) { + inode->i_op = &gfs2_file_iops; + inode->i_fop = &gfs2_file_fops; + inode->i_mapping->a_ops = &gfs2_file_aops; + } else if (S_ISDIR(mode)) { + inode->i_op = &gfs2_dir_iops; + inode->i_fop = &gfs2_dir_fops; + } else if (S_ISLNK(mode)) { + inode->i_op = &gfs2_symlink_iops; + } else { + inode->i_op = &gfs2_dev_iops; + } + + unlock_new_inode(inode); +} + /** * gfs2_inode_lookup - Lookup an inode * @sb: The super block @@ -101,7 +131,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); - umode_t mode; inode->i_private = ip; ip->i_no_formal_ino = no_formal_ino; @@ -122,6 +151,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, gfs2_glock_put(io_gl); + if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) + goto gfs2_nfsbypass; + + inode->i_mode = DT2IF(type); + /* * We must read the inode in order to work out its type in * this case. Note that this doesn't happen often as we normally @@ -129,33 +163,19 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, * unlinked inode recovery (where it is safe to do this glock, * which is not true in the general case). */ - inode->i_mode = mode = DT2IF(type); if (type == DT_UNKNOWN) { struct gfs2_holder gh; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (unlikely(error)) goto fail_glock; /* Inode is now uptodate */ - mode = inode->i_mode; gfs2_glock_dq_uninit(&gh); } - if (S_ISREG(mode)) { - inode->i_op = &gfs2_file_iops; - inode->i_fop = &gfs2_file_fops; - inode->i_mapping->a_ops = &gfs2_file_aops; - } else if (S_ISDIR(mode)) { - inode->i_op = &gfs2_dir_iops; - inode->i_fop = &gfs2_dir_fops; - } else if (S_ISLNK(mode)) { - inode->i_op = &gfs2_symlink_iops; - } else { - inode->i_op = &gfs2_dev_iops; - } - - unlock_new_inode(inode); + gfs2_set_iop(inode); } +gfs2_nfsbypass: return inode; fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 3268a2fed672..4517ac82c01c 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -47,6 +47,7 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, void gfs2_inode_attr_in(struct gfs2_inode *ip); +void gfs2_set_iop(struct inode *inode); struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino); struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index 0fe14478a54d..e317db2a5548 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -27,12 +27,7 @@ #include "util.h" #define GFS2_SMALL_FH_SIZE 4 -#define GFS2_LARGE_FH_SIZE 10 - -struct gfs2_fh_obj { - struct gfs2_inum_host this; - u32 imode; -}; +#define GFS2_LARGE_FH_SIZE 8 static struct dentry *gfs2_decode_fh(struct super_block *sb, __u32 *p, @@ -43,11 +38,8 @@ static struct dentry *gfs2_decode_fh(struct super_block *sb, void *context) { __be32 *fh = (__force __be32 *)p; - struct gfs2_fh_obj fh_obj; - struct gfs2_inum_host *this, parent; + struct gfs2_inum_host inum, parent; - this = &fh_obj.this; - fh_obj.imode = DT_UNKNOWN; memset(&parent, 0, sizeof(struct gfs2_inum)); switch (fh_len) { @@ -56,18 +48,17 @@ static struct dentry *gfs2_decode_fh(struct super_block *sb, parent.no_formal_ino |= be32_to_cpu(fh[5]); parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; parent.no_addr |= be32_to_cpu(fh[7]); - fh_obj.imode = be32_to_cpu(fh[8]); case GFS2_SMALL_FH_SIZE: - this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; - this->no_formal_ino |= be32_to_cpu(fh[1]); - this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32; - this->no_addr |= be32_to_cpu(fh[3]); + inum.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; + inum.no_formal_ino |= be32_to_cpu(fh[1]); + inum.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; + inum.no_addr |= be32_to_cpu(fh[3]); break; default: return NULL; } - return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent, + return gfs2_export_ops.find_exported_dentry(sb, &inum, &parent, acceptable, context); } @@ -102,9 +93,6 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); fh[6] = cpu_to_be32(ip->i_no_addr >> 32); fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); - - fh[8] = cpu_to_be32(inode->i_mode); - fh[9] = 0; /* pad to double word */ *len = GFS2_LARGE_FH_SIZE; iput(inode); @@ -201,8 +189,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child) static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj; - struct gfs2_inum_host *inum = &fh_obj->this; + struct gfs2_inum_host *inum = inum_obj; struct gfs2_holder i_gh, ri_gh, rgd_gh; struct gfs2_rgrpd *rgd; struct inode *inode; @@ -245,9 +232,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) gfs2_glock_dq_uninit(&rgd_gh); gfs2_glock_dq_uninit(&ri_gh); - inode = gfs2_inode_lookup(sb, fh_obj->imode, + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, - inum->no_formal_ino); + 0); if (!inode) goto fail; if (IS_ERR(inode)) { @@ -260,6 +247,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) iput(inode); goto fail; } + + /* Pick up the works we bypass in gfs2_inode_lookup */ + if (inode->i_state & I_NEW) + gfs2_set_iop(inode); + if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { iput(inode); goto fail; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7fb74484af63..e4e040625153 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -872,7 +872,7 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) continue; *last_unlinked = no_addr; inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, - no_addr, 0); + no_addr, -1); if (!IS_ERR(inode)) return inode; } -- cgit v1.2.1 From ac90a2552500996c529d5f0ddc16a9bf60bf670d Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 6 Jul 2007 09:47:08 -0500 Subject: [DLM] dump more lock values Add two more output fields (lkb_flags and rsb nodeid) to the new debugfs file that dumps one lock per line. Also, dump all locks instead of just mastered locks. Accordingly, use a suffix of _locks instead of _master. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/debug_fs.c | 82 ++++++++++++++++++++++++++------------------------- fs/dlm/dlm_internal.h | 2 +- 2 files changed, 43 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 9f5de37706a9..12c3bfd5e660 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -27,7 +27,7 @@ static struct dentry *dlm_root; struct rsb_iter { int entry; - int master; + int locks; int header; struct dlm_ls *ls; struct list_head *next; @@ -60,8 +60,8 @@ static char *print_lockmode(int mode) } } -static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, - struct dlm_rsb *res) +static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) { seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); @@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) /* Print the locks attached to this resource */ seq_printf(s, "Granted Queue\n"); list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) - print_lock(s, lkb, res); + print_resource_lock(s, lkb, res); seq_printf(s, "Conversion Queue\n"); list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) - print_lock(s, lkb, res); + print_resource_lock(s, lkb, res); seq_printf(s, "Waiting Queue\n"); list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) - print_lock(s, lkb, res); + print_resource_lock(s, lkb, res); if (list_empty(&res->res_lookup)) goto out; @@ -160,8 +160,7 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) return 0; } -static void print_master_lock(struct seq_file *s, struct dlm_lkb *lkb, - struct dlm_rsb *r) +static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r) { struct dlm_user_args *ua; unsigned int waiting = 0; @@ -176,37 +175,40 @@ static void print_master_lock(struct seq_file *s, struct dlm_lkb *lkb, if (lkb->lkb_timestamp) waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp); - /* id nodeid remid pid xid flags sts grmode rqmode time_ms len name */ + /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms + r_nodeid r_len r_name */ - seq_printf(s, "%x %d %x %u %llu %x %d %d %d %u %d \"%s\"\n", + seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n", lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_remid, lkb->lkb_ownpid, (unsigned long long)xid, lkb->lkb_exflags, + lkb->lkb_flags, lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, waiting, + r->res_nodeid, r->res_length, r->res_name); } -static int print_master_resource(struct dlm_rsb *r, struct seq_file *s) +static int print_locks(struct dlm_rsb *r, struct seq_file *s) { struct dlm_lkb *lkb; lock_rsb(r); list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) - print_master_lock(s, lkb, r); + print_lock(s, lkb, r); list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) - print_master_lock(s, lkb, r); + print_lock(s, lkb, r); list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) - print_master_lock(s, lkb, r); + print_lock(s, lkb, r); unlock_rsb(r); return 0; @@ -325,14 +327,14 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr) { struct rsb_iter *ri = iter_ptr; - if (ri->master) { + if (ri->locks) { if (ri->header) { - seq_printf(file, "id nodeid remid pid xid flags sts " - "grmode rqmode time_ms len name\n"); + seq_printf(file, "id nodeid remid pid xid exflags flags " + "sts grmode rqmode time_ms r_nodeid " + "r_len r_name\n"); ri->header = 0; } - if (is_master(ri->rsb)) - print_master_resource(ri->rsb, file); + print_locks(ri->rsb, file); } else { print_resource(ri->rsb, file); } @@ -371,10 +373,10 @@ static const struct file_operations rsb_fops = { }; /* - * Dump master lock state + * Dump state in compact per-lock listing */ -static struct rsb_iter *master_iter_init(struct dlm_ls *ls, loff_t *pos) +static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos) { struct rsb_iter *ri; @@ -385,7 +387,7 @@ static struct rsb_iter *master_iter_init(struct dlm_ls *ls, loff_t *pos) ri->ls = ls; ri->entry = 0; ri->next = NULL; - ri->master = 1; + ri->locks = 1; if (*pos == 0) ri->header = 1; @@ -398,12 +400,12 @@ static struct rsb_iter *master_iter_init(struct dlm_ls *ls, loff_t *pos) return ri; } -static void *master_seq_start(struct seq_file *file, loff_t *pos) +static void *locks_seq_start(struct seq_file *file, loff_t *pos) { struct rsb_iter *ri; loff_t n = *pos; - ri = master_iter_init(file->private, pos); + ri = locks_iter_init(file->private, pos); if (!ri) return NULL; @@ -417,19 +419,19 @@ static void *master_seq_start(struct seq_file *file, loff_t *pos) return ri; } -static struct seq_operations master_seq_ops = { - .start = master_seq_start, +static struct seq_operations locks_seq_ops = { + .start = locks_seq_start, .next = rsb_seq_next, .stop = rsb_seq_stop, .show = rsb_seq_show, }; -static int master_open(struct inode *inode, struct file *file) +static int locks_open(struct inode *inode, struct file *file) { struct seq_file *seq; int ret; - ret = seq_open(file, &master_seq_ops); + ret = seq_open(file, &locks_seq_ops); if (ret) return ret; @@ -439,9 +441,9 @@ static int master_open(struct inode *inode, struct file *file) return 0; } -static const struct file_operations master_fops = { +static const struct file_operations locks_fops = { .owner = THIS_MODULE, - .open = master_open, + .open = locks_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -515,14 +517,14 @@ int dlm_create_debug_file(struct dlm_ls *ls) } memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_master", ls->ls_name); - - ls->ls_debug_master_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, - dlm_root, - ls, - &master_fops); - if (!ls->ls_debug_master_dentry) { + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); + + ls->ls_debug_locks_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &locks_fops); + if (!ls->ls_debug_locks_dentry) { debugfs_remove(ls->ls_debug_waiters_dentry); debugfs_remove(ls->ls_debug_rsb_dentry); return -ENOMEM; @@ -537,8 +539,8 @@ void dlm_delete_debug_file(struct dlm_ls *ls) debugfs_remove(ls->ls_debug_rsb_dentry); if (ls->ls_debug_waiters_dentry) debugfs_remove(ls->ls_debug_waiters_dentry); - if (ls->ls_debug_master_dentry) - debugfs_remove(ls->ls_debug_master_dentry); + if (ls->ls_debug_locks_dentry) + debugfs_remove(ls->ls_debug_locks_dentry); } int dlm_register_debugfs(void) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 8ac081882c78..74901e981e10 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -471,7 +471,7 @@ struct dlm_ls { struct dentry *ls_debug_rsb_dentry; /* debugfs */ struct dentry *ls_debug_waiters_dentry; /* debugfs */ - struct dentry *ls_debug_master_dentry; /* debugfs */ + struct dentry *ls_debug_locks_dentry; /* debugfs */ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; -- cgit v1.2.1 From a0a24741cac414aba5918e9939afafa70c37f952 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 9 Jul 2007 15:43:07 +0100 Subject: [GFS2] Small fixes to logging code This reverts part of an earlier patch which tried to reclaim gfs2_bufdata structures too early and resulted in a "use after free" case (this bit from me). Also a change to not write out log headers unless we really need to (in the case of flushing nothing we don't need a header) from Bob. Signed-off-by: Steven Whitehouse Signed-off-by: Bob Peterson --- fs/gfs2/log.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 8fcfb784f906..f49a12e24086 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -237,10 +237,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) list_del(&bd->bd_ail_st_list); list_del(&bd->bd_ail_gl_list); atomic_dec(&bd->bd_gl->gl_ail_count); - if (bd->bd_bh) - brelse(bd->bd_bh); - else - kmem_cache_free(gfs2_bufdata_cachep, bd); + brelse(bd->bd_bh); } } @@ -583,6 +580,7 @@ static void log_flush_commit(struct gfs2_sbd *sdp) struct list_head *head = &sdp->sd_log_flush_list; struct gfs2_log_buf *lb; struct buffer_head *bh; + int flushcount = 0; while (!list_empty(head)) { lb = list_entry(head->next, struct gfs2_log_buf, lb_list); @@ -599,9 +597,20 @@ static void log_flush_commit(struct gfs2_sbd *sdp) } else brelse(bh); kfree(lb); + flushcount++; } - log_write_header(sdp, 0, 0); + /* If nothing was journaled, the header is unplanned and unwanted. */ + if (flushcount) { + log_write_header(sdp, 0, 0); + } else { + unsigned int tail; + tail = current_tail(sdp); + + gfs2_ail1_empty(sdp, 0); + if (sdp->sd_log_tail != tail) + log_pull_tail(sdp, tail); + } } /** -- cgit v1.2.1 From 62480d13d5d1812176e969a47e2db78a5398d02e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jul 2007 18:51:59 +0200 Subject: sched: remove the SleepAVG field remove the SleepAVG field from /proc//status, as with the removal of the sleep-average code this value no longer makes sense. Signed-off-by: Ingo Molnar --- fs/proc/array.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 74f30e0c0381..3df644313f9b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -165,7 +165,6 @@ static inline char * task_state(struct task_struct *p, char *buffer) rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,7 +172,6 @@ static inline char * task_state(struct task_struct *p, char *buffer) "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, -- cgit v1.2.1 From b27f03d4bdc145a09fb7b0c0e004b29f1ee555fa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jul 2007 18:51:59 +0200 Subject: sched: make use of precise accounting for /proc task stats make use of CFS's precise accounting to drive /proc//stat statistics. this code was co-authored by: Balbir Singh Dmitry Adamushko Ingo Molnar Signed-off-by: Ingo Molnar Signed-off-by: Dmitry Adamushko --- fs/proc/array.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 3df644313f9b..98e78e2f18d6 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -310,6 +310,41 @@ int proc_pid_status(struct task_struct *task, char * buffer) return buffer - orig; } +static clock_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + return utime; +} + +static clock_t task_stime(struct task_struct *p) +{ + clock_t stime = cputime_to_clock_t(p->stime); + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p); + + return stime; +} + + static int do_task_stat(struct task_struct *task, char * buffer, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; @@ -324,7 +359,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) unsigned long long start_time; unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long min_flt = 0, maj_flt = 0; - cputime_t cutime, cstime, utime, stime; + cputime_t cutime, cstime; + clock_t utime, stime; unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; @@ -342,7 +378,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = cputime_zero; + cutime = cstime = cputime_zero; + utime = stime = 0; rcu_read_lock(); if (lock_task_sighand(task, &flags)) { @@ -368,15 +405,15 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) do { min_flt += t->min_flt; maj_flt += t->maj_flt; - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); + utime += task_utime(t); + stime += task_stime(t); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; - utime = cputime_add(utime, sig->utime); - stime = cputime_add(stime, sig->stime); + utime += cputime_to_clock_t(sig->utime); + stime += cputime_to_clock_t(sig->stime); } sid = signal_session(sig); @@ -392,8 +429,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - utime = task->utime; - stime = task->stime; + utime = task_utime(task); + stime = task_stime(task); } /* scale priority and nice values from timeslices to -20..20 */ @@ -424,8 +461,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) cmin_flt, maj_flt, cmaj_flt, - cputime_to_clock_t(utime), - cputime_to_clock_t(stime), + utime, + stime, cputime_to_clock_t(cutime), cputime_to_clock_t(cstime), priority, -- cgit v1.2.1 From 172ba844a8851c3edd13c0a979cdf46bd5e3cc1a Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 9 Jul 2007 18:52:00 +0200 Subject: sched: update delay-accounting to use CFS's precise stats update delay-accounting to use CFS's precise stats. Signed-off-by: Ingo Molnar --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index a5fa1fdafc4e..0f40e820c7fd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -296,7 +296,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) */ static int proc_pid_schedstat(struct task_struct *task, char *buffer) { - return sprintf(buffer, "%lu %lu %lu\n", + return sprintf(buffer, "%llu %llu %lu\n", task->sched_info.cpu_time, task->sched_info.run_delay, task->sched_info.pcnt); -- cgit v1.2.1 From 43ae34cb4cd650d1eb4460a8253a8e747ba052ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jul 2007 18:52:00 +0200 Subject: sched: scheduler debugging, core scheduler debugging core: implement /proc/sched_debug and /proc//sched files for scheduler debugging. Signed-off-by: Ingo Molnar --- fs/proc/base.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0f40e820c7fd..46ea5d56e1bb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -929,6 +929,69 @@ static const struct file_operations proc_fault_inject_operations = { }; #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * Print out various scheduling related per-task fields: + */ +static int sched_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + WARN_ON(!inode); + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + + WARN_ON(!inode); + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_set_task(p); + + put_task_struct(p); + + return count; +} + +static int sched_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1963,6 +2026,9 @@ static const struct pid_entry tgid_base_stuff[] = { INF("environ", S_IRUSR, pid_environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, pid_sched), +#endif INF("cmdline", S_IRUGO, pid_cmdline), INF("stat", S_IRUGO, tgid_stat), INF("statm", S_IRUGO, pid_statm), @@ -2247,6 +2313,9 @@ static const struct pid_entry tid_base_stuff[] = { INF("environ", S_IRUSR, pid_environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, pid_sched), +#endif INF("cmdline", S_IRUGO, pid_cmdline), INF("stat", S_IRUGO, tid_stat), INF("statm", S_IRUGO, pid_statm), -- cgit v1.2.1 From 72d3a38ee083a96c09032e608a4c7e047ce26760 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 9 Jul 2007 09:40:44 +0200 Subject: unexport bio_{,un}map_user bio_{,un}map_user no longer have any modular users. Signed-off-by: Adrian Bunk Signed-off-by: Jens Axboe --- fs/bio.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 093345f00128..33e46340a766 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1223,8 +1223,6 @@ EXPORT_SYMBOL(bio_hw_segments); EXPORT_SYMBOL(bio_add_page); EXPORT_SYMBOL(bio_add_pc_page); EXPORT_SYMBOL(bio_get_nr_vecs); -EXPORT_SYMBOL(bio_map_user); -EXPORT_SYMBOL(bio_unmap_user); EXPORT_SYMBOL(bio_map_kern); EXPORT_SYMBOL(bio_pair_release); EXPORT_SYMBOL(bio_split); -- cgit v1.2.1 From c66ab6fa705e1b2887a6d9246b798bdc526839e2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Jun 2007 21:17:17 +0200 Subject: splice: abstract out actor data For direct splicing (or private splicing), the output may not be a file. So abstract out the handling into a specified actor function and put the data in the splice_desc structure earlier, so we can build on top of that. This is the first step in better splice handling for drivers, and also for implementing vmsplice _to_ user memory. Signed-off-by: Jens Axboe --- fs/ocfs2/file.c | 11 +++++-- fs/splice.c | 99 ++++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 78 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ac6c96431bbc..93565c03d315 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1636,9 +1636,14 @@ static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, int ret, err; struct address_space *mapping = out->f_mapping; struct inode *inode = mapping->host; - - ret = __splice_from_pipe(pipe, out, ppos, len, flags, - ocfs2_splice_write_actor); + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .file = out, + }; + + ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); if (ret > 0) { *ppos += ret; diff --git a/fs/splice.c b/fs/splice.c index e7d7080de2f9..68f6328236a6 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -668,31 +668,24 @@ out_ret: * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, - struct file *out, loff_t *ppos, size_t len, - unsigned int flags, splice_actor *actor) +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) { int ret, do_wakeup, err; - struct splice_desc sd; ret = 0; do_wakeup = 0; - sd.total_len = len; - sd.flags = flags; - sd.file = out; - sd.pos = *ppos; - for (;;) { if (pipe->nrbufs) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; const struct pipe_buf_operations *ops = buf->ops; - sd.len = buf->len; - if (sd.len > sd.total_len) - sd.len = sd.total_len; + sd->len = buf->len; + if (sd->len > sd->total_len) + sd->len = sd->total_len; - err = actor(pipe, buf, &sd); + err = actor(pipe, buf, sd); if (err <= 0) { if (!ret && err != -ENODATA) ret = err; @@ -704,10 +697,10 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, buf->offset += err; buf->len -= err; - sd.len -= err; - sd.pos += err; - sd.total_len -= err; - if (sd.len) + sd->len -= err; + sd->pos += err; + sd->total_len -= err; + if (sd->len) continue; if (!buf->len) { @@ -719,7 +712,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, do_wakeup = 1; } - if (!sd.total_len) + if (!sd->total_len) break; } @@ -732,7 +725,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, break; } - if (flags & SPLICE_F_NONBLOCK) { + if (sd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; @@ -772,6 +765,12 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, { ssize_t ret; struct inode *inode = out->f_mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .file = out, + }; /* * The actor worker might be calling ->prepare_write and @@ -780,7 +779,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * pipe->inode, we have to order lock acquiry here. */ inode_double_lock(inode, pipe->inode); - ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor); + ret = __splice_from_pipe(pipe, &sd, actor); inode_double_unlock(inode, pipe->inode); return ret; @@ -804,6 +803,12 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, { struct address_space *mapping = out->f_mapping; struct inode *inode = mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .file = out, + }; ssize_t ret; int err; @@ -811,7 +816,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, if (unlikely(err)) return err; - ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + ret = __splice_from_pipe(pipe, &sd, pipe_to_file); if (ret > 0) { unsigned long nr_pages; @@ -956,14 +961,17 @@ static long do_splice_to(struct file *in, loff_t *ppos, return in->f_op->splice_read(in, ppos, pipe, len, flags); } -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags) +/* + * Splices from an input file to an actor, using a 'direct' pipe. + */ +ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, + splice_direct_actor *actor) { struct pipe_inode_info *pipe; long ret, bytes; - loff_t out_off; umode_t i_mode; - int i; + size_t len; + int i, flags; /* * We require the input being a regular file, as we don't want to @@ -999,7 +1007,13 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, */ ret = 0; bytes = 0; - out_off = 0; + len = sd->total_len; + flags = sd->flags; + + /* + * Don't block on output, we have to drain the direct pipe. + */ + sd->flags &= ~SPLICE_F_NONBLOCK; while (len) { size_t read_len, max_read_len; @@ -1009,19 +1023,19 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, */ max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); - ret = do_splice_to(in, ppos, pipe, max_read_len, flags); + ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags); if (unlikely(ret < 0)) goto out_release; read_len = ret; + sd->total_len = read_len; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ - ret = do_splice_from(pipe, out, &out_off, read_len, - flags & ~SPLICE_F_NONBLOCK); + ret = actor(pipe, sd); if (unlikely(ret < 0)) goto out_release; @@ -1066,6 +1080,33 @@ out_release: return bytes; return ret; + +} +EXPORT_SYMBOL(splice_direct_to_actor); + +static int direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->file; + + return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); +} + +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) +{ + struct splice_desc sd = { + .len = len, + .total_len = len, + .flags = flags, + .pos = *ppos, + .file = out, + }; + size_t ret; + + ret = splice_direct_to_actor(in, &sd, direct_splice_actor); + *ppos = sd.pos; + return ret; } /* -- cgit v1.2.1 From 6a14b90bb6bc7cd83e2a444bf457a2ea645cbfe7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Jun 2007 13:08:55 +0200 Subject: vmsplice: add vmsplice-to-user support A bit of a cheat, it actually just copies the data to userspace. But this makes the interface nice and symmetric and enables people to build on splice, with room for future improvement in performance. Signed-off-by: Jens Axboe --- fs/ocfs2/file.c | 2 +- fs/splice.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 151 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 93565c03d315..222f108ee454 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1640,7 +1640,7 @@ static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, .total_len = len, .flags = flags, .pos = *ppos, - .file = out, + .u.file = out, }; ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); diff --git a/fs/splice.c b/fs/splice.c index 68f6328236a6..13846f723d72 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -528,7 +528,7 @@ EXPORT_SYMBOL(generic_file_splice_read); static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - struct file *file = sd->file; + struct file *file = sd->u.file; loff_t pos = sd->pos; int ret, more; @@ -566,7 +566,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - struct file *file = sd->file; + struct file *file = sd->u.file; struct address_space *mapping = file->f_mapping; unsigned int offset, this_len; struct page *page; @@ -769,7 +769,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, .total_len = len, .flags = flags, .pos = *ppos, - .file = out, + .u.file = out, }; /* @@ -807,7 +807,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, .total_len = len, .flags = flags, .pos = *ppos, - .file = out, + .u.file = out, }; ssize_t ret; int err; @@ -1087,7 +1087,7 @@ EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { - struct file *file = sd->file; + struct file *file = sd->u.file; return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); } @@ -1100,7 +1100,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .total_len = len, .flags = flags, .pos = *ppos, - .file = out, + .u.file = out, }; size_t ret; @@ -1289,28 +1289,131 @@ static int get_iovec_page_array(const struct iovec __user *iov, return error; } +static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + char *src; + int ret; + + ret = buf->ops->pin(pipe, buf); + if (unlikely(ret)) + return ret; + + /* + * See if we can use the atomic maps, by prefaulting in the + * pages and doing an atomic copy + */ + if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { + src = buf->ops->map(pipe, buf, 1); + ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, + sd->len); + buf->ops->unmap(pipe, buf, src); + if (!ret) { + ret = sd->len; + goto out; + } + } + + /* + * No dice, use slow non-atomic map and copy + */ + src = buf->ops->map(pipe, buf, 0); + + ret = sd->len; + if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) + ret = -EFAULT; + +out: + if (ret > 0) + sd->u.userptr += ret; + buf->ops->unmap(pipe, buf, src); + return ret; +} + +/* + * For lack of a better implementation, implement vmsplice() to userspace + * as a simple copy of the pipes pages to the user iov. + */ +static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct splice_desc sd; + ssize_t size; + int error; + long ret; + + pipe = pipe_info(file->f_path.dentry->d_inode); + if (!pipe) + return -EBADF; + + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); + + error = ret = 0; + while (nr_segs) { + void __user *base; + size_t len; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + if (unlikely(!base)) { + error = -EFAULT; + break; + } + + sd.len = 0; + sd.total_len = len; + sd.flags = flags; + sd.u.userptr = base; + sd.pos = 0; + + size = __splice_from_pipe(pipe, &sd, pipe_to_user); + if (size < 0) { + if (!ret) + ret = size; + + break; + } + + ret += size; + + if (size < len) + break; + + nr_segs--; + iov++; + } + + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + + if (!ret) + ret = error; + + return ret; +} + /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. - * - * Note that vmsplice only supports splicing _from_ user memory to a pipe, - * not the other way around. Splicing from user memory is a simple operation - * that can be supported without any funky alignment restrictions or nasty - * vm tricks. We simply map in the user memory and fill them into a pipe. - * The reverse isn't quite as easy, though. There are two possible solutions - * for that: - * - * - memcpy() the data internally, at which point we might as well just - * do a regular read() on the buffer anyway. - * - Lots of nasty vm tricks, that are neither fast nor flexible (it - * has restriction limitations on both ends of the pipe). - * - * Alas, it isn't here. - * */ -static long do_vmsplice(struct file *file, const struct iovec __user *iov, - unsigned long nr_segs, unsigned int flags) +static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct page *pages[PIPE_BUFFERS]; @@ -1325,10 +1428,6 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, pipe = pipe_info(file->f_path.dentry->d_inode); if (!pipe) return -EBADF; - if (unlikely(nr_segs > UIO_MAXIOV)) - return -EINVAL; - else if (unlikely(!nr_segs)) - return 0; spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, flags & SPLICE_F_GIFT); @@ -1338,6 +1437,22 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, return splice_to_pipe(pipe, &spd); } +/* + * Note that vmsplice only really supports true splicing _from_ user memory + * to a pipe, not the other way around. Splicing from user memory is a simple + * operation that can be supported without any funky alignment restrictions + * or nasty vm tricks. We simply map in the user memory and fill them into + * a pipe. The reverse isn't quite as easy, though. There are two possible + * solutions for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Currently we punt and implement it as a normal copy, see pipe_to_user(). + * + */ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, unsigned long nr_segs, unsigned int flags) { @@ -1345,11 +1460,18 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, long error; int fput; + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + error = -EBADF; file = fget_light(fd, &fput); if (file) { if (file->f_mode & FMODE_WRITE) - error = do_vmsplice(file, iov, nr_segs, flags); + error = vmsplice_to_pipe(file, iov, nr_segs, flags); + else if (file->f_mode & FMODE_READ) + error = vmsplice_to_user(file, iov, nr_segs, flags); fput_light(file, fput); } -- cgit v1.2.1 From 534f2aaa6ab07cd71164180bc958a7dcde41db11 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Jun 2007 14:52:37 +0200 Subject: sys_sendfile: switch to using ->splice_read, if available This patch makes sendfile prefer to use ->splice_read(), if it's available in the file_operations structure. Signed-off-by: Jens Axboe --- fs/read_write.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 4d03008f015b..47da8a4a0fbe 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "read_write.h" #include @@ -25,7 +26,7 @@ const struct file_operations generic_ro_fops = { .read = do_sync_read, .aio_read = generic_file_aio_read, .mmap = generic_file_readonly_mmap, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); @@ -708,7 +709,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, struct inode * in_inode, * out_inode; loff_t pos; ssize_t retval; - int fput_needed_in, fput_needed_out; + int fput_needed_in, fput_needed_out, fl; /* * Get input file, and verify that it is ok.. @@ -723,7 +724,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, in_inode = in_file->f_path.dentry->d_inode; if (!in_inode) goto fput_in; - if (!in_file->f_op || !in_file->f_op->sendfile) + if (!in_file->f_op || (!in_file->f_op->sendfile && + !in_file->f_op->splice_read)) goto fput_in; retval = -ESPIPE; if (!ppos) @@ -776,7 +778,21 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, count = max - pos; } - retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file); + if (in_file->f_op->splice_read) { + fl = 0; +#if 0 + /* + * We need to debate whether we can enable this or not. The + * man page documents EAGAIN return for the output at least, + * and the application is arguably buggy if it doesn't expect + * EAGAIN on a non-blocking file descriptor. + */ + if (in_file->f_flags & O_NONBLOCK) + fl = SPLICE_F_NONBLOCK; +#endif + retval = do_splice_direct(in_file, ppos, out_file, count, fl); + } else + retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file); if (retval > 0) { add_rchar(current, retval); -- cgit v1.2.1 From 5ffc4ef45b3b0a57872f631b4e4ceb8ace0d7496 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Jun 2007 11:49:19 +0200 Subject: sendfile: remove .sendfile from filesystems that use generic_file_sendfile() They can use generic_file_splice_read() instead. Since sys_sendfile() now prefers that, there should be no change in behaviour. Signed-off-by: Jens Axboe --- fs/adfs/file.c | 2 +- fs/affs/file.c | 2 +- fs/afs/file.c | 2 +- fs/bfs/file.c | 2 +- fs/block_dev.c | 1 - fs/cifs/cifsfs.c | 8 ++++---- fs/coda/file.c | 11 ++++++----- fs/ecryptfs/file.c | 15 ++++++++------- fs/ext2/file.c | 1 - fs/ext3/file.c | 1 - fs/ext4/file.c | 1 - fs/fat/file.c | 2 +- fs/fuse/file.c | 4 ++-- fs/gfs2/ops_file.c | 1 - fs/hfs/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/hpfs/file.c | 2 +- fs/jffs2/file.c | 2 +- fs/jfs/file.c | 1 - fs/minix/file.c | 2 +- fs/ntfs/file.c | 2 +- fs/ocfs2/file.c | 1 - fs/qnx4/file.c | 2 +- fs/ramfs/file-mmu.c | 2 +- fs/ramfs/file-nommu.c | 2 +- fs/reiserfs/file.c | 1 - fs/smbfs/file.c | 9 +++++---- fs/sysv/file.c | 2 +- fs/udf/file.c | 2 +- fs/ufs/file.c | 2 +- fs/xfs/linux-2.6/xfs_file.c | 26 -------------------------- fs/xfs/linux-2.6/xfs_linux.h | 1 - fs/xfs/linux-2.6/xfs_lrw.c | 44 -------------------------------------------- fs/xfs/linux-2.6/xfs_lrw.h | 3 --- fs/xfs/linux-2.6/xfs_vnode.h | 6 ------ fs/xfs/xfs_vnodeops.c | 3 --- 37 files changed, 43 insertions(+), 131 deletions(-) (limited to 'fs') diff --git a/fs/adfs/file.c b/fs/adfs/file.c index f544a2855923..36e381c6a99a 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -33,7 +33,7 @@ const struct file_operations adfs_file_operations = { .fsync = file_fsync, .write = do_sync_write, .aio_write = generic_file_aio_write, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; const struct inode_operations adfs_file_inode_operations = { diff --git a/fs/affs/file.c b/fs/affs/file.c index c8796906f584..c314a35f0918 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -35,7 +35,7 @@ const struct file_operations affs_file_operations = { .open = affs_file_open, .release = affs_file_release, .fsync = file_fsync, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; const struct inode_operations affs_file_inode_operations = { diff --git a/fs/afs/file.c b/fs/afs/file.c index 9c0e721d9fc2..aede7eb66dd4 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -32,7 +32,7 @@ const struct file_operations afs_file_operations = { .aio_read = generic_file_aio_read, .aio_write = afs_file_write, .mmap = generic_file_readonly_mmap, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, .fsync = afs_fsync, }; diff --git a/fs/bfs/file.c b/fs/bfs/file.c index ef4d1fa04e65..24310e9ee05a 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -24,7 +24,7 @@ const struct file_operations bfs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; static int bfs_move_block(unsigned long from, unsigned long to, struct super_block *sb) diff --git a/fs/block_dev.c b/fs/block_dev.c index ea1480a16f51..b3e9bfa748cf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1346,7 +1346,6 @@ const struct file_operations def_blk_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif - .sendfile = generic_file_sendfile, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 7c04752b76cb..8b0cbf4a4ad0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -616,7 +616,7 @@ const struct file_operations cifs_file_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, .llseek = cifs_llseek, #ifdef CONFIG_CIFS_POSIX .ioctl = cifs_ioctl, @@ -637,7 +637,7 @@ const struct file_operations cifs_file_direct_ops = { .lock = cifs_lock, .fsync = cifs_fsync, .flush = cifs_flush, - .sendfile = generic_file_sendfile, /* BB removeme BB */ + .splice_read = generic_file_splice_read, #ifdef CONFIG_CIFS_POSIX .ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ @@ -656,7 +656,7 @@ const struct file_operations cifs_file_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, .llseek = cifs_llseek, #ifdef CONFIG_CIFS_POSIX .ioctl = cifs_ioctl, @@ -676,7 +676,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .release = cifs_close, .fsync = cifs_fsync, .flush = cifs_flush, - .sendfile = generic_file_sendfile, /* BB removeme BB */ + .splice_read = generic_file_splice_read, #ifdef CONFIG_CIFS_POSIX .ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ diff --git a/fs/coda/file.c b/fs/coda/file.c index 5ef2b609ec7d..99dbe866816d 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -47,8 +47,9 @@ coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *p } static ssize_t -coda_file_sendfile(struct file *coda_file, loff_t *ppos, size_t count, - read_actor_t actor, void *target) +coda_file_splice_read(struct file *coda_file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) { struct coda_file_info *cfi; struct file *host_file; @@ -57,10 +58,10 @@ coda_file_sendfile(struct file *coda_file, loff_t *ppos, size_t count, BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (!host_file->f_op || !host_file->f_op->sendfile) + if (!host_file->f_op || !host_file->f_op->splice_read) return -EINVAL; - return host_file->f_op->sendfile(host_file, ppos, count, actor, target); + return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags); } static ssize_t @@ -295,6 +296,6 @@ const struct file_operations coda_file_operations = { .flush = coda_flush, .release = coda_release, .fsync = coda_fsync, - .sendfile = coda_file_sendfile, + .splice_read = coda_file_splice_read, }; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 59288d817078..94f456fe4d9b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -338,16 +338,17 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) return rc; } -static ssize_t ecryptfs_sendfile(struct file *file, loff_t * ppos, - size_t count, read_actor_t actor, void *target) +static ssize_t ecryptfs_splice_read(struct file *file, loff_t * ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) { struct file *lower_file = NULL; int rc = -EINVAL; lower_file = ecryptfs_file_to_lower(file); - if (lower_file->f_op && lower_file->f_op->sendfile) - rc = lower_file->f_op->sendfile(lower_file, ppos, count, - actor, target); + if (lower_file->f_op && lower_file->f_op->splice_read) + rc = lower_file->f_op->splice_read(lower_file, ppos, pipe, + count, flags); return rc; } @@ -364,7 +365,7 @@ const struct file_operations ecryptfs_dir_fops = { .release = ecryptfs_release, .fsync = ecryptfs_fsync, .fasync = ecryptfs_fasync, - .sendfile = ecryptfs_sendfile, + .splice_read = ecryptfs_splice_read, }; const struct file_operations ecryptfs_main_fops = { @@ -381,7 +382,7 @@ const struct file_operations ecryptfs_main_fops = { .release = ecryptfs_release, .fsync = ecryptfs_fsync, .fasync = ecryptfs_fasync, - .sendfile = ecryptfs_sendfile, + .splice_read = ecryptfs_splice_read, }; static int diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 566d4e2d3852..072a1909b2bc 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -53,7 +53,6 @@ const struct file_operations ext2_file_operations = { .open = generic_file_open, .release = ext2_release_file, .fsync = ext2_sync_file, - .sendfile = generic_file_sendfile, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 1e6f13864536..acc4913d3019 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -120,7 +120,6 @@ const struct file_operations ext3_file_operations = { .open = generic_file_open, .release = ext3_release_file, .fsync = ext3_sync_file, - .sendfile = generic_file_sendfile, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3c6c1fd2be90..d4c8186aed64 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -120,7 +120,6 @@ const struct file_operations ext4_file_operations = { .open = generic_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, - .sendfile = generic_file_sendfile, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/fat/file.c b/fs/fat/file.c index 55d3c7461c5b..69a83b59dce8 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -134,7 +134,7 @@ const struct file_operations fat_file_operations = { .release = fat_file_release, .ioctl = fat_generic_ioctl, .fsync = file_fsync, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; static int fat_cont_expand(struct inode *inode, loff_t size) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index adf7995232b8..f79de7c8cdfa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -802,7 +802,7 @@ static const struct file_operations fuse_file_operations = { .release = fuse_release, .fsync = fuse_fsync, .lock = fuse_file_lock, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; static const struct file_operations fuse_direct_io_file_operations = { @@ -814,7 +814,7 @@ static const struct file_operations fuse_direct_io_file_operations = { .release = fuse_release, .fsync = fuse_fsync, .lock = fuse_file_lock, - /* no mmap and sendfile */ + /* no mmap and splice_read */ }; static const struct address_space_operations fuse_file_aops = { diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 064df8804582..7dc3be108204 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -635,7 +635,6 @@ const struct file_operations gfs2_file_fops = { .release = gfs2_close, .fsync = gfs2_fsync, .lock = gfs2_lock, - .sendfile = generic_file_sendfile, .flock = gfs2_flock, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 9a934db0bd8a..bc835f272a6e 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -607,7 +607,7 @@ static const struct file_operations hfs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, .fsync = file_fsync, .open = hfs_file_open, .release = hfs_file_release, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 45dab5d6cc10..409ce5429c91 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -288,7 +288,7 @@ static const struct file_operations hfsplus_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, .fsync = file_fsync, .open = hfsplus_file_open, .release = hfsplus_file_release, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 8286491dbf31..c77862032e84 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -390,7 +390,7 @@ int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) static const struct file_operations hostfs_file_fops = { .llseek = generic_file_llseek, .read = do_sync_read, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .write = do_sync_write, diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b4eafc0f1e54..5b53e5c5d8df 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -129,7 +129,7 @@ const struct file_operations hpfs_file_ops = .mmap = generic_file_mmap, .release = hpfs_file_release, .fsync = hpfs_file_fsync, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; const struct inode_operations hpfs_file_iops = diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 99871279a1ed..c2530197be0c 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -47,7 +47,7 @@ const struct file_operations jffs2_file_operations = .ioctl = jffs2_ioctl, .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, - .sendfile = generic_file_sendfile + .splice_read = generic_file_splice_read, }; /* jffs2_file_inode_operations */ diff --git a/fs/jfs/file.c b/fs/jfs/file.c index f7f8eff19b7b..87eb93694af7 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -108,7 +108,6 @@ const struct file_operations jfs_file_operations = { .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .sendfile = generic_file_sendfile, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .fsync = jfs_fsync, diff --git a/fs/minix/file.c b/fs/minix/file.c index f92baa1d7570..17765f697e50 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -23,7 +23,7 @@ const struct file_operations minix_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .fsync = minix_sync_file, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; const struct inode_operations minix_file_inode_operations = { diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7ed56390b582..ffcc504a1667 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2276,7 +2276,7 @@ const struct file_operations ntfs_file_ops = { mounted filesystem. */ .mmap = generic_file_mmap, /* Mmap file. */ .open = ntfs_file_open, /* Open file. */ - .sendfile = generic_file_sendfile, /* Zero-copy data send with + .splice_read = generic_file_splice_read /* Zero-copy data send with the data source being on the ntfs partition. We do not need to care about the diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 222f108ee454..ed1ffa70cc38 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1822,7 +1822,6 @@ const struct inode_operations ocfs2_special_file_iops = { const struct file_operations ocfs2_fops = { .read = do_sync_read, .write = do_sync_write, - .sendfile = generic_file_sendfile, .mmap = ocfs2_mmap, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c index 44649981bbc8..867f42b02035 100644 --- a/fs/qnx4/file.c +++ b/fs/qnx4/file.c @@ -25,7 +25,7 @@ const struct file_operations qnx4_file_operations = .read = do_sync_read, .aio_read = generic_file_aio_read, .mmap = generic_file_mmap, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, #ifdef CONFIG_QNX4FS_RW .write = do_sync_write, .aio_write = generic_file_aio_write, diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 2f14774a124f..97bdc0b2f9d2 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -41,7 +41,7 @@ const struct file_operations ramfs_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .fsync = simple_sync_file, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, .llseek = generic_file_llseek, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5d258c40a2fd..cad2b7ace630 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .fsync = simple_sync_file, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, .llseek = generic_file_llseek, }; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 9e451a68580f..30eebfb1b2d8 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1531,7 +1531,6 @@ const struct file_operations reiserfs_file_operations = { .open = generic_file_open, .release = reiserfs_file_release, .fsync = reiserfs_sync_file, - .sendfile = generic_file_sendfile, .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index aea3f8aa54c0..c5d78a7e492b 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -262,8 +262,9 @@ out: } static ssize_t -smb_file_sendfile(struct file *file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) +smb_file_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) { struct dentry *dentry = file->f_path.dentry; ssize_t status; @@ -277,7 +278,7 @@ smb_file_sendfile(struct file *file, loff_t *ppos, DENTRY_PATH(dentry), status); goto out; } - status = generic_file_sendfile(file, ppos, count, actor, target); + status = generic_file_splice_read(file, ppos, pipe, count, flags); out: return status; } @@ -416,7 +417,7 @@ const struct file_operations smb_file_operations = .open = smb_file_open, .release = smb_file_release, .fsync = smb_fsync, - .sendfile = smb_file_sendfile, + .splice_read = smb_file_splice_read, }; const struct inode_operations smb_file_inode_operations = diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 0732ddb9020b..589be21d884e 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -27,7 +27,7 @@ const struct file_operations sysv_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .fsync = sysv_sync_file, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; const struct inode_operations sysv_file_inode_operations = { diff --git a/fs/udf/file.c b/fs/udf/file.c index 51b5764685e7..df070bee8d4f 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -261,7 +261,7 @@ const struct file_operations udf_file_operations = { .aio_write = udf_file_aio_write, .release = udf_release_file, .fsync = udf_fsync_file, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; const struct inode_operations udf_file_inode_operations = { diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 1e096323bad4..6705d74c6d2d 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -60,5 +60,5 @@ const struct file_operations ufs_file_operations = { .mmap = generic_file_mmap, .open = generic_file_open, .fsync = ufs_sync_file, - .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, }; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index cb51dc961355..8c43cd2e237a 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -123,30 +123,6 @@ xfs_file_aio_write_invis( return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos); } -STATIC ssize_t -xfs_file_sendfile( - struct file *filp, - loff_t *pos, - size_t count, - read_actor_t actor, - void *target) -{ - return bhv_vop_sendfile(vn_from_inode(filp->f_path.dentry->d_inode), - filp, pos, 0, count, actor, target, NULL); -} - -STATIC ssize_t -xfs_file_sendfile_invis( - struct file *filp, - loff_t *pos, - size_t count, - read_actor_t actor, - void *target) -{ - return bhv_vop_sendfile(vn_from_inode(filp->f_path.dentry->d_inode), - filp, pos, IO_INVIS, count, actor, target, NULL); -} - STATIC ssize_t xfs_file_splice_read( struct file *infilp, @@ -452,7 +428,6 @@ const struct file_operations xfs_file_operations = { .write = do_sync_write, .aio_read = xfs_file_aio_read, .aio_write = xfs_file_aio_write, - .sendfile = xfs_file_sendfile, .splice_read = xfs_file_splice_read, .splice_write = xfs_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, @@ -475,7 +450,6 @@ const struct file_operations xfs_invis_file_operations = { .write = do_sync_write, .aio_read = xfs_file_aio_read_invis, .aio_write = xfs_file_aio_write_invis, - .sendfile = xfs_file_sendfile_invis, .splice_read = xfs_file_splice_read_invis, .splice_write = xfs_file_splice_write_invis, .unlocked_ioctl = xfs_file_ioctl_invis, diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 715adad7dd4d..af24a457d3a3 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -101,7 +101,6 @@ * Feature macros (disable/enable) */ #undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ -#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ #define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index ed90403f0ee7..765ec16a6e39 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -286,50 +286,6 @@ xfs_read( return ret; } -ssize_t -xfs_sendfile( - bhv_desc_t *bdp, - struct file *filp, - loff_t *offset, - int ioflags, - size_t count, - read_actor_t actor, - void *target, - cred_t *credp) -{ - xfs_inode_t *ip = XFS_BHVTOI(bdp); - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - - XFS_STATS_INC(xs_read_calls); - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && - (!(ioflags & IO_INVIS))) { - bhv_vrwlock_t locktype = VRWLOCK_READ; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), - *offset, count, - FILP_DELAY_FLAG(filp), &locktype); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, - (void *)(unsigned long)target, count, *offset, ioflags); - ret = generic_file_sendfile(filp, offset, count, actor, target); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - ssize_t xfs_splice_read( bhv_desc_t *bdp, diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index 7ac51b1d2161..7c60a1eed88b 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -90,9 +90,6 @@ extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *, extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, const struct iovec *, unsigned int, loff_t *, int, struct cred *); -extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, - loff_t *, int, size_t, read_actor_t, - void *, struct cred *); extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, struct pipe_inode_info *, size_t, int, int, struct cred *); diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index d1b2d01843d1..013048a92643 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -139,9 +139,6 @@ typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *, typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, const struct iovec *, unsigned int, loff_t *, int, struct cred *); -typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, - loff_t *, int, size_t, read_actor_t, - void *, struct cred *); typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, struct pipe_inode_info *, size_t, int, int, struct cred *); @@ -206,7 +203,6 @@ typedef struct bhv_vnodeops { vop_close_t vop_close; vop_read_t vop_read; vop_write_t vop_write; - vop_sendfile_t vop_sendfile; vop_splice_read_t vop_splice_read; vop_splice_write_t vop_splice_write; vop_ioctl_t vop_ioctl; @@ -254,8 +250,6 @@ typedef struct bhv_vnodeops { VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) #define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr) \ VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) -#define bhv_vop_sendfile(vp,f,off,ioflags,cnt,act,targ,cr) \ - VOP(vop_sendfile, vp)(VNHEAD(vp),f,off,ioflags,cnt,act,targ,cr) #define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr) \ VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr) #define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr) \ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index de17aed578f0..70bc82f65311 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -4680,9 +4680,6 @@ bhv_vnodeops_t xfs_vnodeops = { .vop_open = xfs_open, .vop_close = xfs_close, .vop_read = xfs_read, -#ifdef HAVE_SENDFILE - .vop_sendfile = xfs_sendfile, -#endif #ifdef HAVE_SPLICE .vop_splice_read = xfs_splice_read, .vop_splice_write = xfs_splice_write, -- cgit v1.2.1 From f0930fffa99e7fe0a0c4b6c7d9a244dc88288c27 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Jun 2007 11:51:43 +0200 Subject: sendfile: convert nfs to using splice_read() Acked-by: Trond Myklebust Signed-off-by: Jens Axboe --- fs/nfs/file.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 9eb8eb4e4a08..8689b736fdd9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -41,7 +41,9 @@ static int nfs_file_open(struct inode *, struct file *); static int nfs_file_release(struct inode *, struct file *); static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); static int nfs_file_mmap(struct file *, struct vm_area_struct *); -static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *); +static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, unsigned int flags); static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, unsigned long nr_segs, loff_t pos); static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, @@ -65,7 +67,7 @@ const struct file_operations nfs_file_operations = { .fsync = nfs_fsync, .lock = nfs_lock, .flock = nfs_flock, - .sendfile = nfs_file_sendfile, + .splice_read = nfs_file_splice_read, .check_flags = nfs_check_flags, }; @@ -224,20 +226,21 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count, - read_actor_t actor, void *target) +nfs_file_splice_read(struct file *filp, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; ssize_t res; - dfprintk(VFS, "nfs: sendfile(%s/%s, %lu@%Lu)\n", + dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long long) *ppos); res = nfs_revalidate_mapping(inode, filp->f_mapping); if (!res) - res = generic_file_sendfile(filp, ppos, count, actor, target); + res = generic_file_splice_read(filp, ppos, pipe, count, flags); return res; } -- cgit v1.2.1 From cf8208d0eabd1d5d2625ec02a175a294c3f30d36 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Jun 2007 21:22:14 +0200 Subject: sendfile: convert nfsd to splice_direct_to_actor() Signed-off-by: Jens Axboe --- fs/nfsd/vfs.c | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7e6aa245b5d5..15471a9efe0e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -801,26 +801,32 @@ found: } /* - * Grab and keep cached pages assosiated with a file in the svc_rqst - * so that they can be passed to the netowork sendmsg/sendpage routines - * directrly. They will be released after the sending has completed. + * Grab and keep cached pages associated with a file in the svc_rqst + * so that they can be passed to the network sendmsg/sendpage routines + * directly. They will be released after the sending has completed. */ static int -nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset , unsigned long size) +nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) { - unsigned long count = desc->count; - struct svc_rqst *rqstp = desc->arg.data; + struct svc_rqst *rqstp = sd->u.data; struct page **pp = rqstp->rq_respages + rqstp->rq_resused; + struct page *page = buf->page; + size_t size; + int ret; + + ret = buf->ops->pin(pipe, buf); + if (unlikely(ret)) + return ret; - if (size > count) - size = count; + size = sd->len; if (rqstp->rq_res.page_len == 0) { get_page(page); put_page(*pp); *pp = page; rqstp->rq_resused++; - rqstp->rq_res.page_base = offset; + rqstp->rq_res.page_base = buf->offset; rqstp->rq_res.page_len = size; } else if (page != pp[-1]) { get_page(page); @@ -832,11 +838,15 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset } else rqstp->rq_res.page_len += size; - desc->count = count - size; - desc->written += size; return size; } +static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + return __splice_from_pipe(pipe, sd, nfsd_splice_actor); +} + static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) @@ -861,10 +871,15 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (ra && ra->p_set) file->f_ra = ra->p_ra; - if (file->f_op->sendfile && rqstp->rq_sendfile_ok) { - rqstp->rq_resused = 1; - host_err = file->f_op->sendfile(file, &offset, *count, - nfsd_read_actor, rqstp); + if (file->f_op->splice_read && rqstp->rq_splice_ok) { + struct splice_desc sd = { + .len = 0, + .total_len = *count, + .pos = offset, + .u.data = rqstp, + }; + + host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); } else { oldfs = get_fs(); set_fs(KERNEL_DS); -- cgit v1.2.1 From d6b29d7cee064f28ca097e906de7453541351095 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Jun 2007 09:59:47 +0200 Subject: splice: divorce the splice structure/function definitions from the pipe header We need to move even more stuff into the header so that folks can use the splice_to_pipe() implementation instead of open-coding a lot of pipe knowledge (see relay implementation), so move to our own header file finally. Signed-off-by: Jens Axboe --- fs/nfsd/vfs.c | 2 +- fs/ocfs2/file.c | 2 +- fs/read_write.c | 2 +- fs/splice.c | 26 +++++--------------------- 4 files changed, 8 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 15471a9efe0e..8176fbf5c006 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ed1ffa70cc38..44c2e2afa465 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/read_write.c b/fs/read_write.c index 47da8a4a0fbe..2527cf035b0e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "read_write.h" #include diff --git a/fs/splice.c b/fs/splice.c index 13846f723d72..bea9f1581ca0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -29,22 +29,6 @@ #include #include -struct partial_page { - unsigned int offset; - unsigned int len; -}; - -/* - * Passed to splice_to_pipe - */ -struct splice_pipe_desc { - struct page **pages; /* page map */ - struct partial_page *partial; /* pages[] may not be contig */ - int nr_pages; /* number of pages in map */ - unsigned int flags; /* splice flags */ - const struct pipe_buf_operations *ops;/* ops associated with output pipe */ -}; - /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the @@ -170,11 +154,11 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { }; /* - * Pipe output worker. This sets up our pipe format with the page cache - * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). + * Pipe output worker. This fills a pipe with the information contained + * from splice_pipe_desc(). */ -static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; -- cgit v1.2.1 From 497f9625c2bbd6a8525fb2eedb22a382a6a8253c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 Jun 2007 12:00:45 +0200 Subject: pipe: allow passing around of ops private pointer relay needs this for proper consumption handling, and the network receive support needs it as well to lookup the sk_buff on pipe release. Signed-off-by: Jens Axboe --- fs/splice.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index bea9f1581ca0..00850e56280d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -185,6 +185,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; if (spd->flags & SPLICE_F_GIFT) buf->flags |= PIPE_BUF_FLAG_GIFT; -- cgit v1.2.1 From d6f517568f9f5c26e7404a336c7289d5b4b293ec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Jun 2007 10:25:05 +0200 Subject: sendfile: remove bad_sendfile() from bad_file_ops do_sendfile() prefers splice over sendfile, so it should not trigger (directly, at least). Signed-off-by: Jens Axboe --- fs/bad_inode.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 329ee473eede..521ff7caadbd 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -114,12 +114,6 @@ static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl) return -EIO; } -static ssize_t bad_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) -{ - return -EIO; -} - static ssize_t bad_file_sendpage(struct file *file, struct page *page, int off, size_t len, loff_t *pos, int more) { @@ -182,7 +176,6 @@ static const struct file_operations bad_file_ops = .aio_fsync = bad_file_aio_fsync, .fasync = bad_file_fasync, .lock = bad_file_lock, - .sendfile = bad_file_sendfile, .sendpage = bad_file_sendpage, .get_unmapped_area = bad_file_get_unmapped_area, .check_flags = bad_file_check_flags, -- cgit v1.2.1 From 932cc6d4f7c35bbf70bce8cc865b6033ff49c9c0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Jun 2007 13:10:21 +0200 Subject: splice: completely document external interface with kerneldoc Also add fs/splice.c as a kerneldoc target with a smaller blurb that should be expanded to better explain the overview of splice. Signed-off-by: Jens Axboe --- fs/splice.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 00850e56280d..d257d666358b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -153,9 +153,16 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -/* - * Pipe output worker. This fills a pipe with the information contained - * from splice_pipe_desc(). +/** + * splice_to_pipe - fill passed data into a pipe + * @pipe: pipe to fill + * @spd: data to fill + * + * Description: + * @spd contains a map of pages and len/offset tupples, a long with + * the struct pipe_buf_operations associated with these pages. This + * function will link that data to the pipe. + * */ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) @@ -280,11 +287,6 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, */ page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); - /* - * Now fill in the holes: - */ - error = 0; - /* * Lookup the (hopefully) full range of pages we need. */ @@ -292,8 +294,9 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, /* * If find_get_pages_contig() returned fewer pages than we needed, - * allocate the rest. + * allocate the rest and fill in the holes. */ + error = 0; index += spd.nr_pages; while (spd.nr_pages < nr_pages) { /* @@ -455,11 +458,16 @@ fill_it: /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from + * @ppos: position in @in * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * - * Will read pages from given file and fill them into a pipe. + * Description: + * Will read pages from given file and fill them into a pipe. Can be + * used as long as the address_space operations for the source implements + * a readpage() hook. + * */ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -648,10 +656,18 @@ out_ret: return ret; } -/* - * Pipe input worker. Most of this logic works like a regular pipe, the - * key here is the 'actor' worker passed in that actually moves the data - * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. +/** + * __splice_from_pipe - splice data from a pipe to given actor + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function does little more than loop over the pipe and call + * @actor to do the actual moving of a single struct pipe_buffer to + * the desired destination. See pipe_to_file, pipe_to_sendpage, or + * pipe_to_user. + * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) @@ -744,6 +760,20 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, } EXPORT_SYMBOL(__splice_from_pipe); +/** + * splice_from_pipe - splice data from a pipe to a file + * @pipe: pipe to splice from + * @out: file to splice to + * @ppos: position in @out + * @len: how many bytes to splice + * @flags: splice modifier flags + * @actor: handler that splices the data + * + * Description: + * See __splice_from_pipe. This function locks the input and output inodes, + * otherwise it's identical to __splice_from_pipe(). + * + */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) @@ -774,12 +804,14 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * generic_file_splice_write_nolock - generic_file_splice_write without mutexes * @pipe: pipe info * @out: file to write to + * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * - * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. The caller is responsible - * for acquiring i_mutex on both inodes. + * Description: + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. The caller is responsible + * for acquiring i_mutex on both inodes. * */ ssize_t @@ -831,11 +863,13 @@ EXPORT_SYMBOL(generic_file_splice_write_nolock); * generic_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to + * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * - * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. + * Description: + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. * */ ssize_t @@ -886,13 +920,15 @@ EXPORT_SYMBOL(generic_file_splice_write); /** * generic_splice_sendpage - splice data from a pipe to a socket - * @inode: pipe inode + * @pipe: pipe to splice from * @out: socket to write to + * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * - * Will send @len bytes from the pipe to a network socket. No data copying - * is involved. + * Description: + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, @@ -946,8 +982,18 @@ static long do_splice_to(struct file *in, loff_t *ppos, return in->f_op->splice_read(in, ppos, pipe, len, flags); } -/* - * Splices from an input file to an actor, using a 'direct' pipe. +/** + * splice_direct_to_actor - splices data directly between two non-pipes + * @in: file to splice from + * @sd: actor information on where to splice to + * @actor: handles the data splicing + * + * Description: + * This is a special case helper to splice directly between two + * points, without requiring an explicit pipe. Internally an allocated + * pipe is cached in the process, and reused during the life time of + * that process. + * */ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) @@ -1077,6 +1123,21 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); } +/** + * do_splice_direct - splices data directly between two files + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * For use by do_sendfile(). splice can easily emulate sendfile, but + * doing it in the application would incur an extra system call + * (splice in + splice out, as compared to just sendfile()). So this helper + * can splice directly through a process-private pipe. + * + */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, size_t len, unsigned int flags) { -- cgit v1.2.1 From d054fe3d10cc1f9aec01378c38caa32dffdd0090 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Fri, 15 Jun 2007 08:16:22 +0200 Subject: xip sendfile removal This patch removes xip_file_sendfile, the sendfile implementation for xip without replacement. Those customers that use xip on s390 are not using sendfile() as far as we know, and so far s390 is the only platform this could potentially be used on so far. Having sendfile is not a popular feature for execute in place file systems, however we have a working implementation of splice_read() based on fs/splice.c if anyone asks for it. At this point in time, it does not seem preferable to merge splice_read() for xip because it causes extra maintenence effort due to code duplication and it requires struct page behind the xip memory segment. We'd like to get rid of that in favor of supporting flash based embedded platforms (Monta Vista work) soon. Signed-off-by: Carsten Otte Signed-off-by: Jens Axboe --- fs/ext2/file.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 072a1909b2bc..04afeecaaef3 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -70,7 +70,6 @@ const struct file_operations ext2_xip_file_operations = { .open = generic_file_open, .release = ext2_release_file, .fsync = ext2_sync_file, - .sendfile = xip_file_sendfile, }; #endif -- cgit v1.2.1 From d96e6e71647846e0dab097efd9b8bf3a3a556dca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 Jun 2007 12:18:52 +0200 Subject: Remove remnants of sendfile() There are now zero users of .sendfile() in the kernel, so kill it from the file_operations structure and in do_sendfile(). Signed-off-by: Jens Axboe --- fs/read_write.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 2527cf035b0e..507ddff48a9a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -724,8 +724,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, in_inode = in_file->f_path.dentry->d_inode; if (!in_inode) goto fput_in; - if (!in_file->f_op || (!in_file->f_op->sendfile && - !in_file->f_op->splice_read)) + if (!in_file->f_op || !in_file->f_op->splice_read) goto fput_in; retval = -ESPIPE; if (!ppos) @@ -778,21 +777,18 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, count = max - pos; } - if (in_file->f_op->splice_read) { - fl = 0; + fl = 0; #if 0 - /* - * We need to debate whether we can enable this or not. The - * man page documents EAGAIN return for the output at least, - * and the application is arguably buggy if it doesn't expect - * EAGAIN on a non-blocking file descriptor. - */ - if (in_file->f_flags & O_NONBLOCK) - fl = SPLICE_F_NONBLOCK; + /* + * We need to debate whether we can enable this or not. The + * man page documents EAGAIN return for the output at least, + * and the application is arguably buggy if it doesn't expect + * EAGAIN on a non-blocking file descriptor. + */ + if (in_file->f_flags & O_NONBLOCK) + fl = SPLICE_F_NONBLOCK; #endif - retval = do_splice_direct(in_file, ppos, out_file, count, fl); - } else - retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file); + retval = do_splice_direct(in_file, ppos, out_file, count, fl); if (retval > 0) { add_rchar(current, retval); -- cgit v1.2.1 From cac36bb06efe4880234524e117e0e712b10b1f16 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Jun 2007 13:10:48 +0200 Subject: pipe: change the ->pin() operation to ->confirm() The name 'pin' was badly chosen, it doesn't pin a pipe buffer in the most commonly used sense in the kernel. So change the name to 'confirm', after debating this issue with Hugh Dickins a bit. A good return from ->confirm() means that the buffer is really there, and that the contents are good. Signed-off-by: Jens Axboe --- fs/nfsd/vfs.c | 2 +- fs/ocfs2/file.c | 4 ++-- fs/pipe.c | 9 +++++---- fs/splice.c | 14 +++++++------- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8176fbf5c006..8604e35bd48e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -815,7 +815,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, size_t size; int ret; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 44c2e2afa465..4979b6675717 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1583,7 +1583,7 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, ssize_t copied = 0; struct ocfs2_splice_write_priv sp; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (ret) goto out; @@ -1604,7 +1604,7 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, * might enter ocfs2_buffered_write_cluster() more * than once, so keep track of our progress here. */ - copied = ocfs2_buffered_write_cluster(sd->file, + copied = ocfs2_buffered_write_cluster(sd->u.file, (loff_t)sd->pos + total, count, ocfs2_map_and_write_splice_data, diff --git a/fs/pipe.c b/fs/pipe.c index 3a89592bdf57..3694af10dd2c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -203,7 +203,8 @@ void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) page_cache_get(buf->page); } -int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) +int generic_pipe_buf_confirm(struct pipe_inode_info *info, + struct pipe_buffer *buf) { return 0; } @@ -212,7 +213,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = generic_pipe_buf_pin, + .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -252,7 +253,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, if (chars > total_len) chars = total_len; - error = ops->pin(pipe, buf); + error = ops->confirm(pipe, buf); if (error) { if (!ret) error = ret; @@ -373,7 +374,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, int error, atomic = 1; void *addr; - error = ops->pin(pipe, buf); + error = ops->confirm(pipe, buf); if (error) goto out; diff --git a/fs/splice.c b/fs/splice.c index d257d666358b..c804121601b0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -85,8 +85,8 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; int err; @@ -127,7 +127,7 @@ static const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = page_cache_pipe_buf_pin, + .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -147,7 +147,7 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = generic_pipe_buf_pin, + .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -525,7 +525,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, loff_t pos = sd->pos; int ret, more; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (!ret) { more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; @@ -569,7 +569,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, /* * make sure the data in this buffer is uptodate */ - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; @@ -1341,7 +1341,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, char *src; int ret; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; -- cgit v1.2.1 From 0845718dafea3e16041d270c256e8516acf4e13d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Jun 2007 20:51:32 +0200 Subject: pipe: add documentation and comments As per Andrew Mortons request, here's a set of documentation for the generic pipe_buf_operations hooks, the pipe, and pipe_buffer structures. Signed-off-by: Jens Axboe --- fs/pipe.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/splice.c | 4 ++++ 2 files changed, 64 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 3694af10dd2c..d007830d9c87 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -164,6 +164,20 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, page_cache_release(page); } +/** + * generic_pipe_buf_map - virtually map a pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer that should be mapped + * @atomic: whether to use an atomic map + * + * Description: + * This function returns a kernel virtual address mapping for the + * passed in @pipe_buffer. If @atomic is set, an atomic map is provided + * and the caller has to be careful not to fault before calling + * the unmap function. + * + * Note that this function occupies KM_USER0 if @atomic != 0. + */ void *generic_pipe_buf_map(struct pipe_inode_info *pipe, struct pipe_buffer *buf, int atomic) { @@ -175,6 +189,15 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe, return kmap(buf->page); } +/** + * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer that should be unmapped + * @map_data: the data that the mapping function returned + * + * Description: + * This function undoes the mapping that ->map() provided. + */ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *map_data) { @@ -185,11 +208,28 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, kunmap(buf->page); } +/** + * generic_pipe_buf_steal - attempt to take ownership of a @pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to attempt to steal + * + * Description: + * This function attempts to steal the @struct page attached to + * @buf. If successful, this function returns 0 and returns with + * the page locked. The caller may then reuse the page for whatever + * he wishes, the typical use is insertion into a different file + * page cache. + */ int generic_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; + /* + * A reference of one is golden, that means that the owner of this + * page is the only one holding a reference to it. lock the page + * and return OK. + */ if (page_count(page) == 1) { lock_page(page); return 0; @@ -198,11 +238,30 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe, return 1; } -void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) +/** + * generic_pipe_buf_get - get a reference to a @struct pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to get a reference to + * + * Description: + * This function grabs an extra reference to @buf. It's used in + * in the tee() system call, when we duplicate the buffers in one + * pipe into another. + */ +void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { page_cache_get(buf->page); } +/** + * generic_pipe_buf_confirm - verify contents of the pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to confirm + * + * Description: + * This function does nothing, because the generic pipe code uses + * pages that are always good when inserted into the pipe. + */ int generic_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf) { diff --git a/fs/splice.c b/fs/splice.c index c804121601b0..ed2ce995475c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -85,6 +85,10 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, buf->flags &= ~PIPE_BUF_FLAG_LRU; } +/* + * Check whether the contents of buf is OK to access. Since the content + * is a page cache page, IO may be in flight. + */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { -- cgit v1.2.1 From bf1a95a225c1349846274caebdca1bb8d20d5853 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Tue, 10 Jul 2007 11:24:11 +0200 Subject: [S390] fixed cdl-format detection. CDL formated DASDs are now detected correctly even if no VOL1 label is on the disk. This prevents possible loss of data. Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- fs/partitions/ibm.c | 167 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 97 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 9f7ad4244f63..1e064c4a4f86 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -45,7 +45,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) { int blocksize, offset, size,res; loff_t i_size; - dasd_information_t *info; + dasd_information2_t *info; struct hd_geometry *geo; char type[5] = {0,}; char name[7] = {0,}; @@ -64,14 +64,17 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) if (i_size == 0) goto out_exit; - if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL) + info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); + if (info == NULL) goto out_exit; - if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) + geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL); + if (geo == NULL) goto out_nogeo; - if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL) + label = kmalloc(sizeof(union label_t), GFP_KERNEL); + if (label == NULL) goto out_nolab; - if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || + if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0 || ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) goto out_freeall; @@ -96,84 +99,108 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) res = 1; /* - * Three different types: CMS1, VOL1 and LNX1/unlabeled + * Three different formats: LDL, CDL and unformated disk + * + * identified by info->format + * + * unformated disks we do not have to care about */ - if (strncmp(type, "CMS1", 4) == 0) { - /* - * VM style CMS1 labeled disk - */ - if (label->cms.disk_offset != 0) { - printk("CMS1/%8s(MDSK):", name); - /* disk is reserved minidisk */ - blocksize = label->cms.block_size; - offset = label->cms.disk_offset; - size = (label->cms.block_count - 1) * (blocksize >> 9); + if (info->format == DASD_FORMAT_LDL) { + if (strncmp(type, "CMS1", 4) == 0) { + /* + * VM style CMS1 labeled disk + */ + if (label->cms.disk_offset != 0) { + printk("CMS1/%8s(MDSK):", name); + /* disk is reserved minidisk */ + blocksize = label->cms.block_size; + offset = label->cms.disk_offset; + size = (label->cms.block_count - 1) + * (blocksize >> 9); + } else { + printk("CMS1/%8s:", name); + offset = (info->label_block + 1); + size = i_size >> 9; + } } else { - printk("CMS1/%8s:", name); + /* + * Old style LNX1 or unlabeled disk + */ + if (strncmp(type, "LNX1", 4) == 0) + printk ("LNX1/%8s:", name); + else + printk("(nonl)"); offset = (info->label_block + 1); size = i_size >> 9; } put_partition(state, 1, offset*(blocksize >> 9), - size-offset*(blocksize >> 9)); - } else if ((strncmp(type, "VOL1", 4) == 0) && - (!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) { + size-offset*(blocksize >> 9)); + } else if (info->format == DASD_FORMAT_CDL) { /* - * New style VOL1 labeled disk + * New style CDL formatted disk */ unsigned int blk; int counter; - printk("VOL1/%8s:", name); - - /* get block number and read then go through format1 labels */ - blk = cchhb2blk(&label->vol.vtoc, geo) + 1; - counter = 0; - while ((data = read_dev_sector(bdev, blk*(blocksize/512), - §)) != NULL) { - struct vtoc_format1_label f1; - - memcpy(&f1, data, sizeof(struct vtoc_format1_label)); - put_dev_sector(sect); - - /* skip FMT4 / FMT5 / FMT7 labels */ - if (f1.DS1FMTID == _ascebc['4'] - || f1.DS1FMTID == _ascebc['5'] - || f1.DS1FMTID == _ascebc['7']) { - blk++; - continue; - } - - /* only FMT1 valid at this point */ - if (f1.DS1FMTID != _ascebc['1']) - break; - - /* OK, we got valid partition data */ - offset = cchh2blk(&f1.DS1EXT1.llimit, geo); - size = cchh2blk(&f1.DS1EXT1.ulimit, geo) - - offset + geo->sectors; - if (counter >= state->limit) - break; - put_partition(state, counter + 1, - offset * (blocksize >> 9), - size * (blocksize >> 9)); - counter++; - blk++; - } - if (!data) - /* Are we not supposed to report this ? */ - goto out_readerr; - } else { /* - * Old style LNX1 or unlabeled disk + * check if VOL1 label is available + * if not, something is wrong, skipping partition detection */ - if (strncmp(type, "LNX1", 4) == 0) - printk ("LNX1/%8s:", name); - else - printk("(nonl)/%8s:", name); - offset = (info->label_block + 1); - size = i_size >> 9; - put_partition(state, 1, offset*(blocksize >> 9), - size-offset*(blocksize >> 9)); + if (strncmp(type, "VOL1", 4) == 0) { + printk("VOL1/%8s:", name); + /* + * get block number and read then go through format1 + * labels + */ + blk = cchhb2blk(&label->vol.vtoc, geo) + 1; + counter = 0; + data = read_dev_sector(bdev, blk * (blocksize/512), + §); + while (data != NULL) { + struct vtoc_format1_label f1; + + memcpy(&f1, data, + sizeof(struct vtoc_format1_label)); + put_dev_sector(sect); + + /* skip FMT4 / FMT5 / FMT7 labels */ + if (f1.DS1FMTID == _ascebc['4'] + || f1.DS1FMTID == _ascebc['5'] + || f1.DS1FMTID == _ascebc['7']) { + blk++; + data = read_dev_sector(bdev, blk * + (blocksize/512), + §); + continue; + } + + /* only FMT1 valid at this point */ + if (f1.DS1FMTID != _ascebc['1']) + break; + + /* OK, we got valid partition data */ + offset = cchh2blk(&f1.DS1EXT1.llimit, geo); + size = cchh2blk(&f1.DS1EXT1.ulimit, geo) - + offset + geo->sectors; + if (counter >= state->limit) + break; + put_partition(state, counter + 1, + offset * (blocksize >> 9), + size * (blocksize >> 9)); + counter++; + blk++; + data = read_dev_sector(bdev, + blk * (blocksize/512), + §); + } + + if (!data) + /* Are we not supposed to report this ? */ + goto out_readerr; + } else + printk(KERN_WARNING "Warning, expected Label VOL1 not " + "found, treating as CDL formated Disk"); + } printk("\n"); -- cgit v1.2.1 From 3ebf44902f77537b5784eb5059c2b78d8b5a920a Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 10 Jul 2007 12:28:27 +0100 Subject: [GFS2] Accept old format NFS filehandles On Tue, 2007-07-10 at 10:06 +0100, Christoph Hellwig wrote: > > -#define GFS2_LARGE_FH_SIZE 10 > > - > > -struct gfs2_fh_obj { > > - struct gfs2_inum_host this; > > - u32 imode; > > -}; > > +#define GFS2_LARGE_FH_SIZE 8 > > Because gfs2_decode_fh only accepts file handles with GFS2_LARGE_FH_SIZE > or GFS2_LARGE_FH_SIZE you don't accept filehandles sent out by and older > gfs version anymore. Stale filehandles because of a new kernel version > are a big no-no, so please add back code to handle the old filehandles > on the decode side. > This should fix that problem I think since its only relating to end of the fh we can just ignore that field in order to accept the older format. Signed-off-by: Steven Whitehouse Cc: Christoph Hellwig Cc: Wendy Cheng --- fs/gfs2/ops_export.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index e317db2a5548..99ea5659bc2c 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -28,6 +28,7 @@ #define GFS2_SMALL_FH_SIZE 4 #define GFS2_LARGE_FH_SIZE 8 +#define GFS2_OLD_FH_SIZE 10 static struct dentry *gfs2_decode_fh(struct super_block *sb, __u32 *p, @@ -44,6 +45,7 @@ static struct dentry *gfs2_decode_fh(struct super_block *sb, switch (fh_len) { case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; parent.no_formal_ino |= be32_to_cpu(fh[5]); parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; -- cgit v1.2.1 From b23cdde4c6240d70bb3d2e3c4046b60d6f6c8451 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Fri, 22 Jun 2007 13:07:02 -0700 Subject: configfs: consistent attribute size The attribute store/show code currently limits attributes at PAGE_SIZE. This code comes from sysfs, where it still works that way. However, PAGE_SIZE is not constant. A 16k attribute string works on ia64 but not on x86. Really a subsystem shouldn't allow different attribute sizes based on platform. As such, limit all simple attributes to 4k. This works on all platforms, and is consistent with all current code. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/file.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 3527c7c6def8..0f4b65e85245 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -33,6 +33,13 @@ #include #include "configfs_internal.h" +/* + * A simple attribute can only be 4096 characters. Why 4k? Because the + * original code limited it to PAGE_SIZE. That's a bad idea, though, + * because an attribute of 16k on ia64 won't work on x86. So we limit to + * 4k, our minimum common page size. + */ +#define SIMPLE_ATTR_SIZE 4096 struct configfs_buffer { size_t count; @@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf count = ops->show_attribute(item,attr,buffer->page); buffer->needs_read_fill = 0; - BUG_ON(count > (ssize_t)PAGE_SIZE); + BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); if (count >= 0) buffer->count = count; else @@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size if (!buffer->page) return -ENOMEM; - if (count >= PAGE_SIZE) - count = PAGE_SIZE - 1; + if (count >= SIMPLE_ATTR_SIZE) + count = SIMPLE_ATTR_SIZE - 1; error = copy_from_user(buffer->page,buf,count); buffer->needs_read_fill = 1; /* if buf is assumed to contain a string, terminate it by \0, -- cgit v1.2.1 From 4c62b53454a83178676e5ecae6665447d363c7b4 Mon Sep 17 00:00:00 2001 From: Satyam Sharma Date: Wed, 27 Jun 2007 16:02:14 +0530 Subject: configfs: misc cleanups 1. item.c:config_item_cleanup() is a private function (only called by config_item_release() in same file). However, it is spuriously exported in include/linux/configfs.h, so remove that export and make it static in item.c. Also, it is no longer exported / interface function, so no need to give comment for this function (the comment was stating obvious thing, anyway). 2. Kernel-doc comment format does not allow empty line between end of comment and start of function (declaration line). There were several such spurious empty lines in item.c, so fix them. fs/configfs/item.c | 15 +++------------ include/linux/configfs.h | 1 - 2 files changed, 3 insertions(+), 13 deletions(-) Signed-off-by: Satyam Sharma Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/item.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 24421209f854..b762bbeaa0be 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -62,7 +62,6 @@ void config_item_init(struct config_item * item) * dynamically allocated string that @item->ci_name points to. * Otherwise, use the static @item->ci_namebuf array. */ - int config_item_set_name(struct config_item * item, const char * fmt, ...) { int error = 0; @@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item) return item; } -/** - * config_item_cleanup - free config_item resources. - * @item: item. - */ - -void config_item_cleanup(struct config_item * item) +static void config_item_cleanup(struct config_item * item) { struct config_item_type * t = item->ci_type; struct config_group * s = item->ci_group; @@ -179,12 +173,10 @@ void config_item_put(struct config_item * item) kref_put(&item->ci_kref, config_item_release); } - /** * config_group_init - initialize a group for use * @k: group */ - void config_group_init(struct config_group *group) { config_item_init(&group->cg_item); @@ -201,8 +193,8 @@ void config_group_init(struct config_group *group) * looking for a matching config_item. If matching item is found * take a reference and return the item. */ - -struct config_item * config_group_find_obj(struct config_group * group, const char * name) +struct config_item *config_group_find_obj(struct config_group *group, + const char * name) { struct list_head * entry; struct config_item * ret = NULL; @@ -219,7 +211,6 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch return ret; } - EXPORT_SYMBOL(config_item_init); EXPORT_SYMBOL(config_group_init); EXPORT_SYMBOL(config_item_get); -- cgit v1.2.1 From 9b1d9aa4e9c5cafe73b9df21d758b50b5d75264d Mon Sep 17 00:00:00 2001 From: Satyam Sharma Date: Wed, 4 Jul 2007 16:37:06 +0530 Subject: [PATCH] configfs+dlm: Separate out __CONFIGFS_ATTR into configfs.h fs/dlm/config.c contains a useful generic macro called __CONFIGFS_ATTR that is similar to sysfs' __ATTR macro that makes defining attributes easy for any user of configfs. Separate it out into configfs.h so that other users (forthcoming in dynamic netconsole patchset) can use it too. Signed-off-by: Satyam Sharma Cc: David Teigland Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/dlm/config.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 5069b2cb5a1f..e47eb42406fa 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, return len; } -#define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \ - .attr = { .ca_name = __stringify(_name), \ - .ca_mode = _mode, \ - .ca_owner = THIS_MODULE }, \ - .show = _read, \ - .store = _write, \ -} - #define CLUSTER_ATTR(name, check_zero) \ static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ { \ -- cgit v1.2.1 From 3fe6c5ce1176cf661dbe71fc43b627c1a742a89a Mon Sep 17 00:00:00 2001 From: Satyam Sharma Date: Wed, 4 Jul 2007 16:37:16 +0530 Subject: [PATCH] configfs+dlm: Rename config_group_find_obj and state semantics clearly Configfs being based upon sysfs code, config_group_find_obj() is probably so named because of the similar kset_find_obj() in sysfs. However, "kobject"s in sysfs become "config_item"s in configfs, so let's call it config_group_find_item() instead, for sake of uniformity, and make corresponding change in the users of this function. BTW a crucial difference between kset_find_obj and config_group_find_item is in locking expectations. kset_find_obj does its locking by itself, but config_group_find_item expects the *caller* to do the locking. The reason for this: kset's have their own locks, config_group's don't but instead rely on the subsystem mutex. And, subsystem needn't necessarily be around when config_group_find_item() is called. So let's state these locking semantics explicitly, and rectify the comment, otherwise bugs could continue to occur in future, as they did in the past (refer commit d82b8191e238 in gfs2-2.6-fixes.git). [ I also took the opportunity to fix some bad whitespace and double-empty lines. --Joel ] [ Conflict in fs/dlm/config.c with commit 3168b0780d06ace875696f8a648d04d6089654e5 manually resolved. --Mark ] Signed-off-by: Satyam Sharma Cc: David Teigland Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/item.c | 18 ++++++++---------- fs/dlm/config.c | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/configfs/item.c b/fs/configfs/item.c index b762bbeaa0be..76dc4c3e5d51 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -183,27 +183,25 @@ void config_group_init(struct config_group *group) INIT_LIST_HEAD(&group->cg_children); } - /** - * config_group_find_obj - search for item in group. + * config_group_find_item - search for item in group. * @group: group we're looking in. * @name: item's name. * - * Lock group via @group->cg_subsys, and iterate over @group->cg_list, - * looking for a matching config_item. If matching item is found - * take a reference and return the item. + * Iterate over @group->cg_list, looking for a matching config_item. + * If matching item is found take a reference and return the item. + * Caller must have locked group via @group->cg_subsys->su_mtx. */ -struct config_item *config_group_find_obj(struct config_group *group, - const char * name) +struct config_item *config_group_find_item(struct config_group *group, + const char *name) { struct list_head * entry; struct config_item * ret = NULL; - /* XXX LOCKING! */ list_for_each(entry,&group->cg_children) { struct config_item * item = to_item(entry); if (config_item_name(item) && - !strcmp(config_item_name(item), name)) { + !strcmp(config_item_name(item), name)) { ret = config_item_get(item); break; } @@ -215,4 +213,4 @@ EXPORT_SYMBOL(config_item_init); EXPORT_SYMBOL(config_group_init); EXPORT_SYMBOL(config_item_get); EXPORT_SYMBOL(config_item_put); -EXPORT_SYMBOL(config_group_find_obj); +EXPORT_SYMBOL(config_group_find_item); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index e47eb42406fa..4348cb42cf17 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -752,7 +752,7 @@ static struct space *get_space(char *name) return NULL; down(&space_list->cg_subsys->su_sem); - i = config_group_find_obj(space_list, name); + i = config_group_find_item(space_list, name); up(&space_list->cg_subsys->su_sem); return to_space(i); -- cgit v1.2.1 From e6bd07aee739566803425acdbf5cdb29919164e1 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Fri, 6 Jul 2007 23:33:17 -0700 Subject: configfs: Convert subsystem semaphore to mutex Convert the su_sem member of struct configfs_subsystem to a struct mutex, as that's what it is. Also convert all the users and update Documentation/configfs.txt and Documentation/configfs_example.c accordingly. [ Conflict in fs/dlm/config.c with commit 3168b0780d06ace875696f8a648d04d6089654e5 manually resolved. --Mark ] Inspired-by: Satyam Sharma Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/dir.c | 16 ++++++++-------- fs/dlm/config.c | 10 +++++----- fs/ocfs2/cluster/nodemanager.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5e6e37e58f36..d3b1dbb9b5b8 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -562,7 +562,7 @@ static int populate_groups(struct config_group *group) /* * All of link_obj/unlink_obj/link_group/unlink_group require that - * subsys->su_sem is held. + * subsys->su_mutex is held. */ static void unlink_obj(struct config_item *item) @@ -783,7 +783,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); - down(&subsys->su_sem); + mutex_lock(&subsys->su_mutex); group = NULL; item = NULL; if (type->ct_group_ops->make_group) { @@ -797,7 +797,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (item) link_obj(parent_item, item); } - up(&subsys->su_sem); + mutex_unlock(&subsys->su_mutex); kfree(name); if (!item) { @@ -841,13 +841,13 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) out_unlink: if (ret) { /* Tear down everything we built up */ - down(&subsys->su_sem); + mutex_lock(&subsys->su_mutex); if (group) unlink_group(group); else unlink_obj(item); client_drop_item(parent_item, item); - up(&subsys->su_sem); + mutex_unlock(&subsys->su_mutex); if (module_got) module_put(owner); @@ -910,17 +910,17 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) if (sd->s_type & CONFIGFS_USET_DIR) { configfs_detach_group(item); - down(&subsys->su_sem); + mutex_lock(&subsys->su_mutex); unlink_group(to_config_group(item)); } else { configfs_detach_item(item); - down(&subsys->su_sem); + mutex_lock(&subsys->su_mutex); unlink_obj(item); } client_drop_item(parent_item, item); - up(&subsys->su_sem); + mutex_unlock(&subsys->su_mutex); /* Drop our reference from above */ config_item_put(item); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 4348cb42cf17..2f8e3c81bc19 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -607,7 +607,7 @@ static struct clusters clusters_root = { int dlm_config_init(void) { config_group_init(&clusters_root.subsys.su_group); - init_MUTEX(&clusters_root.subsys.su_sem); + mutex_init(&clusters_root.subsys.su_mutex); return configfs_register_subsystem(&clusters_root.subsys); } @@ -751,9 +751,9 @@ static struct space *get_space(char *name) if (!space_list) return NULL; - down(&space_list->cg_subsys->su_sem); + mutex_lock(&space_list->cg_subsys->su_mutex); i = config_group_find_item(space_list, name); - up(&space_list->cg_subsys->su_sem); + mutex_unlock(&space_list->cg_subsys->su_mutex); return to_space(i); } @@ -772,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) if (!comm_list) return NULL; - down(&clusters_root.subsys.su_sem); + mutex_lock(&clusters_root.subsys.su_mutex); list_for_each_entry(i, &comm_list->cg_children, ci_entry) { cm = to_comm(i); @@ -792,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) break; } } - up(&clusters_root.subsys.su_sem); + mutex_unlock(&clusters_root.subsys.su_mutex); if (!found) cm = NULL; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 9f5ad0f01ce0..48b77d113cb2 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -934,7 +934,7 @@ static int __init init_o2nm(void) goto out_sysctl; config_group_init(&o2nm_cluster_group.cs_subsys.su_group); - init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem); + mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex); ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); if (ret) { printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); -- cgit v1.2.1 From 6d748924b753d63a57dad130fdf11f64c27ff54b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 22 Jun 2007 11:20:00 +0200 Subject: [PATCH] configsfs buffer: use mutex Seems copied from sysfs, but I don't see a reason here nor there to use a semaphore instead of a mutex. Convert. Signed-off-by: Johannes Berg Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/file.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 0f4b65e85245..a3658f9a082c 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -27,8 +27,8 @@ #include #include #include +#include #include -#include #include #include "configfs_internal.h" @@ -46,7 +46,7 @@ struct configfs_buffer { loff_t pos; char * page; struct configfs_item_operations * ops; - struct semaphore sem; + struct mutex mutex; int needs_read_fill; }; @@ -109,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp struct configfs_buffer * buffer = file->private_data; ssize_t retval = 0; - down(&buffer->sem); + mutex_lock(&buffer->mutex); if (buffer->needs_read_fill) { if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) goto out; @@ -119,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp retval = simple_read_from_buffer(buf, count, ppos, buffer->page, buffer->count); out: - up(&buffer->sem); + mutex_unlock(&buffer->mutex); return retval; } @@ -200,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof struct configfs_buffer * buffer = file->private_data; ssize_t len; - down(&buffer->sem); + mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) len = flush_write_buffer(file->f_path.dentry, buffer, count); if (len > 0) *ppos += len; - up(&buffer->sem); + mutex_unlock(&buffer->mutex); return len; } @@ -260,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file) error = -ENOMEM; goto Enomem; } - init_MUTEX(&buffer->sem); + mutex_init(&buffer->mutex); buffer->needs_read_fill = 1; buffer->ops = ops; file->private_data = buffer; @@ -299,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp) if (buffer) { if (buffer->page) free_page((unsigned long)buffer->page); + mutex_destroy(&buffer->mutex); kfree(buffer); } return 0; -- cgit v1.2.1 From 299894cc9001b09e3e9685f2709b49e7e1092ccc Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Fri, 6 Oct 2006 17:33:23 -0700 Subject: configfs: accessing item hierarchy during rmdir(2) Add a notification callback, ops->disconnect_notify(). It has the same prototype as ->drop_item(), but it will be called just before the item linkage is broken. This way, configfs users who want to do work while the object is still in the heirarchy have a chance. Client drivers will still need to config_item_put() in their ->drop_item(), if they implement it. They need do nothing in ->disconnect_notify(). They don't have to provide it if they don't care. But someone who wants to be notified before ci_parent is set to NULL can now be notified. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/dir.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index d3b1dbb9b5b8..125954723eb7 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -713,6 +713,28 @@ static void configfs_detach_group(struct config_item *item) configfs_detach_item(item); } +/* + * After the item has been detached from the filesystem view, we are + * ready to tear it out of the hierarchy. Notify the client before + * we do that so they can perform any cleanup that requires + * navigating the hierarchy. A client does not need to provide this + * callback. The subsystem semaphore MUST be held by the caller, and + * references must be valid for both items. It also assumes the + * caller has validated ci_type. + */ +static void client_disconnect_notify(struct config_item *parent_item, + struct config_item *item) +{ + struct config_item_type *type; + + type = parent_item->ci_type; + BUG_ON(!type); + + if (type->ct_group_ops && type->ct_group_ops->disconnect_notify) + type->ct_group_ops->disconnect_notify(to_config_group(parent_item), + item); +} + /* * Drop the initial reference from make_item()/make_group() * This function assumes that reference is held on item @@ -733,7 +755,7 @@ static void client_drop_item(struct config_item *parent_item, */ if (type->ct_group_ops && type->ct_group_ops->drop_item) type->ct_group_ops->drop_item(to_config_group(parent_item), - item); + item); else config_item_put(item); } @@ -842,11 +864,14 @@ out_unlink: if (ret) { /* Tear down everything we built up */ mutex_lock(&subsys->su_mutex); + + client_disconnect_notify(parent_item, item); if (group) unlink_group(group); else unlink_obj(item); client_drop_item(parent_item, item); + mutex_unlock(&subsys->su_mutex); if (module_got) @@ -911,11 +936,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) configfs_detach_group(item); mutex_lock(&subsys->su_mutex); + client_disconnect_notify(parent_item, item); unlink_group(to_config_group(item)); } else { configfs_detach_item(item); mutex_lock(&subsys->su_mutex); + client_disconnect_notify(parent_item, item); unlink_obj(item); } -- cgit v1.2.1 From 631d1febab8e546e3bb800bdfe2c212b8adf87de Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 18 Jun 2007 18:06:09 -0700 Subject: configfs: config item dependancies. Sometimes other drivers depend on particular configfs items. For example, ocfs2 mounts depend on a heartbeat region item. If that region item is removed with rmdir(2), the ocfs2 mount must BUG or go readonly. Not happy. This provides two additional API calls: configfs_depend_item() and configfs_undepend_item(). A client driver can call configfs_depend_item() on an existing item to tell configfs that it is depended on. configfs will then return -EBUSY from rmdir(2) for that item. When the item is no longer depended on, the client driver calls configfs_undepend_item() on it. These API cannot be called underneath any configfs callbacks, as they will conflict. They can block and allocate. A client driver probably shouldn't calling them of its own gumption. Rather it should be providing an API that external subsystems call. How does this work? Imagine the ocfs2 mount process. When it mounts, it asks for a heart region item. This is done via a call into the heartbeat code. Inside the heartbeat code, the region item is looked up. Here, the heartbeat code calls configfs_depend_item(). If it succeeds, then heartbeat knows the region is safe to give to ocfs2. If it fails, it was being torn down anyway, and heartbeat can gracefully pass up an error. [ Fixed some bad whitespace in configfs.txt. --Mark ] Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/configfs_internal.h | 7 +- fs/configfs/dir.c | 244 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 248 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 7b48c034b312..3b0185fdf9a4 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -29,10 +29,11 @@ struct configfs_dirent { atomic_t s_count; + int s_dependent_count; struct list_head s_sibling; struct list_head s_children; struct list_head s_links; - void * s_element; + void * s_element; int s_type; umode_t s_mode; struct dentry * s_dentry; @@ -41,8 +42,8 @@ struct configfs_dirent { #define CONFIGFS_ROOT 0x0001 #define CONFIGFS_DIR 0x0002 -#define CONFIGFS_ITEM_ATTR 0x0004 -#define CONFIGFS_ITEM_LINK 0x0020 +#define CONFIGFS_ITEM_ATTR 0x0004 +#define CONFIGFS_ITEM_LINK 0x0020 #define CONFIGFS_USET_DIR 0x0040 #define CONFIGFS_USET_DEFAULT 0x0080 #define CONFIGFS_USET_DROPPING 0x0100 diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 125954723eb7..2f436d4f1d6d 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry) /* Mark that we've taken i_mutex */ sd->s_type |= CONFIGFS_USET_DROPPING; + /* + * Yup, recursive. If there's a problem, blame + * deep nesting of default_groups + */ ret = configfs_detach_prep(sd->s_dentry); if (!ret) continue; @@ -760,6 +764,239 @@ static void client_drop_item(struct config_item *parent_item, config_item_put(item); } +#ifdef DEBUG +static void configfs_dump_one(struct configfs_dirent *sd, int level) +{ + printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); + +#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); + type_print(CONFIGFS_ROOT); + type_print(CONFIGFS_DIR); + type_print(CONFIGFS_ITEM_ATTR); + type_print(CONFIGFS_ITEM_LINK); + type_print(CONFIGFS_USET_DIR); + type_print(CONFIGFS_USET_DEFAULT); + type_print(CONFIGFS_USET_DROPPING); +#undef type_print +} + +static int configfs_dump(struct configfs_dirent *sd, int level) +{ + struct configfs_dirent *child_sd; + int ret = 0; + + configfs_dump_one(sd, level); + + if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT))) + return 0; + + list_for_each_entry(child_sd, &sd->s_children, s_sibling) { + ret = configfs_dump(child_sd, level + 2); + if (ret) + break; + } + + return ret; +} +#endif + + +/* + * configfs_depend_item() and configfs_undepend_item() + * + * WARNING: Do not call these from a configfs callback! + * + * This describes these functions and their helpers. + * + * Allow another kernel system to depend on a config_item. If this + * happens, the item cannot go away until the dependant can live without + * it. The idea is to give client modules as simple an interface as + * possible. When a system asks them to depend on an item, they just + * call configfs_depend_item(). If the item is live and the client + * driver is in good shape, we'll happily do the work for them. + * + * Why is the locking complex? Because configfs uses the VFS to handle + * all locking, but this function is called outside the normal + * VFS->configfs path. So it must take VFS locks to prevent the + * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is + * why you can't call these functions underneath configfs callbacks. + * + * Note, btw, that this can be called at *any* time, even when a configfs + * subsystem isn't registered, or when configfs is loading or unloading. + * Just like configfs_register_subsystem(). So we take the same + * precautions. We pin the filesystem. We lock each i_mutex _in_order_ + * on our way down the tree. If we can find the target item in the + * configfs tree, it must be part of the subsystem tree as well, so we + * do not need the subsystem semaphore. Holding the i_mutex chain locks + * out mkdir() and rmdir(), who might be racing us. + */ + +/* + * configfs_depend_prep() + * + * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are + * attributes. This is similar but not the same to configfs_detach_prep(). + * Note that configfs_detach_prep() expects the parent to be locked when it + * is called, but we lock the parent *inside* configfs_depend_prep(). We + * do that so we can unlock it if we find nothing. + * + * Here we do a depth-first search of the dentry hierarchy looking for + * our object. We take i_mutex on each step of the way down. IT IS + * ESSENTIAL THAT i_mutex LOCKING IS ORDERED. If we come back up a branch, + * we'll drop the i_mutex. + * + * If the target is not found, -ENOENT is bubbled up and we have released + * all locks. If the target was found, the locks will be cleared by + * configfs_depend_rollback(). + * + * This adds a requirement that all config_items be unique! + * + * This is recursive because the locking traversal is tricky. There isn't + * much on the stack, though, so folks that need this function - be careful + * about your stack! Patches will be accepted to make it iterative. + */ +static int configfs_depend_prep(struct dentry *origin, + struct config_item *target) +{ + struct configfs_dirent *child_sd, *sd = origin->d_fsdata; + int ret = 0; + + BUG_ON(!origin || !sd); + + /* Lock this guy on the way down */ + mutex_lock(&sd->s_dentry->d_inode->i_mutex); + if (sd->s_element == target) /* Boo-yah */ + goto out; + + list_for_each_entry(child_sd, &sd->s_children, s_sibling) { + if (child_sd->s_type & CONFIGFS_DIR) { + ret = configfs_depend_prep(child_sd->s_dentry, + target); + if (!ret) + goto out; /* Child path boo-yah */ + } + } + + /* We looped all our children and didn't find target */ + mutex_unlock(&sd->s_dentry->d_inode->i_mutex); + ret = -ENOENT; + +out: + return ret; +} + +/* + * This is ONLY called if configfs_depend_prep() did its job. So we can + * trust the entire path from item back up to origin. + * + * We walk backwards from item, unlocking each i_mutex. We finish by + * unlocking origin. + */ +static void configfs_depend_rollback(struct dentry *origin, + struct config_item *item) +{ + struct dentry *dentry = item->ci_dentry; + + while (dentry != origin) { + mutex_unlock(&dentry->d_inode->i_mutex); + dentry = dentry->d_parent; + } + + mutex_unlock(&origin->d_inode->i_mutex); +} + +int configfs_depend_item(struct configfs_subsystem *subsys, + struct config_item *target) +{ + int ret; + struct configfs_dirent *p, *root_sd, *subsys_sd = NULL; + struct config_item *s_item = &subsys->su_group.cg_item; + + /* + * Pin the configfs filesystem. This means we can safely access + * the root of the configfs filesystem. + */ + ret = configfs_pin_fs(); + if (ret) + return ret; + + /* + * Next, lock the root directory. We're going to check that the + * subsystem is really registered, and so we need to lock out + * configfs_[un]register_subsystem(). + */ + mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); + + root_sd = configfs_sb->s_root->d_fsdata; + + list_for_each_entry(p, &root_sd->s_children, s_sibling) { + if (p->s_type & CONFIGFS_DIR) { + if (p->s_element == s_item) { + subsys_sd = p; + break; + } + } + } + + if (!subsys_sd) { + ret = -ENOENT; + goto out_unlock_fs; + } + + /* Ok, now we can trust subsys/s_item */ + + /* Scan the tree, locking i_mutex recursively, return 0 if found */ + ret = configfs_depend_prep(subsys_sd->s_dentry, target); + if (ret) + goto out_unlock_fs; + + /* We hold all i_mutexes from the subsystem down to the target */ + p = target->ci_dentry->d_fsdata; + p->s_dependent_count += 1; + + configfs_depend_rollback(subsys_sd->s_dentry, target); + +out_unlock_fs: + mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); + + /* + * If we succeeded, the fs is pinned via other methods. If not, + * we're done with it anyway. So release_fs() is always right. + */ + configfs_release_fs(); + + return ret; +} +EXPORT_SYMBOL(configfs_depend_item); + +/* + * Release the dependent linkage. This is much simpler than + * configfs_depend_item() because we know that that the client driver is + * pinned, thus the subsystem is pinned, and therefore configfs is pinned. + */ +void configfs_undepend_item(struct configfs_subsystem *subsys, + struct config_item *target) +{ + struct configfs_dirent *sd; + + /* + * Since we can trust everything is pinned, we just need i_mutex + * on the item. + */ + mutex_lock(&target->ci_dentry->d_inode->i_mutex); + + sd = target->ci_dentry->d_fsdata; + BUG_ON(sd->s_dependent_count < 1); + + sd->s_dependent_count -= 1; + + /* + * After this unlock, we cannot trust the item to stay alive! + * DO NOT REFERENCE item after this unlock. + */ + mutex_unlock(&target->ci_dentry->d_inode->i_mutex); +} +EXPORT_SYMBOL(configfs_undepend_item); static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { @@ -906,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) if (sd->s_type & CONFIGFS_USET_DEFAULT) return -EPERM; + /* + * Here's where we check for dependents. We're protected by + * i_mutex. + */ + if (sd->s_dependent_count) + return -EBUSY; + /* Get a working ref until we have the child */ parent_item = configfs_get_config_item(dentry->d_parent); subsys = to_config_group(parent_item)->cg_subsys; -- cgit v1.2.1 From 14829422be6d6b6721f61b1e749acf5a9cb664d8 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Thu, 14 Jun 2007 21:40:49 -0700 Subject: ocfs2: Depend on configfs heartbeat items. ocfs2 mounts require a heartbeat region. Use the new configfs_depend_item() facility to actually depend on them so they can't go away from under us. First, teach cluster/nodemanager.c to depend an item on the o2cb subsystem. Then teach o2hb_register_callbacks to take a UUID and depend on the appropriate region. Finally, teach all users of o2hb to pass a UUID or NULL if they don't require a pin. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/heartbeat.c | 64 ++++++++++++++++++++++++++++++++++++++++-- fs/ocfs2/cluster/heartbeat.h | 6 ++-- fs/ocfs2/cluster/nodemanager.c | 10 +++++++ fs/ocfs2/cluster/nodemanager.h | 3 ++ fs/ocfs2/cluster/tcp.c | 8 +++--- fs/ocfs2/dlm/dlmdomain.c | 8 +++--- fs/ocfs2/heartbeat.c | 10 +++---- 7 files changed, 92 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 979113479c66..e331f4cb2c81 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1665,7 +1665,56 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc, } EXPORT_SYMBOL_GPL(o2hb_setup_callback); -int o2hb_register_callback(struct o2hb_callback_func *hc) +static struct o2hb_region *o2hb_find_region(const char *region_uuid) +{ + struct o2hb_region *p, *reg = NULL; + + assert_spin_locked(&o2hb_live_lock); + + list_for_each_entry(p, &o2hb_all_regions, hr_all_item) { + if (!strcmp(region_uuid, config_item_name(&p->hr_item))) { + reg = p; + break; + } + } + + return reg; +} + +static int o2hb_region_get(const char *region_uuid) +{ + int ret = 0; + struct o2hb_region *reg; + + spin_lock(&o2hb_live_lock); + + reg = o2hb_find_region(region_uuid); + if (!reg) + ret = -ENOENT; + spin_unlock(&o2hb_live_lock); + + if (!ret) + ret = o2nm_depend_item(®->hr_item); + + return ret; +} + +static void o2hb_region_put(const char *region_uuid) +{ + struct o2hb_region *reg; + + spin_lock(&o2hb_live_lock); + + reg = o2hb_find_region(region_uuid); + + spin_unlock(&o2hb_live_lock); + + if (reg) + o2nm_undepend_item(®->hr_item); +} + +int o2hb_register_callback(const char *region_uuid, + struct o2hb_callback_func *hc) { struct o2hb_callback_func *tmp; struct list_head *iter; @@ -1681,6 +1730,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc) goto out; } + if (region_uuid) { + ret = o2hb_region_get(region_uuid); + if (ret) + goto out; + } + down_write(&o2hb_callback_sem); list_for_each(iter, &hbcall->list) { @@ -1702,16 +1757,21 @@ out: } EXPORT_SYMBOL_GPL(o2hb_register_callback); -void o2hb_unregister_callback(struct o2hb_callback_func *hc) +void o2hb_unregister_callback(const char *region_uuid, + struct o2hb_callback_func *hc) { BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", __builtin_return_address(0), hc); + /* XXX Can this happen _with_ a region reference? */ if (list_empty(&hc->hc_item)) return; + if (region_uuid) + o2hb_region_put(region_uuid); + down_write(&o2hb_callback_sem); list_del_init(&hc->hc_item); diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index cc6d40b39771..35397dd5ecdb 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc, o2hb_cb_func *func, void *data, int priority); -int o2hb_register_callback(struct o2hb_callback_func *hc); -void o2hb_unregister_callback(struct o2hb_callback_func *hc); +int o2hb_register_callback(const char *region_uuid, + struct o2hb_callback_func *hc); +void o2hb_unregister_callback(const char *region_uuid, + struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, unsigned bytes); void o2hb_init(void); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 48b77d113cb2..eab46d8a7c8c 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -900,6 +900,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = { }, }; +int o2nm_depend_item(struct config_item *item) +{ + return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); +} + +void o2nm_undepend_item(struct config_item *item) +{ + configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); +} + static void __exit exit_o2nm(void) { if (ocfs2_table_header) diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 070522138ae2..55ae1a00d735 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -77,4 +77,7 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr); void o2nm_node_get(struct o2nm_node *node); void o2nm_node_put(struct o2nm_node *node); +int o2nm_depend_item(struct config_item *item); +void o2nm_undepend_item(struct config_item *item); + #endif /* O2CLUSTER_NODEMANAGER_H */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0b229a9c7952..d58c7dddb853 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1638,8 +1638,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, void o2net_unregister_hb_callbacks(void) { - o2hb_unregister_callback(&o2net_hb_up); - o2hb_unregister_callback(&o2net_hb_down); + o2hb_unregister_callback(NULL, &o2net_hb_up); + o2hb_unregister_callback(NULL, &o2net_hb_down); } int o2net_register_hb_callbacks(void) @@ -1651,9 +1651,9 @@ int o2net_register_hb_callbacks(void) o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); - ret = o2hb_register_callback(&o2net_hb_up); + ret = o2hb_register_callback(NULL, &o2net_hb_up); if (ret == 0) - ret = o2hb_register_callback(&o2net_hb_down); + ret = o2hb_register_callback(NULL, &o2net_hb_down); if (ret) o2net_unregister_hb_callbacks(); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index d836b98dd99a..6954565b8ccb 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1128,8 +1128,8 @@ bail: static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) { - o2hb_unregister_callback(&dlm->dlm_hb_up); - o2hb_unregister_callback(&dlm->dlm_hb_down); + o2hb_unregister_callback(NULL, &dlm->dlm_hb_up); + o2hb_unregister_callback(NULL, &dlm->dlm_hb_down); o2net_unregister_handler_list(&dlm->dlm_domain_handlers); } @@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); - status = o2hb_register_callback(&dlm->dlm_hb_down); + status = o2hb_register_callback(NULL, &dlm->dlm_hb_down); if (status) goto bail; o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); - status = o2hb_register_callback(&dlm->dlm_hb_up); + status = o2hb_register_callback(NULL, &dlm->dlm_hb_up); if (status) goto bail; diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index b25ef63781ba..352eb4a13f98 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) if (ocfs2_mount_local(osb)) return 0; - status = o2hb_register_callback(&osb->osb_hb_down); + status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down); if (status < 0) { mlog_errno(status); goto bail; } - status = o2hb_register_callback(&osb->osb_hb_up); + status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up); if (status < 0) { mlog_errno(status); - o2hb_unregister_callback(&osb->osb_hb_down); + o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down); } bail: @@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) if (ocfs2_mount_local(osb)) return; - o2hb_unregister_callback(&osb->osb_hb_down); - o2hb_unregister_callback(&osb->osb_hb_up); + o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down); + o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up); } void ocfs2_stop_heartbeat(struct ocfs2_super *osb) -- cgit v1.2.1 From 16c6a4f24de2933b26477ad5dfb71f518220d641 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 19 Jun 2007 11:34:03 -0700 Subject: ocfs2: live heartbeat depends on the local node configuration Removing the local node configuration out from underneath a running heartbeat is "bad". Provide an API in the ocfs2 nodemanager to request a configfs dependancy on the local node, then use it in heartbeat. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/heartbeat.c | 17 ++++++++++++++--- fs/ocfs2/cluster/nodemanager.c | 30 ++++++++++++++++++++++++++++++ fs/ocfs2/cluster/nodemanager.h | 2 ++ 3 files changed, 46 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index e331f4cb2c81..2877d468f115 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1693,9 +1693,18 @@ static int o2hb_region_get(const char *region_uuid) ret = -ENOENT; spin_unlock(&o2hb_live_lock); - if (!ret) - ret = o2nm_depend_item(®->hr_item); + if (ret) + goto out; + + ret = o2nm_depend_this_node(); + if (ret) + goto out; + ret = o2nm_depend_item(®->hr_item); + if (ret) + o2nm_undepend_this_node(); + +out: return ret; } @@ -1709,8 +1718,10 @@ static void o2hb_region_put(const char *region_uuid) spin_unlock(&o2hb_live_lock); - if (reg) + if (reg) { o2nm_undepend_item(®->hr_item); + o2nm_undepend_this_node(); + } } int o2hb_register_callback(const char *region_uuid, diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index eab46d8a7c8c..af2070da308b 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -910,6 +910,36 @@ void o2nm_undepend_item(struct config_item *item) configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); } +int o2nm_depend_this_node(void) +{ + int ret = 0; + struct o2nm_node *local_node; + + local_node = o2nm_get_node_by_num(o2nm_this_node()); + if (!local_node) { + ret = -EINVAL; + goto out; + } + + ret = o2nm_depend_item(&local_node->nd_item); + o2nm_node_put(local_node); + +out: + return ret; +} + +void o2nm_undepend_this_node(void) +{ + struct o2nm_node *local_node; + + local_node = o2nm_get_node_by_num(o2nm_this_node()); + BUG_ON(!local_node); + + o2nm_undepend_item(&local_node->nd_item); + o2nm_node_put(local_node); +} + + static void __exit exit_o2nm(void) { if (ocfs2_table_header) diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 55ae1a00d735..7c860361b8dd 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -79,5 +79,7 @@ void o2nm_node_put(struct o2nm_node *node); int o2nm_depend_item(struct config_item *item); void o2nm_undepend_item(struct config_item *item); +int o2nm_depend_this_node(void); +void o2nm_undepend_this_node(void); #endif /* O2CLUSTER_NODEMANAGER_H */ -- cgit v1.2.1 From e6df3a663a5d1ee68aeae7f007197f272700d9cc Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 6 Feb 2007 15:45:39 -0800 Subject: ocfs2: Wake up a starting region if it gets killed in the background. Tell o2cb_region_dev_write() to wake up if rmdir(2) happens on the heartbeat region while it is starting up. Then o2hb_region_dev_write() can check to see if it is alive and act accordingly. This prevents a hang (not being woken) and a crash (if it's woken by a signal). Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/heartbeat.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 2877d468f115..2bd7f788cf34 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, ret = wait_event_interruptible(o2hb_steady_queue, atomic_read(®->hr_steady_iterations) == 0); if (ret) { + /* We got interrupted (hello ptrace!). Clean up */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; reg->hr_task = NULL; @@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out; } - ret = count; + /* Ok, we were woken. Make sure it wasn't by drop_item() */ + spin_lock(&o2hb_live_lock); + hb_task = reg->hr_task; + spin_unlock(&o2hb_live_lock); + + if (hb_task) + ret = count; + else + ret = -EIO; + out: if (filp) fput(filp); @@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, if (hb_task) kthread_stop(hb_task); + /* + * If we're racing a dev_write(), we need to wake them. They will + * check reg->hr_task + */ + if (atomic_read(®->hr_steady_iterations) != 0) { + atomic_set(®->hr_steady_iterations, 0); + wake_up(&o2hb_steady_queue); + } + config_item_put(item); } -- cgit v1.2.1 From 800deef3f6f87fee3a2e89cf7237a1f20c1a78d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 May 2007 16:03:13 +0200 Subject: [PATCH] ocfs2: use list_for_each_entry where benefical Signed-off-by: Christoph Hellwig Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/tcp.c | 13 +++----- fs/ocfs2/dlm/dlmmaster.c | 40 +++++++----------------- fs/ocfs2/dlm/dlmrecovery.c | 77 +++++++++++++++------------------------------- fs/ocfs2/dlmglue.c | 6 ++-- fs/ocfs2/extent_map.c | 10 ++---- fs/ocfs2/journal.c | 6 ++-- 6 files changed, 47 insertions(+), 105 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d58c7dddb853..f0bdfd944c44 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -261,14 +261,12 @@ out: static void o2net_complete_nodes_nsw(struct o2net_node *nn) { - struct list_head *iter, *tmp; + struct o2net_status_wait *nsw, *tmp; unsigned int num_kills = 0; - struct o2net_status_wait *nsw; assert_spin_locked(&nn->nn_lock); - list_for_each_safe(iter, tmp, &nn->nn_status_list) { - nsw = list_entry(iter, struct o2net_status_wait, ns_node_item); + list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) { o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); num_kills++; } @@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler); void o2net_unregister_handler_list(struct list_head *list) { - struct list_head *pos, *n; - struct o2net_msg_handler *nmh; + struct o2net_msg_handler *nmh, *n; write_lock(&o2net_handler_lock); - list_for_each_safe(pos, n, list) { - nmh = list_entry(pos, struct o2net_msg_handler, - nh_unregister_item); + list_for_each_entry_safe(nmh, n, list, nh_unregister_item) { mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); rb_erase(&nmh->nh_node, &o2net_handler_tree); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 6edffca99d98..65b2b9b92688 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle) static void dlm_dump_mles(struct dlm_ctxt *dlm) { struct dlm_master_list_entry *mle; - struct list_head *iter; mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); spin_lock(&dlm->master_lock); - list_for_each(iter, &dlm->master_list) { - mle = list_entry(iter, struct dlm_master_list_entry, list); + list_for_each_entry(mle, &dlm->master_list, list) dlm_print_one_mle(mle); - } spin_unlock(&dlm->master_lock); } int dlm_dump_all_mles(const char __user *data, unsigned int len) { - struct list_head *iter; struct dlm_ctxt *dlm; spin_lock(&dlm_domain_lock); - list_for_each(iter, &dlm_domains) { - dlm = list_entry (iter, struct dlm_ctxt, list); + list_for_each_entry(dlm, &dlm_domains, list) { mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); dlm_dump_mles(dlm); } @@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, char *name, unsigned int namelen) { struct dlm_master_list_entry *tmpmle; - struct list_head *iter; assert_spin_locked(&dlm->master_lock); - list_for_each(iter, &dlm->master_list) { - tmpmle = list_entry(iter, struct dlm_master_list_entry, list); + list_for_each_entry(tmpmle, &dlm->master_list, list) { if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) continue; dlm_get_mle(tmpmle); @@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) { struct dlm_master_list_entry *mle; - struct list_head *iter; assert_spin_locked(&dlm->spinlock); - list_for_each(iter, &dlm->mle_hb_events) { - mle = list_entry(iter, struct dlm_master_list_entry, - hb_events); + list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { if (node_up) dlm_mle_node_up(dlm, mle, NULL, idx); else @@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, int ret; int i; int count = 0; - struct list_head *queue, *iter; + struct list_head *queue; struct dlm_lock *lock; assert_spin_locked(&res->spinlock); @@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, ret = 0; queue = &res->granted; for (i = 0; i < 3; i++) { - list_for_each(iter, queue) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, queue, list) { ++count; if (lock->ml.node == dlm->node_num) { mlog(0, "found a lock owned by this node still " @@ -2923,18 +2912,16 @@ again: static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - struct list_head *iter, *iter2; struct list_head *queue = &res->granted; int i, bit; - struct dlm_lock *lock; + struct dlm_lock *lock, *next; assert_spin_locked(&res->spinlock); BUG_ON(res->owner == dlm->node_num); for (i=0; i<3; i++) { - list_for_each_safe(iter, iter2, queue) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, queue, list) { if (lock->ml.node != dlm->node_num) { mlog(0, "putting lock for node %u\n", lock->ml.node); @@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, { int i; struct list_head *queue = &res->granted; - struct list_head *iter; struct dlm_lock *lock; int nodenum; @@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); for (i=0; i<3; i++) { - list_for_each(iter, queue) { + list_for_each_entry(lock, queue, list) { /* up to the caller to make sure this node * is alive */ - lock = list_entry (iter, struct dlm_lock, list); if (lock->ml.node != dlm->node_num) { spin_unlock(&res->spinlock); return lock->ml.node; @@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) { - struct list_head *iter, *iter2; - struct dlm_master_list_entry *mle; + struct dlm_master_list_entry *mle, *next; struct dlm_lock_resource *res; unsigned int hash; @@ -3245,9 +3229,7 @@ top: /* clean the master list */ spin_lock(&dlm->master_lock); - list_for_each_safe(iter, iter2, &dlm->master_list) { - mle = list_entry(iter, struct dlm_master_list_entry, list); - + list_for_each_entry_safe(mle, next, &dlm->master_list, list) { BUG_ON(mle->type != DLM_MLE_BLOCK && mle->type != DLM_MLE_MASTER && mle->type != DLM_MLE_MIGRATION); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 671c4ed58ee2..74d276ec276f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work) struct dlm_ctxt *dlm = container_of(work, struct dlm_ctxt, dispatched_work); LIST_HEAD(tmp_list); - struct list_head *iter, *iter2; - struct dlm_work_item *item; + struct dlm_work_item *item, *next; dlm_workfunc_t *workfunc; int tot=0; @@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work) list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); - list_for_each_safe(iter, iter2, &tmp_list) { + list_for_each_entry(item, &tmp_list, list) { tot++; } mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); - list_for_each_safe(iter, iter2, &tmp_list) { - item = list_entry(iter, struct dlm_work_item, list); + list_for_each_entry_safe(item, next, &tmp_list, list) { workfunc = item->func; list_del_init(&item->list); @@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) { int status = 0; struct dlm_reco_node_data *ndata; - struct list_head *iter; int all_nodes_done; int destroy = 0; int pass = 0; @@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) /* safe to access the node data list without a lock, since this * process is the only one to change the list */ - list_for_each(iter, &dlm->reco.node_data) { - ndata = list_entry (iter, struct dlm_reco_node_data, list); + list_for_each_entry(ndata, &dlm->reco.node_data, list) { BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); ndata->state = DLM_RECO_NODE_DATA_REQUESTING; @@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) * done, or if anyone died */ all_nodes_done = 1; spin_lock(&dlm_reco_state_lock); - list_for_each(iter, &dlm->reco.node_data) { - ndata = list_entry (iter, struct dlm_reco_node_data, list); - + list_for_each_entry(ndata, &dlm->reco.node_data, list) { mlog(0, "checking recovery state of node %u\n", ndata->node_num); switch (ndata->state) { @@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) { - struct list_head *iter, *iter2; - struct dlm_reco_node_data *ndata; + struct dlm_reco_node_data *ndata, *next; LIST_HEAD(tmplist); spin_lock(&dlm_reco_state_lock); list_splice_init(&dlm->reco.node_data, &tmplist); spin_unlock(&dlm_reco_state_lock); - list_for_each_safe(iter, iter2, &tmplist) { - ndata = list_entry (iter, struct dlm_reco_node_data, list); + list_for_each_entry_safe(ndata, next, &tmplist, list) { list_del_init(&ndata->list); kfree(ndata); } @@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) struct dlm_lock_resource *res; struct dlm_ctxt *dlm; LIST_HEAD(resources); - struct list_head *iter; int ret; u8 dead_node, reco_master; int skip_all_done = 0; @@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) /* any errors returned will be due to the new_master dying, * the dlm_reco_thread should detect this */ - list_for_each(iter, &resources) { - res = list_entry (iter, struct dlm_lock_resource, recovering); + list_for_each_entry(res, &resources, recovering) { ret = dlm_send_one_lockres(dlm, res, mres, reco_master, DLM_MRES_RECOVERY); if (ret < 0) { @@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, { struct dlm_ctxt *dlm = data; struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; - struct list_head *iter; struct dlm_reco_node_data *ndata = NULL; int ret = -EINVAL; @@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, dlm->reco.dead_node, done->node_idx, dlm->node_num); spin_lock(&dlm_reco_state_lock); - list_for_each(iter, &dlm->reco.node_data) { - ndata = list_entry (iter, struct dlm_reco_node_data, list); + list_for_each_entry(ndata, &dlm->reco.node_data, list) { if (ndata->node_num != done->node_idx) continue; @@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, struct list_head *list, u8 dead_node) { - struct dlm_lock_resource *res; - struct list_head *iter, *iter2; + struct dlm_lock_resource *res, *next; struct dlm_lock *lock; spin_lock(&dlm->spinlock); - list_for_each_safe(iter, iter2, &dlm->reco.resources) { - res = list_entry (iter, struct dlm_lock_resource, recovering); + list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, @@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_migratable_lockres *mres, u8 send_to, u8 flags) { - struct list_head *queue, *iter; + struct list_head *queue; int total_locks, i; u64 mig_cookie = 0; struct dlm_lock *lock; @@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, total_locks = 0; for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { queue = dlm_list_idx_to_ptr(res, i); - list_for_each(iter, queue) { - lock = list_entry (iter, struct dlm_lock, list); - + list_for_each_entry(lock, queue, list) { /* add another lock. */ total_locks++; if (!dlm_add_lock_to_array(lock, mres, i)) @@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb = NULL; int ret = 0; int i, j, bad; - struct list_head *iter; struct dlm_lock *lock = NULL; u8 from = O2NM_MAX_NODES; unsigned int added = 0; @@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each(iter, tmpq) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, tmpq, list) { if (lock->ml.cookie != ml->cookie) lock = NULL; else @@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int i; - struct list_head *queue, *iter, *iter2; - struct dlm_lock *lock; + struct list_head *queue; + struct dlm_lock *lock, *next; res->state |= DLM_LOCK_RES_RECOVERING; if (!list_empty(&res->recovering)) { @@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, /* find any pending locks and put them back on proper list */ for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { queue = dlm_list_idx_to_ptr(res, i); - list_for_each_safe(iter, iter2, queue) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, queue, list) { dlm_lock_get(lock); if (lock->convert_pending) { /* move converting lock back to granted */ @@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, u8 dead_node, u8 new_master) { int i; - struct list_head *iter, *iter2; struct hlist_node *hash_iter; struct hlist_head *bucket; - - struct dlm_lock_resource *res; + struct dlm_lock_resource *res, *next; mlog_entry_void(); assert_spin_locked(&dlm->spinlock); - list_for_each_safe(iter, iter2, &dlm->reco.resources) { - res = list_entry (iter, struct dlm_lock_resource, recovering); + list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { if (res->owner == dead_node) { list_del_init(&res->recovering); spin_lock(&res->spinlock); @@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local) static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 dead_node) { - struct list_head *iter, *queue; + struct list_head *queue; struct dlm_lock *lock; int blank_lvb = 0, local = 0; int i; @@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { queue = dlm_list_idx_to_ptr(res, i); - list_for_each(iter, queue) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, queue, list) { if (lock->ml.node == search_node) { if (dlm_lvb_needs_invalidation(lock, local)) { /* zero the lksb lvb and lockres lvb */ @@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, static void dlm_free_dead_locks(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 dead_node) { - struct list_head *iter, *tmpiter; - struct dlm_lock *lock; + struct dlm_lock *lock, *next; unsigned int freed = 0; /* this node is the lockres master: @@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); /* TODO: check pending_asts, pending_basts here */ - list_for_each_safe(iter, tmpiter, &res->granted) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, &res->granted, list) { if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); freed++; } } - list_for_each_safe(iter, tmpiter, &res->converting) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, &res->converting, list) { if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); freed++; } } - list_for_each_safe(iter, tmpiter, &res->blocked) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, &res->blocked, list) { if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index d1bd305ef0d7..f71250ed166f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level) static void lockres_set_flags(struct ocfs2_lock_res *lockres, unsigned long newflags) { - struct list_head *pos, *tmp; - struct ocfs2_mask_waiter *mw; + struct ocfs2_mask_waiter *mw, *tmp; assert_spin_locked(&lockres->l_lock); lockres->l_flags = newflags; - list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { - mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); + list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) { if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) continue; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index ba2b2ab1c6e4..e23e416ca74c 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, */ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) { - struct list_head *p, *n; - struct ocfs2_extent_map_item *emi; + struct ocfs2_extent_map_item *emi, *n; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_extent_map *em = &oi->ip_extent_map; LIST_HEAD(tmp_list); unsigned int range; spin_lock(&oi->ip_lock); - list_for_each_safe(p, n, &em->em_list) { - emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); - + list_for_each_entry_safe(emi, n, &em->em_list, ei_list) { if (emi->ei_cpos >= cpos) { /* Full truncate of this record. */ list_move(&emi->ei_list, &tmp_list); @@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) } spin_unlock(&oi->ip_lock); - list_for_each_safe(p, n, &tmp_list) { - emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); + list_for_each_entry_safe(emi, n, &tmp_list, ei_list) { list_del(&emi->ei_list); kfree(emi); } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index dc1188081720..dbfb20bb27ea 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work) container_of(work, struct ocfs2_journal, j_recovery_work); struct ocfs2_super *osb = journal->j_osb; struct ocfs2_dinode *la_dinode, *tl_dinode; - struct ocfs2_la_recovery_item *item; - struct list_head *p, *n; + struct ocfs2_la_recovery_item *item, *n; LIST_HEAD(tmp_la_list); mlog_entry_void(); @@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work) list_splice_init(&journal->j_la_cleanups, &tmp_la_list); spin_unlock(&journal->j_lock); - list_for_each_safe(p, n, &tmp_la_list) { - item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); + list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) { list_del_init(&item->lri_list); mlog(0, "Complete recovery for slot %d\n", item->lri_slot); -- cgit v1.2.1 From 5fb0f7f010ba07e373c30c3e99b0efd868c6c977 Mon Sep 17 00:00:00 2001 From: Shani Moideen Date: Mon, 11 Jun 2007 09:38:19 +0530 Subject: [KJ PATCH] Replacing memset(,0,PAGE_SIZE) with clear_page() in fs/ocfs2/dlm/dlmrecovery.c Replacing memset(,0,PAGE_SIZE) with clear_page() in fs/ocfs2/dlm/dlmrecovery.c Signed-off-by: Shani Moideen Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmrecovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74d276ec276f..a2c33160bfd6 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1155,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, u8 flags, u8 master) { /* mres here is one full page */ - memset(mres, 0, PAGE_SIZE); + clear_page(mres); mres->lockname_len = namelen; memcpy(mres->lockname, lockname, namelen); mres->num_locks = 0; -- cgit v1.2.1 From baf4661a8225d3a39622b795a8db0e6aa845c1ec Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Mon, 18 Jun 2007 17:00:24 -0700 Subject: ocfs2: Add "preferred slot" mount option ocfs2 will attempt to assign the node the slot# provided in the mount option. Failure to assign the preferred slot is not an error. This small feature can be useful for automated testing. Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/slot_map.c | 12 ++++++++++-- fs/ocfs2/super.c | 23 ++++++++++++++++++++--- 3 files changed, 31 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a860633e833f..648ef8e45eaa 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -219,6 +219,7 @@ struct ocfs2_super u16 max_slots; s16 node_num; s16 slot_num; + s16 preferred_slot; int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d8b79067dc14..af4882b62cfa 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, return ret; } -static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) +static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) { int i; s16 ret = OCFS2_INVALID_SLOT; + if (preferred >= 0 && preferred < si->si_num_slots) { + if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) { + ret = preferred; + goto out; + } + } + for(i = 0; i < si->si_num_slots; i++) { if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { ret = (s16) i; break; } } +out: return ret; } @@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb) if (slot == OCFS2_INVALID_SLOT) { /* if no slot yet, then just take 1st available * one. */ - slot = __ocfs2_find_empty_slot(si); + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); if (slot == OCFS2_INVALID_SLOT) { spin_unlock(&si->si_lock); mlog(ML_ERROR, "no free slots available!\n"); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 86b559c7dce9..f07718a7552b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); static int ocfs2_parse_options(struct super_block *sb, char *options, - unsigned long *mount_opt, int is_remount); + unsigned long *mount_opt, s16 *slot, + int is_remount); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); static int ocfs2_remount(struct super_block *sb, int *flags, char *data); @@ -140,6 +141,7 @@ enum { Opt_data_ordered, Opt_data_writeback, Opt_atime_quantum, + Opt_slot, Opt_err, }; @@ -154,6 +156,7 @@ static match_table_t tokens = { {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, {Opt_atime_quantum, "atime_quantum=%u"}, + {Opt_slot, "preferred_slot=%u"}, {Opt_err, NULL} }; @@ -355,9 +358,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) int incompat_features; int ret = 0; unsigned long parsed_options; + s16 slot; struct ocfs2_super *osb = OCFS2_SB(sb); - if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { + if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) { ret = -EINVAL; goto out; } @@ -534,6 +538,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) struct dentry *root; int status, sector_size; unsigned long parsed_opt; + s16 slot; struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; @@ -541,7 +546,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) mlog_entry("%p, %p, %i", sb, data, silent); - if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { + if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) { status = -EINVAL; goto read_super_error; } @@ -571,6 +576,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); bh = NULL; osb->s_mount_opt = parsed_opt; + osb->preferred_slot = slot; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -713,6 +719,7 @@ static struct file_system_type ocfs2_fs_type = { static int ocfs2_parse_options(struct super_block *sb, char *options, unsigned long *mount_opt, + s16 *slot, int is_remount) { int status; @@ -722,6 +729,7 @@ static int ocfs2_parse_options(struct super_block *sb, options ? options : "(none)"); *mount_opt = 0; + *slot = OCFS2_INVALID_SLOT; if (!options) { status = 1; @@ -782,6 +790,15 @@ static int ocfs2_parse_options(struct super_block *sb, else osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; break; + case Opt_slot: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option) + *slot = (s16)option; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " -- cgit v1.2.1 From 2e89b2e48e1da09ed483f195968c9172aa95b5e2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 9 May 2007 13:40:18 -0700 Subject: ocfs2: take ip_alloc_sem during entire truncate Use of the alloc sem during truncate was too narrow - we want to protect the i_size change and page truncation against mmap now. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 3 --- fs/ocfs2/file.c | 12 +++++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 19712a7d145f..02b6e7af8edb 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3631,8 +3631,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, mlog_entry_void(); - down_write(&OCFS2_I(inode)->ip_alloc_sem); - new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); @@ -3754,7 +3752,6 @@ start: goto start; bail: - up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_schedule_truncate_log_flush(osb, 1); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4979b6675717..566f9b70ec91 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -326,9 +326,6 @@ static int ocfs2_truncate_file(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)new_i_size); - unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(inode->i_mapping, new_i_size); - fe = (struct ocfs2_dinode *) di_bh->b_data; if (!OCFS2_IS_VALID_DINODE(fe)) { OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); @@ -363,16 +360,23 @@ static int ocfs2_truncate_file(struct inode *inode, if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; + down_write(&OCFS2_I(inode)->ip_alloc_sem); + /* This forces other nodes to sync and drop their pages. Do * this even if we have a truncate without allocation change - * ocfs2 cluster sizes can be much greater than page size, so * we have to truncate them anyway. */ status = ocfs2_data_lock(inode, 1); if (status < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); + mlog_errno(status); goto bail; } + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking @@ -399,6 +403,8 @@ static int ocfs2_truncate_file(struct inode *inode, bail_unlock_data: ocfs2_data_unlock(inode, 1); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + bail: mlog_exit(status); -- cgit v1.2.1 From 3a307ffc2730bfa1a4dfa94537be9d412338aad2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 8 May 2007 17:47:32 -0700 Subject: ocfs2: rework ocfs2_buffered_write_cluster() Use some ideas from the new-aops patch series and turn ocfs2_buffered_write_cluster() into a 2 stage operation with the caller copying data in between. The code now understands multiple cluster writes as a result of having to deal with a full page write for greater than 4k pages. This sets us up to easily call into the write path during ->page_mkwrite(). Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 812 +++++++++++++++++++++++++++++++++----------------------- fs/ocfs2/aops.h | 56 +--- fs/ocfs2/file.c | 121 +++++---- 3 files changed, 551 insertions(+), 438 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a480b09c79b9..3e5758ebd932 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, bh = bh->b_this_page, block_start += bsize) { block_end = block_start + bsize; + clear_buffer_new(bh); + /* * Ignore blocks outside of our i/o range - * they may belong to unallocated clusters. @@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, * For an allocating write with cluster size >= page * size, we always write the entire page. */ - - if (buffer_new(bh)) - clear_buffer_new(bh); + if (new) + set_buffer_new(bh); if (!buffer_mapped(bh)) { map_bh(bh, inode->i_sb, *p_blkno); @@ -761,217 +762,232 @@ next_bh: return ret; } +#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#define OCFS2_MAX_CTXT_PAGES 1 +#else +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#endif + +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) + /* - * This will copy user data from the buffer page in the splice - * context. - * - * For now, we ignore SPLICE_F_MOVE as that would require some extra - * communication out all the way to ocfs2_write(). + * Describe the state of a single cluster to be written to. */ -int ocfs2_map_and_write_splice_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, u64 *p_blkno, - unsigned int *ret_from, unsigned int *ret_to) -{ - int ret; - unsigned int to, from, cluster_start, cluster_end; - char *src, *dst; - struct ocfs2_splice_write_priv *sp = wc->w_private; - struct pipe_buffer *buf = sp->s_buf; - unsigned long bytes, src_from; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +struct ocfs2_write_cluster_desc { + u32 c_cpos; + u32 c_phys; + /* + * Give this a unique field because c_phys eventually gets + * filled. + */ + unsigned c_new; +}; - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, - &cluster_end); +struct ocfs2_write_ctxt { + /* Logical cluster position / len of write */ + u32 w_cpos; + u32 w_clen; - from = sp->s_offset; - src_from = sp->s_buf_offset; - bytes = wc->w_count; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; - if (wc->w_large_pages) { - /* - * For cluster size < page size, we have to - * calculate pos within the cluster and obey - * the rightmost boundary. - */ - bytes = min(bytes, (unsigned long)(osb->s_clustersize - - (wc->w_pos & (osb->s_clustersize - 1)))); - } - to = from + bytes; + /* + * This is true if page_size > cluster_size. + * + * It triggers a set of special cases during write which might + * have to deal with allocating writes to partial pages. + */ + unsigned int w_large_pages; - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); - BUG_ON(from < cluster_start); - BUG_ON(to > cluster_end); + /* + * Pages involved in this write. + * + * w_target_page is the page being written to by the user. + * + * w_pages is an array of pages which always contains + * w_target_page, and in the case of an allocating write with + * page_size < cluster size, it will contain zero'd and mapped + * pages adjacent to w_target_page which need to be written + * out in so that future reads from that region will get + * zero's. + */ + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; + unsigned int w_num_pages; + struct page *w_target_page; - if (wc->w_this_page_new) - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - cluster_start, cluster_end, 1); - else - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - from, to, 0); - if (ret) { - mlog_errno(ret); - goto out; + /* + * ocfs2_write_end() uses this to know what the real range to + * write in the target should be. + */ + unsigned int w_target_from; + unsigned int w_target_to; + + /* + * We could use journal_current_handle() but this is cleaner, + * IMHO -Mark + */ + handle_t *w_handle; + + struct buffer_head *w_di_bh; +}; + +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +{ + int i; + + for(i = 0; i < wc->w_num_pages; i++) { + if (wc->w_pages[i] == NULL) + continue; + + unlock_page(wc->w_pages[i]); + mark_page_accessed(wc->w_pages[i]); + page_cache_release(wc->w_pages[i]); } - src = buf->ops->map(sp->s_pipe, buf, 1); - dst = kmap_atomic(wc->w_this_page, KM_USER1); - memcpy(dst + from, src + src_from, bytes); - kunmap_atomic(wc->w_this_page, KM_USER1); - buf->ops->unmap(sp->s_pipe, buf, src); + brelse(wc->w_di_bh); + kfree(wc); +} + +static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, + struct ocfs2_super *osb, loff_t pos, + unsigned len) +{ + struct ocfs2_write_ctxt *wc; + + wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); + if (!wc) + return -ENOMEM; - wc->w_finished_copy = 1; + wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); - *ret_from = from; - *ret_to = to; -out: + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + wc->w_large_pages = 1; + else + wc->w_large_pages = 0; + + *wcp = wc; - return bytes ? (unsigned int)bytes : ret; + return 0; } /* - * This will copy user data from the iovec in the buffered write - * context. + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. */ -int ocfs2_map_and_write_user_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, u64 *p_blkno, - unsigned int *ret_from, unsigned int *ret_to) +static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) { - int ret; - unsigned int to, from, cluster_start, cluster_end; - unsigned long bytes, src_from; - char *dst; - struct ocfs2_buffered_write_priv *bp = wc->w_private; - const struct iovec *cur_iov = bp->b_cur_iov; - char __user *buf; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int block_start, block_end; + struct buffer_head *head, *bh; - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, - &cluster_end); + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; - buf = cur_iov->iov_base + bp->b_cur_off; - src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; + + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, end; + void *kaddr; + + start = max(from, block_start); + end = min(to, block_end); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+start, 0, end - start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } - from = wc->w_pos & (PAGE_CACHE_SIZE - 1); + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} + +/* + * Only called when we have a failure during allocating write to write + * zero's to the newly allocated region. + */ +static void ocfs2_write_failure(struct inode *inode, + struct ocfs2_write_ctxt *wc, + loff_t user_pos, unsigned user_len) +{ + int i; + unsigned from, to; + struct page *tmppage; + + ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len); - /* - * This is a lot of comparisons, but it reads quite - * easily, which is important here. - */ - /* Stay within the src page */ - bytes = PAGE_SIZE - src_from; - /* Stay within the vector */ - bytes = min(bytes, - (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); - /* Stay within count */ - bytes = min(bytes, (unsigned long)wc->w_count); - /* - * For clustersize > page size, just stay within - * target page, otherwise we have to calculate pos - * within the cluster and obey the rightmost - * boundary. - */ if (wc->w_large_pages) { - /* - * For cluster size < page size, we have to - * calculate pos within the cluster and obey - * the rightmost boundary. - */ - bytes = min(bytes, (unsigned long)(osb->s_clustersize - - (wc->w_pos & (osb->s_clustersize - 1)))); + from = wc->w_target_from; + to = wc->w_target_to; } else { - /* - * cluster size > page size is the most common - * case - we just stay within the target page - * boundary. - */ - bytes = min(bytes, PAGE_CACHE_SIZE - from); + from = 0; + to = PAGE_CACHE_SIZE; } - to = from + bytes; + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); - BUG_ON(from < cluster_start); - BUG_ON(to > cluster_end); + if (ocfs2_should_order_data(inode)) + walk_page_buffers(wc->w_handle, page_buffers(tmppage), + from, to, NULL, + ocfs2_journal_dirty_data); - if (wc->w_this_page_new) - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - cluster_start, cluster_end, 1); - else - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - from, to, 0); - if (ret) { - mlog_errno(ret); - goto out; + block_commit_write(tmppage, from, to); } - - dst = kmap(wc->w_this_page); - memcpy(dst + from, bp->b_src_buf + src_from, bytes); - kunmap(wc->w_this_page); - - /* - * XXX: This is slow, but simple. The caller of - * ocfs2_buffered_write_cluster() is responsible for - * passing through the iovecs, so it's difficult to - * predict what our next step is in here after our - * initial write. A future version should be pushing - * that iovec manipulation further down. - * - * By setting this, we indicate that a copy from user - * data was done, and subsequent calls for this - * cluster will skip copying more data. - */ - wc->w_finished_copy = 1; - - *ret_from = from; - *ret_to = to; -out: - - return bytes ? (unsigned int)bytes : ret; } -/* - * Map, fill and write a page to disk. - * - * The work of copying data is done via callback. Newly allocated - * pages which don't take user data will be zero'd (set 'new' to - * indicate an allocating write) - * - * Returns a negative error code or the number of bytes copied into - * the page. - */ -static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, - u64 *p_blkno, struct page *page, - struct ocfs2_write_ctxt *wc, int new) +static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, + struct ocfs2_write_ctxt *wc, + struct page *page, u32 cpos, + loff_t user_pos, unsigned user_len, + int new) { - int ret, copied = 0; - unsigned int from = 0, to = 0; + int ret; + unsigned int map_from = 0, map_to = 0; unsigned int cluster_start, cluster_end; - unsigned int zero_from = 0, zero_to = 0; + unsigned int user_data_from = 0, user_data_to = 0; - ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, &cluster_start, &cluster_end); - if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index - && !wc->w_finished_copy) { - - wc->w_this_page = page; - wc->w_this_page_new = new; - ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); - if (ret < 0) { + if (page == wc->w_target_page) { + map_from = user_pos & (PAGE_CACHE_SIZE - 1); + map_to = map_from + user_len; + + if (new) + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, + new); + else + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + map_from, map_to, new); + if (ret) { mlog_errno(ret); goto out; } - copied = ret; - - zero_from = from; - zero_to = to; + user_data_from = map_from; + user_data_to = map_to; if (new) { - from = cluster_start; - to = cluster_end; + map_from = cluster_start; + map_to = cluster_end; } + + wc->w_target_from = map_from; + wc->w_target_to = map_to; } else { /* * If we haven't allocated the new page yet, we @@ -980,11 +996,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, */ BUG_ON(!new); - from = cluster_start; - to = cluster_end; + map_from = cluster_start; + map_to = cluster_end; ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, 1); + cluster_start, cluster_end, new); if (ret) { mlog_errno(ret); goto out; @@ -1003,108 +1019,84 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, */ if (new && !PageUptodate(page)) ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), - wc->w_cpos, zero_from, zero_to); + cpos, user_data_from, user_data_to); flush_dcache_page(page); - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, NULL, - ocfs2_journal_dirty_data); - if (ret < 0) - mlog_errno(ret); - } - - /* - * We don't use generic_commit_write() because we need to - * handle our own i_size update. - */ - ret = block_commit_write(page, from, to); - if (ret) - mlog_errno(ret); out: - - return copied ? copied : ret; + return ret; } /* - * Do the actual write of some data into an inode. Optionally allocate - * in order to fulfill the write. - * - * cpos is the logical cluster offset within the file to write at - * - * 'phys' is the physical mapping of that offset. a 'phys' value of - * zero indicates that allocation is required. In this case, data_ac - * and meta_ac should be valid (meta_ac can be null if metadata - * allocation isn't required). + * This function will only grab one clusters worth of pages. */ -static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, - struct buffer_head *di_bh, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_write_ctxt *wc) +static int ocfs2_grab_pages_for_write(struct address_space *mapping, + struct ocfs2_write_ctxt *wc, + u32 cpos, loff_t user_pos, int new) { - int ret, i, numpages = 1, new; - unsigned int copied = 0; - u32 tmp_pos; - u64 v_blkno, p_blkno; - struct address_space *mapping = file->f_mapping; + int ret = 0, i; + unsigned long start, target_index, index; struct inode *inode = mapping->host; - unsigned long index, start; - struct page **cpages; - new = phys == 0 ? 1 : 0; + target_index = user_pos >> PAGE_CACHE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For * non allocating write, we just change the one * page. Otherwise, we'll need a whole clusters worth. */ - if (new) - numpages = ocfs2_pages_per_cluster(inode->i_sb); - - cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); - if (!cpages) { - ret = -ENOMEM; - mlog_errno(ret); - return ret; - } - - /* - * Fill our page array first. That way we've grabbed enough so - * that we can zero and flush if we error after adding the - * extent. - */ if (new) { - start = ocfs2_align_clusters_to_page_index(inode->i_sb, - wc->w_cpos); - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); + wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); + start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); } else { - start = wc->w_pos >> PAGE_CACHE_SHIFT; - v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; + wc->w_num_pages = 1; + start = target_index; } - for(i = 0; i < numpages; i++) { + for(i = 0; i < wc->w_num_pages; i++) { index = start + i; - cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); - if (!cpages[i]) { + wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); + if (!wc->w_pages[i]) { ret = -ENOMEM; mlog_errno(ret); goto out; } + + if (index == target_index) + wc->w_target_page = wc->w_pages[i]; } +out: + return ret; +} + +/* + * Prepare a single cluster for write one cluster into the file. + */ +static int ocfs2_write_cluster(struct address_space *mapping, + u32 phys, struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, u32 cpos, + loff_t user_pos, unsigned user_len) +{ + int ret, i, new; + u64 v_blkno, p_blkno; + struct inode *inode = mapping->host; + + new = phys == 0 ? 1 : 0; if (new) { + u32 tmp_pos; + /* * This is safe to call with the page locks - it won't take * any additional semaphores or cluster locks. */ - tmp_pos = wc->w_cpos; + tmp_pos = cpos; ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, di_bh, handle, - data_ac, meta_ac, NULL); + &tmp_pos, 1, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1121,103 +1113,132 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, mlog_errno(ret); goto out; } + + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); + } else { + v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; } + /* + * The only reason this should fail is due to an inability to + * find the extent added. + */ ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, NULL); if (ret < 0) { - - /* - * XXX: Should we go readonly here? - */ - - mlog_errno(ret); + ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " + "at logical block %llu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_blkno); goto out; } BUG_ON(p_blkno == 0); - for(i = 0; i < numpages; i++) { - ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], - wc, new); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + for(i = 0; i < wc->w_num_pages; i++) { + int tmpret; - copied += ret; + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, + wc->w_pages[i], cpos, + user_pos, user_len, new); + if (tmpret) { + mlog_errno(tmpret); + if (ret == 0) + tmpret = ret; + } } + /* + * We only have cleanup to do in case of allocating write. + */ + if (ret && new) + ocfs2_write_failure(inode, wc, user_pos, user_len); + out: - for(i = 0; i < numpages; i++) { - unlock_page(cpages[i]); - mark_page_accessed(cpages[i]); - page_cache_release(cpages[i]); - } - kfree(cpages); - return copied ? copied : ret; + return ret; } -static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, - struct ocfs2_super *osb, loff_t pos, - size_t count, ocfs2_page_writer *cb, - void *cb_priv) +/* + * ocfs2_write_end() wants to know which parts of the target page it + * should complete the write on. It's easiest to compute them ahead of + * time when a more complete view of the write is available. + */ +static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len, int alloc) { - wc->w_count = count; - wc->w_pos = pos; - wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; - wc->w_finished_copy = 0; + struct ocfs2_write_cluster_desc *desc; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) - wc->w_large_pages = 1; - else - wc->w_large_pages = 0; + wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_to = wc->w_target_from + len; + + if (alloc == 0) + return; + + /* + * Allocating write - we may have different boundaries based + * on page size and cluster size. + * + * NOTE: We can no longer compute one value from the other as + * the actual write length and user provided length may be + * different. + */ - wc->w_write_data_page = cb; - wc->w_private = cb_priv; + if (wc->w_large_pages) { + /* + * We only care about the 1st and last cluster within + * our range and whether they are holes or not. Either + * value may be extended out to the start/end of a + * newly allocated cluster. + */ + desc = &wc->w_desc[0]; + if (desc->c_new) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + &wc->w_target_from, + NULL); + + desc = &wc->w_desc[wc->w_clen - 1]; + if (desc->c_new) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + NULL, + &wc->w_target_to); + } else { + wc->w_target_from = 0; + wc->w_target_to = PAGE_CACHE_SIZE; + } } -/* - * Write a cluster to an inode. The cluster may not be allocated yet, - * in which case it will be. This only exists for buffered writes - - * O_DIRECT takes a more "traditional" path through the kernel. - * - * The caller is responsible for incrementing pos, written counts, etc - * - * For file systems that don't support sparse files, pre-allocation - * and page zeroing up until cpos should be done prior to this - * function call. - * - * Callers should be holding i_sem, and the rw cluster lock. - * - * Returns the number of user bytes written, or less than zero for - * error. - */ -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, - size_t count, ocfs2_page_writer *actor, - void *priv) +int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; - ssize_t written = 0; - u32 phys; - struct inode *inode = file->f_mapping->host; + int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; + unsigned int num_clusters = 0, clusters_to_alloc = 0; + u32 phys = 0; + struct ocfs2_write_ctxt *wc; + struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; - struct ocfs2_write_ctxt wc; + struct ocfs2_write_cluster_desc *desc; - ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len); + if (ret) { + mlog_errno(ret); + return ret; + } - ret = ocfs2_meta_lock(inode, &di_bh, 1); + ret = ocfs2_meta_lock(inode, &wc->w_di_bh, 1); if (ret) { mlog_errno(ret); goto out; } - di = (struct ocfs2_dinode *)di_bh->b_data; + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; /* * Take alloc sem here to prevent concurrent lookups. That way @@ -1228,23 +1249,60 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); - if (ret) { - mlog_errno(ret); - goto out_meta; + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + desc->c_cpos = wc->w_cpos + i; + + if (num_clusters == 0) { + ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, + &num_clusters, NULL); + if (ret) { + mlog_errno(ret); + goto out_meta; + } + } else if (phys) { + /* + * Only increment phys if it doesn't describe + * a hole. + */ + phys++; + } + + desc->c_phys = phys; + if (phys == 0) { + desc->c_new = 1; + clusters_to_alloc++; + } + + num_clusters--; } - /* phys == 0 means that allocation is required. */ - if (phys == 0) { - ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); + /* + * We set w_target_from, w_target_to here so that + * ocfs2_write_end() knows which range in the target page to + * write out. An allocation requires that we write the entire + * cluster range. + */ + if (clusters_to_alloc > 0) { + /* + * XXX: We are stretching the limits of + * ocfs2_lock_allocators(). It greately over-estimates + * the work to be done. + */ + ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, + &data_ac, &meta_ac); if (ret) { mlog_errno(ret); goto out_meta; } - credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); + credits = ocfs2_calc_extend_credits(inode->i_sb, di, + clusters_to_alloc); + } + ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc); + ret = ocfs2_data_lock(inode, 1); if (ret) { mlog_errno(ret); @@ -1258,36 +1316,50 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, goto out_data; } - written = ocfs2_write(file, phys, handle, di_bh, data_ac, - meta_ac, &wc); - if (written < 0) { - ret = written; + wc->w_handle = handle; + + /* + * We don't want this to fail in ocfs2_write_end(), so do it + * here. + */ + ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { mlog_errno(ret); goto out_commit; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + /* + * Fill our page array first. That way we've grabbed enough so + * that we can zero and flush if we error after adding the + * extent. + */ + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, + clusters_to_alloc); if (ret) { mlog_errno(ret); goto out_commit; } - pos += written; - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + + ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac, + meta_ac, wc, desc->c_cpos, pos, len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } } - inode->i_blocks = ocfs2_inode_sector_count(inode); - di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) - mlog_errno(ret); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + *pagep = wc->w_target_page; + *fsdata = wc; + return 0; out_commit: ocfs2_commit_trans(osb, handle); @@ -1299,13 +1371,85 @@ out_meta: ocfs2_meta_unlock(inode, 1); out: - brelse(di_bh); + ocfs2_free_write_ctxt(wc); + if (data_ac) ocfs2_free_alloc_context(data_ac); if (meta_ac) ocfs2_free_alloc_context(meta_ac); + return ret; +} + +int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int i; + unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_write_ctxt *wc = fsdata; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + handle_t *handle = wc->w_handle; + struct page *tmppage; + + if (unlikely(copied < len)) { + if (!PageUptodate(wc->w_target_page)) + copied = 0; + + ocfs2_zero_new_buffers(wc->w_target_page, start+copied, + start+len); + } + flush_dcache_page(wc->w_target_page); + + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; + + if (tmppage == wc->w_target_page) { + from = wc->w_target_from; + to = wc->w_target_to; + + BUG_ON(from > PAGE_CACHE_SIZE || + to > PAGE_CACHE_SIZE || + to < from); + } else { + /* + * Pages adjacent to the target (if any) imply + * a hole-filling write in which case we want + * to flush their entire range. + */ + from = 0; + to = PAGE_CACHE_SIZE; + } + + if (ocfs2_should_order_data(inode)) + walk_page_buffers(wc->w_handle, page_buffers(tmppage), + from, to, NULL, + ocfs2_journal_dirty_data); + + block_commit_write(tmppage, from, to); + } + + pos += copied; + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + + ocfs2_journal_dirty(handle, wc->w_di_bh); + + ocfs2_commit_trans(osb, handle); + ocfs2_data_unlock(inode, 1); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_meta_unlock(inode, 1); + ocfs2_free_write_ctxt(wc); - return written ? written : ret; + return copied; } const struct address_space_operations ocfs2_aops = { diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 45821d479b5a..bdcdd1ae63a9 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -42,57 +42,13 @@ int walk_page_buffers( handle_t *handle, int (*fn)( handle_t *handle, struct buffer_head *bh)); -struct ocfs2_write_ctxt; -typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, - u64 *, unsigned int *, unsigned int *); +int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, - size_t count, ocfs2_page_writer *actor, - void *priv); - -struct ocfs2_write_ctxt { - size_t w_count; - loff_t w_pos; - u32 w_cpos; - unsigned int w_finished_copy; - - /* This is true if page_size > cluster_size */ - unsigned int w_large_pages; - - /* Filler callback and private data */ - ocfs2_page_writer *w_write_data_page; - void *w_private; - - /* Only valid for the filler callback */ - struct page *w_this_page; - unsigned int w_this_page_new; -}; - -struct ocfs2_buffered_write_priv { - char *b_src_buf; - const struct iovec *b_cur_iov; /* Current iovec */ - size_t b_cur_off; /* Offset in the - * current iovec */ -}; -int ocfs2_map_and_write_user_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, - u64 *p_blkno, - unsigned int *ret_from, - unsigned int *ret_to); - -struct ocfs2_splice_write_priv { - struct splice_desc *s_sd; - struct pipe_buffer *s_buf; - struct pipe_inode_info *s_pipe; - /* Neither offset value is ever larger than one page */ - unsigned int s_offset; - unsigned int s_buf_offset; -}; -int ocfs2_map_and_write_splice_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, - u64 *p_blkno, - unsigned int *ret_from, - unsigned int *ret_to); +int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 566f9b70ec91..4c850d00c269 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1335,15 +1335,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) *basep = base; } -static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, +static struct page * ocfs2_get_write_source(char **ret_src_buf, const struct iovec *cur_iov, size_t iov_offset) { int ret; - char *buf; + char *buf = cur_iov->iov_base + iov_offset; struct page *src_page = NULL; + unsigned long off; - buf = cur_iov->iov_base + iov_offset; + off = (unsigned long)(buf) & ~PAGE_CACHE_MASK; if (!segment_eq(get_fs(), KERNEL_DS)) { /* @@ -1355,18 +1356,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp (unsigned long)buf & PAGE_CACHE_MASK, 1, 0, 0, &src_page, NULL); if (ret == 1) - bp->b_src_buf = kmap(src_page); + *ret_src_buf = kmap(src_page) + off; else src_page = ERR_PTR(-EFAULT); } else { - bp->b_src_buf = buf; + *ret_src_buf = buf; } return src_page; } -static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, - struct page *page) +static void ocfs2_put_write_source(struct page *page) { if (page) { kunmap(page); @@ -1382,10 +1382,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, { int ret = 0; ssize_t copied, total = 0; - size_t iov_offset = 0; + size_t iov_offset = 0, bytes; + loff_t pos; const struct iovec *cur_iov = iov; - struct ocfs2_buffered_write_priv bp; - struct page *page; + struct page *user_page, *page; + char *buf, *dst; + void *fsdata; /* * handle partial DIO write. Adjust cur_iov if needed. @@ -1393,21 +1395,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); do { - bp.b_cur_off = iov_offset; - bp.b_cur_iov = cur_iov; + pos = *ppos; - page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); - if (IS_ERR(page)) { - ret = PTR_ERR(page); + user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset); + if (IS_ERR(user_page)) { + ret = PTR_ERR(user_page); goto out; } - copied = ocfs2_buffered_write_cluster(file, *ppos, count, - ocfs2_map_and_write_user_data, - &bp); + /* Stay within our page boundaries */ + bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)), + (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK))); + /* Stay within the vector boundary */ + bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset); + /* Stay within count */ + bytes = min(bytes, count); + + page = NULL; + ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0, + &page, &fsdata); + if (ret) { + mlog_errno(ret); + goto out; + } - ocfs2_put_write_source(&bp, page); + dst = kmap_atomic(page, KM_USER0); + memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes); + kunmap_atomic(dst, KM_USER0); + flush_dcache_page(page); + ocfs2_put_write_source(user_page); + copied = ocfs2_write_end(file, file->f_mapping, pos, bytes, + bytes, page, fsdata); if (copied < 0) { mlog_errno(copied); ret = copied; @@ -1415,7 +1434,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, } total += copied; - *ppos = *ppos + copied; + *ppos = pos + copied; count -= copied; ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); @@ -1585,52 +1604,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - int ret, count, total = 0; + int ret, count; ssize_t copied = 0; - struct ocfs2_splice_write_priv sp; + struct file *file = sd->u.file; + unsigned int offset; + struct page *page = NULL; + void *fsdata; + char *src, *dst; ret = buf->ops->confirm(pipe, buf); if (ret) goto out; - sp.s_sd = sd; - sp.s_buf = buf; - sp.s_pipe = pipe; - sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; - sp.s_buf_offset = buf->offset; - + offset = sd->pos & ~PAGE_CACHE_MASK; count = sd->len; - if (count + sp.s_offset > PAGE_CACHE_SIZE) - count = PAGE_CACHE_SIZE - sp.s_offset; + if (count + offset > PAGE_CACHE_SIZE) + count = PAGE_CACHE_SIZE - offset; - do { - /* - * splice wants us to copy up to one page at a - * time. For pagesize > cluster size, this means we - * might enter ocfs2_buffered_write_cluster() more - * than once, so keep track of our progress here. - */ - copied = ocfs2_buffered_write_cluster(sd->u.file, - (loff_t)sd->pos + total, - count, - ocfs2_map_and_write_splice_data, - &sp); - if (copied < 0) { - mlog_errno(copied); - ret = copied; - goto out; - } + ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0, + &page, &fsdata); + if (ret) { + mlog_errno(ret); + goto out; + } - count -= copied; - sp.s_offset += copied; - sp.s_buf_offset += copied; - total += copied; - } while (count); + src = buf->ops->map(pipe, buf, 1); + dst = kmap_atomic(page, KM_USER1); + memcpy(dst + offset, src + buf->offset, count); + kunmap_atomic(page, KM_USER1); + buf->ops->unmap(pipe, buf, src); - ret = 0; + copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count, + page, fsdata); + if (copied < 0) { + mlog_errno(copied); + ret = copied; + goto out; + } out: - return total ? total : ret; + return copied ? copied : ret; } static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, -- cgit v1.2.1 From 607d44aa3fa6f40b0facaf1028886ed362b92682 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 9 May 2007 15:14:45 -0700 Subject: ocfs2: factor out write aops into nolock variants ocfs2_mkwrite() will want this so that it can add some mmap specific checks before asking for a write. Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 120 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3e5758ebd932..fc723fb9c981 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -849,7 +849,7 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, struct ocfs2_super *osb, loff_t pos, - unsigned len) + unsigned len, struct buffer_head *di_bh) { struct ocfs2_write_ctxt *wc; @@ -859,6 +859,8 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, wc->w_cpos = pos >> osb->s_clustersize_bits; wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); + get_bh(di_bh); + wc->w_di_bh = di_bh; if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) wc->w_large_pages = 1; @@ -1211,9 +1213,10 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, } } -int ocfs2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +static int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh) { int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int num_clusters = 0, clusters_to_alloc = 0; @@ -1227,28 +1230,14 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping, handle_t *handle; struct ocfs2_write_cluster_desc *desc; - ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len); + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { mlog_errno(ret); return ret; } - ret = ocfs2_meta_lock(inode, &wc->w_di_bh, 1); - if (ret) { - mlog_errno(ret); - goto out; - } di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; - /* - * Take alloc sem here to prevent concurrent lookups. That way - * the mapping, zeroing and tree manipulation within - * ocfs2_write() will be safe against ->readpage(). This - * should also serve to lock out allocation from a shared - * writeable region. - */ - down_write(&OCFS2_I(inode)->ip_alloc_sem); - for (i = 0; i < wc->w_clen; i++) { desc = &wc->w_desc[i]; desc->c_cpos = wc->w_cpos + i; @@ -1258,7 +1247,7 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping, &num_clusters, NULL); if (ret) { mlog_errno(ret); - goto out_meta; + goto out; } } else if (phys) { /* @@ -1293,7 +1282,7 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping, &data_ac, &meta_ac); if (ret) { mlog_errno(ret); - goto out_meta; + goto out; } credits = ocfs2_calc_extend_credits(inode->i_sb, di, @@ -1303,17 +1292,11 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping, ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc); - ret = ocfs2_data_lock(inode, 1); - if (ret) { - mlog_errno(ret); - goto out_meta; - } - handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_data; + goto out; } wc->w_handle = handle; @@ -1363,13 +1346,6 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping, out_commit: ocfs2_commit_trans(osb, handle); -out_data: - ocfs2_data_unlock(inode, 1); - -out_meta: - up_write(&OCFS2_I(inode)->ip_alloc_sem); - ocfs2_meta_unlock(inode, 1); - out: ocfs2_free_write_ctxt(wc); @@ -1380,9 +1356,60 @@ out: return ret; } -int ocfs2_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct inode *inode = mapping->host; + + ret = ocfs2_meta_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* + * Take alloc sem here to prevent concurrent lookups. That way + * the mapping, zeroing and tree manipulation within + * ocfs2_write() will be safe against ->readpage(). This + * should also serve to lock out allocation from a shared + * writeable region. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_data_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto out_fail; + } + + ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + fsdata, di_bh); + if (ret) { + mlog_errno(ret); + goto out_fail_data; + } + + brelse(di_bh); + + return 0; + +out_fail_data: + ocfs2_data_unlock(inode, 1); +out_fail: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_meta_unlock(inode, 1); + + return ret; +} + +static int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { int i; unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); @@ -1444,12 +1471,25 @@ int ocfs2_write_end(struct file *file, struct address_space *mapping, ocfs2_journal_dirty(handle, wc->w_di_bh); ocfs2_commit_trans(osb, handle); + ocfs2_free_write_ctxt(wc); + + return copied; +} + +int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + struct inode *inode = mapping->host; + + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + ocfs2_data_unlock(inode, 1); up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_meta_unlock(inode, 1); - ocfs2_free_write_ctxt(wc); - return copied; + return ret; } const struct address_space_operations ocfs2_aops = { -- cgit v1.2.1 From 7307de80510a70e5e5aa98de1e80ccbb7d90a3a8 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 9 May 2007 15:16:19 -0700 Subject: ocfs2: shared writeable mmap Implement cluster consistent shared writeable mappings using the ->page_mkwrite() callback. Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 56 ++++++++++++++----- fs/ocfs2/aops.h | 9 +++ fs/ocfs2/file.c | 7 +++ fs/ocfs2/mmap.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 200 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index fc723fb9c981..b8869fd0884f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1034,7 +1034,8 @@ out: */ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, int new) + u32 cpos, loff_t user_pos, int new, + struct page *mmap_page) { int ret = 0, i; unsigned long start, target_index, index; @@ -1058,11 +1059,36 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, for(i = 0; i < wc->w_num_pages; i++) { index = start + i; - wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); - if (!wc->w_pages[i]) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; + if (index == target_index && mmap_page) { + /* + * ocfs2_pagemkwrite() is a little different + * and wants us to directly use the page + * passed in. + */ + lock_page(mmap_page); + + if (mmap_page->mapping != mapping) { + unlock_page(mmap_page); + /* + * Sanity check - the locking in + * ocfs2_pagemkwrite() should ensure + * that this code doesn't trigger. + */ + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + page_cache_get(mmap_page); + wc->w_pages[i] = mmap_page; + } else { + wc->w_pages[i] = find_or_create_page(mapping, index, + GFP_NOFS); + if (!wc->w_pages[i]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } } if (index == target_index) @@ -1213,10 +1239,10 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, } } -static int ocfs2_write_begin_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata, - struct buffer_head *di_bh) +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page) { int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int num_clusters = 0, clusters_to_alloc = 0; @@ -1318,7 +1344,7 @@ static int ocfs2_write_begin_nolock(struct address_space *mapping, * extent. */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, - clusters_to_alloc); + clusters_to_alloc, mmap_page); if (ret) { mlog_errno(ret); goto out_commit; @@ -1386,7 +1412,7 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping, } ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, - fsdata, di_bh); + fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); goto out_fail_data; @@ -1407,9 +1433,9 @@ out_fail: return ret; } -static int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { int i; unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index bdcdd1ae63a9..389579bd64e3 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -50,6 +50,15 @@ int ocfs2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page); + /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ test_bit(0, (unsigned long *)&iocb->private) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4c850d00c269..a80f31776d94 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1001,6 +1001,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock; } + /* + * This will intentionally not wind up calling vmtruncate(), + * since all the work for a size change has been done above. + * Otherwise, we could get into problems with truncate as + * ip_alloc_sem is used there to protect against i_size + * changes. + */ status = inode_setattr(inode, attr); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af01158b39f5..d79aa12137d2 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -37,11 +37,29 @@ #include "ocfs2.h" +#include "aops.h" #include "dlmglue.h" #include "file.h" #include "inode.h" #include "mmap.h" +static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset) +{ + /* The best way to deal with signals in the vm path is + * to block them upfront, rather than allowing the + * locking paths to return -ERESTARTSYS. */ + sigfillset(blocked); + + /* We should technically never get a bad return value + * from sigprocmask */ + return sigprocmask(SIG_BLOCK, blocked, oldset); +} + +static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset) +{ + return sigprocmask(SIG_SETMASK, oldset, NULL); +} + static struct page *ocfs2_nopage(struct vm_area_struct * area, unsigned long address, int *type) @@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area, mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, type); - /* The best way to deal with signals in this path is - * to block them upfront, rather than allowing the - * locking paths to return -ERESTARTSYS. */ - sigfillset(&blocked); - - /* We should technically never get a bad ret return - * from sigprocmask */ - ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); + ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); if (ret < 0) { mlog_errno(ret); goto out; @@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area, page = filemap_nopage(area, address, type); - ret = sigprocmask(SIG_SETMASK, &oldset, NULL); + ret = ocfs2_vm_op_unblock_sigs(&oldset); if (ret < 0) mlog_errno(ret); out: @@ -76,28 +87,136 @@ out: return page; } -static struct vm_operations_struct ocfs2_file_vm_ops = { - .nopage = ocfs2_nopage, -}; +static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, + struct page *page) +{ + int ret; + struct address_space *mapping = inode->i_mapping; + loff_t pos = page->index << PAGE_CACHE_SHIFT; + unsigned int len = PAGE_CACHE_SIZE; + pgoff_t last_index; + struct page *locked_page = NULL; + void *fsdata; + loff_t size = i_size_read(inode); -int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) + /* + * Another node might have truncated while we were waiting on + * cluster locks. + */ + last_index = size >> PAGE_CACHE_SHIFT; + if (page->index > last_index) { + ret = -EINVAL; + goto out; + } + + /* + * The i_size check above doesn't catch the case where nodes + * truncated and then re-extended the file. We'll re-check the + * page mapping after taking the page lock inside of + * ocfs2_write_begin_nolock(). + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = -EINVAL; + goto out; + } + + /* + * Call ocfs2_write_begin() and ocfs2_write_end() to take + * advantage of the allocation code there. We pass a write + * length of the whole page (chopped to i_size) to make sure + * the whole thing is allocated. + * + * Since we know the page is up to date, we don't have to + * worry about ocfs2_write_begin() skipping some buffer reads + * because the "write" would invalidate their data. + */ + if (page->index == last_index) + len = size & ~PAGE_CACHE_MASK; + + ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, + &fsdata, di_bh, page); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, + fsdata); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + BUG_ON(ret != len); + ret = 0; +out: + return ret; +} + +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) { - int ret = 0, lock_level = 0; - struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct buffer_head *di_bh = NULL; + sigset_t blocked, oldset; + int ret, ret2; + + ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* + * The cluster locks taken will block a truncate from another + * node. Taking the data lock will also ensure that we don't + * attempt page truncation as part of a downconvert. + */ + ret = ocfs2_meta_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } /* - * Only support shared writeable mmap for local mounts which - * don't know about holes. + * The alloc sem should be enough to serialize with + * ocfs2_truncate_file() changing i_size as well as any thread + * modifying the inode btree. */ - if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && - ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && - ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { - mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); - /* This is -EINVAL because generic_file_readonly_mmap - * returns it in a similar situation. */ - return -EINVAL; + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_data_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_meta_unlock; } + ret = __ocfs2_page_mkwrite(inode, di_bh, page); + + ocfs2_data_unlock(inode, 1); + +out_meta_unlock: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_meta_unlock(inode, 1); + +out: + ret2 = ocfs2_vm_op_unblock_sigs(&oldset); + if (ret2 < 0) + mlog_errno(ret2); + + return ret; +} + +static struct vm_operations_struct ocfs2_file_vm_ops = { + .nopage = ocfs2_nopage, + .page_mkwrite = ocfs2_page_mkwrite, +}; + +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret = 0, lock_level = 0; + ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, file->f_vfsmnt, &lock_level); if (ret < 0) { -- cgit v1.2.1 From bce997682fe3121516f5a20cf7bad2e6029ba018 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 18 Jun 2007 11:12:36 -0700 Subject: ocfs2: harden buffer check during mapping of page blocks We don't want to submit buffer_new blocks for read i/o. This actually won't happen right now because those requests during an allocating write are all nicely aligned. It's probably a good idea to provide an explicit check though. Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b8869fd0884f..e8d16ae12ef0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -712,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && - (block_start < from || block_end > to)) { + !buffer_new(bh) && + (block_start < from || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; } -- cgit v1.2.1 From 2b604351bc99b4e4504758cbac369b660b71de0b Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 22 Jun 2007 15:45:27 -0700 Subject: ocfs2: simplify deallocation locking Deallocation of suballocator blocks, most notably extent blocks, might involve multiple suballocator inodes. The locking for this can get extremely complicated, especially when the suballocator inodes to delete from aren't known until deep within an unrelated codepath. Implement a simple scheme for recording the blocks to be unlinked so that the actual deallocation can be done in a context which won't deadlock. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/alloc.h | 19 +++++ fs/ocfs2/suballoc.c | 27 ++----- fs/ocfs2/suballoc.h | 13 ++++ 4 files changed, 242 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 02b6e7af8edb..873bb99fc2ff 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2957,6 +2957,210 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) return status; } +/* + * Delayed de-allocation of suballocator blocks. + * + * Some sets of block de-allocations might involve multiple suballocator inodes. + * + * The locking for this can get extremely complicated, especially when + * the suballocator inodes to delete from aren't known until deep + * within an unrelated codepath. + * + * ocfs2_extent_block structures are a good example of this - an inode + * btree could have been grown by any number of nodes each allocating + * out of their own suballoc inode. + * + * These structures allow the delay of block de-allocation until a + * later time, when locking of multiple cluster inodes won't cause + * deadlock. + */ + +/* + * Describes a single block free from a suballocator + */ +struct ocfs2_cached_block_free { + struct ocfs2_cached_block_free *free_next; + u64 free_blk; + unsigned int free_bit; +}; + +struct ocfs2_per_slot_free_list { + struct ocfs2_per_slot_free_list *f_next_suballocator; + int f_inode_type; + int f_slot; + struct ocfs2_cached_block_free *f_first; +}; + +static int ocfs2_free_cached_items(struct ocfs2_super *osb, + int sysfile_type, + int slot, + struct ocfs2_cached_block_free *head) +{ + int ret; + u64 bg_blkno; + handle_t *handle; + struct inode *inode; + struct buffer_head *di_bh = NULL; + struct ocfs2_cached_block_free *tmp; + + inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); + if (!inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&inode->i_mutex); + + ret = ocfs2_meta_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + while (head) { + bg_blkno = ocfs2_which_suballoc_group(head->free_blk, + head->free_bit); + mlog(0, "Free bit: (bit %u, blkno %llu)\n", + head->free_bit, (unsigned long long)head->free_blk); + + ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, + head->free_bit, bg_blkno, 1); + if (ret) { + mlog_errno(ret); + goto out_journal; + } + + ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); + if (ret) { + mlog_errno(ret); + goto out_journal; + } + + tmp = head; + head = head->free_next; + kfree(tmp); + } + +out_journal: + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_meta_unlock(inode, 1); + brelse(di_bh); +out_mutex: + mutex_unlock(&inode->i_mutex); + iput(inode); +out: + while(head) { + /* Premature exit may have left some dangling items. */ + tmp = head; + head = head->free_next; + kfree(tmp); + } + + return ret; +} + +int ocfs2_run_deallocs(struct ocfs2_super *osb, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + int ret = 0, ret2; + struct ocfs2_per_slot_free_list *fl; + + if (!ctxt) + return 0; + + while (ctxt->c_first_suballocator) { + fl = ctxt->c_first_suballocator; + + if (fl->f_first) { + mlog(0, "Free items: (type %u, slot %d)\n", + fl->f_inode_type, fl->f_slot); + ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, + fl->f_slot, fl->f_first); + if (ret2) + mlog_errno(ret2); + if (!ret) + ret = ret2; + } + + ctxt->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + + return ret; +} + +static struct ocfs2_per_slot_free_list * +ocfs2_find_per_slot_free_list(int type, + int slot, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + + while (fl) { + if (fl->f_inode_type == type && fl->f_slot == slot) + return fl; + + fl = fl->f_next_suballocator; + } + + fl = kmalloc(sizeof(*fl), GFP_NOFS); + if (fl) { + fl->f_inode_type = type; + fl->f_slot = slot; + fl->f_first = NULL; + fl->f_next_suballocator = ctxt->c_first_suballocator; + + ctxt->c_first_suballocator = fl; + } + return fl; +} + +static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 blkno, + unsigned int bit) +{ + int ret; + struct ocfs2_per_slot_free_list *fl; + struct ocfs2_cached_block_free *item; + + fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); + if (fl == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + item = kmalloc(sizeof(*item), GFP_NOFS); + if (item == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", + type, slot, bit, (unsigned long long)blkno); + + item->free_blk = blkno; + item->free_bit = bit; + item->free_next = fl->f_first; + + fl->f_first = item; + + ret = 0; +out: + return ret; +} + /* This function will figure out whether the currently last extent * block will be deleted, and if it will, what the new last extent * block will be so we can update his h_next_leaf_blk field, as well diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index fbcb5934a081..01db0adc2150 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -63,6 +63,25 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, struct ocfs2_dinode *tl_copy); +/* + * Process local structure which describes the block unlinks done + * during an operation. This is populated via + * ocfs2_cache_block_dealloc(). + * + * ocfs2_run_deallocs() should be called after the potentially + * de-allocating routines. No journal handles should be open, and most + * locks should have been dropped. + */ +struct ocfs2_cached_dealloc_ctxt { + struct ocfs2_per_slot_free_list *c_first_suballocator; +}; +static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) +{ + c->c_first_suballocator = NULL; +} +int ocfs2_run_deallocs(struct ocfs2_super *osb, + struct ocfs2_cached_dealloc_ctxt *ctxt); + struct ocfs2_truncate_context { struct inode *tc_ext_alloc_inode; struct buffer_head *tc_ext_alloc_bh; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index e3437626d183..6788f2f1a667 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle, u16 chain); static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, u32 wanted); -static int ocfs2_free_suballoc_bits(handle_t *handle, - struct inode *alloc_inode, - struct buffer_head *alloc_bh, - unsigned int start_bit, - u64 bg_blkno, - unsigned int count); -static inline u64 ocfs2_which_suballoc_group(u64 block, - unsigned int bit); static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, u64 bg_blkno, u16 bg_bit_off); @@ -1626,12 +1618,12 @@ bail: /* * expects the suballoc inode to already be locked. */ -static int ocfs2_free_suballoc_bits(handle_t *handle, - struct inode *alloc_inode, - struct buffer_head *alloc_bh, - unsigned int start_bit, - u64 bg_blkno, - unsigned int count) +int ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count) { int status = 0; u32 tmp_used; @@ -1703,13 +1695,6 @@ bail: return status; } -static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) -{ - u64 group = block - (u64) bit; - - return group; -} - int ocfs2_free_dinode(handle_t *handle, struct inode *inode_alloc_inode, struct buffer_head *inode_alloc_bh, diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 1a3c94cb9250..7bc4819db4db 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -86,6 +86,12 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, u32 *cluster_start, u32 *num_clusters); +int ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count); int ocfs2_free_dinode(handle_t *handle, struct inode *inode_alloc_inode, struct buffer_head *inode_alloc_bh, @@ -100,6 +106,13 @@ int ocfs2_free_clusters(handle_t *handle, u64 start_blk, unsigned int num_clusters); +static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) +{ + u64 group = block - (u64) bit; + + return group; +} + static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, u64 bg_blkno) { -- cgit v1.2.1 From 59a5e416d1ab543a5248a2b34d83202c4d55d132 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 22 Jun 2007 15:52:36 -0700 Subject: ocfs2: plug truncate into cached dealloc routines Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 102 ++++++++++++++-------------------------------------- fs/ocfs2/alloc.h | 3 +- fs/ocfs2/aops.c | 1 + fs/ocfs2/suballoc.c | 13 ------- fs/ocfs2/suballoc.h | 4 --- 5 files changed, 29 insertions(+), 94 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 873bb99fc2ff..26e867087e95 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -50,6 +50,8 @@ #include "buffer_head_io.h" static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); +static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, + struct ocfs2_extent_block *eb); /* * Structures which describe a path through a btree, and functions to @@ -3161,6 +3163,15 @@ out: return ret; } +static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, + struct ocfs2_extent_block *eb) +{ + return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(eb->h_suballoc_slot), + le64_to_cpu(eb->h_blkno), + le16_to_cpu(eb->h_suballoc_bit)); +} + /* This function will figure out whether the currently last extent * block will be deleted, and if it will, what the new last extent * block will be so we can update his h_next_leaf_blk field, as well @@ -3442,27 +3453,10 @@ delete: BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); - if (le16_to_cpu(eb->h_suballoc_slot) == 0) { - /* - * This code only understands how to - * lock the suballocator in slot 0, - * which is fine because allocation is - * only ever done out of that - * suballocator too. A future version - * might change that however, so avoid - * a free if we don't know how to - * handle it. This way an fs incompat - * bit will not be necessary. - */ - ret = ocfs2_free_extent_block(handle, - tc->tc_ext_alloc_inode, - tc->tc_ext_alloc_bh, - eb); - - /* An error here is not fatal. */ - if (ret < 0) - mlog_errno(ret); - } + ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); + /* An error here is not fatal. */ + if (ret < 0) + mlog_errno(ret); } else { deleted_eb = 0; } @@ -3965,6 +3959,8 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); + ocfs2_run_deallocs(osb, &tc->tc_dealloc); + ocfs2_free_path(path); /* This will drop the ext_alloc cluster lock for us */ @@ -3975,23 +3971,18 @@ bail: } /* - * Expects the inode to already be locked. This will figure out which - * inodes need to be locked and will put them on the returned truncate - * context. + * Expects the inode to already be locked. */ int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, struct ocfs2_truncate_context **tc) { - int status, metadata_delete, i; + int status; unsigned int new_i_clusters; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; struct buffer_head *last_eb_bh = NULL; - struct inode *ext_alloc_inode = NULL; - struct buffer_head *ext_alloc_bh = NULL; mlog_entry_void(); @@ -4011,12 +4002,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, mlog_errno(status); goto bail; } + ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); - metadata_delete = 0; if (fe->id2.i_list.l_tree_depth) { - /* If we have a tree, then the truncate may result in - * metadata deletes. Figure this out from the - * rightmost leaf block.*/ status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), &last_eb_bh, OCFS2_BH_CACHED, inode); if (status < 0) { @@ -4031,43 +4019,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, status = -EIO; goto bail; } - el = &(eb->h_list); - - i = 0; - if (ocfs2_is_empty_extent(&el->l_recs[0])) - i = 1; - /* - * XXX: Should we check that next_free_rec contains - * the extent? - */ - if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters) - metadata_delete = 1; } (*tc)->tc_last_eb_bh = last_eb_bh; - if (metadata_delete) { - mlog(0, "Will have to delete metadata for this trunc. " - "locking allocator.\n"); - ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); - if (!ext_alloc_inode) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - mutex_lock(&ext_alloc_inode->i_mutex); - (*tc)->tc_ext_alloc_inode = ext_alloc_inode; - - status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - (*tc)->tc_ext_alloc_bh = ext_alloc_bh; - (*tc)->tc_ext_alloc_locked = 1; - } - status = 0; bail: if (status < 0) { @@ -4081,16 +4036,13 @@ bail: static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) { - if (tc->tc_ext_alloc_inode) { - if (tc->tc_ext_alloc_locked) - ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); - - mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); - iput(tc->tc_ext_alloc_inode); - } - - if (tc->tc_ext_alloc_bh) - brelse(tc->tc_ext_alloc_bh); + /* + * The caller is responsible for completing deallocation + * before freeing the context. + */ + if (tc->tc_dealloc.c_first_suballocator != NULL) + mlog(ML_NOTICE, + "Truncate completion has non-empty dealloc context\n"); if (tc->tc_last_eb_bh) brelse(tc->tc_last_eb_bh); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 01db0adc2150..cb02e53b593c 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -83,8 +83,7 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb, struct ocfs2_cached_dealloc_ctxt *ctxt); struct ocfs2_truncate_context { - struct inode *tc_ext_alloc_inode; - struct buffer_head *tc_ext_alloc_bh; + struct ocfs2_cached_dealloc_ctxt tc_dealloc; int tc_ext_alloc_locked; /* is it cluster locked? */ /* these get destroyed once it's passed to ocfs2_commit_truncate. */ struct buffer_head *tc_last_eb_bh; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e8d16ae12ef0..510bf84c9cf5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1498,6 +1498,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, ocfs2_journal_dirty(handle, wc->w_di_bh); ocfs2_commit_trans(osb, handle); + ocfs2_free_write_ctxt(wc); return copied; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6788f2f1a667..82bf12f887a6 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1708,19 +1708,6 @@ int ocfs2_free_dinode(handle_t *handle, inode_alloc_bh, bit, bg_blkno, 1); } -int ocfs2_free_extent_block(handle_t *handle, - struct inode *eb_alloc_inode, - struct buffer_head *eb_alloc_bh, - struct ocfs2_extent_block *eb) -{ - u64 blk = le64_to_cpu(eb->h_blkno); - u16 bit = le16_to_cpu(eb->h_suballoc_bit); - u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); - - return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh, - bit, bg_blkno, 1); -} - int ocfs2_free_clusters(handle_t *handle, struct inode *bitmap_inode, struct buffer_head *bitmap_bh, diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 7bc4819db4db..f212dc01a84b 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -96,10 +96,6 @@ int ocfs2_free_dinode(handle_t *handle, struct inode *inode_alloc_inode, struct buffer_head *inode_alloc_bh, struct ocfs2_dinode *di); -int ocfs2_free_extent_block(handle_t *handle, - struct inode *eb_alloc_inode, - struct buffer_head *eb_alloc_bh, - struct ocfs2_extent_block *eb); int ocfs2_free_clusters(handle_t *handle, struct inode *bitmap_inode, struct buffer_head *bitmap_bh, -- cgit v1.2.1 From 1f6697d072e6fd0b332a4301c21060dcb89bd623 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 25 Jun 2007 14:53:33 -0700 Subject: ocfs2: use all extent block suballocators Now that we have a method to deallocate blocks from them, each node should allocate extent blocks from their local suballocator file. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 6 ------ fs/ocfs2/suballoc.c | 6 ------ 2 files changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 26e867087e95..b5327b4cdb34 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -386,13 +386,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); eb->h_blkno = cpu_to_le64(first_blkno); eb->h_fs_generation = cpu_to_le32(osb->fs_generation); - -#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS - /* we always use slot zero's suballocator */ - eb->h_suballoc_slot = 0; -#else eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); -#endif eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); eb->h_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 82bf12f887a6..d9c5c9fcb30f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -488,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); (*ac)->ac_which = OCFS2_AC_USE_META; - -#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS - slot = 0; -#else slot = osb->slot_num; -#endif - (*ac)->ac_group_search = ocfs2_block_group_search; status = ocfs2_reserve_suballoc_bits(osb, (*ac), -- cgit v1.2.1 From c3afcbb34426a9291e4c038540129053a72c3cd8 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 29 May 2007 14:28:51 -0700 Subject: ocfs2: abstract btree growing calls The top level calls and logic for growing a tree can easily be abstracted out of ocfs2_insert_extent() into a seperate function - ocfs2_grow_tree(). This allows future code to easily grow btrees when needed. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 119 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b5327b4cdb34..a02d026cb0e4 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -824,6 +824,74 @@ bail: return status; } +/* + * Grow a b-tree so that it has more records. + * + * We might shift the tree depth in which case existing paths should + * be considered invalid. + * + * Tree depth after the grow is returned via *final_depth. + */ +static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, + struct buffer_head *di_bh, int *final_depth, + struct buffer_head *last_eb_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret, shift; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *bh = NULL; + + BUG_ON(meta_ac == NULL); + + shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); + if (shift < 0) { + ret = shift; + mlog_errno(ret); + goto out; + } + + /* We traveled all the way to the bottom of the allocation tree + * and didn't find room for any more extents - we need to add + * another tree level */ + if (shift) { + BUG_ON(bh); + mlog(0, "need to shift tree depth (current = %d)\n", depth); + + /* ocfs2_shift_tree_depth will return us a buffer with + * the new extent block (so we can pass that to + * ocfs2_add_branch). */ + ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, + meta_ac, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + depth++; + /* Special case: we have room now if we shifted from + * tree_depth 0 */ + if (depth == 1) + goto out; + } + + /* call ocfs2_add_branch to add the final part of the tree with + * the new data. */ + mlog(0, "add branch. bh = %p\n", bh); + ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, + meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (final_depth) + *final_depth = depth; + brelse(bh); + return ret; +} + /* * This is only valid for leaf nodes, which are the only ones that can * have empty extents anyway. @@ -2325,7 +2393,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, u32 new_clusters, struct ocfs2_alloc_context *meta_ac) { - int status, shift; + int status; struct buffer_head *last_eb_bh = NULL; struct buffer_head *bh = NULL; struct ocfs2_insert_type insert = {0, }; @@ -2360,55 +2428,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, insert.ins_appending, insert.ins_contig, insert.ins_contig_index, insert.ins_free_records, insert.ins_tree_depth); - /* - * Avoid growing the tree unless we're out of records and the - * insert type requres one. - */ - if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) - goto out_add; - - shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); - if (shift < 0) { - status = shift; - mlog_errno(status); - goto bail; - } - - /* We traveled all the way to the bottom of the allocation tree - * and didn't find room for any more extents - we need to add - * another tree level */ - if (shift) { - BUG_ON(bh); - mlog(0, "need to shift tree depth " - "(current = %d)\n", insert.ins_tree_depth); - - /* ocfs2_shift_tree_depth will return us a buffer with - * the new extent block (so we can pass that to - * ocfs2_add_branch). */ - status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, - meta_ac, &bh); - if (status < 0) { + if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) { + status = ocfs2_grow_tree(inode, handle, fe_bh, + &insert.ins_tree_depth, last_eb_bh, + meta_ac); + if (status) { mlog_errno(status); goto bail; } - insert.ins_tree_depth++; - /* Special case: we have room now if we shifted from - * tree_depth 0 */ - if (insert.ins_tree_depth == 1) - goto out_add; - } - - /* call ocfs2_add_branch to add the final part of the tree with - * the new data. */ - mlog(0, "add branch. bh = %p\n", bh); - status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, - meta_ac); - if (status < 0) { - mlog_errno(status); - goto bail; } -out_add: /* Finally, we can add clusters. This might rotate the tree for us. */ status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); if (status < 0) -- cgit v1.2.1 From 328d5752e1259dfb29b7e65f6c2d145fddbaa750 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 18 Jun 2007 10:48:04 -0700 Subject: ocfs2: btree changes for unwritten extents Writes to a region marked as unwritten might result in a record split or merge. We can support splits by making minor changes to the existing insert code. Merges require left rotations which mostly re-use right rotation support functions. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 2043 ++++++++++++++++++++++++++++++++++++++++++++----- fs/ocfs2/alloc.h | 6 + fs/ocfs2/endian.h | 5 + fs/ocfs2/extent_map.c | 31 - fs/ocfs2/ocfs2.h | 13 + fs/ocfs2/ocfs2_fs.h | 7 +- 6 files changed, 1892 insertions(+), 213 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a02d026cb0e4..0db6a1f724e1 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -118,6 +118,31 @@ static void ocfs2_free_path(struct ocfs2_path *path) } } +/* + * All the elements of src into dest. After this call, src could be freed + * without affecting dest. + * + * Both paths should have the same root. Any non-root elements of dest + * will be freed. + */ +static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_el(dest) != path_root_el(src)); + + ocfs2_reinit_path(dest, 1); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + if (dest->p_node[i].bh) + get_bh(dest->p_node[i].bh); + } +} + /* * Make the *dest path the same as src and re-initialize src path to * have a root only. @@ -214,10 +239,41 @@ out: return ret; } +/* + * Return the index of the extent record which contains cluster #v_cluster. + * -1 is returned if it was not found. + * + * Should work fine on interior and exterior nodes. + */ +int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) +{ + int ret = -1; + int i; + struct ocfs2_extent_rec *rec; + u32 rec_end, rec_start, clusters; + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + rec_start = le32_to_cpu(rec->e_cpos); + clusters = ocfs2_rec_clusters(el, rec); + + rec_end = rec_start + clusters; + + if (v_cluster >= rec_start && v_cluster < rec_end) { + ret = i; + break; + } + } + + return ret; +} + enum ocfs2_contig_type { CONTIG_NONE = 0, CONTIG_LEFT, - CONTIG_RIGHT + CONTIG_RIGHT, + CONTIG_LEFTRIGHT, }; @@ -255,6 +311,14 @@ static enum ocfs2_contig_type { u64 blkno = le64_to_cpu(insert_rec->e_blkno); + /* + * Refuse to coalesce extent records with different flag + * fields - we don't want to mix unwritten extents with user + * data. + */ + if (ext->e_flags != insert_rec->e_flags) + return CONTIG_NONE; + if (ocfs2_extents_adjacent(ext, insert_rec) && ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) return CONTIG_RIGHT; @@ -279,7 +343,14 @@ enum ocfs2_append_type { APPEND_TAIL, }; +enum ocfs2_split_type { + SPLIT_NONE = 0, + SPLIT_LEFT, + SPLIT_RIGHT, +}; + struct ocfs2_insert_type { + enum ocfs2_split_type ins_split; enum ocfs2_append_type ins_appending; enum ocfs2_contig_type ins_contig; int ins_contig_index; @@ -287,6 +358,13 @@ struct ocfs2_insert_type { int ins_tree_depth; }; +struct ocfs2_merge_ctxt { + enum ocfs2_contig_type c_contig_type; + int c_has_empty_extent; + int c_split_covers_rec; + int c_used_tail_recs; +}; + /* * How many free extents have we got before we need more meta data? */ @@ -457,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, struct buffer_head *eb_bh, - struct buffer_head *last_eb_bh, + struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int status, new_blocks, i; @@ -472,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, mlog_entry_void(); - BUG_ON(!last_eb_bh); + BUG_ON(!last_eb_bh || !*last_eb_bh); fe = (struct ocfs2_dinode *) fe_bh->b_data; @@ -503,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } - eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; + eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be @@ -564,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * journal_dirty erroring as it won't unless we've aborted the * handle (in which case we would never be here) so reserving * the write with journal_access is all we need to do. */ - status = ocfs2_journal_access(handle, inode, last_eb_bh, + status = ocfs2_journal_access(handle, inode, *last_eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -597,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * next_leaf on the previously last-extent-block. */ fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); - status = ocfs2_journal_dirty(handle, last_eb_bh); + status = ocfs2_journal_dirty(handle, *last_eb_bh); if (status < 0) mlog_errno(status); status = ocfs2_journal_dirty(handle, fe_bh); @@ -612,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, mlog_errno(status); } + /* + * Some callers want to track the rightmost leaf so pass it + * back here. + */ + brelse(*last_eb_bh); + get_bh(new_eb_bhs[0]); + *last_eb_bh = new_eb_bhs[0]; + status = 0; bail: if (new_eb_bhs) { @@ -831,10 +917,12 @@ bail: * be considered invalid. * * Tree depth after the grow is returned via *final_depth. + * + * *last_eb_bh will be updated by ocfs2_add_branch(). */ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, int *final_depth, - struct buffer_head *last_eb_bh, + struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; @@ -869,10 +957,21 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, goto out; } depth++; - /* Special case: we have room now if we shifted from - * tree_depth 0 */ - if (depth == 1) + if (depth == 1) { + /* + * Special case: we have room now if we shifted from + * tree_depth 0, so no more work needs to be done. + * + * We won't be calling add_branch, so pass + * back *last_eb_bh as the new leaf. At depth + * zero, it should always be null so there's + * no reason to brelse. + */ + BUG_ON(*last_eb_bh); + get_bh(bh); + *last_eb_bh = bh; goto out; + } } /* call ocfs2_add_branch to add the final part of the tree with @@ -998,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, } +static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) +{ + int size, num_recs = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(num_recs == 0); + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + num_recs--; + size = num_recs * sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[0], &el->l_recs[1], size); + memset(&el->l_recs[num_recs], 0, + sizeof(struct ocfs2_extent_rec)); + el->l_next_free_rec = cpu_to_le16(num_recs); + } +} + /* * Create an empty extent record . * @@ -1275,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, * immediately to their right. */ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); + if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { + BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); + left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); + } left_clusters -= le32_to_cpu(left_rec->e_cpos); left_rec->e_int_clusters = cpu_to_le32(left_clusters); @@ -1595,10 +1714,16 @@ out: return ret; } +/* + * Extend the transaction by enough credits to complete the rotation, + * and still leave at least the original number of credits allocated + * to this transaction. + */ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, + int op_credits, struct ocfs2_path *path) { - int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; + int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; if (handle->h_buffer_credits < credits) return ocfs2_extend_trans(handle, credits); @@ -1632,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, return 0; } +static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + unsigned int range; + struct ocfs2_extent_rec *rec; + + if (next_free == 0) + return 0; + + rec = &el->l_recs[0]; + if (ocfs2_is_empty_extent(rec)) { + /* Empty list. */ + if (next_free == 1) + return 0; + rec = &el->l_recs[1]; + } + + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + return 1; + return 0; +} + /* * Rotate all the records in a btree right one record, starting at insert_cpos. * @@ -1650,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, */ static int ocfs2_rotate_tree_right(struct inode *inode, handle_t *handle, + enum ocfs2_split_type split, u32 insert_cpos, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) { - int ret, start; + int ret, start, orig_credits = handle->h_buffer_credits; u32 cpos; struct ocfs2_path *left_path = NULL; @@ -1721,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode, (unsigned long long) path_leaf_bh(left_path)->b_blocknr); - if (ocfs2_rotate_requires_path_adjustment(left_path, + if (split == SPLIT_NONE && + ocfs2_rotate_requires_path_adjustment(left_path, insert_cpos)) { - mlog(0, "Path adjustment required\n"); /* * We've rotated the tree as much as we @@ -1751,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, right_path->p_tree_depth); ret = ocfs2_extend_rotate_transaction(handle, start, - right_path); + orig_credits, right_path); if (ret) { mlog_errno(ret); goto out; @@ -1764,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out; } + if (split != SPLIT_NONE && + ocfs2_leftmost_rec_contains(path_leaf_el(right_path), + insert_cpos)) { + /* + * A rotate moves the rightmost left leaf + * record over to the leftmost right leaf + * slot. If we're doing an extent split + * instead of a real insert, then we have to + * check that the extent to be split wasn't + * just moved over. If it was, then we can + * exit here, passing left_path back - + * ocfs2_split_extent() is smart enough to + * search both leaves. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + /* * There is no need to re-read the next right path * as we know that it'll be our current left @@ -1786,206 +1953,1248 @@ out_ret_path: return ret; } -/* - * Do the final bits of extent record insertion at the target leaf - * list. If this leaf is part of an allocation tree, it is assumed - * that the tree above has been prepared. - */ -static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, - struct ocfs2_extent_list *el, - struct ocfs2_insert_type *insert, - struct inode *inode) +static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, + struct ocfs2_path *path) { - int i = insert->ins_contig_index; - unsigned int range; + int i, idx; struct ocfs2_extent_rec *rec; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + u32 range; - BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + /* Path should always be rightmost. */ + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + BUG_ON(eb->h_next_leaf_blk != 0ULL); - /* - * Contiguous insert - either left or right. - */ - if (insert->ins_contig != CONTIG_NONE) { - rec = &el->l_recs[i]; - if (insert->ins_contig == CONTIG_LEFT) { - rec->e_blkno = insert_rec->e_blkno; - rec->e_cpos = insert_rec->e_cpos; - } - le16_add_cpu(&rec->e_leaf_clusters, - le16_to_cpu(insert_rec->e_leaf_clusters)); - return; - } + el = &eb->h_list; + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + idx = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[idx]; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); - /* - * Handle insert into an empty leaf. - */ - if (le16_to_cpu(el->l_next_free_rec) == 0 || - ((le16_to_cpu(el->l_next_free_rec) == 1) && - ocfs2_is_empty_extent(&el->l_recs[0]))) { - el->l_recs[0] = *insert_rec; - el->l_next_free_rec = cpu_to_le16(1); - return; - } + for (i = 0; i < path->p_tree_depth; i++) { + el = path->p_node[i].el; + idx = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[idx]; - /* - * Appending insert. - */ - if (insert->ins_appending == APPEND_TAIL) { - i = le16_to_cpu(el->l_next_free_rec) - 1; - rec = &el->l_recs[i]; - range = le32_to_cpu(rec->e_cpos) - + le16_to_cpu(rec->e_leaf_clusters); - BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); + rec->e_int_clusters = cpu_to_le32(range); + le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); - mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= - le16_to_cpu(el->l_count), - "inode %lu, depth %u, count %u, next free %u, " - "rec.cpos %u, rec.clusters %u, " - "insert.cpos %u, insert.clusters %u\n", - inode->i_ino, - le16_to_cpu(el->l_tree_depth), - le16_to_cpu(el->l_count), - le16_to_cpu(el->l_next_free_rec), - le32_to_cpu(el->l_recs[i].e_cpos), - le16_to_cpu(el->l_recs[i].e_leaf_clusters), - le32_to_cpu(insert_rec->e_cpos), - le16_to_cpu(insert_rec->e_leaf_clusters)); - i++; - el->l_recs[i] = *insert_rec; - le16_add_cpu(&el->l_next_free_rec, 1); - return; + ocfs2_journal_dirty(handle, path->p_node[i].bh); } - - /* - * Ok, we have to rotate. - * - * At this point, it is safe to assume that inserting into an - * empty leaf and appending to a leaf have both been handled - * above. - * - * This leaf needs to have space, either by the empty 1st - * extent record, or by virtue of an l_next_rec < l_count. - */ - ocfs2_rotate_leaf(el, insert_rec); } -static inline void ocfs2_update_dinode_clusters(struct inode *inode, - struct ocfs2_dinode *di, - u32 clusters) +static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_path *path, int unlink_start) { - le32_add_cpu(&di->i_clusters, clusters); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); + int ret, i; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh; + + for(i = unlink_start; i < path_num_items(path); i++) { + bh = path->p_node[i].bh; + + eb = (struct ocfs2_extent_block *)bh->b_data; + /* + * Not all nodes might have had their final count + * decremented by the caller - handle this here. + */ + el = &eb->h_list; + if (le16_to_cpu(el->l_next_free_rec) > 1) { + mlog(ML_ERROR, + "Inode %llu, attempted to remove extent block " + "%llu with %u records\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(el->l_next_free_rec)); + + ocfs2_journal_dirty(handle, bh); + ocfs2_remove_from_cache(inode, bh); + continue; + } + + el->l_next_free_rec = 0; + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + + ocfs2_journal_dirty(handle, bh); + + ret = ocfs2_cache_extent_block_free(dealloc, eb); + if (ret) + mlog_errno(ret); + + ocfs2_remove_from_cache(inode, bh); + } } -static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, - struct ocfs2_extent_rec *insert_rec, - struct ocfs2_path *right_path, - struct ocfs2_path **ret_left_path) +static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index, + struct ocfs2_cached_dealloc_ctxt *dealloc) { - int ret, i, next_free; - struct buffer_head *bh; + int i; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; struct ocfs2_extent_list *el; - struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_block *eb; - *ret_left_path = NULL; + el = path_leaf_el(left_path); - /* - * This shouldn't happen for non-trees. The extent rec cluster - * count manipulation below only works for interior nodes. - */ - BUG_ON(right_path->p_tree_depth == 0); + eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; - /* - * If our appending insert is at the leftmost edge of a leaf, - * then we might need to update the rightmost records of the - * neighboring path. - */ - el = path_leaf_el(right_path); - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0 || - (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { - u32 left_cpos; + for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + if (root_el->l_recs[i].e_blkno == eb->h_blkno) + break; - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, - &left_cpos); - if (ret) { - mlog_errno(ret); - goto out; - } + BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); - mlog(0, "Append may need a left path update. cpos: %u, " - "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), - left_cpos); + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&root_el->l_next_free_rec, -1); + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + eb->h_next_leaf_blk = 0; + + ocfs2_journal_dirty(handle, root_bh); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + + ocfs2_unlink_path(inode, handle, dealloc, right_path, + subtree_index + 1); +} + +static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int *deleted) +{ + int ret, i, del_right_subtree = 0, right_has_empty = 0; + struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; + struct ocfs2_extent_block *eb; + + *deleted = 0; + + right_leaf_el = path_leaf_el(right_path); + left_leaf_el = path_leaf_el(left_path); + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) + return 0; + eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; + if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { /* - * No need to worry if the append is already in the - * leftmost leaf. + * It's legal for us to proceed if the right leaf is + * the rightmost one and it has an empty extent. There + * are two cases to handle - whether the leaf will be + * empty after removal or not. If the leaf isn't empty + * then just remove the empty extent up front. The + * next block will handle empty leaves by flagging + * them for unlink. + * + * Non rightmost leaves will throw -EAGAIN and the + * caller can manually move the subtree and retry. */ - if (left_cpos) { - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); - if (!left_path) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - ret = ocfs2_find_path(inode, left_path, left_cpos); + if (eb->h_next_leaf_blk != 0ULL) + return -EAGAIN; + + if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { + ret = ocfs2_journal_access(handle, inode, + path_leaf_bh(right_path), + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } - /* - * ocfs2_insert_path() will pass the left_path to the - * journal for us. - */ - } + ocfs2_remove_empty_extent(right_leaf_el); + } else + right_has_empty = 1; } - ret = ocfs2_journal_access_path(inode, handle, right_path); - if (ret) { - mlog_errno(ret); - goto out; - } + if (eb->h_next_leaf_blk == 0ULL && + le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { + /* + * We have to update i_last_eb_blk during the meta + * data delete. + */ + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + del_right_subtree = 1; + } + + /* + * Getting here with an empty extent in the right path implies + * that it's the rightmost path and will be deleted. + */ + BUG_ON(right_has_empty && !del_right_subtree); + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_journal_access(handle, inode, + right_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, + left_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (!right_has_empty) { + /* + * Only do this if we're moving a real + * record. Otherwise, the action is delayed until + * after removal of the right path in which case we + * can do a simple shift to remove the empty extent. + */ + ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); + memset(&right_leaf_el->l_recs[0], 0, + sizeof(struct ocfs2_extent_rec)); + } + if (eb->h_next_leaf_blk == 0ULL) { + /* + * Move recs over to get rid of empty extent, decrease + * next_free. This is allowed to remove the last + * extent in our leaf (setting l_next_free_rec to + * zero) - the delete code below won't care. + */ + ocfs2_remove_empty_extent(right_leaf_el); + } + + ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + if (ret) + mlog_errno(ret); + ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); + if (ret) + mlog_errno(ret); + + if (del_right_subtree) { + ocfs2_unlink_subtree(inode, handle, left_path, right_path, + subtree_index, dealloc); + ocfs2_update_edge_lengths(inode, handle, left_path); + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + di->i_last_eb_blk = eb->h_blkno; + + /* + * Removal of the extent in the left leaf was skipped + * above so we could delete the right path + * 1st. + */ + if (right_has_empty) + ocfs2_remove_empty_extent(left_leaf_el); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + + *deleted = 1; + } else + ocfs2_complete_edge_insert(inode, handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the right of the current one. + * + * Will return zero if the path passed in is already the rightmost path. + * + * This looks similar, but is subtly different to + * ocfs2_find_cpos_for_left_leaf(). + */ +static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + *cpos = 0; + + if (path->p_tree_depth == 0) + return 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + int next_free; + + el = path->p_node[i].el; + + /* + * Find the extent record just after the one in our + * path. + */ + next_free = le16_to_cpu(el->l_next_free_rec); + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == (next_free - 1)) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the rightmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The rightmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, + handle_t *handle, + struct buffer_head *bh, + struct ocfs2_extent_list *el) +{ + int ret; + + if (!ocfs2_is_empty_extent(&el->l_recs[0])) + return 0; + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_empty_extent(el); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int __ocfs2_rotate_tree_left(struct inode *inode, + handle_t *handle, int orig_credits, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_path **empty_extent_path) +{ + int ret, subtree_root, deleted; + u32 right_cpos; + struct ocfs2_path *left_path = NULL; + struct ocfs2_path *right_path = NULL; + + BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + + *empty_extent_path = NULL; + + ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, + &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + left_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_cp_path(left_path, path); + + right_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + while (right_cpos) { + ret = ocfs2_find_path(inode, right_path, right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(inode, left_path, + right_path); + + mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", + subtree_root, + (unsigned long long) + right_path->p_node[subtree_root].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + orig_credits, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_left(inode, handle, left_path, + right_path, subtree_root, + dealloc, &deleted); + if (ret == -EAGAIN) { + /* + * The rotation has to temporarily stop due to + * the right subtree having an empty + * extent. Pass it back to the caller for a + * fixup. + */ + *empty_extent_path = right_path; + right_path = NULL; + goto out; + } + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The subtree rotate might have removed records on + * the rightmost edge. If so, then rotation is + * complete. + */ + if (deleted) + break; + + ocfs2_mv_path(left_path, right_path); + + ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, + &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(right_path); + ocfs2_free_path(left_path); + + return ret; +} + +static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, subtree_index; + u32 cpos; + struct ocfs2_path *left_path = NULL; + struct ocfs2_dinode *di; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + /* + * XXX: This code assumes that the root is an inode, which is + * true for now but may change as tree code gets generic. + */ + di = (struct ocfs2_dinode *)path_root_bh(path)->b_data; + if (!OCFS2_IS_VALID_DINODE(di)) { + ret = -EIO; + ocfs2_error(inode->i_sb, + "Inode %llu has invalid path root", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto out; + } + + /* + * There's two ways we handle this depending on + * whether path is the only existing one. + */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos) { + /* + * We have a path to the left of this one - it needs + * an update too. + */ + left_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_index = ocfs2_find_subtree_root(inode, left_path, path); + + ocfs2_unlink_subtree(inode, handle, left_path, path, + subtree_index, dealloc); + ocfs2_update_edge_lengths(inode, handle, left_path); + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + di->i_last_eb_blk = eb->h_blkno; + } else { + /* + * 'path' is also the leftmost path which + * means it must be the only one. This gets + * handled differently because we want to + * revert the inode back to having extents + * in-line. + */ + ocfs2_unlink_path(inode, handle, dealloc, path, 1); + + el = &di->id2.i_list; + el->l_tree_depth = 0; + el->l_next_free_rec = 0; + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + + di->i_last_eb_blk = 0; + } + + ocfs2_journal_dirty(handle, path_root_bh(path)); + +out: + ocfs2_free_path(left_path); + return ret; +} + +/* + * Left rotation of btree records. + * + * In many ways, this is (unsurprisingly) the opposite of right + * rotation. We start at some non-rightmost path containing an empty + * extent in the leaf block. The code works its way to the rightmost + * path by rotating records to the left in every subtree. + * + * This is used by any code which reduces the number of extent records + * in a leaf. After removal, an empty record should be placed in the + * leftmost list position. + * + * This won't handle a length update of the rightmost path records if + * the rightmost tree leaf record is removed so the caller is + * responsible for detecting and correcting that. + */ +static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, orig_credits = handle->h_buffer_credits; + struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + el = path_leaf_el(path); + if (!ocfs2_is_empty_extent(&el->l_recs[0])) + return 0; + + if (path->p_tree_depth == 0) { +rightmost_no_delete: + /* + * In-inode extents. This is trivially handled, so do + * it up front. + */ + ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, + path_leaf_bh(path), + path_leaf_el(path)); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Handle rightmost branch now. There's several cases: + * 1) simple rotation leaving records in there. That's trivial. + * 2) rotation requiring a branch delete - there's no more + * records left. Two cases of this: + * a) There are branches to the left. + * b) This is also the leftmost (the only) branch. + * + * 1) is handled via ocfs2_rotate_rightmost_leaf_left() + * 2a) we need the left branch so that we can update it with the unlink + * 2b) we need to bring the inode back to inline extents. + */ + + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + el = &eb->h_list; + if (eb->h_next_leaf_blk == 0) { + /* + * This gets a bit tricky if we're going to delete the + * rightmost path. Get the other cases out of the way + * 1st. + */ + if (le16_to_cpu(el->l_next_free_rec) > 1) + goto rightmost_no_delete; + + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ret = -EIO; + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent block at %llu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto out; + } + + /* + * XXX: The caller can not trust "path" any more after + * this as it will have been deleted. What do we do? + * + * In theory the rotate-for-merge code will never get + * here because it'll always ask for a rotate in a + * nonempty list. + */ + + ret = ocfs2_remove_rightmost_path(inode, handle, path, + dealloc); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Now we can loop, remembering the path we get from -EAGAIN + * and restarting from there. + */ +try_rotate: + ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, + dealloc, &restart_path); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out; + } + + while (ret == -EAGAIN) { + tmp_path = restart_path; + restart_path = NULL; + + ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, + tmp_path, dealloc, + &restart_path); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out; + } + + ocfs2_free_path(tmp_path); + tmp_path = NULL; + + if (ret == 0) + goto try_rotate; + } + +out: + ocfs2_free_path(tmp_path); + ocfs2_free_path(restart_path); + return ret; +} + +static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, + int index) +{ + struct ocfs2_extent_rec *rec = &el->l_recs[index]; + unsigned int size; + + if (rec->e_leaf_clusters == 0) { + /* + * We consumed all of the merged-from record. An empty + * extent cannot exist anywhere but the 1st array + * position, so move things over if the merged-from + * record doesn't occupy that position. + * + * This creates a new empty extent so the caller + * should be smart enough to have removed any existing + * ones. + */ + if (index > 0) { + BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); + size = index * sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[1], &el->l_recs[0], size); + } + + /* + * Always memset - the caller doesn't check whether it + * created an empty extent, so there could be junk in + * the other fields. + */ + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + } +} + +/* + * Remove split_rec clusters from the record at index and merge them + * onto the beginning of the record at index + 1. + */ +static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, + handle_t *handle, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_extent_list *el, int index) +{ + int ret; + unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); + struct ocfs2_extent_rec *left_rec; + struct ocfs2_extent_rec *right_rec; + + BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); + + left_rec = &el->l_recs[index]; + right_rec = &el->l_recs[index + 1]; + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); + + le32_add_cpu(&right_rec->e_cpos, -split_clusters); + le64_add_cpu(&right_rec->e_blkno, + -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); + le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); + + ocfs2_cleanup_merge(el, index); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +/* + * Remove split_rec clusters from the record at index and merge them + * onto the tail of the record at index - 1. + */ +static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, + handle_t *handle, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_extent_list *el, int index) +{ + int ret, has_empty_extent = 0; + unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); + struct ocfs2_extent_rec *left_rec; + struct ocfs2_extent_rec *right_rec; + + BUG_ON(index <= 0); + + left_rec = &el->l_recs[index - 1]; + right_rec = &el->l_recs[index]; + if (ocfs2_is_empty_extent(&el->l_recs[0])) + has_empty_extent = 1; + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (has_empty_extent && index == 1) { + /* + * The easy case - we can just plop the record right in. + */ + *left_rec = *split_rec; + + has_empty_extent = 0; + } else { + le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); + } + + le32_add_cpu(&right_rec->e_cpos, split_clusters); + le64_add_cpu(&right_rec->e_blkno, + ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); + le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); + + ocfs2_cleanup_merge(el, index); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_try_to_merge_extent(struct inode *inode, + handle_t *handle, + struct ocfs2_path *left_path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_merge_ctxt *ctxt) + +{ + int ret = 0, delete_tail_recs = 0; + struct ocfs2_extent_list *el = path_leaf_el(left_path); + struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; + + BUG_ON(ctxt->c_contig_type == CONTIG_NONE); + + if (ctxt->c_split_covers_rec) { + delete_tail_recs++; + + if (ctxt->c_contig_type == CONTIG_LEFTRIGHT || + ctxt->c_has_empty_extent) + delete_tail_recs++; + + if (ctxt->c_has_empty_extent) { + /* + * The merge code will need to create an empty + * extent to take the place of the newly + * emptied slot. Remove any pre-existing empty + * extents - having more than one in a leaf is + * illegal. + */ + ret = ocfs2_rotate_tree_left(inode, handle, left_path, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + split_index--; + rec = &el->l_recs[split_index]; + } + } + + if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { + /* + * Left-right contig implies this. + */ + BUG_ON(!ctxt->c_split_covers_rec); + BUG_ON(split_index == 0); + + /* + * Since the leftright insert always covers the entire + * extent, this call will delete the insert record + * entirely, resulting in an empty extent record added to + * the extent block. + * + * Since the adding of an empty extent shifts + * everything back to the right, there's no need to + * update split_index here. + */ + ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path), + handle, split_rec, el, split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We can only get this from logic error above. + */ + BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + + /* + * The left merge left us with an empty extent, remove + * it. + */ + ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + split_index--; + rec = &el->l_recs[split_index]; + + /* + * Note that we don't pass split_rec here on purpose - + * we've merged it into the left side. + */ + ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path), + handle, rec, el, split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + + ret = ocfs2_rotate_tree_left(inode, handle, left_path, + dealloc); + /* + * Error from this last rotate is not critical, so + * print but don't bubble it up. + */ + if (ret) + mlog_errno(ret); + ret = 0; + } else { + /* + * Merge a record to the left or right. + * + * 'contig_type' is relative to the existing record, + * so for example, if we're "right contig", it's to + * the record on the left (hence the left merge). + */ + if (ctxt->c_contig_type == CONTIG_RIGHT) { + ret = ocfs2_merge_rec_left(inode, + path_leaf_bh(left_path), + handle, split_rec, el, + split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + ret = ocfs2_merge_rec_right(inode, + path_leaf_bh(left_path), + handle, split_rec, el, + split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (ctxt->c_split_covers_rec) { + /* + * The merge may have left an empty extent in + * our leaf. Try to rotate it away. + */ + ret = ocfs2_rotate_tree_left(inode, handle, left_path, + dealloc); + if (ret) + mlog_errno(ret); + ret = 0; + } + } + +out: + return ret; +} + +static void ocfs2_subtract_from_rec(struct super_block *sb, + enum ocfs2_split_type split, + struct ocfs2_extent_rec *rec, + struct ocfs2_extent_rec *split_rec) +{ + u64 len_blocks; + + len_blocks = ocfs2_clusters_to_blocks(sb, + le16_to_cpu(split_rec->e_leaf_clusters)); + + if (split == SPLIT_LEFT) { + /* + * Region is on the left edge of the existing + * record. + */ + le32_add_cpu(&rec->e_cpos, + le16_to_cpu(split_rec->e_leaf_clusters)); + le64_add_cpu(&rec->e_blkno, len_blocks); + le16_add_cpu(&rec->e_leaf_clusters, + -le16_to_cpu(split_rec->e_leaf_clusters)); + } else { + /* + * Region is on the right edge of the existing + * record. + */ + le16_add_cpu(&rec->e_leaf_clusters, + -le16_to_cpu(split_rec->e_leaf_clusters)); + } +} + +/* + * Do the final bits of extent record insertion at the target leaf + * list. If this leaf is part of an allocation tree, it is assumed + * that the tree above has been prepared. + */ +static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_list *el, + struct ocfs2_insert_type *insert, + struct inode *inode) +{ + int i = insert->ins_contig_index; + unsigned int range; + struct ocfs2_extent_rec *rec; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (insert->ins_split != SPLIT_NONE) { + i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); + BUG_ON(i == -1); + rec = &el->l_recs[i]; + ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, + insert_rec); + goto rotate; + } + + /* + * Contiguous insert - either left or right. + */ + if (insert->ins_contig != CONTIG_NONE) { + rec = &el->l_recs[i]; + if (insert->ins_contig == CONTIG_LEFT) { + rec->e_blkno = insert_rec->e_blkno; + rec->e_cpos = insert_rec->e_cpos; + } + le16_add_cpu(&rec->e_leaf_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + return; + } + + /* + * Handle insert into an empty leaf. + */ + if (le16_to_cpu(el->l_next_free_rec) == 0 || + ((le16_to_cpu(el->l_next_free_rec) == 1) && + ocfs2_is_empty_extent(&el->l_recs[0]))) { + el->l_recs[0] = *insert_rec; + el->l_next_free_rec = cpu_to_le16(1); + return; + } + + /* + * Appending insert. + */ + if (insert->ins_appending == APPEND_TAIL) { + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + range = le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters); + BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); + + mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count), + "inode %lu, depth %u, count %u, next free %u, " + "rec.cpos %u, rec.clusters %u, " + "insert.cpos %u, insert.clusters %u\n", + inode->i_ino, + le16_to_cpu(el->l_tree_depth), + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_next_free_rec), + le32_to_cpu(el->l_recs[i].e_cpos), + le16_to_cpu(el->l_recs[i].e_leaf_clusters), + le32_to_cpu(insert_rec->e_cpos), + le16_to_cpu(insert_rec->e_leaf_clusters)); + i++; + el->l_recs[i] = *insert_rec; + le16_add_cpu(&el->l_next_free_rec, 1); + return; + } + +rotate: + /* + * Ok, we have to rotate. + * + * At this point, it is safe to assume that inserting into an + * empty leaf and appending to a leaf have both been handled + * above. + * + * This leaf needs to have space, either by the empty 1st + * extent record, or by virtue of an l_next_rec < l_count. + */ + ocfs2_rotate_leaf(el, insert_rec); +} + +static inline void ocfs2_update_dinode_clusters(struct inode *inode, + struct ocfs2_dinode *di, + u32 clusters) +{ + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +static void ocfs2_adjust_rightmost_records(struct inode *inode, + handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_extent_rec *insert_rec) +{ + int ret, i, next_free; + struct buffer_head *bh; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + /* + * Update everything except the leaf block. + */ + for (i = 0; i < path->p_tree_depth; i++) { + bh = path->p_node[i].bh; + el = path->p_node[i].el; + + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %llu has a bad extent list", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EIO; + return; + } + + rec = &el->l_recs[next_free - 1]; + + rec->e_int_clusters = insert_rec->e_cpos; + le32_add_cpu(&rec->e_int_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + le32_add_cpu(&rec->e_int_clusters, + -le32_to_cpu(rec->e_cpos)); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret) + mlog_errno(ret); + + } +} + +static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, next_free; + struct ocfs2_extent_list *el; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* + * This shouldn't happen for non-trees. The extent rec cluster + * count manipulation below only works for interior nodes. + */ + BUG_ON(right_path->p_tree_depth == 0); - el = path_root_el(right_path); - bh = path_root_bh(right_path); - i = 0; - while (1) { - struct ocfs2_extent_rec *rec; + /* + * If our appending insert is at the leftmost edge of a leaf, + * then we might need to update the rightmost records of the + * neighboring path. + */ + el = path_leaf_el(right_path); + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { + u32 left_cpos; - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - ret = -EIO; + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, + &left_cpos); + if (ret) { + mlog_errno(ret); goto out; } - rec = &el->l_recs[next_free - 1]; + mlog(0, "Append may need a left path update. cpos: %u, " + "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), + left_cpos); - rec->e_int_clusters = insert_rec->e_cpos; - le32_add_cpu(&rec->e_int_clusters, - le16_to_cpu(insert_rec->e_leaf_clusters)); - le32_add_cpu(&rec->e_int_clusters, - -le32_to_cpu(rec->e_cpos)); + /* + * No need to worry if the append is already in the + * leftmost leaf. + */ + if (left_cpos) { + left_path = ocfs2_new_path(path_root_bh(right_path), + path_root_el(right_path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); + ret = ocfs2_find_path(inode, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } - /* Don't touch the leaf node */ - if (++i >= right_path->p_tree_depth) - break; + /* + * ocfs2_insert_path() will pass the left_path to the + * journal for us. + */ + } + } - bh = right_path->p_node[i].bh; - el = right_path->p_node[i].el; + ret = ocfs2_journal_access_path(inode, handle, right_path); + if (ret) { + mlog_errno(ret); + goto out; } + ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); + *ret_left_path = left_path; ret = 0; out: @@ -1995,6 +3204,83 @@ out: return ret; } +static void ocfs2_split_record(struct inode *inode, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + struct ocfs2_extent_rec *split_rec, + enum ocfs2_split_type split) +{ + int index; + u32 cpos = le32_to_cpu(split_rec->e_cpos); + struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; + struct ocfs2_extent_rec *rec, *tmprec; + + right_el = path_leaf_el(right_path);; + if (left_path) + left_el = path_leaf_el(left_path); + + el = right_el; + insert_el = right_el; + index = ocfs2_search_extent_list(el, cpos); + if (index != -1) { + if (index == 0 && left_path) { + BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); + + /* + * This typically means that the record + * started in the left path but moved to the + * right as a result of rotation. We either + * move the existing record to the left, or we + * do the later insert there. + * + * In this case, the left path should always + * exist as the rotate code will have passed + * it back for a post-insert update. + */ + + if (split == SPLIT_LEFT) { + /* + * It's a left split. Since we know + * that the rotate code gave us an + * empty extent in the left path, we + * can just do the insert there. + */ + insert_el = left_el; + } else { + /* + * Right split - we have to move the + * existing record over to the left + * leaf. The insert will be into the + * newly created empty extent in the + * right leaf. + */ + tmprec = &right_el->l_recs[index]; + ocfs2_rotate_leaf(left_el, tmprec); + el = left_el; + + memset(tmprec, 0, sizeof(*tmprec)); + index = ocfs2_search_extent_list(left_el, cpos); + BUG_ON(index == -1); + } + } + } else { + BUG_ON(!left_path); + BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); + /* + * Left path is easy - we can just allow the insert to + * happen. + */ + el = left_el; + insert_el = left_el; + index = ocfs2_search_extent_list(el, cpos); + BUG_ON(index == -1); + } + + rec = &el->l_recs[index]; + ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); + ocfs2_rotate_leaf(insert_el, split_rec); +} + /* * This function only does inserts on an allocation b-tree. For dinode * lists, ocfs2_insert_at_leaf() is called directly. @@ -2012,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode, { int ret, subtree_index; struct buffer_head *leaf_bh = path_leaf_bh(right_path); - struct ocfs2_extent_list *el; /* * Pass both paths to the journal. The majority of inserts @@ -2048,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode, } } - el = path_leaf_el(right_path); + if (insert->ins_split != SPLIT_NONE) { + /* + * We could call ocfs2_insert_at_leaf() for some types + * of splits, but it's easier to just let one seperate + * function sort it all out. + */ + ocfs2_split_record(inode, left_path, right_path, + insert_rec, insert->ins_split); + } else + ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), + insert, inode); - ocfs2_insert_at_leaf(insert_rec, el, insert, inode); ret = ocfs2_journal_dirty(handle, leaf_bh); if (ret) mlog_errno(ret); @@ -2139,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * can wind up skipping both of these two special cases... */ if (rotate) { - ret = ocfs2_rotate_tree_right(inode, handle, + ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, le32_to_cpu(insert_rec->e_cpos), right_path, &left_path); if (ret) { @@ -2164,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode, } out_update_clusters: - ocfs2_update_dinode_clusters(inode, di, - le16_to_cpu(insert_rec->e_leaf_clusters)); + if (type->ins_split == SPLIT_NONE) + ocfs2_update_dinode_clusters(inode, di, + le16_to_cpu(insert_rec->e_leaf_clusters)); ret = ocfs2_journal_dirty(handle, di_bh); if (ret) @@ -2178,6 +3473,44 @@ out: return ret; } +static enum ocfs2_contig_type +ocfs2_figure_merge_contig_type(struct inode *inode, + struct ocfs2_extent_list *el, int index, + struct ocfs2_extent_rec *split_rec) +{ + struct ocfs2_extent_rec *rec; + enum ocfs2_contig_type ret = CONTIG_NONE; + + /* + * We're careful to check for an empty extent record here - + * the merge code will know what to do if it sees one. + */ + + if (index > 0) { + rec = &el->l_recs[index - 1]; + if (index == 1 && ocfs2_is_empty_extent(rec)) { + if (split_rec->e_cpos == el->l_recs[index].e_cpos) + ret = CONTIG_RIGHT; + } else { + ret = ocfs2_extent_contig(inode, rec, split_rec); + } + } + + if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) { + enum ocfs2_contig_type contig_type; + + rec = &el->l_recs[index + 1]; + contig_type = ocfs2_extent_contig(inode, rec, split_rec); + + if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) + ret = CONTIG_LEFTRIGHT; + else if (ret == CONTIG_NONE) + ret = contig_type; + } + + return ret; +} + static void ocfs2_figure_contig_type(struct inode *inode, struct ocfs2_insert_type *insert, struct ocfs2_extent_list *el, @@ -2269,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode, struct ocfs2_path *path = NULL; struct buffer_head *bh = NULL; + insert->ins_split = SPLIT_NONE; + el = &di->id2.i_list; insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); @@ -2430,7 +3765,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) { status = ocfs2_grow_tree(inode, handle, fe_bh, - &insert.ins_tree_depth, last_eb_bh, + &insert.ins_tree_depth, &last_eb_bh, meta_ac); if (status) { mlog_errno(status); @@ -2456,6 +3791,352 @@ bail: return status; } +static void ocfs2_make_right_split_rec(struct super_block *sb, + struct ocfs2_extent_rec *split_rec, + u32 cpos, + struct ocfs2_extent_rec *rec) +{ + u32 rec_cpos = le32_to_cpu(rec->e_cpos); + u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); + + memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); + + split_rec->e_cpos = cpu_to_le32(cpos); + split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); + + split_rec->e_blkno = rec->e_blkno; + le64_add_cpu(&split_rec->e_blkno, + ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); + + split_rec->e_flags = rec->e_flags; +} + +static int ocfs2_split_and_insert(struct inode *inode, + handle_t *handle, + struct ocfs2_path *path, + struct buffer_head *di_bh, + struct buffer_head **last_eb_bh, + int split_index, + struct ocfs2_extent_rec *orig_split_rec, + struct ocfs2_alloc_context *meta_ac) +{ + int ret = 0, depth; + unsigned int insert_range, rec_range, do_leftright = 0; + struct ocfs2_extent_rec tmprec; + struct ocfs2_extent_list *rightmost_el; + struct ocfs2_extent_rec rec; + struct ocfs2_extent_rec split_rec = *orig_split_rec; + struct ocfs2_insert_type insert; + struct ocfs2_extent_block *eb; + struct ocfs2_dinode *di; + +leftright: + /* + * Store a copy of the record on the stack - it might move + * around as the tree is manipulated below. + */ + rec = path_leaf_el(path)->l_recs[split_index]; + + di = (struct ocfs2_dinode *)di_bh->b_data; + rightmost_el = &di->id2.i_list; + + depth = le16_to_cpu(rightmost_el->l_tree_depth); + if (depth) { + BUG_ON(!(*last_eb_bh)); + eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; + rightmost_el = &eb->h_list; + } + + if (le16_to_cpu(rightmost_el->l_next_free_rec) == + le16_to_cpu(rightmost_el->l_count)) { + int old_depth = depth; + + ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (old_depth != depth) { + eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; + rightmost_el = &eb->h_list; + } + } + + memset(&insert, 0, sizeof(struct ocfs2_insert_type)); + insert.ins_appending = APPEND_NONE; + insert.ins_contig = CONTIG_NONE; + insert.ins_free_records = le16_to_cpu(rightmost_el->l_count) + - le16_to_cpu(rightmost_el->l_next_free_rec); + insert.ins_tree_depth = depth; + + insert_range = le32_to_cpu(split_rec.e_cpos) + + le16_to_cpu(split_rec.e_leaf_clusters); + rec_range = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + + if (split_rec.e_cpos == rec.e_cpos) { + insert.ins_split = SPLIT_LEFT; + } else if (insert_range == rec_range) { + insert.ins_split = SPLIT_RIGHT; + } else { + /* + * Left/right split. We fake this as a right split + * first and then make a second pass as a left split. + */ + insert.ins_split = SPLIT_RIGHT; + + ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, + &rec); + + split_rec = tmprec; + + BUG_ON(do_leftright); + do_leftright = 1; + } + + ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, + &insert); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (do_leftright == 1) { + u32 cpos; + struct ocfs2_extent_list *el; + + do_leftright++; + split_rec = *orig_split_rec; + + ocfs2_reinit_path(path, 1); + + cpos = le32_to_cpu(split_rec.e_cpos); + ret = ocfs2_find_path(inode, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + split_index = ocfs2_search_extent_list(el, cpos); + goto leftright; + } +out: + + return ret; +} + +/* + * Mark part or all of the extent record at split_index in the leaf + * pointed to by path as written. This removes the unwritten + * extent flag. + * + * Care is taken to handle contiguousness so as to not grow the tree. + * + * meta_ac is not strictly necessary - we only truly need it if growth + * of the tree is required. All other cases will degrade into a less + * optimal tree layout. + * + * last_eb_bh should be the rightmost leaf block for any inode with a + * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. + * + * This code is optimized for readability - several passes might be + * made over certain portions of the tree. All of those blocks will + * have been brought into cache (and pinned via the journal), so the + * extra overhead is not expressed in terms of disk reads. + */ +static int __ocfs2_mark_extent_written(struct inode *inode, + struct buffer_head *di_bh, + handle_t *handle, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct buffer_head *eb_bh, *last_eb_bh = NULL; + struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; + struct ocfs2_merge_ctxt ctxt; + struct ocfs2_extent_list *rightmost_el; + + if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || + ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < + (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + eb_bh = path_leaf_bh(path); + ret = ocfs2_journal_access(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, + split_index, + split_rec); + + /* + * The core merge / split code wants to know how much room is + * left in this inodes allocation tree, so we pass the + * rightmost extent list. + */ + if (path->p_tree_depth) { + struct ocfs2_extent_block *eb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_last_eb_blk), + &last_eb_bh, OCFS2_BH_CACHED, inode); + if (ret) { + mlog_exit(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EROFS; + goto out; + } + + rightmost_el = &eb->h_list; + } else + rightmost_el = path_root_el(path); + + ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec); + if (ctxt.c_used_tail_recs > 0 && + ocfs2_is_empty_extent(&rightmost_el->l_recs[0])) + ctxt.c_used_tail_recs--; + + if (rec->e_cpos == split_rec->e_cpos && + rec->e_leaf_clusters == split_rec->e_leaf_clusters) + ctxt.c_split_covers_rec = 1; + else + ctxt.c_split_covers_rec = 0; + + ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); + + mlog(0, "index: %d, contig: %u, used_tail_recs: %u, " + "has_empty: %u, split_covers: %u\n", split_index, + ctxt.c_contig_type, ctxt.c_used_tail_recs, + ctxt.c_has_empty_extent, ctxt.c_split_covers_rec); + + if (ctxt.c_contig_type == CONTIG_NONE) { + if (ctxt.c_split_covers_rec) + el->l_recs[split_index] = *split_rec; + else + ret = ocfs2_split_and_insert(inode, handle, path, di_bh, + &last_eb_bh, split_index, + split_rec, meta_ac); + if (ret) + mlog_errno(ret); + } else { + ret = ocfs2_try_to_merge_extent(inode, handle, path, + split_index, split_rec, + dealloc, &ctxt); + if (ret) + mlog_errno(ret); + } + + ocfs2_journal_dirty(handle, eb_bh); + +out: + brelse(last_eb_bh); + return ret; +} + +/* + * Mark the already-existing extent at cpos as written for len clusters. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); + struct ocfs2_extent_rec split_rec; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *el; + + mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", + inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); + + if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " + "that are being written to, but the feature bit " + "is not set in the super block.", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EROFS; + goto out; + } + + /* + * XXX: This should be fixed up so that we just re-insert the + * next extent records. + */ + ocfs2_extent_map_trunc(inode, 0); + + left_path = ocfs2_new_inode_path(di_bh); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + el = path_leaf_el(left_path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + ret = -EROFS; + goto out; + } + + memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); + split_rec.e_cpos = cpu_to_le32(cpos); + split_rec.e_leaf_clusters = cpu_to_le16(len); + split_rec.e_blkno = cpu_to_le64(start_blkno); + split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; + split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; + + ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, + index, &split_rec, meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(left_path); + return ret; +} + static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) { struct buffer_head *tl_bh = osb->osb_tl_bh; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index cb02e53b593c..d3acf45225c2 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -35,6 +35,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, u64 start_blk, u32 new_clusters, struct ocfs2_alloc_context *meta_ac); +struct ocfs2_cached_dealloc_ctxt; +int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_dinode *fe); @@ -102,6 +107,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, u32 cpos, struct buffer_head **leaf_bh); +int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); /* * Helper function to look at the # of clusters in an extent record. diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h index f226b2207628..ff257628af16 100644 --- a/fs/ocfs2/endian.h +++ b/fs/ocfs2/endian.h @@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val) *var = cpu_to_le32(le32_to_cpu(*var) + val); } +static inline void le64_add_cpu(__le64 *var, u64 val) +{ + *var = cpu_to_le64(le64_to_cpu(*var) + val); +} + static inline void le32_and_cpu(__le32 *var, u32 val) { *var = cpu_to_le32(le32_to_cpu(*var) & val); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e23e416ca74c..03c1d365c78b 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -373,37 +373,6 @@ out: return ret; } -/* - * Return the index of the extent record which contains cluster #v_cluster. - * -1 is returned if it was not found. - * - * Should work fine on interior and exterior nodes. - */ -static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, - u32 v_cluster) -{ - int ret = -1; - int i; - struct ocfs2_extent_rec *rec; - u32 rec_end, rec_start, clusters; - - for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - - rec_start = le32_to_cpu(rec->e_cpos); - clusters = ocfs2_rec_clusters(el, rec); - - rec_end = rec_start + clusters; - - if (v_cluster >= rec_start && v_cluster < rec_end) { - ret = i; - break; - } - } - - return ret; -} - int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, unsigned int *extent_flags) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 648ef8e45eaa..5cc90a40b3c5 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -306,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) +{ + /* + * Support for sparse files is a pre-requisite + */ + if (!ocfs2_sparse_alloc(osb)) + return 0; + + if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index f0d9eb08547a..c20a74b99d87 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -88,7 +88,7 @@ #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) -#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 +#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN /* * Heartbeat-only devices are missing journals and other files. The @@ -116,6 +116,11 @@ */ #define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 +/* + * Unwritten extents support. + */ +#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ -- cgit v1.2.1 From 0d172baa5586071ae0ae0c07356a378fdbedecdb Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 14 May 2007 18:09:54 -0700 Subject: ocfs2: small cleanup of ocfs2_write_begin_nolock() We can easily seperate out the write descriptor setup and manipulation into helper functions. Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 108 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 510bf84c9cf5..077583b50391 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1188,6 +1188,31 @@ out: return ret; } +static int ocfs2_write_cluster_by_desc(struct address_space *mapping, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len) +{ + int ret, i; + struct ocfs2_write_cluster_desc *desc; + + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + + ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac, + meta_ac, wc, desc->c_cpos, pos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = 0; +out: + return ret; +} + /* * ocfs2_write_end() wants to know which parts of the target page it * should complete the write on. It's easiest to compute them ahead of @@ -1240,30 +1265,19 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, } } -int ocfs2_write_begin_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata, - struct buffer_head *di_bh, struct page *mmap_page) +/* + * Populate each single-cluster write descriptor in the write context + * with information about the i/o to be done. + */ +static int ocfs2_populate_write_desc(struct inode *inode, + struct ocfs2_write_ctxt *wc, + unsigned int *clusters_to_alloc) { - int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; - unsigned int num_clusters = 0, clusters_to_alloc = 0; - u32 phys = 0; - struct ocfs2_write_ctxt *wc; - struct inode *inode = mapping->host; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_dinode *di; - struct ocfs2_alloc_context *data_ac = NULL; - struct ocfs2_alloc_context *meta_ac = NULL; - handle_t *handle; + int ret; struct ocfs2_write_cluster_desc *desc; - - ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); - if (ret) { - mlog_errno(ret); - return ret; - } - - di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + unsigned int num_clusters = 0; + u32 phys = 0; + int i; for (i = 0; i < wc->w_clen; i++) { desc = &wc->w_desc[i]; @@ -1287,12 +1301,46 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, desc->c_phys = phys; if (phys == 0) { desc->c_new = 1; - clusters_to_alloc++; + *clusters_to_alloc = *clusters_to_alloc + 1; } num_clusters--; } + ret = 0; +out: + return ret; +} + +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page) +{ + int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + unsigned int clusters_to_alloc = 0; + struct ocfs2_write_ctxt *wc; + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle; + + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + /* * We set w_target_from, w_target_to here so that * ocfs2_write_end() knows which range in the target page to @@ -1351,15 +1399,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, goto out_commit; } - for (i = 0; i < wc->w_clen; i++) { - desc = &wc->w_desc[i]; - - ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac, - meta_ac, wc, desc->c_cpos, pos, len); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, + len); + if (ret) { + mlog_errno(ret); + goto out_commit; } if (data_ac) -- cgit v1.2.1 From b27b7cbcf12a1bfff1ed68a73ddd7d11edc20daf Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 18 Jun 2007 11:22:56 -0700 Subject: ocfs2: support writing of unwritten extents Update the write code to detect when the user is asking to write to an unwritten extent. Like writing to a hole, we must zero the region between the write and the cluster boundaries. Most of the existing cluster zeroing logic can be re-used with some additional checks for the unwritten flag on extent records. Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 94 +++++++++++++++++++++++++++++++++++++++++++++------------ fs/ocfs2/file.c | 14 ++++++--- fs/ocfs2/file.h | 2 +- 3 files changed, 84 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 077583b50391..8af923316d22 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -782,8 +782,14 @@ struct ocfs2_write_cluster_desc { * filled. */ unsigned c_new; + unsigned c_unwritten; }; +static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) +{ + return d->c_new || d->c_unwritten; +} + struct ocfs2_write_ctxt { /* Logical cluster position / len of write */ u32 w_cpos; @@ -829,6 +835,8 @@ struct ocfs2_write_ctxt { handle_t *w_handle; struct buffer_head *w_di_bh; + + struct ocfs2_cached_dealloc_ctxt w_dealloc; }; static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) @@ -868,6 +876,8 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, else wc->w_large_pages = 0; + ocfs2_init_dealloc_ctxt(&wc->w_dealloc); + *wcp = wc; return 0; @@ -1103,16 +1113,19 @@ out: * Prepare a single cluster for write one cluster into the file. */ static int ocfs2_write_cluster(struct address_space *mapping, - u32 phys, struct ocfs2_alloc_context *data_ac, + u32 phys, unsigned int unwritten, + struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new; + int ret, i, new, should_zero = 0; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; new = phys == 0 ? 1 : 0; + if (new || unwritten) + should_zero = 1; if (new) { u32 tmp_pos; @@ -1142,11 +1155,20 @@ static int ocfs2_write_cluster(struct address_space *mapping, mlog_errno(ret); goto out; } + } else if (unwritten) { + ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, + wc->w_handle, cpos, 1, phys, + meta_ac, &wc->w_dealloc); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + if (should_zero) v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); - } else { + else v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; - } /* * The only reason this should fail is due to an inability to @@ -1169,7 +1191,8 @@ static int ocfs2_write_cluster(struct address_space *mapping, tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, wc->w_pages[i], cpos, - user_pos, user_len, new); + user_pos, user_len, + should_zero); if (tmpret) { mlog_errno(tmpret); if (ret == 0) @@ -1200,8 +1223,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, for (i = 0; i < wc->w_clen; i++) { desc = &wc->w_desc[i]; - ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac, - meta_ac, wc, desc->c_cpos, pos, len); + ret = ocfs2_write_cluster(mapping, desc->c_phys, + desc->c_unwritten, data_ac, meta_ac, + wc, desc->c_cpos, pos, len); if (ret) { mlog_errno(ret); goto out; @@ -1242,19 +1266,19 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, if (wc->w_large_pages) { /* * We only care about the 1st and last cluster within - * our range and whether they are holes or not. Either + * our range and whether they should be zero'd or not. Either * value may be extended out to the start/end of a * newly allocated cluster. */ desc = &wc->w_desc[0]; - if (desc->c_new) + if (ocfs2_should_zero_cluster(desc)) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, &wc->w_target_from, NULL); desc = &wc->w_desc[wc->w_clen - 1]; - if (desc->c_new) + if (ocfs2_should_zero_cluster(desc)) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, NULL, @@ -1268,28 +1292,52 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, /* * Populate each single-cluster write descriptor in the write context * with information about the i/o to be done. + * + * Returns the number of clusters that will have to be allocated, as + * well as a worst case estimate of the number of extent records that + * would have to be created during a write to an unwritten region. */ static int ocfs2_populate_write_desc(struct inode *inode, struct ocfs2_write_ctxt *wc, - unsigned int *clusters_to_alloc) + unsigned int *clusters_to_alloc, + unsigned int *extents_to_split) { int ret; struct ocfs2_write_cluster_desc *desc; unsigned int num_clusters = 0; + unsigned int ext_flags = 0; u32 phys = 0; int i; + *clusters_to_alloc = 0; + *extents_to_split = 0; + for (i = 0; i < wc->w_clen; i++) { desc = &wc->w_desc[i]; desc->c_cpos = wc->w_cpos + i; if (num_clusters == 0) { + /* + * Need to look up the next extent record. + */ ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, - &num_clusters, NULL); + &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); goto out; } + + /* + * Assume worst case - that we're writing in + * the middle of the extent. + * + * We can assume that the write proceeds from + * left to right, in which case the extent + * insert code is smart enough to coalesce the + * next splits into the previous records created. + */ + if (ext_flags & OCFS2_EXT_UNWRITTEN) + *extents_to_split = *extents_to_split + 2; } else if (phys) { /* * Only increment phys if it doesn't describe @@ -1303,6 +1351,8 @@ static int ocfs2_populate_write_desc(struct inode *inode, desc->c_new = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } + if (ext_flags & OCFS2_EXT_UNWRITTEN) + desc->c_unwritten = 1; num_clusters--; } @@ -1318,7 +1368,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct buffer_head *di_bh, struct page *mmap_page) { int ret, credits = OCFS2_INODE_UPDATE_CREDITS; - unsigned int clusters_to_alloc = 0; + unsigned int clusters_to_alloc, extents_to_split; struct ocfs2_write_ctxt *wc; struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -1333,7 +1383,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, return ret; } - ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc); + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, + &extents_to_split); if (ret) { mlog_errno(ret); goto out; @@ -1347,14 +1398,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * write out. An allocation requires that we write the entire * cluster range. */ - if (clusters_to_alloc > 0) { + if (clusters_to_alloc || extents_to_split) { /* * XXX: We are stretching the limits of - * ocfs2_lock_allocators(). It greately over-estimates + * ocfs2_lock_allocators(). It greatly over-estimates * the work to be done. */ ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, - &data_ac, &meta_ac); + extents_to_split, &data_ac, &meta_ac); if (ret) { mlog_errno(ret); goto out; @@ -1365,7 +1416,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } - ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc); + ocfs2_set_target_boundaries(osb, wc, pos, len, + clusters_to_alloc + extents_to_split); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -1393,7 +1445,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * extent. */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, - clusters_to_alloc, mmap_page); + clusters_to_alloc + extents_to_split, + mmap_page); if (ret) { mlog_errno(ret); goto out_commit; @@ -1538,11 +1591,12 @@ int ocfs2_write_end_nolock(struct address_space *mapping, inode->i_mtime = inode->i_ctime = CURRENT_TIME; di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ocfs2_journal_dirty(handle, wc->w_di_bh); ocfs2_commit_trans(osb, handle); + ocfs2_run_deallocs(osb, &wc->w_dealloc); + ocfs2_free_write_ctxt(wc); return copied; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a80f31776d94..6745086da6fd 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -527,20 +527,21 @@ leave: * understand sparse inodes. */ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, + u32 clusters_to_add, u32 extents_to_split, struct ocfs2_alloc_context **data_ac, struct ocfs2_alloc_context **meta_ac) { int ret, num_free_extents; + unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); *meta_ac = NULL; *data_ac = NULL; mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " - "clusters_to_add = %u\n", + "clusters_to_add = %u, extents_to_split = %u\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), - le32_to_cpu(di->i_clusters), clusters_to_add); + le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); num_free_extents = ocfs2_num_free_extents(osb, inode, di); if (num_free_extents < 0) { @@ -558,9 +559,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, * * Most of the time we'll only be seeing this 1 cluster at a time * anyway. + * + * Always lock for any unwritten extents - we might want to + * add blocks during a split. */ if (!num_free_extents || - (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); if (ret < 0) { if (ret != -ENOSPC) @@ -641,7 +645,7 @@ restart_all: down_write(&OCFS2_I(inode)->ip_alloc_sem); drop_alloc_sem = 1; - status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, + status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, &meta_ac); if (status) { mlog_errno(status); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index a4dd1fa1822b..54df3c4bd2fd 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -47,7 +47,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason); int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, + u32 clusters_to_add, u32 extents_to_split, struct ocfs2_alloc_context **data_ac, struct ocfs2_alloc_context **meta_ac); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); -- cgit v1.2.1 From 2ae99a60374f360ba07037ebbf33d19b89ac43a6 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 9 Mar 2007 16:43:28 -0800 Subject: ocfs2: Support creation of unwritten extents This can now be trivially supported with re-use of our existing extend code. ocfs2_allocate_unwritten_extents() takes a start offset and a byte length and iterates over the inode, adding extents (marked as unwritten) until len is reached. Existing extents are skipped over. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 2 + fs/ocfs2/alloc.h | 1 + fs/ocfs2/aops.c | 2 +- fs/ocfs2/dir.c | 2 +- fs/ocfs2/file.c | 119 ++++++++++++++++++++++++++++++++++++++++++++----------- fs/ocfs2/file.h | 5 ++- fs/ocfs2/namei.c | 2 +- 7 files changed, 105 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0db6a1f724e1..2e3898316468 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3726,6 +3726,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, u32 cpos, u64 start_blk, u32 new_clusters, + u8 flags, struct ocfs2_alloc_context *meta_ac) { int status; @@ -3749,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, rec.e_cpos = cpu_to_le32(cpos); rec.e_blkno = cpu_to_le64(start_blk); rec.e_leaf_clusters = cpu_to_le16(new_clusters); + rec.e_flags = flags; status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, &insert); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index d3acf45225c2..e3284f3eb6b9 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -34,6 +34,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, u32 cpos, u64 start_blk, u32 new_clusters, + u8 flags, struct ocfs2_alloc_context *meta_ac); struct ocfs2_cached_dealloc_ctxt; int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8af923316d22..ec8b606b30e1 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1136,7 +1136,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, */ tmp_pos = cpos; ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, wc->w_di_bh, + &tmp_pos, 1, 0, wc->w_di_bh, wc->w_handle, data_ac, meta_ac, NULL); /* diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c441ef1f2bad..0d5fdde959c8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb, u32 offset = OCFS2_I(dir)->ip_clusters; status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, - 1, parent_fe_bh, handle, + 1, 0, parent_fe_bh, handle, data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6745086da6fd..3e21ad9a6dde 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -425,6 +425,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, u32 *logical_offset, u32 clusters_to_add, + int mark_unwritten, struct buffer_head *fe_bh, handle_t *handle, struct ocfs2_alloc_context *data_ac, @@ -437,9 +438,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, enum ocfs2_alloc_restarted reason = RESTART_NONE; u32 bit_off, num_bits; u64 block; + u8 flags = 0; BUG_ON(!clusters_to_add); + if (mark_unwritten) + flags = OCFS2_EXT_UNWRITTEN; + free_extents = ocfs2_num_free_extents(osb, inode, fe); if (free_extents < 0) { status = free_extents; @@ -489,7 +494,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_insert_extent(osb, handle, inode, fe_bh, *logical_offset, block, num_bits, - meta_ac); + flags, meta_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -522,9 +527,11 @@ leave: * For a given allocation, determine which allocators will need to be * accessed, and lock them, reserving the appropriate number of bits. * - * Called from ocfs2_extend_allocation() for file systems which don't - * support holes, and from ocfs2_write() for file systems which - * understand sparse inodes. + * Sparse file systems call this from ocfs2_write_begin_nolock() + * and ocfs2_allocate_unwritten_extents(). + * + * File systems which don't support holes call this from + * ocfs2_extend_allocation(). */ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, u32 clusters_to_add, u32 extents_to_split, @@ -595,14 +602,13 @@ out: return ret; } -static int ocfs2_extend_allocation(struct inode *inode, - u32 clusters_to_add) +static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) { int status = 0; int restart_func = 0; - int drop_alloc_sem = 0; int credits; - u32 prev_clusters, logical_start; + u32 prev_clusters; struct buffer_head *bh = NULL; struct ocfs2_dinode *fe = NULL; handle_t *handle = NULL; @@ -617,7 +623,7 @@ static int ocfs2_extend_allocation(struct inode *inode, * This function only exists for file systems which don't * support holes. */ - BUG_ON(ocfs2_sparse_alloc(osb)); + BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, OCFS2_BH_CACHED, inode); @@ -633,18 +639,9 @@ static int ocfs2_extend_allocation(struct inode *inode, goto leave; } - logical_start = OCFS2_I(inode)->ip_clusters; - restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - /* blocks peope in read/write from reading our allocation - * until we're done changing it. We depend on i_mutex to block - * other extend/truncate calls while we're here. Ordering wrt - * start_trans is important here -- always do it before! */ - down_write(&OCFS2_I(inode)->ip_alloc_sem); - drop_alloc_sem = 1; - status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, &meta_ac); if (status) { @@ -678,6 +675,7 @@ restarted_transaction: inode, &logical_start, clusters_to_add, + mark_unwritten, bh, handle, data_ac, @@ -730,10 +728,6 @@ restarted_transaction: OCFS2_I(inode)->ip_clusters, i_size_read(inode)); leave: - if (drop_alloc_sem) { - up_write(&OCFS2_I(inode)->ip_alloc_sem); - drop_alloc_sem = 0; - } if (handle) { ocfs2_commit_trans(osb, handle); handle = NULL; @@ -759,6 +753,25 @@ leave: return status; } +static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + int ret; + + /* + * The alloc sem blocks peope in read/write from reading our + * allocation until we're done changing it. We depend on + * i_mutex to block other extend/truncate calls while we're + * here. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add, + mark_unwritten); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + return ret; +} + /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->prepare_write() and @@ -900,7 +913,9 @@ static int ocfs2_extend_file(struct inode *inode, } if (clusters_to_add) { - ret = ocfs2_extend_allocation(inode, clusters_to_add); + ret = ocfs2_extend_allocation(inode, + OCFS2_I(inode)->ip_clusters, + clusters_to_add, 0); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -1176,6 +1191,64 @@ out: return ret; } +/* + * Allocate enough extents to cover the region starting at byte offset + * start for len bytes. Existing extents are skipped, any extents + * added are marked as "unwritten". + */ +static int ocfs2_allocate_unwritten_extents(struct inode *inode, + u64 start, u64 len) +{ + int ret; + u32 cpos, phys_cpos, clusters, alloc_size; + + /* + * We consider both start and len to be inclusive. + */ + cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); + clusters -= cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Hole or existing extent len can be arbitrary, so + * cap it to our own allocation request. + */ + if (alloc_size > clusters) + alloc_size = clusters; + + if (phys_cpos) { + /* + * We already have an allocation at this + * region so we can safely skip it. + */ + goto next; + } + + ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +next: + cpos += alloc_size; + clusters -= alloc_size; + } + + ret = 0; +out: + return ret; +} + static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 54df3c4bd2fd..79115c92dc30 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -39,13 +39,14 @@ enum ocfs2_alloc_restarted { }; int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, - u32 *cluster_start, + u32 *logical_offset, u32 clusters_to_add, + int mark_unwritten, struct buffer_head *fe_bh, handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason); + enum ocfs2_alloc_restarted *reason_ret); int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, u32 clusters_to_add, u32 extents_to_split, struct ocfs2_alloc_context **data_ac, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 36289e6295ce..d430fdab16e9 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, + status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0, new_fe_bh, handle, data_ac, NULL, NULL); -- cgit v1.2.1 From d0c7d7082ee1ec4f95ee57bf86ed39d1a27c4037 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 3 Jul 2007 13:27:22 -0700 Subject: ocfs2: btree support for removal of arbirtrary extents Add code to the btree paths to support the removal of arbitrary regions within an existing extent. With proper higher level support this can be used to "punch holes" in a file. Truncate (a special case of hole punching) could also be converted to use these methods. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 367 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2e3898316468..fac1adb3880a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4139,6 +4139,373 @@ out: return ret; } +static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, + handle_t *handle, struct ocfs2_path *path, + int index, u32 new_range, + struct ocfs2_alloc_context *meta_ac) +{ + int ret, depth, credits = handle->h_buffer_credits; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *last_eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *rightmost_el, *el; + struct ocfs2_extent_rec split_rec; + struct ocfs2_extent_rec *rec; + struct ocfs2_insert_type insert; + + /* + * Setup the record to split before we grow the tree. + */ + el = path_leaf_el(path); + rec = &el->l_recs[index]; + ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); + + depth = path->p_tree_depth; + if (depth > 0) { + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_last_eb_blk), + &last_eb_bh, OCFS2_BH_CACHED, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + rightmost_el = &eb->h_list; + } else + rightmost_el = path_leaf_el(path); + + credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le16_to_cpu(rightmost_el->l_next_free_rec) == + le16_to_cpu(rightmost_el->l_count)) { + int old_depth = depth; + + ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (old_depth != depth) { + eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; + rightmost_el = &eb->h_list; + } + } + + memset(&insert, 0, sizeof(struct ocfs2_insert_type)); + insert.ins_appending = APPEND_NONE; + insert.ins_contig = CONTIG_NONE; + insert.ins_split = SPLIT_RIGHT; + insert.ins_free_records = le16_to_cpu(rightmost_el->l_count) + - le16_to_cpu(rightmost_el->l_next_free_rec); + insert.ins_tree_depth = depth; + + ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); + if (ret) + mlog_errno(ret); + +out: + brelse(last_eb_bh); + return ret; +} + +static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, + struct ocfs2_path *path, int index, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u32 cpos, u32 len) +{ + int ret; + u32 left_cpos, rec_range, trunc_range; + int wants_rotate = 0, is_rightmost_tree_rec = 0; + struct super_block *sb = inode->i_sb; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_block *eb; + + if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + index--; + } + + if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && + path->p_tree_depth) { + /* + * Check whether this is the rightmost tree record. If + * we remove all of this record or part of its right + * edge then an update of the record lengths above it + * will be required. + */ + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + if (eb->h_next_leaf_blk == 0) + is_rightmost_tree_rec = 1; + } + + rec = &el->l_recs[index]; + if (index == 0 && path->p_tree_depth && + le32_to_cpu(rec->e_cpos) == cpos) { + /* + * Changing the leftmost offset (via partial or whole + * record truncate) of an interior (or rightmost) path + * means we have to update the subtree that is formed + * by this leaf and the one to it's left. + * + * There are two cases we can skip: + * 1) Path is the leftmost one in our inode tree. + * 2) The leaf is rightmost and will be empty after + * we remove the extent record - the rotate code + * knows how to update the newly formed edge. + */ + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, + &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { + left_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } + + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + trunc_range = cpos + len; + + if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { + int next_free; + + memset(rec, 0, sizeof(*rec)); + ocfs2_cleanup_merge(el, index); + wants_rotate = 1; + + next_free = le16_to_cpu(el->l_next_free_rec); + if (is_rightmost_tree_rec && next_free > 1) { + /* + * We skip the edge update if this path will + * be deleted by the rotate code. + */ + rec = &el->l_recs[next_free - 1]; + ocfs2_adjust_rightmost_records(inode, handle, path, + rec); + } + } else if (le32_to_cpu(rec->e_cpos) == cpos) { + /* Remove leftmost portion of the record. */ + le32_add_cpu(&rec->e_cpos, len); + le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); + le16_add_cpu(&rec->e_leaf_clusters, -len); + } else if (rec_range == trunc_range) { + /* Remove rightmost portion of the record */ + le16_add_cpu(&rec->e_leaf_clusters, -len); + if (is_rightmost_tree_rec) + ocfs2_adjust_rightmost_records(inode, handle, path, rec); + } else { + /* Caller should have trapped this. */ + mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " + "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, + le32_to_cpu(rec->e_cpos), + le16_to_cpu(rec->e_leaf_clusters), cpos, len); + BUG(); + } + + if (left_path) { + int subtree_index; + + subtree_index = ocfs2_find_subtree_root(inode, left_path, path); + ocfs2_complete_edge_insert(inode, handle, left_path, path, + subtree_index); + } + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); + + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + ocfs2_free_path(left_path); + return ret; +} + +static int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, + u32 cpos, u32 len, handle_t *handle, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + u32 rec_range, trunc_range; + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_list *el; + struct ocfs2_path *path; + + ocfs2_extent_map_trunc(inode, 0); + + path = ocfs2_new_inode_path(di_bh); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + ret = -EROFS; + goto out; + } + + /* + * We have 3 cases of extent removal: + * 1) Range covers the entire extent rec + * 2) Range begins or ends on one edge of the extent rec + * 3) Range is in the middle of the extent rec (no shared edges) + * + * For case 1 we remove the extent rec and left rotate to + * fill the hole. + * + * For case 2 we just shrink the existing extent rec, with a + * tree update if the shrinking edge is also the edge of an + * extent block. + * + * For case 3 we do a right split to turn the extent rec into + * something case 2 can handle. + */ + rec = &el->l_recs[index]; + rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + trunc_range = cpos + len; + + BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); + + mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " + "(cpos %u, len %u)\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, + le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); + + if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { + ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, + cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + ret = ocfs2_split_tree(inode, di_bh, handle, path, index, + trunc_range, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The split could have manipulated the tree enough to + * move the record location, so we have to look for it again. + */ + ocfs2_reinit_path(path, 1); + + ret = ocfs2_find_path(inode, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(inode->i_sb, + "Inode %llu: split at cpos %u lost record.", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + cpos); + ret = -EROFS; + goto out; + } + + /* + * Double check our values here. If anything is fishy, + * it's easier to catch it at the top level. + */ + rec = &el->l_recs[index]; + rec_range = le32_to_cpu(rec->e_cpos) + + ocfs2_rec_clusters(el, rec); + if (rec_range != trunc_range) { + ocfs2_error(inode->i_sb, + "Inode %llu: error after split at cpos %u" + "trunc len %u, existing record is (%u,%u)", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + cpos, len, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, + cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(path); + return ret; +} + static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) { struct buffer_head *tl_bh = osb->osb_tl_bh; -- cgit v1.2.1 From 35edec1d52c075975991471d624b33b9336226f2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 6 Jul 2007 14:41:18 -0700 Subject: ocfs2: update truncate handling of partial clusters The partial cluster zeroing code used during truncate usually assumes that the rightmost byte in the range to be zeroed lies on a cluster boundary. This makes sense for truncate, but punching holes might require zeroing on non-aligned rightmost boundaries. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 72 +++++++++++++++++++++++--------------------------------- fs/ocfs2/alloc.h | 4 ++-- fs/ocfs2/file.c | 5 +++- 3 files changed, 35 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fac1adb3880a..df186d2e8248 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5668,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) return ocfs2_journal_dirty_data(handle, bh); } -static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, - struct page **pages, int numpages, - u64 phys, handle_t *handle) +static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, + loff_t end, struct page **pages, + int numpages, u64 phys, handle_t *handle) { int i, ret, partial = 0; void *kaddr; @@ -5683,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, if (numpages == 0) goto out; - from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ - if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { - /* - * Since 'from' has been capped to a value below page - * size, this calculation won't be able to overflow - * 'to' - */ - to = ocfs2_align_bytes_to_clusters(sb, from); - - /* - * The truncate tail in this case should never contain - * more than one page at maximum. The loop below also - * assumes this. - */ - BUG_ON(numpages != 1); - } - + to = PAGE_CACHE_SIZE; for(i = 0; i < numpages; i++) { page = pages[i]; + from = start & (PAGE_CACHE_SIZE - 1); + if ((end >> PAGE_CACHE_SHIFT) == page->index) + to = end & (PAGE_CACHE_SIZE - 1); + BUG_ON(from > PAGE_CACHE_SIZE); BUG_ON(to > PAGE_CACHE_SIZE); @@ -5739,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, flush_dcache_page(page); - /* - * Every page after the 1st one should be completely zero'd. - */ - from = 0; + start = (page->index + 1) << PAGE_CACHE_SHIFT; } out: if (pages) { @@ -5755,24 +5740,26 @@ out: } } -static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, - int *num, u64 *phys) +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num, u64 *phys) { int i, numpages = 0, ret = 0; - unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; unsigned int ext_flags; struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned long index; - u64 next_cluster_bytes; + loff_t last_page_bytes; BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + BUG_ON(start > end); - /* Cluster boundary, so we don't need to grab any pages. */ - if ((isize & (csize - 1)) == 0) + if (start == end) goto out; - ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, + BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != + (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); + + ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits, phys, NULL, &ext_flags); if (ret) { mlog_errno(ret); @@ -5788,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page * if (ext_flags & OCFS2_EXT_UNWRITTEN) goto out; - next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); - index = isize >> PAGE_CACHE_SHIFT; + last_page_bytes = PAGE_ALIGN(end); + index = start >> PAGE_CACHE_SHIFT; do { pages[numpages] = grab_cache_page(mapping, index); if (!pages[numpages]) { @@ -5800,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page * numpages++; index++; - } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); + } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); out: if (ret != 0) { @@ -5829,11 +5816,10 @@ out: * otherwise block_write_full_page() will skip writeout of pages past * i_size. The new_i_size parameter is passed for this reason. */ -int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, - u64 new_i_size) +int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, + u64 range_start, u64 range_end) { int ret, numpages; - loff_t endbyte; struct page **pages = NULL; u64 phys; @@ -5852,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); + ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, + &numpages, &phys); if (ret) { mlog_errno(ret); goto out; @@ -5861,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, if (numpages == 0) goto out; - ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, - handle); + ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, + numpages, phys, handle); /* * Initiate writeout of the pages we zero'd here. We don't * wait on them - the truncate_inode_pages() call later will * do that for us. */ - endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); - ret = do_sync_mapping_range(inode->i_mapping, new_i_size, - endbyte - 1, SYNC_FILE_RANGE_WRITE); + ret = do_sync_mapping_range(inode->i_mapping, range_start, + range_end - 1, SYNC_FILE_RANGE_WRITE); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index e3284f3eb6b9..752ef860873d 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -95,8 +95,8 @@ struct ocfs2_truncate_context { struct buffer_head *tc_last_eb_bh; }; -int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, - u64 new_i_size); +int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, + u64 range_start, u64 range_end); int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3e21ad9a6dde..f0a6b1330a6e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, int status; handle_t *handle; struct ocfs2_dinode *di; + u64 cluster_bytes; mlog_entry_void(); @@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, /* * Do this before setting i_size. */ - status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); + cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); + status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, + cluster_bytes); if (status) { mlog_errno(status); goto out_commit; -- cgit v1.2.1 From 063c4561f52a74de686fe0ff2f96f4f54c9fecd2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 3 Jul 2007 13:34:11 -0700 Subject: ocfs2: support for removing file regions Provide an internal interface for the removal of arbitrary file regions. ocfs2_remove_inode_range() takes a byte range within a file and will remove existing extents within that range. Partial clusters will be zeroed so that any read from within the region will return zeros. Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 20 ++--- fs/ocfs2/alloc.h | 10 +++ fs/ocfs2/file.c | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/ocfs2/journal.h | 2 + 4 files changed, 262 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index df186d2e8248..f5e11f4fa952 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4373,10 +4373,10 @@ out: return ret; } -static int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, - u32 cpos, u32 len, handle_t *handle, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, + u32 cpos, u32 len, handle_t *handle, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, index; u32 rec_range, trunc_range; @@ -4506,7 +4506,7 @@ out: return ret; } -static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) +int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) { struct buffer_head *tl_bh = osb->osb_tl_bh; struct ocfs2_dinode *di; @@ -4539,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, return current_tail == new_start; } -static int ocfs2_truncate_log_append(struct ocfs2_super *osb, - handle_t *handle, - u64 start_blk, - unsigned int num_clusters) +int ocfs2_truncate_log_append(struct ocfs2_super *osb, + handle_t *handle, + u64 start_blk, + unsigned int num_clusters) { int status, index; unsigned int start_cluster, tl_count; @@ -4698,7 +4698,7 @@ bail: } /* Expects you to already be holding tl_inode->i_mutex */ -static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) +int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; unsigned int num_to_flush; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 752ef860873d..990df48ae8d3 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -41,6 +41,10 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, + u32 cpos, u32 len, handle_t *handle, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_dinode *fe); @@ -68,6 +72,12 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, struct ocfs2_dinode **tl_copy); int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, struct ocfs2_dinode *tl_copy); +int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb); +int ocfs2_truncate_log_append(struct ocfs2_super *osb, + handle_t *handle, + u64 start_blk, + unsigned int num_clusters); +int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); /* * Process local structure which describes the block unlinks done diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f0a6b1330a6e..11f7cf9f2511 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -541,12 +541,15 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, struct ocfs2_alloc_context **data_ac, struct ocfs2_alloc_context **meta_ac) { - int ret, num_free_extents; + int ret = 0, num_free_extents; unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); *meta_ac = NULL; - *data_ac = NULL; + if (data_ac) + *data_ac = NULL; + + BUG_ON(clusters_to_add != 0 && data_ac == NULL); mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " "clusters_to_add = %u, extents_to_split = %u\n", @@ -583,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, } } + if (clusters_to_add == 0) + goto out; + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); if (ret < 0) { if (ret != -ENOSPC) @@ -1252,6 +1258,238 @@ out: return ret; } +static int __ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + OCFS2_I(inode)->ip_clusters -= len; + di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +/* + * Truncate a byte range, avoiding pages within partial clusters. This + * preserves those pages for the zeroing code to write to. + */ +static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, + u64 byte_len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + loff_t start, end; + struct address_space *mapping = inode->i_mapping; + + start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); + end = byte_start + byte_len; + end = end & ~(osb->s_clustersize - 1); + + if (start < end) { + unmap_mapping_range(mapping, start, end - start, 0); + truncate_inode_pages_range(mapping, start, end - 1); + } +} + +static int ocfs2_zero_partial_clusters(struct inode *inode, + u64 start, u64 len) +{ + int ret = 0; + u64 tmpend, end = start + len; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int csize = osb->s_clustersize; + handle_t *handle; + + /* + * The "start" and "end" values are NOT necessarily part of + * the range whose allocation is being deleted. Rather, this + * is what the user passed in with the request. We must zero + * partial clusters here. There's no need to worry about + * physical allocation - the zeroing code knows to skip holes. + */ + mlog(0, "byte start: %llu, end: %llu\n", + (unsigned long long)start, (unsigned long long)end); + + /* + * If both edges are on a cluster boundary then there's no + * zeroing required as the region is part of the allocation to + * be truncated. + */ + if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * We want to get the byte offset of the end of the 1st cluster. + */ + tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; + + mlog(0, "1st range: start: %llu, tmpend: %llu\n", + (unsigned long long)start, (unsigned long long)tmpend); + + ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); + if (ret) + mlog_errno(ret); + + if (tmpend < end) { + /* + * This may make start and end equal, but the zeroing + * code will skip any work in that case so there's no + * need to catch it up here. + */ + start = end & ~(osb->s_clustersize - 1); + + mlog(0, "2nd range: start: %llu, end: %llu\n", + (unsigned long long)start, (unsigned long long)end); + + ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); + if (ret) + mlog_errno(ret); + } + + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) +{ + int ret = 0; + u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cached_dealloc_ctxt dealloc; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (byte_len == 0) + return 0; + + trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); + trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; + if (trunc_len >= trunc_start) + trunc_len -= trunc_start; + else + trunc_len = 0; + + mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)byte_start, + (unsigned long long)byte_len, trunc_start, trunc_len); + + ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos = trunc_start; + while (trunc_len) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > trunc_len) + alloc_size = trunc_len; + + /* Only do work for non-holes */ + if (phys_cpos != 0) { + ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, + phys_cpos, alloc_size, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += alloc_size; + trunc_len -= alloc_size; + } + + ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); + +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3db5de4506da..ce60aab013aa 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -289,6 +289,8 @@ int ocfs2_journal_dirty_data(handle_t *handle, #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ + OCFS2_TRUNCATE_LOG_UPDATE) +#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS) + /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + * bitmap block for the new bit) */ #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) -- cgit v1.2.1 From b25801038da5823bba1b5440a57ca68afc51b6bd Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 9 Mar 2007 16:53:21 -0800 Subject: ocfs2: Support xfs style space reservation ioctls We re-use the RESVSP/UNRESVSP ioctls from xfs which allow the user to allocate and deallocate regions to a file without zeroing data or changing i_size. Though renamed, the structure passed in from user is identical to struct xfs_flock64. The three fields that are actually used right now are l_whence, l_start and l_len. This should get ocfs2 immediate compatibility with userspace software using the pre-existing xfs ioctls. Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++---- fs/ocfs2/file.h | 3 + fs/ocfs2/ioctl.c | 15 +++++ fs/ocfs2/ocfs2_fs.h | 26 ++++++++ fs/ocfs2/super.c | 4 +- fs/ocfs2/super.h | 2 + 6 files changed, 216 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 11f7cf9f2511..f04c7aa834cb 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1111,17 +1111,16 @@ out: return ret; } -static int ocfs2_write_remove_suid(struct inode *inode) +static int __ocfs2_write_remove_suid(struct inode *inode, + struct buffer_head *bh) { int ret; - struct buffer_head *bh = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(inode); handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di; mlog_entry("(Inode %llu, mode 0%o)\n", - (unsigned long long)oi->ip_blkno, inode->i_mode); + (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (handle == NULL) { @@ -1130,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode) goto out; } - ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); - if (ret < 0) { - mlog_errno(ret); - goto out_trans; - } - ret = ocfs2_journal_access(handle, inode, bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); - goto out_bh; + goto out_trans; } inode->i_mode &= ~S_ISUID; @@ -1153,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode) ret = ocfs2_journal_dirty(handle, bh); if (ret < 0) mlog_errno(ret); -out_bh: - brelse(bh); + out_trans: ocfs2_commit_trans(osb, handle); out: @@ -1200,6 +1192,25 @@ out: return ret; } +static int ocfs2_write_remove_suid(struct inode *inode) +{ + int ret; + struct buffer_head *bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_write_remove_suid(inode, bh); +out: + brelse(bh); + return ret; +} + /* * Allocate enough extents to cover the region starting at byte offset * start for len bytes. Existing extents are skipped, any extents @@ -1490,6 +1501,151 @@ out: return ret; } +/* + * Parts of this function taken from xfs_change_file_space() + */ +int ocfs2_change_file_space(struct file *file, unsigned int cmd, + struct ocfs2_space_resv *sr) +{ + int ret; + s64 llen; + struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + handle_t *handle; + unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits); + + if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && + !ocfs2_writes_unwritten_extents(osb)) + return -ENOTTY; + else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && + !ocfs2_sparse_alloc(osb)) + return -ENOTTY; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + mutex_lock(&inode->i_mutex); + + /* + * This prevents concurrent writes on other nodes + */ + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_meta_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_rw_unlock; + } + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + ret = -EPERM; + goto out_meta_unlock; + } + + switch (sr->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + sr->l_start += file->f_pos; + break; + case 2: /*SEEK_END*/ + sr->l_start += i_size_read(inode); + break; + default: + ret = -EINVAL; + goto out_meta_unlock; + } + sr->l_whence = 0; + + llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; + + if (sr->l_start < 0 + || sr->l_start > max_off + || (sr->l_start + llen) < 0 + || (sr->l_start + llen) > max_off) { + ret = -EINVAL; + goto out_meta_unlock; + } + + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { + if (sr->l_len <= 0) { + ret = -EINVAL; + goto out_meta_unlock; + } + } + + if (should_remove_suid(file->f_path.dentry)) { + ret = __ocfs2_write_remove_suid(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out_meta_unlock; + } + } + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + switch (cmd) { + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + /* + * This takes unsigned offsets, but the signed ones we + * pass have been checked against overflow above. + */ + ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, + sr->l_len); + break; + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, + sr->l_len); + break; + default: + ret = -EINVAL; + } + up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); + goto out_meta_unlock; + } + + /* + * We update c/mtime for these changes + */ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_meta_unlock; + } + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + +out_meta_unlock: + brelse(di_bh); + ocfs2_meta_unlock(inode, 1); +out_rw_unlock: + ocfs2_rw_unlock(inode, 1); + + mutex_unlock(&inode->i_mutex); +out: + return ret; +} + static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 79115c92dc30..36fe27f268ee 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -62,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode, int ocfs2_update_inode_atime(struct inode *inode, struct buffer_head *bh); +int ocfs2_change_file_space(struct file *file, unsigned int cmd, + struct ocfs2_space_resv *sr); + #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index f3ad21ad9aed..bd68c3f2afbe 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -14,6 +14,7 @@ #include "ocfs2.h" #include "alloc.h" #include "dlmglue.h" +#include "file.h" #include "inode.h" #include "journal.h" @@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, { unsigned int flags; int status; + struct ocfs2_space_resv sr; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, return ocfs2_set_inode_attr(inode, flags, OCFS2_FL_MODIFIABLE); + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) + return -EFAULT; + + return ocfs2_change_file_space(filp, cmd, &sr); default: return -ENOTTY; } @@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC32_SETFLAGS: cmd = OCFS2_IOC_SETFLAGS; break; + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c20a74b99d87..82f8a75b207e 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -174,6 +174,32 @@ #define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) #define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) +/* + * Space reservation / allocation / free ioctls and argument structure + * are designed to be compatible with XFS. + * + * ALLOCSP* and FREESP* are not and will never be supported, but are + * included here for completeness. + */ +struct ocfs2_space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +}; + +#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv) +#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv) +#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv) +#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv) +#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv) +#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv) +#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) +#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) + /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f07718a7552b..3a5a1ed09ac9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -115,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb); static struct inode *ocfs2_alloc_inode(struct super_block *sb); static void ocfs2_destroy_inode(struct inode *inode); -static unsigned long long ocfs2_max_file_offset(unsigned int blockshift); - static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, @@ -321,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode) /* From xfs_super.c:xfs_max_file_offset * Copyright (c) 2000-2004 Silicon Graphics, Inc. */ -static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) +unsigned long long ocfs2_max_file_offset(unsigned int blockshift) { unsigned int pagefactor = 1; unsigned int bitshift = BITS_PER_LONG - 1; diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 783f5270f2a1..3b9cb3d0b008 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb, #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +unsigned long long ocfs2_max_file_offset(unsigned int blockshift); + #endif /* OCFS2_SUPER_H */ -- cgit v1.2.1 From 54c57dc3b6578356c0a428c767d4bf080254a2ee Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 20 Jun 2007 17:15:10 -0700 Subject: [PATCH] ocfs2: zero_user_page conversion Signed-off-by: Eric Sandeen Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ec8b606b30e1..84bf6e79de23 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -740,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, bh = head; block_start = 0; do { - void *kaddr; - block_end = block_start + bsize; if (block_end <= from) goto next_bh; if (block_start >= to) break; - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+block_start, 0, bh->b_size); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + zero_user_page(page, block_start, bh->b_size, KM_USER0); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -906,15 +901,11 @@ static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to if (block_end > from && block_start < to) { if (!PageUptodate(page)) { unsigned start, end; - void *kaddr; start = max(from, block_start); end = min(to, block_end); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+start, 0, end - start); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + zero_user_page(page, start, end - start, KM_USER0); set_buffer_uptodate(bh); } -- cgit v1.2.1 From bcf67e16251c42302499499b1c50f7d35622f564 Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Tue, 10 Jul 2007 17:22:26 -0700 Subject: Make common helpers for seq_files that work with list_heads Many places in kernel use seq_file API to iterate over a regular list_head. The code for such iteration is identical in all the places, so it's worth introducing a common helpers. This makes code about 300 lines smaller: The first version of this patch made the helper functions static inline in the seq_file.h header. This patch moves them to the fs/seq_file.c as Andrew proposed. The vmlinux .text section sizes are as follows: 2.6.22-rc1-mm1: 0x001794d5 with the previous version: 0x00179505 with this patch: 0x00179135 The config file used was make allnoconfig with the "y" inclusion of all the possible options to make the files modified by the patch compile plus drivers I have on the test node. This patch: Many places in kernel use seq_file API to iterate over a regular list_head. The code for such iteration is identical in all the places, so it's worth introducing a common helpers. Signed-off-by: Pavel Emelianov Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 0ac22af7afe5..49194a4e6b91 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -447,3 +447,37 @@ int seq_puts(struct seq_file *m, const char *s) return -1; } EXPORT_SYMBOL(seq_puts); + +struct list_head *seq_list_start(struct list_head *head, loff_t pos) +{ + struct list_head *lh; + + list_for_each(lh, head) + if (pos-- == 0) + return lh; + + return NULL; +} + +EXPORT_SYMBOL(seq_list_start); + +struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) +{ + if (!pos) + return head; + + return seq_list_start(head, pos - 1); +} + +EXPORT_SYMBOL(seq_list_start_head); + +struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) +{ + struct list_head *lh; + + lh = ((struct list_head *)v)->next; + ++*ppos; + return lh == head ? NULL : lh; +} + +EXPORT_SYMBOL(seq_list_next); -- cgit v1.2.1 From 070ea60214c1894c9eec86ca9aa5dff57a5ab525 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 19 May 2007 17:22:52 -0400 Subject: NFS: Clean ups in fs/nfs/direct.c Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 00eee87510fe..4c97e55e86ee 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -763,10 +763,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, (unsigned long) count, (long long) pos); if (nr_segs != 1) - return -EINVAL; - - if (count < 0) goto out; + retval = -EFAULT; if (!access_ok(VERIFY_WRITE, buf, count)) goto out; @@ -814,7 +812,7 @@ out: ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - ssize_t retval; + ssize_t retval = -EINVAL; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; /* XXX: temporary */ @@ -827,7 +825,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, (unsigned long) count, (long long) pos); if (nr_segs != 1) - return -EINVAL; + goto out; retval = generic_write_checks(file, &pos, &count, 0); if (retval) -- cgit v1.2.1 From d9df8d6b38228afab519094048aa2c082b0b2cf4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 May 2007 10:22:20 -0400 Subject: NFS: Don't fail an O_DIRECT read/write if get_user_pages() returns pages There is no need to fail the entire O_DIRECT read/write just because get_user_pages() returned fewer pages than we requested. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4c97e55e86ee..f1b153ad645b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -295,9 +295,14 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo break; } if ((unsigned)result < data->npages) { - nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_release(data); - break; + bytes = result * PAGE_SIZE; + if (bytes <= pgbase) { + nfs_direct_release_pages(data->pagevec, result); + nfs_readdata_release(data); + break; + } + bytes -= pgbase; + data->npages = result; } get_dreq(dreq); @@ -630,9 +635,14 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l break; } if ((unsigned)result < data->npages) { - nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_release(data); - break; + bytes = result * PAGE_SIZE; + if (bytes <= pgbase) { + nfs_direct_release_pages(data->pagevec, result); + nfs_writedata_release(data); + break; + } + bytes -= pgbase; + data->npages = result; } get_dreq(dreq); -- cgit v1.2.1 From 44dd151d5c21234cc534c47d7382f5c28c3143cd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 19 May 2007 11:58:03 -0400 Subject: NFS: Don't mark a written page as uptodate until it is on disk The write may fail, so we should not mark the page as uptodate until we are certain that the data has been accepted and written to disk by the server. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index af344a158e01..b853959d964d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -191,8 +191,6 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, } /* Update file length */ nfs_grow_file(page, offset, count); - /* Set the PG_uptodate flag? */ - nfs_mark_uptodate(page, offset, count); nfs_unlock_request(req); return 0; } @@ -751,7 +749,12 @@ int nfs_updatepage(struct file *file, struct page *page, static void nfs_writepage_release(struct nfs_page *req) { - if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { + if (PageError(req->wb_page)) { + nfs_end_page_writeback(req->wb_page); + nfs_inode_remove_request(req); + } else if (!nfs_reschedule_unstable_write(req)) { + /* Set the PG_uptodate flag */ + nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes); nfs_end_page_writeback(req->wb_page); nfs_inode_remove_request(req); } else @@ -1039,6 +1042,8 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) dprintk(" marked for commit\n"); goto next; } + /* Set the PG_uptodate flag? */ + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); dprintk(" OK\n"); remove_request: nfs_end_page_writeback(page); @@ -1249,6 +1254,9 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) * returned by the server against all stored verfs. */ if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { /* We have a match */ + /* Set the PG_uptodate flag */ + nfs_mark_uptodate(req->wb_page, req->wb_pgbase, + req->wb_bytes); nfs_inode_remove_request(req); dprintk(" OK\n"); goto next; -- cgit v1.2.1 From de05a0cc2a2ae16eb8d8dbf88fe728ace45beb9a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 20 May 2007 13:05:05 -0400 Subject: NFS: Minor read optimisation... Since PG_uptodate may now end up getting set during the call to nfs_wb_page(), we can avoid putting a read request on the wire in those situations. Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 7bd7cb95c034..c07d0d10d9ed 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -483,17 +483,19 @@ int nfs_readpage(struct file *file, struct page *page) */ error = nfs_wb_page(inode, page); if (error) - goto out_error; + goto out_unlock; + if (PageUptodate(page)) + goto out_unlock; error = -ESTALE; if (NFS_STALE(inode)) - goto out_error; + goto out_unlock; if (file == NULL) { error = -EBADF; ctx = nfs_find_open_context(inode, NULL, FMODE_READ); if (ctx == NULL) - goto out_error; + goto out_unlock; } else ctx = get_nfs_open_context((struct nfs_open_context *) file->private_data); @@ -502,8 +504,7 @@ int nfs_readpage(struct file *file, struct page *page) put_nfs_open_context(ctx); return error; - -out_error: +out_unlock: unlock_page(page); return error; } @@ -520,21 +521,32 @@ readpage_async_filler(void *data, struct page *page) struct inode *inode = page->mapping->host; struct nfs_page *new; unsigned int len; + int error; + + error = nfs_wb_page(inode, page); + if (error) + goto out_unlock; + if (PageUptodate(page)) + goto out_unlock; - nfs_wb_page(inode, page); len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); + new = nfs_create_request(desc->ctx, inode, page, 0, len); - if (IS_ERR(new)) { - SetPageError(page); - unlock_page(page); - return PTR_ERR(new); - } + if (IS_ERR(new)) + goto out_error; + if (len < PAGE_CACHE_SIZE) zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0); nfs_pageio_add_request(desc->pgio, new); return 0; +out_error: + error = PTR_ERR(new); + SetPageError(page); +out_unlock: + unlock_page(page); + return error; } int nfs_readpages(struct file *filp, struct address_space *mapping, -- cgit v1.2.1 From 88be9f990fe70f0f177ef44a16a477599e91f825 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2007 10:42:27 -0400 Subject: NFS: Replace vfsmount and dentry in nfs_open_context with struct path Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 +- fs/nfs/direct.c | 4 ++-- fs/nfs/inode.c | 10 +++++----- fs/nfs/nfs4proc.c | 6 +++--- fs/nfs/pagelist.c | 6 +++--- fs/nfs/read.c | 6 +++--- fs/nfs/write.c | 20 ++++++++++---------- 7 files changed, 27 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 7f37d1bea83f..b47c156a711d 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -74,7 +74,7 @@ again: continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); - err = nfs4_open_delegation_recall(ctx->dentry, state); + err = nfs4_open_delegation_recall(ctx->path.dentry, state); if (err >= 0) err = nfs_delegation_claim_locks(ctx, state); put_nfs_open_context(ctx); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f1b153ad645b..a5c82b6f3b45 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -266,7 +266,7 @@ static const struct rpc_call_ops nfs_read_direct_ops = { static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos) { struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = ctx->path.dentry->d_inode; size_t rsize = NFS_SERVER(inode)->rsize; unsigned int pgbase; int result; @@ -606,7 +606,7 @@ static const struct rpc_call_ops nfs_write_direct_ops = { static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync) { struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = ctx->path.dentry->d_inode; size_t wsize = NFS_SERVER(inode)->wsize; unsigned int pgbase; int result; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bd9f5a836592..cc7a9064be90 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -462,8 +462,8 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { atomic_set(&ctx->count, 1); - ctx->dentry = dget(dentry); - ctx->vfsmnt = mntget(mnt); + ctx->path.dentry = dget(dentry); + ctx->path.mnt = mntget(mnt); ctx->cred = get_rpccred(cred); ctx->state = NULL; ctx->lockowner = current->files; @@ -484,7 +484,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx) { if (atomic_dec_and_test(&ctx->count)) { if (!list_empty(&ctx->list)) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = ctx->path.dentry->d_inode; spin_lock(&inode->i_lock); list_del(&ctx->list); spin_unlock(&inode->i_lock); @@ -493,8 +493,8 @@ void put_nfs_open_context(struct nfs_open_context *ctx) nfs4_close_state(ctx->state, ctx->mode); if (ctx->cred != NULL) put_rpccred(ctx->cred); - dput(ctx->dentry); - mntput(ctx->vfsmnt); + dput(ctx->path.dentry); + mntput(ctx->path.mnt); kfree(ctx); } } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 648e0ac0f90e..4d641cbdbde1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -512,7 +512,7 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = nfs4_do_open_reclaim(sp, state, ctx->dentry); + ret = nfs4_do_open_reclaim(sp, state, ctx->path.dentry); put_nfs_open_context(ctx); return ret; } @@ -862,7 +862,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = nfs4_do_open_expired(sp, state, ctx->dentry); + ret = nfs4_do_open_expired(sp, state, ctx->path.dentry); put_nfs_open_context(ctx); return ret; } @@ -3285,7 +3285,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, sizeof(data->lsp->ls_stateid.data)); data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; - renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); } nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid); out: diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index c5bb51a29e80..f8a4ba533930 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -114,7 +114,7 @@ void nfs_unlock_request(struct nfs_page *req) */ int nfs_set_page_writeback_locked(struct nfs_page *req) { - struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode); if (!nfs_lock_request(req)) return 0; @@ -127,7 +127,7 @@ int nfs_set_page_writeback_locked(struct nfs_page *req) */ void nfs_clear_page_writeback(struct nfs_page *req) { - struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode); if (req->wb_page != NULL) { spin_lock(&nfsi->req_lock); @@ -193,7 +193,7 @@ static int nfs_wait_bit_interruptible(void *word) int nfs_wait_on_request(struct nfs_page *req) { - struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->dentry->d_inode); + struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->path.dentry->d_inode); sigset_t oldmask; int ret = 0; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index c07d0d10d9ed..6ae2e58ed05a 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -145,8 +145,8 @@ static void nfs_readpage_release(struct nfs_page *req) unlock_page(req->wb_page); dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", - req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); nfs_clear_request(req); @@ -164,7 +164,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, int flags; data->req = req; - data->inode = inode = req->wb_context->dentry->d_inode; + data->inode = inode = req->wb_context->path.dentry->d_inode; data->cred = req->wb_context->cred; data->args.fh = NFS_FH(inode); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b853959d964d..9e7c21da864f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -407,7 +407,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); BUG_ON (!NFS_WBACK_BUSY(req)); @@ -455,7 +455,7 @@ nfs_dirty_request(struct nfs_page *req) static void nfs_mark_request_commit(struct nfs_page *req) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&nfsi->req_lock); @@ -789,7 +789,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req, * NB: take care not to mess about with data->commit et al. */ data->req = req; - data->inode = inode = req->wb_context->dentry->d_inode; + data->inode = inode = req->wb_context->path.dentry->d_inode; data->cred = req->wb_context->cred; data->args.fh = NFS_FH(inode); @@ -957,8 +957,8 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) struct page *page = req->wb_page; dprintk("NFS: write (%s/%Ld %d@%Ld)", - req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); @@ -1023,8 +1023,8 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) page = req->wb_page; dprintk("NFS: write (%s/%Ld %d@%Ld)", - req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); @@ -1162,7 +1162,7 @@ static void nfs_commit_rpcsetup(struct list_head *head, list_splice_init(head, &data->pages); first = nfs_list_entry(data->pages.next); - inode = first->wb_context->dentry->d_inode; + inode = first->wb_context->path.dentry->d_inode; data->inode = inode; data->cred = first->wb_context->cred; @@ -1239,8 +1239,8 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dprintk("NFS: commit (%s/%Ld %d@%Ld)", - req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); if (task->tk_status < 0) { -- cgit v1.2.1 From 539cd03a5708c9861a3e738e6f363ad743c85ddf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2007 11:46:42 -0400 Subject: NFSv4: Cleanup: pass the nfs_open_context to open recovery code Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 +- fs/nfs/delegation.h | 2 +- fs/nfs/nfs4proc.c | 38 +++++++++++++++++++------------------- 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index b47c156a711d..9f17b91205cf 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -74,7 +74,7 @@ again: continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); - err = nfs4_open_delegation_recall(ctx->path.dentry, state); + err = nfs4_open_delegation_recall(ctx, state); if (err >= 0) err = nfs_delegation_claim_locks(ctx, state); put_nfs_open_context(ctx); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 2cfd4b24c7fe..f6e42fb21afb 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -39,7 +39,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid); -int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state); int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4d641cbdbde1..c83db9def0fe 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -462,7 +462,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * * OPEN_RECLAIM: * reclaim state on the server after a reboot. */ -static int _nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) +static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) { struct nfs_delegation *delegation = NFS_I(state->inode)->delegation; struct nfs4_opendata *opendata; @@ -478,7 +478,7 @@ static int _nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state } delegation_type = delegation->type; } - opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL); + opendata = nfs4_opendata_alloc(ctx->path.dentry, state->owner, 0, NULL); if (opendata == NULL) return -ENOMEM; opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; @@ -490,13 +490,13 @@ static int _nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state return status; } -static int nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) +static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_exception exception = { }; int err; do { - err = _nfs4_do_open_reclaim(sp, state, dentry); + err = _nfs4_do_open_reclaim(ctx, state); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -512,12 +512,12 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = nfs4_do_open_reclaim(sp, state, ctx->path.dentry); + ret = nfs4_do_open_reclaim(ctx, state); put_nfs_open_context(ctx); return ret; } -static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) +static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state) { struct nfs4_state_owner *sp = state->owner; struct nfs4_opendata *opendata; @@ -525,7 +525,7 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) return 0; - opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL); + opendata = nfs4_opendata_alloc(ctx->path.dentry, sp, 0, NULL); if (opendata == NULL) return -ENOMEM; opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; @@ -536,13 +536,13 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state return ret; } -int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state) { struct nfs4_exception exception = { }; - struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct nfs_server *server = NFS_SERVER(state->inode); int err; do { - err = _nfs4_open_delegation_recall(dentry, state); + err = _nfs4_open_delegation_recall(ctx, state); switch (err) { case 0: return err; @@ -811,7 +811,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) * reclaim state on the server after a network partition. * Assumes caller holds the appropriate lock */ -static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) +static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) { struct inode *inode = state->inode; struct nfs_delegation *delegation = NFS_I(inode)->delegation; @@ -820,34 +820,34 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st int ret; if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) { - ret = _nfs4_do_access(inode, sp->so_cred, openflags); + ret = _nfs4_do_access(inode, ctx->cred, openflags); if (ret < 0) return ret; memcpy(&state->stateid, &delegation->stateid, sizeof(state->stateid)); set_bit(NFS_DELEGATED_STATE, &state->flags); return 0; } - opendata = nfs4_opendata_alloc(dentry, sp, openflags, NULL); + opendata = nfs4_opendata_alloc(ctx->path.dentry, state->owner, openflags, NULL); if (opendata == NULL) return -ENOMEM; ret = nfs4_open_recover(opendata, state); if (ret == -ESTALE) { /* Invalidate the state owner so we don't ever use it again */ - nfs4_drop_state_owner(sp); - d_drop(dentry); + nfs4_drop_state_owner(state->owner); + d_drop(ctx->path.dentry); } nfs4_opendata_free(opendata); return ret; } -static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) +static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) { - struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_exception exception = { }; int err; do { - err = _nfs4_open_expired(sp, state, dentry); + err = _nfs4_open_expired(ctx, state); if (err == -NFS4ERR_DELAY) nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -862,7 +862,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = nfs4_do_open_expired(sp, state, ctx->path.dentry); + ret = nfs4_do_open_expired(ctx, state); put_nfs_open_context(ctx); return ret; } -- cgit v1.2.1 From ad389da79f7bf9dc12dbc79c9c2740f9ed2f13d1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2007 12:30:00 -0400 Subject: NFSv4: Ensure asynchronous open() calls always pin the mountpoint A number of race conditions may currently ensue if the user presses ^C and then unmounts the partition while an asynchronous open() is in progress. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/nfs4proc.c | 48 +++++++++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c27258b5d3e1..4948ec1dd9bd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1244,7 +1244,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - if (nd && (nd->flags & LOOKUP_CREATE)) + if ((nd->flags & LOOKUP_CREATE) != 0) open_flags = nd->intent.open.flags; lock_kernel(); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c83db9def0fe..895e8e649c91 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -221,7 +221,7 @@ struct nfs4_opendata { struct nfs_open_confirmres c_res; struct nfs_fattr f_attr; struct nfs_fattr dir_attr; - struct dentry *dentry; + struct path path; struct dentry *dir; struct nfs4_state_owner *owner; struct iattr attrs; @@ -230,11 +230,11 @@ struct nfs4_opendata { int cancelled; }; -static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, +static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, struct nfs4_state_owner *sp, int flags, const struct iattr *attrs) { - struct dentry *parent = dget_parent(dentry); + struct dentry *parent = dget_parent(path->dentry); struct inode *dir = parent->d_inode; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *p; @@ -246,7 +246,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (p->o_arg.seqid == NULL) goto err_free; atomic_set(&p->count, 1); - p->dentry = dget(dentry); + p->path.mnt = mntget(path->mnt); + p->path.dentry = dget(path->dentry); p->dir = parent; p->owner = sp; atomic_inc(&sp->so_count); @@ -254,7 +255,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.open_flags = flags, p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id = sp->so_id; - p->o_arg.name = &dentry->d_name; + p->o_arg.name = &p->path.dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; @@ -288,7 +289,8 @@ static void nfs4_opendata_free(struct nfs4_opendata *p) nfs_free_seqid(p->o_arg.seqid); nfs4_put_state_owner(p->owner); dput(p->dir); - dput(p->dentry); + dput(p->path.dentry); + mntput(p->path.mnt); kfree(p); } } @@ -478,7 +480,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state } delegation_type = delegation->type; } - opendata = nfs4_opendata_alloc(ctx->path.dentry, state->owner, 0, NULL); + opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL); if (opendata == NULL) return -ENOMEM; opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; @@ -525,7 +527,7 @@ static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) return 0; - opendata = nfs4_opendata_alloc(ctx->path.dentry, sp, 0, NULL); + opendata = nfs4_opendata_alloc(&ctx->path, sp, 0, NULL); if (opendata == NULL) return -ENOMEM; opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; @@ -827,7 +829,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s set_bit(NFS_DELEGATED_STATE, &state->flags); return 0; } - opendata = nfs4_opendata_alloc(ctx->path.dentry, state->owner, openflags, NULL); + opendata = nfs4_opendata_alloc(&ctx->path, state->owner, openflags, NULL); if (opendata == NULL) return -ENOMEM; ret = nfs4_open_recover(opendata, state); @@ -955,7 +957,7 @@ static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, st /* * Returns a referenced nfs4_state */ -static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) +static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; @@ -975,7 +977,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st goto err_put_state_owner; down_read(&clp->cl_sem); status = -ENOMEM; - opendata = nfs4_opendata_alloc(dentry, sp, flags, sattr); + opendata = nfs4_opendata_alloc(path, sp, flags, sattr); if (opendata == NULL) goto err_release_rwsem; @@ -1006,14 +1008,14 @@ out_err: } -static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred) +static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred) { struct nfs4_exception exception = { }; struct nfs4_state *res; int status; do { - status = _nfs4_do_open(dir, dentry, flags, sattr, cred, &res); + status = _nfs4_do_open(dir, path, flags, sattr, cred, &res); if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -1259,6 +1261,10 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, str struct dentry * nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { + struct path path = { + .mnt = nd->mnt, + .dentry = dentry, + }; struct iattr attr; struct rpc_cred *cred; struct nfs4_state *state; @@ -1277,7 +1283,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); if (IS_ERR(cred)) return (struct dentry *)cred; - state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); + state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred); put_rpccred(cred); if (IS_ERR(state)) { if (PTR_ERR(state) == -ENOENT) @@ -1294,6 +1300,10 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) int nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd) { + struct path path = { + .mnt = nd->mnt, + .dentry = dentry, + }; struct rpc_cred *cred; struct nfs4_state *state; @@ -1302,7 +1312,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st return PTR_ERR(cred); state = nfs4_open_delegated(dentry->d_inode, openflags, cred); if (IS_ERR(state)) - state = nfs4_do_open(dir, dentry, openflags, NULL, cred); + state = nfs4_do_open(dir, &path, openflags, NULL, cred); put_rpccred(cred); if (IS_ERR(state)) { switch (PTR_ERR(state)) { @@ -1752,6 +1762,10 @@ static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nameidata *nd) { + struct path path = { + .mnt = nd->mnt, + .dentry = dentry, + }; struct nfs4_state *state; struct rpc_cred *cred; int status = 0; @@ -1761,7 +1775,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = PTR_ERR(cred); goto out; } - state = nfs4_do_open(dir, dentry, flags, sattr, cred); + state = nfs4_do_open(dir, &path, flags, sattr, cred); put_rpccred(cred); if (IS_ERR(state)) { status = PTR_ERR(state); @@ -1774,7 +1788,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status == 0) nfs_setattr_update_inode(state->inode, sattr); } - if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) + if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) status = nfs4_intent_set_file(nd, dentry, state); else nfs4_close_state(state, flags); -- cgit v1.2.1 From 4a35bd41aff5714deb41c8f14766df3871e2e8f7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2007 10:31:33 -0400 Subject: NFSv4: Ensure that nfs4_do_close() doesn't race with umount nfs4_do_close() does not currently have any way to ensure that the user won't attempt to unmount the partition while the asynchronous RPC call is completing. This again may cause Oopses in nfs_update_inode(). Add a vfsmount argument to nfs4_close_state to ensure that the partition remains mounted while we're closing the file. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 19 +------------------ fs/nfs/nfs4_fs.h | 6 +++--- fs/nfs/nfs4proc.c | 35 ++++++++++++++++++++--------------- fs/nfs/nfs4state.c | 4 ++-- 4 files changed, 26 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index cc7a9064be90..23ecf0334a18 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -490,7 +490,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx) spin_unlock(&inode->i_lock); } if (ctx->state != NULL) - nfs4_close_state(ctx->state, ctx->mode); + nfs4_close_state(&ctx->path, ctx->state, ctx->mode); if (ctx->cred != NULL) put_rpccred(ctx->cred); dput(ctx->path.dentry); @@ -1103,27 +1103,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ void nfs4_clear_inode(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - /* If we are holding a delegation, return it! */ nfs_inode_return_delegation(inode); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); - /* Now clear out any remaining state */ - while (!list_empty(&nfsi->open_states)) { - struct nfs4_state *state; - - state = list_entry(nfsi->open_states.next, - struct nfs4_state, - inode_states); - dprintk("%s(%s/%Ld): found unclaimed NFSv4 state %p\n", - __FUNCTION__, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - state); - BUG_ON(atomic_read(&state->count) != 1); - nfs4_close_state(state, state->state); - } } #endif diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index cf3a17eb5c09..c97a0ad8430e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -165,7 +165,7 @@ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struc extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); -extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state); +extern int nfs4_do_close(struct path *path, struct nfs4_state *state); extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); @@ -196,7 +196,7 @@ extern void nfs4_put_state_owner(struct nfs4_state_owner *); extern void nfs4_drop_state_owner(struct nfs4_state_owner *); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); -extern void nfs4_close_state(struct nfs4_state *, mode_t); +extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t); extern void nfs4_schedule_state_recovery(struct nfs_client *); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); @@ -222,7 +222,7 @@ extern struct svc_version nfs4_callback_version1; #else -#define nfs4_close_state(a, b) do { } while (0) +#define nfs4_close_state(a, b, c) do { } while (0) #endif /* CONFIG_NFS_V4 */ #endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 895e8e649c91..8feaf232f2e4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -453,7 +453,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * opendata->owner->so_cred, &opendata->o_res); } - nfs4_close_state(newstate, opendata->o_arg.open_flags); + nfs4_close_state(&opendata->path, newstate, opendata->o_arg.open_flags); } if (newstate != state) return -ESTALE; @@ -603,7 +603,7 @@ static void nfs4_open_confirm_release(void *calldata) nfs_confirm_seqid(&data->owner->so_seqid, 0); state = nfs4_opendata_to_nfs4_state(data); if (state != NULL) - nfs4_close_state(state, data->o_arg.open_flags); + nfs4_close_state(&data->path, state, data->o_arg.open_flags); out_free: nfs4_opendata_free(data); } @@ -706,7 +706,7 @@ static void nfs4_open_release(void *calldata) nfs_confirm_seqid(&data->owner->so_seqid, 0); state = nfs4_opendata_to_nfs4_state(data); if (state != NULL) - nfs4_close_state(state, data->o_arg.open_flags); + nfs4_close_state(&data->path, state, data->o_arg.open_flags); out_free: nfs4_opendata_free(data); } @@ -1103,6 +1103,7 @@ static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, } struct nfs4_closedata { + struct path path; struct inode *inode; struct nfs4_state *state; struct nfs_closeargs arg; @@ -1119,6 +1120,8 @@ static void nfs4_free_closedata(void *data) nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); + dput(calldata->path.dentry); + mntput(calldata->path.mnt); kfree(calldata); } @@ -1211,18 +1214,18 @@ static const struct rpc_call_ops nfs4_close_ops = { * * NOTE: Caller must be holding the sp->so_owner semaphore! */ -int nfs4_do_close(struct inode *inode, struct nfs4_state *state) +int nfs4_do_close(struct path *path, struct nfs4_state *state) { - struct nfs_server *server = NFS_SERVER(inode); + struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_closedata *calldata; int status = -ENOMEM; calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); if (calldata == NULL) goto out; - calldata->inode = inode; + calldata->inode = state->inode; calldata->state = state; - calldata->arg.fh = NFS_FH(inode); + calldata->arg.fh = NFS_FH(state->inode); calldata->arg.stateid = &state->stateid; /* Serialization for the sequence id */ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); @@ -1231,6 +1234,8 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state) calldata->arg.bitmask = server->attr_bitmask; calldata->res.fattr = &calldata->fattr; calldata->res.server = server; + calldata->path.mnt = mntget(path->mnt); + calldata->path.dentry = dget(path->dentry); status = nfs4_call_async(server->client, &nfs4_close_ops, calldata); if (status == 0) @@ -1243,18 +1248,18 @@ out: return status; } -static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) +static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state) { struct file *filp; - filp = lookup_instantiate_filp(nd, dentry, NULL); + filp = lookup_instantiate_filp(nd, path->dentry, NULL); if (!IS_ERR(filp)) { struct nfs_open_context *ctx; ctx = (struct nfs_open_context *)filp->private_data; ctx->state = state; return 0; } - nfs4_close_state(state, nd->intent.open.flags); + nfs4_close_state(path, state, nd->intent.open.flags); return PTR_ERR(filp); } @@ -1293,7 +1298,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) res = d_add_unique(dentry, igrab(state->inode)); if (res != NULL) dentry = res; - nfs4_intent_set_file(nd, dentry, state); + nfs4_intent_set_file(nd, &path, state); return res; } @@ -1328,10 +1333,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st } } if (state->inode == dentry->d_inode) { - nfs4_intent_set_file(nd, dentry, state); + nfs4_intent_set_file(nd, &path, state); return 1; } - nfs4_close_state(state, openflags); + nfs4_close_state(&path, state, openflags); out_drop: d_drop(dentry); return 0; @@ -1789,9 +1794,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_setattr_update_inode(state->inode, sattr); } if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) - status = nfs4_intent_set_file(nd, dentry, state); + status = nfs4_intent_set_file(nd, &path, state); else - nfs4_close_state(state, flags); + nfs4_close_state(&path, state, flags); out: return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 8ed79d5c54f9..a85138ef67ad 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -341,7 +341,7 @@ void nfs4_put_open_state(struct nfs4_state *state) /* * Close the current file. */ -void nfs4_close_state(struct nfs4_state *state, mode_t mode) +void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) { struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; @@ -375,7 +375,7 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode) spin_unlock(&inode->i_lock); spin_unlock(&owner->so_lock); - if (oldstate != newstate && nfs4_do_close(inode, state) == 0) + if (oldstate != newstate && nfs4_do_close(path, state) == 0) return; nfs4_put_open_state(state); nfs4_put_state_owner(owner); -- cgit v1.2.1 From b39e625b6e75aa70e26c13f9378756bb5f2af032 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 11 Jun 2007 23:05:07 -0400 Subject: NFSv4: Clean up nfs4_call_async() Use rpc_run_task() instead of doing it ourselves. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 26 +++++++++----------------- fs/nfs/nfs4state.c | 9 +++++---- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8feaf232f2e4..3cc75445a68d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -295,18 +295,6 @@ static void nfs4_opendata_free(struct nfs4_opendata *p) } } -/* Helper for asynchronous RPC calls */ -static int nfs4_call_async(struct rpc_clnt *clnt, - const struct rpc_call_ops *tk_ops, void *calldata) -{ - struct rpc_task *task; - - if (!(task = rpc_new_task(clnt, RPC_TASK_ASYNC, tk_ops, calldata))) - return -ENOMEM; - rpc_execute(task); - return 0; -} - static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) { sigset_t oldset; @@ -1218,6 +1206,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_closedata *calldata; + struct nfs4_state_owner *sp = state->owner; + struct rpc_task *task; int status = -ENOMEM; calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); @@ -1237,14 +1227,16 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state) calldata->path.mnt = mntget(path->mnt); calldata->path.dentry = dget(path->dentry); - status = nfs4_call_async(server->client, &nfs4_close_ops, calldata); - if (status == 0) - goto out; - - nfs_free_seqid(calldata->arg.seqid); + task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_close_ops, calldata); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; out_free_calldata: kfree(calldata); out: + nfs4_put_open_state(state); + nfs4_put_state_owner(sp); return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a85138ef67ad..5d7ffbfc3483 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -375,10 +375,11 @@ void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) spin_unlock(&inode->i_lock); spin_unlock(&owner->so_lock); - if (oldstate != newstate && nfs4_do_close(path, state) == 0) - return; - nfs4_put_open_state(state); - nfs4_put_state_owner(owner); + if (oldstate == newstate) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); + } else + nfs4_do_close(path, state); } /* -- cgit v1.2.1 From a0356862bcbeb20acf64bc1a82d28a4c5bb957a7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2007 13:26:15 -0400 Subject: NFS: Fix nfs_reval_fsid() We don't need to revalidate the fsid on the root directory. It suffices to revalidate it on the current directory. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 9 ++++----- fs/nfs/inode.c | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4948ec1dd9bd..c02a7962e69d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -897,14 +897,13 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) return (nd->intent.open.flags & O_EXCL) != 0; } -static inline int nfs_reval_fsid(struct vfsmount *mnt, struct inode *dir, - struct nfs_fh *fh, struct nfs_fattr *fattr) +static inline int nfs_reval_fsid(struct inode *dir, const struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(dir); if (!nfs_fsid_equal(&server->fsid, &fattr->fsid)) - /* Revalidate fsid on root dir */ - return __nfs_revalidate_inode(server, mnt->mnt_root->d_inode); + /* Revalidate fsid using the parent directory */ + return __nfs_revalidate_inode(server, dir); return 0; } @@ -946,7 +945,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru res = ERR_PTR(error); goto out_unlock; } - error = nfs_reval_fsid(nd->mnt, dir, &fhandle, &fattr); + error = nfs_reval_fsid(dir, &fattr); if (error < 0) { res = ERR_PTR(error); goto out_unlock; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 23ecf0334a18..7bcb3dfa6179 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -961,8 +961,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) goto out_changed; server = NFS_SERVER(inode); - /* Update the fsid if and only if this is the root directory */ - if (inode == inode->i_sb->s_root->d_inode + /* Update the fsid? */ + if (S_ISDIR(inode->i_mode) && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) server->fsid = fattr->fsid; -- cgit v1.2.1 From 83d93f2229348837bf988a1048f7f38789474471 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 7 Jun 2007 09:58:08 -0400 Subject: NFS: Use GFP_HIGHUSER for page allocation in nfs_symlink() nfs_symlink() allocates a GFP_KERNEL page for the pagecache. Most pagecache pages are allocated using GFP_HIGHUSER, and there's no reason not to do that in nfs_symlink() as well. Signed-off-by: Jeff Layton --- fs/nfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c02a7962e69d..0f41678fd7ee 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1534,7 +1534,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym lock_kernel(); - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_HIGHUSER); if (!page) { unlock_kernel(); return -ENOMEM; -- cgit v1.2.1 From fc6ae3cf482c385a6fe87ba119d399bb85aa670b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2007 19:13:47 -0400 Subject: NFS: Re-enable forced umounts They disappeared some time around 2.6.18. Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ca20d3cc2609..14c7923697d2 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -430,7 +430,20 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) */ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags) { + struct nfs_server *server = NFS_SB(vfsmnt->mnt_sb); + struct rpc_clnt *rpc; + shrink_submounts(vfsmnt, &nfs_automount_list); + + if (!(flags & MNT_FORCE)) + return; + /* -EIO all pending I/O */ + rpc = server->client_acl; + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); + rpc = server->client; + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); } /* -- cgit v1.2.1 From aa53ed541a1fec78a78d02afc8b042d040cc080d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 5 Jun 2007 14:49:03 -0400 Subject: NFS4: on a O_EXCL OPEN make sure SETATTR sets the fields holding the verifier The Linux NFS4 client simply skips over the bitmask in an O_EXCL open call and so it doesn't bother to reset any fields that may be holding the verifier. This patch has us save the first two words of the bitmask (which is all the current client has #defines for). The client then later checks this bitmask and turns on the appropriate flags in the sattr->ia_verify field for the following SETATTR call. This patch only currently checks to see if the server used the atime and mtime slots for the verifier (which is what the Linux server uses for this). I'm not sure of what other fields the server could reasonably use, but adding checks for others should be trivial. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 20 ++++++++++++++++++++ fs/nfs/nfs4xdr.c | 9 +++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3cc75445a68d..fee2d14b158b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -942,6 +942,22 @@ static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, st return res; } +/* + * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* + * fields corresponding to attributes that were used to store the verifier. + * Make sure we clobber those fields in the later setattr call + */ +static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr) +{ + if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) && + !(sattr->ia_valid & ATTR_ATIME_SET)) + sattr->ia_valid |= ATTR_ATIME; + + if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) && + !(sattr->ia_valid & ATTR_MTIME_SET)) + sattr->ia_valid |= ATTR_MTIME; +} + /* * Returns a referenced nfs4_state */ @@ -973,6 +989,9 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct if (status != 0) goto err_opendata_free; + if (opendata->o_arg.open_flags & O_EXCL) + nfs4_exclusive_attrset(opendata, sattr); + status = -ENOMEM; state = nfs4_opendata_to_nfs4_state(opendata); if (state == NULL) @@ -1784,6 +1803,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = nfs4_do_setattr(state->inode, &fattr, sattr, state); if (status == 0) nfs_setattr_update_inode(state->inode, sattr); + nfs_post_op_update_inode(state->inode, &fattr); } if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) status = nfs4_intent_set_file(nd, &path, state); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8003c91ccb9a..1fcca516e6ee 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3269,7 +3269,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) { __be32 *p; - uint32_t bmlen; + uint32_t savewords, bmlen, i; int status; status = decode_op_hdr(xdr, OP_OPEN); @@ -3287,7 +3287,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) goto xdr_error; READ_BUF(bmlen << 2); - p += bmlen; + savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); + for (i = 0; i < savewords; ++i) + READ32(res->attrset[i]); + for (; i < NFS4_BITMAP_SIZE; i++) + res->attrset[i] = 0; + return decode_delegation(xdr, res); xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen); -- cgit v1.2.1 From e2f032e9ef66e33089d09452892696ea97d1dca1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2007 19:27:00 -0400 Subject: NFS: nfs3_proc_create() should use nfs_post_op_update_inode() Also get rid of a redundant call to nfs_setattr_update_inode(). The call to nfs3_proc_setattr() already takes care of that. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 45268d6def2e..814d886b6aa4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -335,9 +335,7 @@ again: * not sure this buys us anything (and I'd have * to revamp the NFSv3 XDR code) */ status = nfs3_proc_setattr(dentry, &fattr, sattr); - if (status == 0) - nfs_setattr_update_inode(dentry->d_inode, sattr); - nfs_refresh_inode(dentry->d_inode, &fattr); + nfs_post_op_update_inode(dentry->d_inode, &fattr); dprintk("NFS reply setattr (post-create): %d\n", status); } if (status != 0) -- cgit v1.2.1 From a50f7951a31d3b976e829250853f89c9d2da32c0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2007 19:23:43 -0400 Subject: NFS: Fix an Oops in the nfs_access_cache_shrinker() The nfs_access_cache_shrinker may race with nfs_access_zap_cache(). Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0f41678fd7ee..322141f4ab48 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1743,8 +1743,8 @@ int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) struct nfs_inode *nfsi; struct nfs_access_entry *cache; - spin_lock(&nfs_access_lru_lock); restart: + spin_lock(&nfs_access_lru_lock); list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { struct inode *inode; @@ -1769,6 +1769,7 @@ remove_lru_entry: clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); } spin_unlock(&inode->i_lock); + spin_unlock(&nfs_access_lru_lock); iput(inode); goto restart; } -- cgit v1.2.1 From c03b40246123b2ced79e2620d1d2c089bb12369a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Jun 2007 13:26:38 -0400 Subject: NFS: Convert struct nfs_page to use krefs Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 14 ++++++++------ fs/nfs/write.c | 6 +++--- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index f8a4ba533930..257a7f8b2362 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -85,9 +85,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; - atomic_set(&req->wb_count, 1); req->wb_context = get_nfs_open_context(ctx); - + kref_init(&req->wb_kref); return req; } @@ -160,11 +159,9 @@ void nfs_clear_request(struct nfs_page *req) * * Note: Should never be called with the spinlock held! */ -void -nfs_release_request(struct nfs_page *req) +static void nfs_free_request(struct kref *kref) { - if (!atomic_dec_and_test(&req->wb_count)) - return; + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); /* Release struct file or cached credential */ nfs_clear_request(req); @@ -172,6 +169,11 @@ nfs_release_request(struct nfs_page *req) nfs_page_free(req); } +void nfs_release_request(struct nfs_page *req) +{ + kref_put(&req->wb_kref, nfs_free_request); +} + static int nfs_wait_bit_interruptible(void *word) { int ret = 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9e7c21da864f..e9404328ac02 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -117,7 +117,7 @@ static struct nfs_page *nfs_page_find_request_locked(struct page *page) if (PagePrivate(page)) { req = (struct nfs_page *)page_private(page); if (req != NULL) - atomic_inc(&req->wb_count); + kref_get(&req->wb_kref); } return req; } @@ -398,7 +398,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) if (PageDirty(req->wb_page)) set_bit(PG_NEED_FLUSH, &req->wb_flags); nfsi->npages++; - atomic_inc(&req->wb_count); + kref_get(&req->wb_kref); return 0; } @@ -531,7 +531,7 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u next = req->wb_index + 1; BUG_ON(!NFS_WBACK_BUSY(req)); - atomic_inc(&req->wb_count); + kref_get(&req->wb_kref); spin_unlock(&nfsi->req_lock); error = nfs_wait_on_request(req); nfs_release_request(req); -- cgit v1.2.1 From 9fd367f0f376ccfb2592eed9be0eece70429894f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Jun 2007 15:10:24 -0400 Subject: NFS cleanup: Rename NFS_PAGE_TAG_WRITEBACK to NFS_PAGE_TAG_LOCKED Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 14 +++++++------- fs/nfs/write.c | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 257a7f8b2362..23e9dea20902 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -108,29 +108,29 @@ void nfs_unlock_request(struct nfs_page *req) } /** - * nfs_set_page_writeback_locked - Lock a request for writeback + * nfs_set_page_tag_locked - Tag a request as locked * @req: */ -int nfs_set_page_writeback_locked(struct nfs_page *req) +static int nfs_set_page_tag_locked(struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode); if (!nfs_lock_request(req)) return 0; - radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); + radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); return 1; } /** - * nfs_clear_page_writeback - Unlock request and wake up sleepers + * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers */ -void nfs_clear_page_writeback(struct nfs_page *req) +void nfs_clear_page_tag_locked(struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode); if (req->wb_page != NULL) { spin_lock(&nfsi->req_lock); - radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); + radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); spin_unlock(&nfsi->req_lock); } nfs_unlock_request(req); @@ -421,7 +421,7 @@ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, idx_start = req->wb_index + 1; if (req->wb_list_head != head) continue; - if (nfs_set_page_writeback_locked(req)) { + if (nfs_set_page_tag_locked(req)) { nfs_list_remove_request(req); nfs_list_add_request(req, dst); res++; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e9404328ac02..754066cc9146 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -289,7 +289,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, BUG(); } radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, - NFS_PAGE_TAG_WRITEBACK); + NFS_PAGE_TAG_LOCKED); ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); spin_unlock(req_lock); nfs_pageio_add_request(pgio, req); @@ -524,7 +524,7 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u idx_end = idx_start + npages - 1; next = idx_start; - while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_WRITEBACK)) { + while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) { if (req->wb_index > idx_end) break; @@ -759,7 +759,7 @@ static void nfs_writepage_release(struct nfs_page *req) nfs_inode_remove_request(req); } else nfs_end_page_writeback(req->wb_page); - nfs_clear_page_writeback(req); + nfs_clear_page_tag_locked(req); } static inline int flush_task_priority(int how) @@ -888,7 +888,7 @@ out_bad: } nfs_redirty_request(req); nfs_end_page_writeback(req->wb_page); - nfs_clear_page_writeback(req); + nfs_clear_page_tag_locked(req); return -ENOMEM; } @@ -931,7 +931,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i nfs_list_remove_request(req); nfs_redirty_request(req); nfs_end_page_writeback(req->wb_page); - nfs_clear_page_writeback(req); + nfs_clear_page_tag_locked(req); } return -ENOMEM; } @@ -1049,7 +1049,7 @@ remove_request: nfs_end_page_writeback(page); nfs_inode_remove_request(req); next: - nfs_clear_page_writeback(req); + nfs_clear_page_tag_locked(req); } } @@ -1212,7 +1212,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_list_remove_request(req); nfs_mark_request_commit(req); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - nfs_clear_page_writeback(req); + nfs_clear_page_tag_locked(req); } return -ENOMEM; } @@ -1265,7 +1265,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) dprintk(" mismatch\n"); nfs_redirty_request(req); next: - nfs_clear_page_writeback(req); + nfs_clear_page_tag_locked(req); } } -- cgit v1.2.1 From 5c36968343fcd013a3f7ae93f246c2e75596780b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Jun 2007 15:27:42 -0400 Subject: NFS cleanup: speed up nfs_scan_commit using radix tree tags Add a tag for requests that are waiting for a COMMIT Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 14 +++++++------- fs/nfs/write.c | 6 +++++- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 23e9dea20902..ad90cbe76703 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -381,10 +381,10 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) /** * nfs_scan_list - Scan a list for matching requests * @nfsi: NFS inode - * @head: One of the NFS inode request lists * @dst: Destination list * @idx_start: lower bound of page->index to scan * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for * * Moves elements from one of the inode request lists. * If the number of requests is set to 0, the entire address_space @@ -392,9 +392,9 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) * The requests are *not* checked to ensure that they form a contiguous set. * You must be holding the inode's req_lock when calling this function */ -int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, +int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, pgoff_t idx_start, - unsigned int npages) + unsigned int npages, int tag) { struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; struct nfs_page *req; @@ -409,9 +409,9 @@ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, idx_end = idx_start + npages - 1; for (;;) { - found = radix_tree_gang_lookup(&nfsi->nfs_page_tree, + found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&pgvec[0], idx_start, - NFS_SCAN_MAXENTRIES); + NFS_SCAN_MAXENTRIES, tag); if (found <= 0) break; for (i = 0; i < found; i++) { @@ -419,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, if (req->wb_index > idx_end) goto out; idx_start = req->wb_index + 1; - if (req->wb_list_head != head) - continue; if (nfs_set_page_tag_locked(req)) { nfs_list_remove_request(req); + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); nfs_list_add_request(req, dst); res++; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 754066cc9146..0f779ca12ec3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -462,6 +462,9 @@ nfs_mark_request_commit(struct nfs_page *req) nfs_list_add_request(req, &nfsi->commit); nfsi->ncommit++; set_bit(PG_NEED_COMMIT, &(req)->wb_flags); + radix_tree_tag_set(&nfsi->nfs_page_tree, + req->wb_index, + NFS_PAGE_TAG_COMMIT); spin_unlock(&nfsi->req_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -575,7 +578,8 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u int res = 0; if (nfsi->ncommit != 0) { - res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages); + res = nfs_scan_list(nfsi, dst, idx_start, npages, + NFS_PAGE_TAG_COMMIT); nfsi->ncommit -= res; if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); -- cgit v1.2.1 From 2aefa104313996d1a9582476cee53d1296c834bf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Jun 2007 15:40:59 -0400 Subject: NFS: Remove the redundant 'dirty' and 'commit' lists from nfs_inode Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 --- fs/nfs/write.c | 3 --- 2 files changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7bcb3dfa6179..e7d2bba900b5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1149,14 +1149,11 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag inode_init_once(&nfsi->vfs_inode); spin_lock_init(&nfsi->req_lock); - INIT_LIST_HEAD(&nfsi->dirty); - INIT_LIST_HEAD(&nfsi->commit); INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); atomic_set(&nfsi->data_updates, 0); - nfsi->ndirty = 0; nfsi->ncommit = 0; nfsi->npages = 0; nfs4_init_once(nfsi); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0f779ca12ec3..9ef9ec746bfb 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -459,7 +459,6 @@ nfs_mark_request_commit(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&nfsi->req_lock); - nfs_list_add_request(req, &nfsi->commit); nfsi->ncommit++; set_bit(PG_NEED_COMMIT, &(req)->wb_flags); radix_tree_tag_set(&nfsi->nfs_page_tree, @@ -581,8 +580,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u res = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); nfsi->ncommit -= res; - if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); } return res; } -- cgit v1.2.1 From dce34ce298d85b81630401f4feb4bd7ac77fe9c7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Jun 2007 15:47:53 -0400 Subject: NFS: Prevent integer overflow in nfs_scan_list() Also ensure that nfs_inode ncommit and npages are large enough to represent all possible values for the number of pages. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ad90cbe76703..68f6bf122006 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -425,6 +425,8 @@ int nfs_scan_list(struct nfs_inode *nfsi, req->wb_index, tag); nfs_list_add_request(req, dst); res++; + if (res == INT_MAX) + goto out; } } -- cgit v1.2.1 From edc05fc1c24ba49dae585da1b2a22686f0b221f0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Jun 2007 16:02:34 -0400 Subject: NFS: reduce latency by using conditional rescheduling in nfs_scan_list Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 68f6bf122006..8d2642f24b8f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -429,7 +429,8 @@ int nfs_scan_list(struct nfs_inode *nfsi, goto out; } } - + /* for latency reduction */ + cond_resched_lock(&nfsi->req_lock); } out: return res; -- cgit v1.2.1 From 3bec63db55463365110d00721ed60a31e4614cb6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Jun 2007 16:02:44 -0400 Subject: NFS: Convert struct nfs_open_context to use a kref Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e7d2bba900b5..01fc8ab0c562 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -461,7 +461,6 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { - atomic_set(&ctx->count, 1); ctx->path.dentry = dget(dentry); ctx->path.mnt = mntget(mnt); ctx->cred = get_rpccred(cred); @@ -469,6 +468,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str ctx->lockowner = current->files; ctx->error = 0; ctx->dir_cookie = 0; + kref_init(&ctx->kref); } return ctx; } @@ -476,27 +476,33 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL) - atomic_inc(&ctx->count); + kref_get(&ctx->kref); return ctx; } -void put_nfs_open_context(struct nfs_open_context *ctx) +static void nfs_free_open_context(struct kref *kref) { - if (atomic_dec_and_test(&ctx->count)) { - if (!list_empty(&ctx->list)) { - struct inode *inode = ctx->path.dentry->d_inode; - spin_lock(&inode->i_lock); - list_del(&ctx->list); - spin_unlock(&inode->i_lock); - } - if (ctx->state != NULL) - nfs4_close_state(&ctx->path, ctx->state, ctx->mode); - if (ctx->cred != NULL) - put_rpccred(ctx->cred); - dput(ctx->path.dentry); - mntput(ctx->path.mnt); - kfree(ctx); + struct nfs_open_context *ctx = container_of(kref, + struct nfs_open_context, kref); + + if (!list_empty(&ctx->list)) { + struct inode *inode = ctx->path.dentry->d_inode; + spin_lock(&inode->i_lock); + list_del(&ctx->list); + spin_unlock(&inode->i_lock); } + if (ctx->state != NULL) + nfs4_close_state(&ctx->path, ctx->state, ctx->mode); + if (ctx->cred != NULL) + put_rpccred(ctx->cred); + dput(ctx->path.dentry); + mntput(ctx->path.mnt); + kfree(ctx); +} + +void put_nfs_open_context(struct nfs_open_context *ctx) +{ + kref_put(&ctx->kref, nfs_free_open_context); } /* -- cgit v1.2.1 From c6d00e639bdec5f33460bc95bae4efda7177a6ed Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Jun 2007 16:02:44 -0400 Subject: NFSv4: Convert struct nfs4_opendata to use struct kref Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 53 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fee2d14b158b..d90209e79587 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -214,7 +214,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) } struct nfs4_opendata { - atomic_t count; + struct kref kref; struct nfs_openargs o_arg; struct nfs_openres o_res; struct nfs_open_confirmargs c_arg; @@ -245,7 +245,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); if (p->o_arg.seqid == NULL) goto err_free; - atomic_set(&p->count, 1); p->path.mnt = mntget(path->mnt); p->path.dentry = dget(path->dentry); p->dir = parent; @@ -275,6 +274,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->c_arg.fh = &p->o_res.fh; p->c_arg.stateid = &p->o_res.stateid; p->c_arg.seqid = p->o_arg.seqid; + kref_init(&p->kref); return p; err_free: kfree(p); @@ -283,16 +283,23 @@ err: return NULL; } -static void nfs4_opendata_free(struct nfs4_opendata *p) +static void nfs4_opendata_free(struct kref *kref) { - if (p != NULL && atomic_dec_and_test(&p->count)) { - nfs_free_seqid(p->o_arg.seqid); - nfs4_put_state_owner(p->owner); - dput(p->dir); - dput(p->path.dentry); - mntput(p->path.mnt); - kfree(p); - } + struct nfs4_opendata *p = container_of(kref, + struct nfs4_opendata, kref); + + nfs_free_seqid(p->o_arg.seqid); + nfs4_put_state_owner(p->owner); + dput(p->dir); + dput(p->path.dentry); + mntput(p->path.mnt); + kfree(p); +} + +static void nfs4_opendata_put(struct nfs4_opendata *p) +{ + if (p != NULL) + kref_put(&p->kref, nfs4_opendata_free); } static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) @@ -476,7 +483,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state nfs_copy_fh(&opendata->o_res.fh, opendata->o_arg.fh); opendata->o_arg.u.delegation_type = delegation_type; status = nfs4_open_recover(opendata, state); - nfs4_opendata_free(opendata); + nfs4_opendata_put(opendata); return status; } @@ -522,7 +529,7 @@ static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs memcpy(opendata->o_arg.u.delegation.data, state->stateid.data, sizeof(opendata->o_arg.u.delegation.data)); ret = nfs4_open_recover(opendata, state); - nfs4_opendata_free(opendata); + nfs4_opendata_put(opendata); return ret; } @@ -593,7 +600,7 @@ static void nfs4_open_confirm_release(void *calldata) if (state != NULL) nfs4_close_state(&data->path, state, data->o_arg.open_flags); out_free: - nfs4_opendata_free(data); + nfs4_opendata_put(data); } static const struct rpc_call_ops nfs4_open_confirm_ops = { @@ -611,7 +618,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) struct rpc_task *task; int status; - atomic_inc(&data->count); + kref_get(&data->kref); /* * If rpc_run_task() ends up calling ->rpc_release(), we * want to ensure that it takes the 'error' code path. @@ -696,7 +703,7 @@ static void nfs4_open_release(void *calldata) if (state != NULL) nfs4_close_state(&data->path, state, data->o_arg.open_flags); out_free: - nfs4_opendata_free(data); + nfs4_opendata_put(data); } static const struct rpc_call_ops nfs4_open_ops = { @@ -717,7 +724,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) struct rpc_task *task; int status; - atomic_inc(&data->count); + kref_get(&data->kref); /* * If rpc_run_task() ends up calling ->rpc_release(), we * want to ensure that it takes the 'error' code path. @@ -826,7 +833,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s nfs4_drop_state_owner(state->owner); d_drop(ctx->path.dentry); } - nfs4_opendata_free(opendata); + nfs4_opendata_put(opendata); return ret; } @@ -987,7 +994,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct status = _nfs4_proc_open(opendata); if (status != 0) - goto err_opendata_free; + goto err_opendata_put; if (opendata->o_arg.open_flags & O_EXCL) nfs4_exclusive_attrset(opendata, sattr); @@ -995,16 +1002,16 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct status = -ENOMEM; state = nfs4_opendata_to_nfs4_state(opendata); if (state == NULL) - goto err_opendata_free; + goto err_opendata_put; if (opendata->o_res.delegation_type != 0) nfs_inode_set_delegation(state->inode, cred, &opendata->o_res); - nfs4_opendata_free(opendata); + nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); up_read(&clp->cl_sem); *res = state; return 0; -err_opendata_free: - nfs4_opendata_free(opendata); +err_opendata_put: + nfs4_opendata_put(opendata); err_release_rwsem: up_read(&clp->cl_sem); err_put_state_owner: -- cgit v1.2.1 From 34f52e3591f241b825353ba27def956d8487c400 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jun 2007 16:40:31 -0400 Subject: SUNRPC: Convert rpc_clnt->cl_users to a kref Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 96070bff93fc..c252a1c95857 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -161,15 +161,9 @@ nlm_destroy_host(struct nlm_host *host) */ nsm_unmonitor(host); - if ((clnt = host->h_rpcclnt) != NULL) { - if (atomic_read(&clnt->cl_users)) { - printk(KERN_WARNING - "lockd: active RPC handle\n"); - clnt->cl_dead = 1; - } else { - rpc_destroy_client(host->h_rpcclnt); - } - } + clnt = host->h_rpcclnt; + if (clnt != NULL) + rpc_shutdown_client(clnt); kfree(host); } -- cgit v1.2.1 From 90c5755ff5111ffdcca10a1e8a823dba29f37b6d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 9 Jun 2007 19:49:36 -0400 Subject: SUNRPC: Kill rpc_clnt->cl_oneshot Replace it with explicit calls to rpc_shutdown_client() or rpc_destroy_client() (for the case of asynchronous calls). Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 2 +- fs/nfs/mount_clnt.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 2102e2d0134d..3353ed8421a7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -61,6 +61,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) status); else status = 0; + rpc_shutdown_client(clnt); out: return status; } @@ -138,7 +139,6 @@ nsm_create(void) .program = &nsm_program, .version = SM_VERSION, .authflavor = RPC_AUTH_NULL, - .flags = (RPC_CLNT_CREATE_ONESHOT), }; return rpc_create(&args); diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index ca5a266a3140..878d7a5cb6d4 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -69,6 +69,7 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh, msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT]; status = rpc_call_sync(mnt_clnt, &msg, 0); + rpc_shutdown_client(mnt_clnt); return status < 0? status : (result.status? -EACCES : 0); } @@ -84,8 +85,7 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version, .program = &mnt_program, .version = version, .authflavor = RPC_AUTH_UNIX, - .flags = (RPC_CLNT_CREATE_ONESHOT | - RPC_CLNT_CREATE_INTR), + .flags = RPC_CLNT_CREATE_INTR, }; return rpc_create(&args); -- cgit v1.2.1 From f61534dfd38f895b203e2aadaba04f21a992ca8c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jun 2007 17:31:58 -0400 Subject: SUNRPC: Remove redundant calls to rpciod_up()/rpciod_down() Signed-off-by: Trond Myklebust --- fs/lockd/svc.c | 6 ------ fs/nfs/client.c | 15 --------------- fs/nfsd/nfs4callback.c | 12 +++--------- fs/nfsd/nfs4state.c | 1 - 4 files changed, 3 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 126b1bf02c0e..26809325469c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -123,9 +123,6 @@ lockd(struct svc_rqst *rqstp) /* Process request with signals blocked, but allow SIGKILL. */ allow_signal(SIGKILL); - /* kick rpciod */ - rpciod_up(); - dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); if (!nlm_timeout) @@ -202,9 +199,6 @@ lockd(struct svc_rqst *rqstp) /* Exit the RPC thread */ svc_exit_thread(rqstp); - /* release rpciod */ - rpciod_down(); - /* Release module */ unlock_kernel(); module_put_and_exit(0); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 881fa4900923..71d4c4cdac52 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -102,19 +102,10 @@ static struct nfs_client *nfs_alloc_client(const char *hostname, int nfsversion) { struct nfs_client *clp; - int error; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; - error = rpciod_up(); - if (error < 0) { - dprintk("%s: couldn't start rpciod! Error = %d\n", - __FUNCTION__, error); - goto error_1; - } - __set_bit(NFS_CS_RPCIOD, &clp->cl_res_state); - if (nfsversion == 4) { if (nfs_callback_up() < 0) goto error_2; @@ -154,9 +145,6 @@ error_3: if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) nfs_callback_down(); error_2: - rpciod_down(); - __clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state); -error_1: kfree(clp); error_0: return NULL; @@ -198,9 +186,6 @@ static void nfs_free_client(struct nfs_client *clp) if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) nfs_callback_down(); - if (__test_and_clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state)) - rpciod_down(); - kfree(clp->cl_hostname); kfree(clp); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 864090edc28b..6b1b487db1ec 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -429,29 +429,23 @@ nfsd4_probe_callback(struct nfs4_client *clp) goto out_err; } - /* Kick rpciod, put the call on the wire. */ - if (rpciod_up() != 0) - goto out_clnt; - /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); msg.rpc_cred = nfsd4_lookupcred(clp,0); if (IS_ERR(msg.rpc_cred)) - goto out_rpciod; + goto out_release_clp; status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL); put_rpccred(msg.rpc_cred); if (status != 0) { dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n"); - goto out_rpciod; + goto out_release_clp; } return; -out_rpciod: +out_release_clp: atomic_dec(&clp->cl_count); - rpciod_down(); -out_clnt: rpc_shutdown_client(cb->cb_client); out_err: cb->cb_client = NULL; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3cc8ce422ab1..8c52913d7cb6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -378,7 +378,6 @@ shutdown_callback_client(struct nfs4_client *clp) if (clnt) { clp->cl_callback.cb_client = NULL; rpc_shutdown_client(clnt); - rpciod_down(); } } -- cgit v1.2.1 From 1be27f36601973815171db684c711d30557cf50c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 27 Jun 2007 14:29:04 -0400 Subject: SUNRPC: Remove the tk_auth macro... We should almost always be deferencing the rpc_auth struct by means of the credential's cr_auth field instead of the rpc_clnt->cl_auth anyway. Fix up that historical mistake, and remove the macro that propagated it. Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 6 +++--- fs/nfs/nfs3xdr.c | 8 ++++---- fs/nfs/nfs4xdr.c | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index cd3ca7b5d3db..7fcc78f2aa71 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -223,7 +223,7 @@ nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args) static int nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; unsigned int replen; u32 offset = (u32)args->offset; u32 count = args->count; @@ -380,7 +380,7 @@ static int nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) { struct rpc_task *task = req->rq_task; - struct rpc_auth *auth = task->tk_auth; + struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -541,7 +541,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) static int nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index b51df8eb9f01..b4647a22f349 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -319,7 +319,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg static int nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -458,7 +458,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args) static int nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -643,7 +643,7 @@ static int nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, struct nfs3_getaclargs *args) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); @@ -773,7 +773,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) static int nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1fcca516e6ee..859b13633258 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1071,7 +1071,7 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args) static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; uint32_t attrs[2] = { FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, FATTR4_WORD1_MOUNTED_ON_FILEID, @@ -1117,7 +1117,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; unsigned int replen; __be32 *p; @@ -1735,7 +1735,7 @@ out: */ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) { - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; struct xdr_stream xdr; struct compound_hdr hdr = { .nops = 2, @@ -1795,7 +1795,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, struct nfs_getaclargs *args) { struct xdr_stream xdr; - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; struct compound_hdr hdr = { .nops = 2, }; @@ -2030,7 +2030,7 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs struct compound_hdr hdr = { .nops = 3, }; - struct rpc_auth *auth = req->rq_task->tk_auth; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; int replen; int status; -- cgit v1.2.1 From 4e56e082dd89266d320ccfbc7bd0102186a765ac Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Jul 2007 18:13:52 -0400 Subject: NFSv4: Clean up _nfs4_proc_lookup() vs _nfs4_proc_lookupfh() They differ only slightly in the arguments they take. Why have they not been merged? Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d90209e79587..84d0b7e0dd67 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1592,8 +1592,6 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, dprintk("NFS call lookupfh %s\n", name->name); status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply lookupfh: %d\n", status); - if (status == -NFS4ERR_MOVED) - status = -EREMOTE; return status; } @@ -1604,10 +1602,13 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(server, - _nfs4_proc_lookupfh(server, dirfh, name, - fhandle, fattr), - &exception); + err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); + /* FIXME: !!!! */ + if (err == -NFS4ERR_MOVED) { + err = -EREMOTE; + break; + } + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } @@ -1615,28 +1616,10 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { - int status; - struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_lookup_arg args = { - .bitmask = server->attr_bitmask, - .dir_fh = NFS_FH(dir), - .name = name, - }; - struct nfs4_lookup_res res = { - .server = server, - .fattr = fattr, - .fh = fhandle, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP], - .rpc_argp = &args, - .rpc_resp = &res, - }; - - nfs_fattr_init(fattr); + int status; dprintk("NFS call lookup %s\n", name->name); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); if (status == -NFS4ERR_MOVED) status = nfs4_get_referral(dir, name, fattr, fhandle); dprintk("NFS reply lookup: %d\n", status); -- cgit v1.2.1 From 587142f85f796cf0b823dd3080e815f02ff6b952 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Jul 2007 09:57:54 -0400 Subject: NFS: Replace NFS_I(inode)->req_lock with inode->i_lock There is no justification for keeping a special spinlock for the exclusive use of the NFS writeback code. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 - fs/nfs/pagelist.c | 11 ++++---- fs/nfs/write.c | 84 ++++++++++++++++++++++++++----------------------------- 3 files changed, 46 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 01fc8ab0c562..9d5124166d20 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1154,7 +1154,6 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag struct nfs_inode *nfsi = (struct nfs_inode *) foo; inode_init_once(&nfsi->vfs_inode); - spin_lock_init(&nfsi->req_lock); INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 8d2642f24b8f..f56dae5216f4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -126,12 +126,13 @@ static int nfs_set_page_tag_locked(struct nfs_page *req) */ void nfs_clear_page_tag_locked(struct nfs_page *req) { - struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode); + struct inode *inode = req->wb_context->path.dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); if (req->wb_page != NULL) { - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); } nfs_unlock_request(req); } @@ -390,7 +391,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) * If the number of requests is set to 0, the entire address_space * starting at index idx_start, is scanned. * The requests are *not* checked to ensure that they form a contiguous set. - * You must be holding the inode's req_lock when calling this function + * You must be holding the inode's i_lock when calling this function */ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, pgoff_t idx_start, @@ -430,7 +431,7 @@ int nfs_scan_list(struct nfs_inode *nfsi, } } /* for latency reduction */ - cond_resched_lock(&nfsi->req_lock); + cond_resched_lock(&nfsi->vfs_inode.i_lock); } out: return res; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9ef9ec746bfb..73ac992ece85 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -124,12 +124,12 @@ static struct nfs_page *nfs_page_find_request_locked(struct page *page) static struct nfs_page *nfs_page_find_request(struct page *page) { + struct inode *inode = page->mapping->host; struct nfs_page *req = NULL; - spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock; - spin_lock(req_lock); + spin_lock(&inode->i_lock); req = nfs_page_find_request_locked(page); - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); return req; } @@ -251,16 +251,16 @@ static void nfs_end_page_writeback(struct page *page) static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct page *page) { + struct inode *inode = page->mapping->host; + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *req; - struct nfs_inode *nfsi = NFS_I(page->mapping->host); - spinlock_t *req_lock = &nfsi->req_lock; int ret; - spin_lock(req_lock); + spin_lock(&inode->i_lock); for(;;) { req = nfs_page_find_request_locked(page); if (req == NULL) { - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); return 1; } if (nfs_lock_request_dontget(req)) @@ -270,28 +270,28 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, * succeed provided that someone hasn't already marked the * request as dirty (in which case we don't care). */ - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); ret = nfs_wait_on_request(req); nfs_release_request(req); if (ret != 0) return ret; - spin_lock(req_lock); + spin_lock(&inode->i_lock); } if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { /* This request is marked for commit */ - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); nfs_unlock_request(req); nfs_pageio_complete(pgio); return 1; } if (nfs_set_page_writeback(page) != 0) { - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); BUG(); } radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); nfs_pageio_add_request(pgio, req); return ret; } @@ -412,7 +412,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) BUG_ON (!NFS_WBACK_BUSY(req)); - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); set_page_private(req->wb_page, 0); ClearPagePrivate(req->wb_page); radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); @@ -420,11 +420,11 @@ static void nfs_inode_remove_request(struct nfs_page *req) __set_page_dirty_nobuffers(req->wb_page); nfsi->npages--; if (!nfsi->npages) { - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); nfs_end_data_update(inode); iput(inode); } else - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); nfs_clear_request(req); nfs_release_request(req); } @@ -458,13 +458,13 @@ nfs_mark_request_commit(struct nfs_page *req) struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); nfsi->ncommit++; set_bit(PG_NEED_COMMIT, &(req)->wb_flags); radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_COMMIT); - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } @@ -534,10 +534,10 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u BUG_ON(!NFS_WBACK_BUSY(req)); kref_get(&req->wb_kref); - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); error = nfs_wait_on_request(req); nfs_release_request(req); - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); if (error < 0) return error; res++; @@ -602,7 +602,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *req, *new = NULL; pgoff_t rqend, end; @@ -612,13 +611,13 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, /* Loop over all inode entries and see if we find * A request for the page we wish to update */ - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); req = nfs_page_find_request_locked(page); if (req) { if (!nfs_lock_request_dontget(req)) { int error; - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); error = nfs_wait_on_request(req); nfs_release_request(req); if (error < 0) { @@ -628,7 +627,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, } continue; } - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); if (new) nfs_release_request(new); break; @@ -639,14 +638,14 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, nfs_lock_request_dontget(new); error = nfs_inode_add_request(inode, new); if (error) { - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); nfs_unlock_request(new); return ERR_PTR(error); } - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); return new; } - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); new = nfs_create_request(ctx, inode, page, offset, bytes); if (IS_ERR(new)) @@ -974,9 +973,9 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) } if (nfs_write_need_commit(data)) { - spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock; + struct inode *inode = page->mapping->host; - spin_lock(req_lock); + spin_lock(&inode->i_lock); if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { /* Do nothing we need to resend the writes */ } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { @@ -987,7 +986,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) clear_bit(PG_NEED_COMMIT, &req->wb_flags); dprintk(" server reboot detected\n"); } - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); } else dprintk(" OK\n"); @@ -1277,13 +1276,12 @@ static const struct rpc_call_ops nfs_commit_ops = { int nfs_commit_inode(struct inode *inode, int how) { - struct nfs_inode *nfsi = NFS_I(inode); LIST_HEAD(head); int res; - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); if (res) { int error = nfs_commit_list(inode, &head, how); if (error < 0) @@ -1301,7 +1299,6 @@ static inline int nfs_commit_list(struct inode *inode, struct list_head *head, i long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) { struct inode *inode = mapping->host; - struct nfs_inode *nfsi = NFS_I(inode); pgoff_t idx_start, idx_end; unsigned int npages = 0; LIST_HEAD(head); @@ -1323,7 +1320,7 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr } } how &= ~FLUSH_NOCOMMIT; - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); do { ret = nfs_wait_on_requests_locked(inode, idx_start, npages); if (ret != 0) @@ -1334,18 +1331,19 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr if (pages == 0) break; if (how & FLUSH_INVALIDATE) { - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); nfs_cancel_commit_list(&head); ret = pages; - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); continue; } pages += nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); ret = nfs_commit_list(inode, &head, how); - spin_lock(&nfsi->req_lock); + spin_lock(&inode->i_lock); + } while (ret >= 0); - spin_unlock(&nfsi->req_lock); + spin_unlock(&inode->i_lock); return ret; } @@ -1439,7 +1437,6 @@ int nfs_set_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode; - spinlock_t *req_lock; struct nfs_page *req; int ret; @@ -1448,18 +1445,17 @@ int nfs_set_page_dirty(struct page *page) inode = mapping->host; if (!inode) goto out_raced; - req_lock = &NFS_I(inode)->req_lock; - spin_lock(req_lock); + spin_lock(&inode->i_lock); req = nfs_page_find_request_locked(page); if (req != NULL) { /* Mark any existing write requests for flushing */ ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags); - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); nfs_release_request(req); return ret; } ret = __set_page_dirty_nobuffers(page); - spin_unlock(req_lock); + spin_unlock(&inode->i_lock); return ret; out_raced: return !TestSetPageDirty(page); -- cgit v1.2.1 From 27b3f949b769a208e2849d28e7ad64cadac5d0e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Jul 2007 10:24:56 -0400 Subject: NFSv4: Fix a credential reference leak in nfs4_get_state_owner() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5d7ffbfc3483..0030248d63ec 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -83,7 +83,7 @@ nfs4_client_grab_unused(struct nfs_client *clp, struct rpc_cred *cred) if (!list_empty(&clp->cl_unused)) { sp = list_entry(clp->cl_unused.next, struct nfs4_state_owner, so_list); atomic_inc(&sp->so_count); - sp->so_cred = cred; + sp->so_cred = get_rpccred(cred); list_move(&sp->so_list, &clp->cl_state_owners); clp->cl_nunused--; } @@ -175,7 +175,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp, *new; - get_rpccred(cred); new = nfs4_alloc_state_owner(); spin_lock(&clp->cl_lock); sp = nfs4_find_state_owner(clp, cred); @@ -185,7 +184,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct list_add(&new->so_list, &clp->cl_state_owners); new->so_client = clp; new->so_id = nfs4_alloc_lockowner_id(clp); - new->so_cred = cred; + new->so_cred = get_rpccred(cred); sp = new; new = NULL; } @@ -193,7 +192,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct kfree(new); if (sp != NULL) return sp; - put_rpccred(cred); return NULL; } -- cgit v1.2.1 From 7af654f8d1b7460415af5d1d326233478dd0f563 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Jul 2007 12:49:23 -0400 Subject: NFSv4: Don't reuse expired nfs4_state_owner structs That just confuses certain NFSv4 servers. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 10 ---------- fs/nfs/nfs4state.c | 28 ---------------------------- 2 files changed, 38 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 71d4c4cdac52..6b424407d631 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -131,7 +131,6 @@ static struct nfs_client *nfs_alloc_client(const char *hostname, init_rwsem(&clp->cl_sem); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_state_owners); - INIT_LIST_HEAD(&clp->cl_unused); spin_lock_init(&clp->cl_lock); INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); @@ -155,15 +154,6 @@ static void nfs4_shutdown_client(struct nfs_client *clp) #ifdef CONFIG_NFS_V4 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) nfs4_kill_renewd(clp); - while (!list_empty(&clp->cl_unused)) { - struct nfs4_state_owner *sp; - - sp = list_entry(clp->cl_unused.next, - struct nfs4_state_owner, - so_list); - list_del(&sp->so_list); - kfree(sp); - } BUG_ON(!list_empty(&clp->cl_state_owners)); if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) nfs_idmap_delete(clp); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0030248d63ec..2b00c45aebea 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -75,21 +75,6 @@ nfs4_alloc_lockowner_id(struct nfs_client *clp) return clp->cl_lockowner_id ++; } -static struct nfs4_state_owner * -nfs4_client_grab_unused(struct nfs_client *clp, struct rpc_cred *cred) -{ - struct nfs4_state_owner *sp = NULL; - - if (!list_empty(&clp->cl_unused)) { - sp = list_entry(clp->cl_unused.next, struct nfs4_state_owner, so_list); - atomic_inc(&sp->so_count); - sp->so_cred = get_rpccred(cred); - list_move(&sp->so_list, &clp->cl_state_owners); - clp->cl_nunused--; - } - return sp; -} - struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; @@ -178,8 +163,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct new = nfs4_alloc_state_owner(); spin_lock(&clp->cl_lock); sp = nfs4_find_state_owner(clp, cred); - if (sp == NULL) - sp = nfs4_client_grab_unused(clp, cred); if (sp == NULL && new != NULL) { list_add(&new->so_list, &clp->cl_state_owners); new->so_client = clp; @@ -206,17 +189,6 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp) if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) return; - if (clp->cl_nunused >= OPENOWNER_POOL_SIZE) - goto out_free; - if (list_empty(&sp->so_list)) - goto out_free; - list_move(&sp->so_list, &clp->cl_unused); - clp->cl_nunused++; - spin_unlock(&clp->cl_lock); - put_rpccred(cred); - cred = NULL; - return; -out_free: list_del(&sp->so_list); spin_unlock(&clp->cl_lock); put_rpccred(cred); -- cgit v1.2.1 From bd625ba80d84d9de003b8a4bf61fd937b82aca09 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Jul 2007 18:38:23 -0400 Subject: NFSv4: Fix the NFSv4 owner and owner_group size estimates Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 859b13633258..932bc79a9028 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -87,9 +87,11 @@ static int nfs4_stat_to_errno(int); #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ - 3 + 3 + 3 + 2 * nfs4_name_maxsz)) + 3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ nfs4_fattr_value_maxsz) #define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) -- cgit v1.2.1 From 2cebf82883f49fd26148da5d9a43d1b4363f1d59 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Jul 2007 13:57:28 -0400 Subject: NFSv4: Fix the underestimate of NFSv4 open request size The maximum size depends on the filename size and a number of other elements which are currently not being counted. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 63 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 932bc79a9028..f6068bf3823a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -70,7 +70,8 @@ static int nfs4_stat_to_errno(int); /* lock,open owner id: * we currently use size 1 (u32) out of (NFS4_OPAQUE_LIMIT >> 2) */ -#define owner_id_maxsz (1 + 1) +#define open_owner_id_maxsz (1 + 1) +#define lock_owner_id_maxsz (1 + 1) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define op_encode_hdr_maxsz (1) @@ -120,6 +121,25 @@ static int nfs4_stat_to_errno(int); (op_decode_hdr_maxsz) #define encode_lookup_maxsz (op_encode_hdr_maxsz + \ 1 + ((3 + NFS4_FHSIZE) >> 2)) +#define encode_share_access_maxsz \ + (2) +#define encode_createmode_maxsz (1 + nfs4_fattr_maxsz) +#define encode_opentype_maxsz (1 + encode_createmode_maxsz) +#define encode_claim_null_maxsz (1 + nfs4_name_maxsz) +#define encode_open_maxsz (op_encode_hdr_maxsz + \ + 2 + encode_share_access_maxsz + 2 + \ + open_owner_id_maxsz + \ + encode_opentype_maxsz + \ + encode_claim_null_maxsz) +#define decode_ace_maxsz (3 + nfs4_owner_maxsz) +#define decode_delegation_maxsz (1 + XDR_QUADLEN(NFS4_STATEID_SIZE) + 1 + \ + decode_ace_maxsz) +#define decode_change_info_maxsz (5) +#define decode_open_maxsz (op_decode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + decode_change_info_maxsz + 1 + \ + nfs4_fattr_bitmap_maxsz + \ + decode_delegation_maxsz) #define encode_remove_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) #define encode_rename_maxsz (op_encode_hdr_maxsz + \ @@ -136,7 +156,9 @@ static int nfs4_stat_to_errno(int); #define encode_create_maxsz (op_encode_hdr_maxsz + \ 2 + nfs4_name_maxsz + \ nfs4_fattr_maxsz) -#define decode_create_maxsz (op_decode_hdr_maxsz + 8) +#define decode_create_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz + \ + nfs4_fattr_bitmap_maxsz) #define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4) #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ @@ -176,16 +198,21 @@ static int nfs4_stat_to_errno(int); op_decode_hdr_maxsz + 2 + \ decode_getattr_maxsz) #define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ - encode_putfh_maxsz + \ - op_encode_hdr_maxsz + \ - 13 + 3 + 2 + 64 + \ - encode_getattr_maxsz + \ - encode_getfh_maxsz) + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_open_maxsz + \ + encode_getfh_maxsz + \ + encode_getattr_maxsz + \ + encode_restorefh_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ - decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 4 + 5 + 2 + 3 + \ - decode_getattr_maxsz + \ - decode_getfh_maxsz) + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_open_maxsz + \ + decode_getfh_maxsz + \ + decode_getattr_maxsz + \ + decode_restorefh_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_open_confirm_sz \ (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ @@ -195,12 +222,12 @@ static int nfs4_stat_to_errno(int); op_decode_hdr_maxsz + 4) #define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - op_encode_hdr_maxsz + \ - 11) + encode_open_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + \ - 4 + 5 + 2 + 3) + decode_open_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_open_downgrade_sz \ (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ @@ -258,19 +285,19 @@ static int nfs4_stat_to_errno(int); op_encode_hdr_maxsz + \ 1 + 1 + 2 + 2 + \ 1 + 4 + 1 + 2 + \ - owner_id_maxsz) + lock_owner_id_maxsz) #define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_getattr_maxsz + \ op_decode_hdr_maxsz + \ 2 + 2 + 1 + 2 + \ - owner_id_maxsz) + lock_owner_id_maxsz) #define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz + \ op_encode_hdr_maxsz + \ 1 + 2 + 2 + 2 + \ - owner_id_maxsz) + lock_owner_id_maxsz) #define NFS4_dec_lockt_sz (NFS4_dec_lock_sz) #define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ -- cgit v1.2.1 From e6889620e89525ebf41f0eed937edb3dc065cf1d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Jul 2007 13:58:30 -0400 Subject: NFSv4: Fix underestimate of NFSv4 lookup request size Also fix up the underestimate of fs_locations Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f6068bf3823a..4c8f67d47523 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -119,8 +119,8 @@ static int nfs4_stat_to_errno(int); 3 + (NFS4_VERIFIER_SIZE >> 2)) #define decode_setclientid_confirm_maxsz \ (op_decode_hdr_maxsz) -#define encode_lookup_maxsz (op_encode_hdr_maxsz + \ - 1 + ((3 + NFS4_FHSIZE) >> 2)) +#define encode_lookup_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_lookup_maxsz (op_decode_hdr_maxsz) #define encode_share_access_maxsz \ (2) #define encode_createmode_maxsz (1 + nfs4_fattr_maxsz) @@ -161,6 +161,10 @@ static int nfs4_stat_to_errno(int); nfs4_fattr_bitmap_maxsz) #define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4) #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) +#define encode_fs_locations_maxsz \ + (encode_getattr_maxsz) +#define decode_fs_locations_maxsz \ + (0) #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ #define NFS4_dec_compound_sz (1024) /* XXX: large enough? */ #define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \ @@ -327,7 +331,7 @@ static int nfs4_stat_to_errno(int); encode_getfh_maxsz) #define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + \ + decode_lookup_maxsz + \ decode_getattr_maxsz + \ decode_getfh_maxsz) #define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \ @@ -446,12 +450,13 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_fs_locations_sz \ (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_getattr_maxsz) + encode_lookup_maxsz + \ + encode_fs_locations_maxsz) #define NFS4_dec_fs_locations_sz \ (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz) + decode_lookup_maxsz + \ + decode_fs_locations_maxsz) static struct { unsigned int mode; -- cgit v1.2.1 From 88d9093997e1c73ca98db41b5605dbde7783845f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Jul 2007 14:03:03 -0400 Subject: NFSv4: nfs_increment_open_seqid should not return a value It is a void function... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2b00c45aebea..0f79d56e97f0 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -528,7 +528,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid) * failed with a seqid incrementing error - * see comments nfs_fs.h:seqid_mutating_error() */ -static inline void nfs_increment_seqid(int status, struct nfs_seqid *seqid) +static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { switch (status) { case 0: @@ -557,7 +557,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) struct nfs4_state_owner, so_seqid); nfs4_drop_state_owner(sp); } - return nfs_increment_seqid(status, seqid); + nfs_increment_seqid(status, seqid); } /* @@ -567,7 +567,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) */ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) { - return nfs_increment_seqid(status, seqid); + nfs_increment_seqid(status, seqid); } int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) -- cgit v1.2.1 From 9f958ab8858c75df800e0121b1920182820cbc39 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Jul 2007 13:58:33 -0400 Subject: NFSv4: Reduce the chances of an open_owner identifier collision Currently we just use a 32-bit counter. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 3 +- fs/nfs/nfs4_fs.h | 17 +++-- fs/nfs/nfs4proc.c | 8 +-- fs/nfs/nfs4state.c | 193 ++++++++++++++++++++++++++++++++++++++++++----------- fs/nfs/nfs4xdr.c | 27 ++++---- 5 files changed, 185 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 6b424407d631..ccb455053ee4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -130,7 +130,6 @@ static struct nfs_client *nfs_alloc_client(const char *hostname, #ifdef CONFIG_NFS_V4 init_rwsem(&clp->cl_sem); INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_state_owners); spin_lock_init(&clp->cl_lock); INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); @@ -154,7 +153,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp) #ifdef CONFIG_NFS_V4 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) nfs4_kill_renewd(clp); - BUG_ON(!list_empty(&clp->cl_state_owners)); + BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners)); if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) nfs_idmap_delete(clp); #endif diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c97a0ad8430e..44b56c915f72 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -70,19 +70,25 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status seqid->flags |= NFS_SEQID_CONFIRMED; } +struct nfs_unique_id { + struct rb_node rb_node; + __u64 id; +}; + /* * NFS4 state_owners and lock_owners are simply labels for ordered * sequences of RPC calls. Their sole purpose is to provide once-only * semantics by allowing the server to identify replayed requests. */ struct nfs4_state_owner { - spinlock_t so_lock; - struct list_head so_list; /* per-clientid list of state_owners */ + struct nfs_unique_id so_owner_id; struct nfs_client *so_client; - u32 so_id; /* 32-bit identifier, unique */ - atomic_t so_count; + struct rb_node so_client_node; struct rpc_cred *so_cred; /* Associated cred */ + + spinlock_t so_lock; + atomic_t so_count; struct list_head so_states; struct list_head so_delegations; struct nfs_seqid_counter so_seqid; @@ -108,7 +114,7 @@ struct nfs4_lock_state { #define NFS_LOCK_INITIALIZED 1 int ls_flags; struct nfs_seqid_counter ls_seqid; - u32 ls_id; + struct nfs_unique_id ls_id; nfs4_stateid ls_stateid; atomic_t ls_count; }; @@ -189,7 +195,6 @@ extern void nfs4_renew_state(struct work_struct *); /* nfs4state.c */ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp); -extern u32 nfs4_alloc_lockowner_id(struct nfs_client *); extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); extern void nfs4_put_state_owner(struct nfs4_state_owner *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 84d0b7e0dd67..1840ebc78fd3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -253,7 +253,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->o_arg.fh = NFS_FH(dir); p->o_arg.open_flags = flags, p->o_arg.clientid = server->nfs_client->cl_clientid; - p->o_arg.id = sp->so_id; + p->o_arg.id = sp->so_owner_id.id; p->o_arg.name = &p->path.dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; @@ -651,7 +651,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) return; /* Update sequence id. */ - data->o_arg.id = sp->so_id; + data->o_arg.id = sp->so_owner_id.id; data->o_arg.clientid = sp->so_client->cl_clientid; if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; @@ -3029,7 +3029,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock if (status != 0) goto out; lsp = request->fl_u.nfs4_fl.owner; - arg.lock_owner.id = lsp->ls_id; + arg.lock_owner.id = lsp->ls_id.id; status = rpc_call_sync(server->client, &msg, 0); switch (status) { case 0: @@ -3243,7 +3243,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, goto out_free; p->arg.lock_stateid = &lsp->ls_stateid; p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; - p->arg.lock_owner.id = lsp->ls_id; + p->arg.lock_owner.id = lsp->ls_id.id; p->lsp = lsp; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0f79d56e97f0..ab0b5ab60e60 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -69,18 +70,14 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) return status; } -u32 -nfs4_alloc_lockowner_id(struct nfs_client *clp) -{ - return clp->cl_lockowner_id ++; -} - struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; + struct rb_node *pos; struct rpc_cred *cred = NULL; - list_for_each_entry(sp, &clp->cl_state_owners, so_list) { + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); if (list_empty(&sp->so_states)) continue; cred = get_rpccred(sp->so_cred); @@ -92,32 +89,129 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; + struct rb_node *pos; - if (!list_empty(&clp->cl_state_owners)) { - sp = list_entry(clp->cl_state_owners.next, - struct nfs4_state_owner, so_list); + pos = rb_first(&clp->cl_state_owners); + if (pos != NULL) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); return get_rpccred(sp->so_cred); } return NULL; } +static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, + __u64 minval, int maxbits) +{ + struct rb_node **p, *parent; + struct nfs_unique_id *pos; + __u64 mask = ~0ULL; + + if (maxbits < 64) + mask = (1ULL << maxbits) - 1ULL; + + /* Ensure distribution is more or less flat */ + get_random_bytes(&new->id, sizeof(new->id)); + new->id &= mask; + if (new->id < minval) + new->id += minval; +retry: + p = &root->rb_node; + parent = NULL; + + while (*p != NULL) { + parent = *p; + pos = rb_entry(parent, struct nfs_unique_id, rb_node); + + if (new->id < pos->id) + p = &(*p)->rb_left; + else if (new->id > pos->id) + p = &(*p)->rb_right; + else + goto id_exists; + } + rb_link_node(&new->rb_node, parent, p); + rb_insert_color(&new->rb_node, root); + return; +id_exists: + for (;;) { + new->id++; + if (new->id < minval || (new->id & mask) != new->id) { + new->id = minval; + break; + } + parent = rb_next(parent); + if (parent == NULL) + break; + pos = rb_entry(parent, struct nfs_unique_id, rb_node); + if (new->id < pos->id) + break; + } + goto retry; +} + +static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id) +{ + rb_erase(&id->rb_node, root); +} + static struct nfs4_state_owner * nfs4_find_state_owner(struct nfs_client *clp, struct rpc_cred *cred) { + struct rb_node **p = &clp->cl_state_owners.rb_node, + *parent = NULL; struct nfs4_state_owner *sp, *res = NULL; - list_for_each_entry(sp, &clp->cl_state_owners, so_list) { - if (sp->so_cred != cred) - continue; - atomic_inc(&sp->so_count); - /* Move to the head of the list */ - list_move(&sp->so_list, &clp->cl_state_owners); - res = sp; - break; + while (*p != NULL) { + parent = *p; + sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); + + if (cred < sp->so_cred) + p = &parent->rb_left; + else if (cred > sp->so_cred) + p = &parent->rb_right; + else { + atomic_inc(&sp->so_count); + res = sp; + break; + } } return res; } +static struct nfs4_state_owner * +nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) +{ + struct rb_node **p = &clp->cl_state_owners.rb_node, + *parent = NULL; + struct nfs4_state_owner *sp; + + while (*p != NULL) { + parent = *p; + sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); + + if (new->so_cred < sp->so_cred) + p = &parent->rb_left; + else if (new->so_cred > sp->so_cred) + p = &parent->rb_right; + else { + atomic_inc(&sp->so_count); + return sp; + } + } + nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64); + rb_link_node(&new->so_client_node, parent, p); + rb_insert_color(&new->so_client_node, &clp->cl_state_owners); + return new; +} + +static void +nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp) +{ + if (!RB_EMPTY_NODE(&sp->so_client_node)) + rb_erase(&sp->so_client_node, &clp->cl_state_owners); + nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id); +} + /* * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to * create a new state_owner. @@ -145,10 +239,14 @@ nfs4_alloc_state_owner(void) void nfs4_drop_state_owner(struct nfs4_state_owner *sp) { - struct nfs_client *clp = sp->so_client; - spin_lock(&clp->cl_lock); - list_del_init(&sp->so_list); - spin_unlock(&clp->cl_lock); + if (!RB_EMPTY_NODE(&sp->so_client_node)) { + struct nfs_client *clp = sp->so_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_client_node, &clp->cl_state_owners); + RB_CLEAR_NODE(&sp->so_client_node); + spin_unlock(&clp->cl_lock); + } } /* @@ -160,22 +258,24 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp, *new; - new = nfs4_alloc_state_owner(); spin_lock(&clp->cl_lock); sp = nfs4_find_state_owner(clp, cred); - if (sp == NULL && new != NULL) { - list_add(&new->so_list, &clp->cl_state_owners); - new->so_client = clp; - new->so_id = nfs4_alloc_lockowner_id(clp); - new->so_cred = get_rpccred(cred); - sp = new; - new = NULL; - } spin_unlock(&clp->cl_lock); - kfree(new); if (sp != NULL) return sp; - return NULL; + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; + new->so_client = clp; + new->so_cred = cred; + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner(clp, new); + spin_unlock(&clp->cl_lock); + if (sp == new) + get_rpccred(cred); + else + kfree(new); + return sp; } /* @@ -189,7 +289,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp) if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) return; - list_del(&sp->so_list); + nfs4_remove_state_owner(clp, sp); spin_unlock(&clp->cl_lock); put_rpccred(cred); kfree(sp); @@ -386,12 +486,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f atomic_set(&lsp->ls_count, 1); lsp->ls_owner = fl_owner; spin_lock(&clp->cl_lock); - lsp->ls_id = nfs4_alloc_lockowner_id(clp); + nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); spin_unlock(&clp->cl_lock); INIT_LIST_HEAD(&lsp->ls_locks); return lsp; } +static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) +{ + struct nfs_client *clp = lsp->ls_state->owner->so_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); + spin_unlock(&clp->cl_lock); + kfree(lsp); +} + /* * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. @@ -421,7 +531,8 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ return NULL; } spin_unlock(&state->state_lock); - kfree(new); + if (new != NULL) + nfs4_free_lock_state(new); return lsp; } @@ -442,7 +553,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); - kfree(lsp); + nfs4_free_lock_state(lsp); } static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) @@ -719,11 +830,13 @@ out_err: static void nfs4_state_mark_reclaim(struct nfs_client *clp) { struct nfs4_state_owner *sp; + struct rb_node *pos; struct nfs4_state *state; struct nfs4_lock_state *lock; /* Reset all sequence ids to zero */ - list_for_each_entry(sp, &clp->cl_state_owners, so_list) { + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); sp->so_seqid.counter = 0; sp->so_seqid.flags = 0; spin_lock(&sp->so_lock); @@ -742,6 +855,7 @@ static int reclaimer(void *ptr) { struct nfs_client *clp = ptr; struct nfs4_state_owner *sp; + struct rb_node *pos; struct nfs4_state_recovery_ops *ops; struct rpc_cred *cred; int status = 0; @@ -787,7 +901,8 @@ restart_loop: /* Mark all delegations for reclaim */ nfs_delegation_mark_reclaim(clp); /* Note: list is protected by exclusive lock on cl->cl_sem */ - list_for_each_entry(sp, &clp->cl_state_owners, so_list) { + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); status = nfs4_reclaim_open_state(ops, sp); if (status < 0) { if (status == -NFS4ERR_NO_GRACE) { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4c8f67d47523..c08738441f73 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -68,10 +68,10 @@ static int nfs4_stat_to_errno(int); #endif /* lock,open owner id: - * we currently use size 1 (u32) out of (NFS4_OPAQUE_LIMIT >> 2) + * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ -#define open_owner_id_maxsz (1 + 1) -#define lock_owner_id_maxsz (1 + 1) +#define open_owner_id_maxsz (1 + 4) +#define lock_owner_id_maxsz (1 + 4) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define op_encode_hdr_maxsz (1) @@ -827,13 +827,14 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args) WRITE64(nfs4_lock_length(args->fl)); WRITE32(args->new_lock_owner); if (args->new_lock_owner){ - RESERVE_SPACE(4+NFS4_STATEID_SIZE+20); + RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); WRITE32(args->open_seqid->sequence->counter); WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); WRITE32(args->lock_seqid->sequence->counter); WRITE64(args->lock_owner.clientid); - WRITE32(4); - WRITE32(args->lock_owner.id); + WRITE32(16); + WRITEMEM("lock id:", 8); + WRITE64(args->lock_owner.id); } else { RESERVE_SPACE(NFS4_STATEID_SIZE+4); @@ -848,14 +849,15 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg { __be32 *p; - RESERVE_SPACE(40); + RESERVE_SPACE(52); WRITE32(OP_LOCKT); WRITE32(nfs4_lock_type(args->fl, 0)); WRITE64(args->fl->fl_start); WRITE64(nfs4_lock_length(args->fl)); WRITE64(args->lock_owner.clientid); - WRITE32(4); - WRITE32(args->lock_owner.id); + WRITE32(16); + WRITEMEM("lock id:", 8); + WRITE64(args->lock_owner.id); return 0; } @@ -920,10 +922,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena WRITE32(OP_OPEN); WRITE32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->open_flags); - RESERVE_SPACE(16); + RESERVE_SPACE(28); WRITE64(arg->clientid); - WRITE32(4); - WRITE32(arg->id); + WRITE32(16); + WRITEMEM("open id:", 8); + WRITE64(arg->id); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) -- cgit v1.2.1 From 1b45c46cf75d9c48eb611d5cc41607ac1f046606 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Jul 2007 13:04:56 -0400 Subject: NFSv4: Fix atomic open for execute... Currently we do not check for the FMODE_EXEC flag as we should. For that particular case, we need to perform an ACCESS call to the server in order to check that the file is executable. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1840ebc78fd3..69aab8db4947 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -769,6 +769,8 @@ static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openf mask |= MAY_READ; if (openflags & FMODE_WRITE) mask |= MAY_WRITE; + if (openflags & FMODE_EXEC) + mask |= MAY_EXEC; status = nfs_access_get_cached(inode, cred, &cache); if (status == 0) goto out; @@ -1269,7 +1271,16 @@ out: static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state) { struct file *filp; + int ret; + /* If the open_intent is for execute, we have an extra check to make */ + if (nd->intent.open.flags & FMODE_EXEC) { + ret = _nfs4_do_access(state->inode, + state->owner->so_cred, + nd->intent.open.flags); + if (ret < 0) + goto out_close; + } filp = lookup_instantiate_filp(nd, path->dentry, NULL); if (!IS_ERR(filp)) { struct nfs_open_context *ctx; @@ -1277,8 +1288,10 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct ctx->state = state; return 0; } + ret = PTR_ERR(filp); +out_close: nfs4_close_state(path, state, nd->intent.open.flags); - return PTR_ERR(filp); + return ret; } struct dentry * -- cgit v1.2.1 From 1c816efa245111c52858fbe55d99474f3c149dd3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Jul 2007 14:41:19 -0400 Subject: NFSv4: Fix a bug in __nfs4_find_state_byowner The test for state->state == 0 does not tell you that the stateid is in the process of being freed. It really tells you that the stateid is not yet initialised... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ab0b5ab60e60..ac816b303f3a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -333,13 +333,10 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) struct nfs4_state *state; list_for_each_entry(state, &nfsi->open_states, inode_states) { - /* Is this in the process of being freed? */ - if (state->state == 0) + if (state->owner != owner) continue; - if (state->owner == owner) { - atomic_inc(&state->count); + if (atomic_inc_not_zero(&state->count)) return state; - } } return NULL; } -- cgit v1.2.1 From 549d6ed5e85003370fe858e70864a71882491d28 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Jul 2007 16:42:45 -0400 Subject: NFSv4: set the delegation in nfs4_opendata_to_nfs4_state This ensures that nfs4_open_release() and nfs4_open_confirm_release() can now handle an eventual delegation that was returned with out open. As such, it fixes a delegation "leak" when the user breaks out of an open call. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 69aab8db4947..4f0b06d549fa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -356,6 +356,21 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data if (state == NULL) goto put_inode; update_open_stateid(state, &data->o_res.stateid, data->o_arg.open_flags); + if (data->o_res.delegation_type != 0) { + struct nfs_inode *nfsi = NFS_I(inode); + int delegation_flags = 0; + + if (nfsi->delegation) + delegation_flags = nfsi->delegation->flags; + if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM)) + nfs_inode_set_delegation(state->inode, + data->owner->so_cred, + &data->o_res); + else + nfs_inode_reclaim_delegation(state->inode, + data->owner->so_cred, + &data->o_res); + } put_inode: iput(inode); out: @@ -433,23 +448,8 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * opendata->o_res.delegation_type = delegation; opendata->o_arg.open_flags |= mode; newstate = nfs4_opendata_to_nfs4_state(opendata); - if (newstate != NULL) { - if (opendata->o_res.delegation_type != 0) { - struct nfs_inode *nfsi = NFS_I(newstate->inode); - int delegation_flags = 0; - if (nfsi->delegation) - delegation_flags = nfsi->delegation->flags; - if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM)) - nfs_inode_set_delegation(newstate->inode, - opendata->owner->so_cred, - &opendata->o_res); - else - nfs_inode_reclaim_delegation(newstate->inode, - opendata->owner->so_cred, - &opendata->o_res); - } + if (newstate != NULL) nfs4_close_state(&opendata->path, newstate, opendata->o_arg.open_flags); - } if (newstate != state) return -ESTALE; return 0; @@ -1005,8 +1005,6 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct state = nfs4_opendata_to_nfs4_state(opendata); if (state == NULL) goto err_opendata_put; - if (opendata->o_res.delegation_type != 0) - nfs_inode_set_delegation(state->inode, cred, &opendata->o_res); nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); up_read(&clp->cl_sem); -- cgit v1.2.1 From 2ced46c27058710a6d731d6eca77f1dd14ccde75 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Jul 2007 23:48:13 -0400 Subject: NFSv4: Fix up a bug in nfs4_open_recover() Don't clobber the delegation info... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 65 ++++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4f0b06d549fa..03b60c67ca7c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -230,6 +230,16 @@ struct nfs4_opendata { int cancelled; }; + +static void nfs4_init_opendata_res(struct nfs4_opendata *p) +{ + p->o_res.f_attr = &p->f_attr; + p->o_res.dir_attr = &p->dir_attr; + p->o_res.server = p->o_arg.server; + nfs_fattr_init(&p->f_attr); + nfs_fattr_init(&p->dir_attr); +} + static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, struct nfs4_state_owner *sp, int flags, const struct iattr *attrs) @@ -258,11 +268,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; - p->o_res.f_attr = &p->f_attr; - p->o_res.dir_attr = &p->dir_attr; - p->o_res.server = server; - nfs_fattr_init(&p->f_attr); - nfs_fattr_init(&p->dir_attr); if (flags & O_EXCL) { u32 *s = (u32 *) p->o_arg.u.verifier.data; s[0] = jiffies; @@ -274,6 +279,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->c_arg.fh = &p->o_res.fh; p->c_arg.stateid = &p->o_res.stateid; p->c_arg.seqid = p->o_arg.seqid; + nfs4_init_opendata_res(p); kref_init(&p->kref); return p; err_free: @@ -394,64 +400,54 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state * return ERR_PTR(-ENOENT); } -static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, nfs4_stateid *stateid) +static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res) { + struct nfs4_state *newstate; int ret; opendata->o_arg.open_flags = openflags; + memset(&opendata->o_res, 0, sizeof(opendata->o_res)); + memset(&opendata->c_res, 0, sizeof(opendata->c_res)); + nfs4_init_opendata_res(opendata); ret = _nfs4_proc_open(opendata); if (ret != 0) return ret; - memcpy(stateid->data, opendata->o_res.stateid.data, - sizeof(stateid->data)); + newstate = nfs4_opendata_to_nfs4_state(opendata); + if (newstate != NULL) + nfs4_close_state(&opendata->path, newstate, openflags); + *res = newstate; return 0; } static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state) { - nfs4_stateid stateid; struct nfs4_state *newstate; - int mode = 0; - int delegation = 0; int ret; /* memory barrier prior to reading state->n_* */ + clear_bit(NFS_DELEGATED_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &stateid); + ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); if (ret != 0) return ret; - mode |= FMODE_READ|FMODE_WRITE; - if (opendata->o_res.delegation_type != 0) - delegation = opendata->o_res.delegation_type; - smp_rmb(); + if (newstate != state) + return -ESTALE; } if (state->n_wronly != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &stateid); + ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); if (ret != 0) return ret; - mode |= FMODE_WRITE; - if (opendata->o_res.delegation_type != 0) - delegation = opendata->o_res.delegation_type; - smp_rmb(); + if (newstate != state) + return -ESTALE; } if (state->n_rdonly != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_READ, &stateid); + ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); if (ret != 0) return ret; - mode |= FMODE_READ; + if (newstate != state) + return -ESTALE; } - clear_bit(NFS_DELEGATED_STATE, &state->flags); - if (mode == 0) - return 0; - if (opendata->o_res.delegation_type == 0) - opendata->o_res.delegation_type = delegation; - opendata->o_arg.open_flags |= mode; - newstate = nfs4_opendata_to_nfs4_state(opendata); - if (newstate != NULL) - nfs4_close_state(&opendata->path, newstate, opendata->o_arg.open_flags); - if (newstate != state) - return -ESTALE; return 0; } @@ -730,6 +726,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) * want to ensure that it takes the 'error' code path. */ data->rpc_status = -ENOMEM; + data->cancelled = 0; task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data); if (IS_ERR(task)) return PTR_ERR(task); -- cgit v1.2.1 From 901630278469c0d7610554227f39ed2d02d0d270 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 5 Jul 2007 14:55:18 -0400 Subject: NFSv4: Support recalling delegations by stateid There appear to be some rogue servers out there that issue multiple delegations with different stateids for the same file. Ensure that when we return delegations, we do so on a per-stateid basis rather than a per-file basis. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 80 +++++++++++++++++++++++++++++++++-------------------- fs/nfs/delegation.h | 10 +------ 2 files changed, 51 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 9f17b91205cf..cee2ba42b68d 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -57,7 +57,7 @@ out_err: return status; } -static void nfs_delegation_claim_opens(struct inode *inode) +static void nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; @@ -72,6 +72,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) continue; + if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) + continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); err = nfs4_open_delegation_recall(ctx, state); @@ -170,33 +172,55 @@ static void nfs_msync_inode(struct inode *inode) /* * Basic procedure for returning a delegation to the server */ -int __nfs_inode_return_delegation(struct inode *inode) +static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_delegation *delegation; - int res = 0; nfs_msync_inode(inode); down_read(&clp->cl_sem); /* Guard against new delegated open calls */ down_write(&nfsi->rwsem); - spin_lock(&clp->cl_lock); - delegation = nfsi->delegation; - if (delegation != NULL) { - list_del_init(&delegation->super_list); - nfsi->delegation = NULL; - nfsi->delegation_state = 0; - } - spin_unlock(&clp->cl_lock); - nfs_delegation_claim_opens(inode); + nfs_delegation_claim_opens(inode, &delegation->stateid); up_write(&nfsi->rwsem); up_read(&clp->cl_sem); nfs_msync_inode(inode); - if (delegation != NULL) - res = nfs_do_return_delegation(inode, delegation); - return res; + return nfs_do_return_delegation(inode, delegation); +} + +static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) +{ + struct nfs_delegation *delegation = nfsi->delegation; + + if (delegation == NULL) + goto nomatch; + if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, + sizeof(delegation->stateid.data)) != 0) + goto nomatch; + list_del_init(&delegation->super_list); + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + return delegation; +nomatch: + return NULL; +} + +int nfs_inode_return_delegation(struct inode *inode) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + int err = 0; + + if (nfsi->delegation_state != 0) { + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(nfsi, NULL); + spin_unlock(&clp->cl_lock); + if (delegation != NULL) + err = __nfs_inode_return_delegation(inode, delegation); + } + return err; } /* @@ -218,8 +242,9 @@ restart: inode = igrab(delegation->inode); if (inode == NULL) continue; + nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); - nfs_inode_return_delegation(inode); + __nfs_inode_return_delegation(inode, delegation); iput(inode); goto restart; } @@ -243,8 +268,9 @@ restart: inode = igrab(delegation->inode); if (inode == NULL) continue; + nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); - nfs_inode_return_delegation(inode); + __nfs_inode_return_delegation(inode, delegation); iput(inode); goto restart; } @@ -285,8 +311,9 @@ restart: inode = igrab(delegation->inode); if (inode == NULL) continue; + nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); - nfs_inode_return_delegation(inode); + __nfs_inode_return_delegation(inode, delegation); iput(inode); goto restart; } @@ -316,21 +343,14 @@ static int recall_thread(void *data) down_read(&clp->cl_sem); down_write(&nfsi->rwsem); spin_lock(&clp->cl_lock); - delegation = nfsi->delegation; - if (delegation != NULL && memcmp(delegation->stateid.data, - args->stateid->data, - sizeof(delegation->stateid.data)) == 0) { - list_del_init(&delegation->super_list); - nfsi->delegation = NULL; - nfsi->delegation_state = 0; + delegation = nfs_detach_delegation_locked(nfsi, args->stateid); + if (delegation != NULL) args->result = 0; - } else { - delegation = NULL; + else args->result = -ENOENT; - } spin_unlock(&clp->cl_lock); complete(&args->started); - nfs_delegation_claim_opens(inode); + nfs_delegation_claim_opens(inode, args->stateid); up_write(&nfsi->rwsem); up_read(&clp->cl_sem); nfs_msync_inode(inode); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index f6e42fb21afb..7b22f1742445 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -26,7 +26,7 @@ struct nfs_delegation { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -int __nfs_inode_return_delegation(struct inode *inode); +int nfs_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); @@ -52,14 +52,6 @@ static inline int nfs_have_delegation(struct inode *inode, int flags) return 0; } -static inline int nfs_inode_return_delegation(struct inode *inode) -{ - int err = 0; - - if (NFS_I(inode)->delegation != NULL) - err = __nfs_inode_return_delegation(inode); - return err; -} #else static inline int nfs_have_delegation(struct inode *inode, int flags) { -- cgit v1.2.1 From 13437e12fb43cb7e285ff59248f781c91578eafe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 6 Jul 2007 15:10:43 -0400 Subject: NFSv4: Support recalling delegations by stateid part 2 Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 +- fs/nfs/delegation.h | 2 +- fs/nfs/nfs4proc.c | 10 ++++------ 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index cee2ba42b68d..93a9f4bd9bd0 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -76,7 +76,7 @@ again: continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); - err = nfs4_open_delegation_recall(ctx, state); + err = nfs4_open_delegation_recall(ctx, state, stateid); if (err >= 0) err = nfs_delegation_claim_locks(ctx, state); put_nfs_open_context(ctx); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 7b22f1742445..8f79a3135167 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -39,7 +39,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid); -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 03b60c67ca7c..10946415de77 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -510,32 +510,30 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state) +static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs4_state_owner *sp = state->owner; struct nfs4_opendata *opendata; int ret; - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) - return 0; opendata = nfs4_opendata_alloc(&ctx->path, sp, 0, NULL); if (opendata == NULL) return -ENOMEM; opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; - memcpy(opendata->o_arg.u.delegation.data, state->stateid.data, + memcpy(opendata->o_arg.u.delegation.data, stateid->data, sizeof(opendata->o_arg.u.delegation.data)); ret = nfs4_open_recover(opendata, state); nfs4_opendata_put(opendata); return ret; } -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state) +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs4_exception exception = { }; struct nfs_server *server = NFS_SERVER(state->inode); int err; do { - err = _nfs4_open_delegation_recall(ctx, state); + err = _nfs4_open_delegation_recall(ctx, state, stateid); switch (err) { case 0: return err; -- cgit v1.2.1 From 8383e4602c89857ef926f29ca61ac0a83a614443 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 6 Jul 2007 15:12:04 -0400 Subject: NFSv4: Use RCU to protect delegations Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 114 +++++++++++++++++++++++++++++----------------------- fs/nfs/delegation.h | 14 +++++-- 2 files changed, 73 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 93a9f4bd9bd0..56f4f6a99d4e 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -27,6 +27,13 @@ static void nfs_free_delegation(struct nfs_delegation *delegation) kfree(delegation); } +static void nfs_free_delegation_callback(struct rcu_head *head) +{ + struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu); + + nfs_free_delegation(delegation); +} + static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) { struct inode *inode = state->inode; @@ -133,10 +140,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation->inode = inode; spin_lock(&clp->cl_lock); - if (nfsi->delegation == NULL) { - list_add(&delegation->super_list, &clp->cl_delegations); - nfsi->delegation = delegation; + if (rcu_dereference(nfsi->delegation) == NULL) { + list_add_rcu(&delegation->super_list, &clp->cl_delegations); nfsi->delegation_state = delegation->type; + rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; } else { if (memcmp(&delegation->stateid, &nfsi->delegation->stateid, @@ -157,7 +164,7 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation * int res = 0; res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid); - nfs_free_delegation(delegation); + call_rcu(&delegation->rcu, nfs_free_delegation_callback); return res; } @@ -191,16 +198,16 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) { - struct nfs_delegation *delegation = nfsi->delegation; + struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL) goto nomatch; if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, sizeof(delegation->stateid.data)) != 0) goto nomatch; - list_del_init(&delegation->super_list); - nfsi->delegation = NULL; + list_del_rcu(&delegation->super_list); nfsi->delegation_state = 0; + rcu_assign_pointer(nfsi->delegation, NULL); return delegation; nomatch: return NULL; @@ -213,7 +220,7 @@ int nfs_inode_return_delegation(struct inode *inode) struct nfs_delegation *delegation; int err = 0; - if (nfsi->delegation_state != 0) { + if (rcu_dereference(nfsi->delegation) != NULL) { spin_lock(&clp->cl_lock); delegation = nfs_detach_delegation_locked(nfsi, NULL); spin_unlock(&clp->cl_lock); @@ -235,20 +242,23 @@ void nfs_return_all_delegations(struct super_block *sb) if (clp == NULL) return; restart: - spin_lock(&clp->cl_lock); - list_for_each_entry(delegation, &clp->cl_delegations, super_list) { + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { if (delegation->inode->i_sb != sb) continue; inode = igrab(delegation->inode); if (inode == NULL) continue; - nfs_detach_delegation_locked(NFS_I(inode), NULL); + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); - __nfs_inode_return_delegation(inode, delegation); + rcu_read_unlock(); + if (delegation != NULL) + __nfs_inode_return_delegation(inode, delegation); iput(inode); goto restart; } - spin_unlock(&clp->cl_lock); + rcu_read_unlock(); } static int nfs_do_expire_all_delegations(void *ptr) @@ -259,23 +269,26 @@ static int nfs_do_expire_all_delegations(void *ptr) allow_signal(SIGKILL); restart: - spin_lock(&clp->cl_lock); if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0) goto out; if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) goto out; - list_for_each_entry(delegation, &clp->cl_delegations, super_list) { + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { inode = igrab(delegation->inode); if (inode == NULL) continue; - nfs_detach_delegation_locked(NFS_I(inode), NULL); + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); - __nfs_inode_return_delegation(inode, delegation); + rcu_read_unlock(); + if (delegation) + __nfs_inode_return_delegation(inode, delegation); iput(inode); goto restart; } + rcu_read_unlock(); out: - spin_unlock(&clp->cl_lock); nfs_put_client(clp); module_put_and_exit(0); } @@ -306,18 +319,21 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp) if (clp == NULL) return; restart: - spin_lock(&clp->cl_lock); - list_for_each_entry(delegation, &clp->cl_delegations, super_list) { + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { inode = igrab(delegation->inode); if (inode == NULL) continue; - nfs_detach_delegation_locked(NFS_I(inode), NULL); + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); - __nfs_inode_return_delegation(inode, delegation); + rcu_read_unlock(); + if (delegation != NULL) + __nfs_inode_return_delegation(inode, delegation); iput(inode); goto restart; } - spin_unlock(&clp->cl_lock); + rcu_read_unlock(); } struct recall_threadargs { @@ -391,14 +407,14 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs { struct nfs_delegation *delegation; struct inode *res = NULL; - spin_lock(&clp->cl_lock); - list_for_each_entry(delegation, &clp->cl_delegations, super_list) { + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { res = igrab(delegation->inode); break; } } - spin_unlock(&clp->cl_lock); + rcu_read_unlock(); return res; } @@ -408,10 +424,10 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs void nfs_delegation_mark_reclaim(struct nfs_client *clp) { struct nfs_delegation *delegation; - spin_lock(&clp->cl_lock); - list_for_each_entry(delegation, &clp->cl_delegations, super_list) + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) delegation->flags |= NFS_DELEGATION_NEED_RECLAIM; - spin_unlock(&clp->cl_lock); + rcu_read_unlock(); } /* @@ -419,39 +435,35 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp) */ void nfs_delegation_reap_unclaimed(struct nfs_client *clp) { - struct nfs_delegation *delegation, *n; - LIST_HEAD(head); - spin_lock(&clp->cl_lock); - list_for_each_entry_safe(delegation, n, &clp->cl_delegations, super_list) { + struct nfs_delegation *delegation; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) continue; - list_move(&delegation->super_list, &head); - NFS_I(delegation->inode)->delegation = NULL; - NFS_I(delegation->inode)->delegation_state = 0; - } - spin_unlock(&clp->cl_lock); - while(!list_empty(&head)) { - delegation = list_entry(head.next, struct nfs_delegation, super_list); - list_del(&delegation->super_list); - nfs_free_delegation(delegation); + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL); + spin_unlock(&clp->cl_lock); + rcu_read_unlock(); + if (delegation != NULL) + call_rcu(&delegation->rcu, nfs_free_delegation_callback); + goto restart; } + rcu_read_unlock(); } int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - int res = 0; + int ret = 0; - if (nfsi->delegation_state == 0) - return 0; - spin_lock(&clp->cl_lock); - delegation = nfsi->delegation; + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); if (delegation != NULL) { memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); - res = 1; + ret = 1; } - spin_unlock(&clp->cl_lock); - return res; + rcu_read_unlock(); + return ret; } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 8f79a3135167..5874ce7fdbae 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -22,6 +22,7 @@ struct nfs_delegation { long flags; loff_t maxsize; __u64 change_attr; + struct rcu_head rcu; }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -45,11 +46,16 @@ int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); static inline int nfs_have_delegation(struct inode *inode, int flags) { + struct nfs_delegation *delegation; + int ret = 0; + flags &= FMODE_READ|FMODE_WRITE; - smp_rmb(); - if ((NFS_I(inode)->delegation_state & flags) == flags) - return 1; - return 0; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && (delegation->type & flags) == flags) + ret = 1; + rcu_read_unlock(); + return ret; } #else -- cgit v1.2.1 From 412c77cee6d6e73fbe1dc3d67f52163efed33fc4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Jul 2007 16:10:55 -0400 Subject: NFSv4: Defer inode revalidation when setting up a delegation Currently we force a synchronous call to __nfs_revalidate_inode() in nfs_inode_set_delegation(). This not only ensures that we cannot call nfs_inode_set_delegation from an asynchronous context, but it also slows down any call to open(). Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 10 ++++++---- fs/nfs/inode.c | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 56f4f6a99d4e..20ac403469a0 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -124,10 +124,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct struct nfs_delegation *delegation; int status = 0; - /* Ensure we first revalidate the attributes and page cache! */ - if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))) - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); if (delegation == NULL) return -ENOMEM; @@ -154,6 +150,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct status = -EIO; } } + + /* Ensure we revalidate the attributes and page cache! */ + spin_lock(&inode->i_lock); + nfsi->cache_validity |= NFS_INO_REVAL_FORCED; + spin_unlock(&inode->i_lock); + spin_unlock(&clp->cl_lock); kfree(delegation); return status; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9d5124166d20..3d9fccf4ef93 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1072,8 +1072,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid &= ~NFS_INO_INVALID_DATA; if (data_stable) invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE); - if (!nfs_have_delegation(inode, FMODE_READ)) + if (!nfs_have_delegation(inode, FMODE_READ) || + (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) nfsi->cache_validity |= invalid; + nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED; return 0; out_changed: -- cgit v1.2.1 From 0f9f95e0ad1f9d07d77832c5b60f7d30440602ee Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Jul 2007 16:19:56 -0400 Subject: NFSv4: Clean up confirmation of sequence ids... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 10946415de77..61ba32af4d2f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -574,8 +574,8 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) sizeof(data->o_res.stateid.data)); renew_lease(data->o_res.server, data->timestamp); } - nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid); nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status); + nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid); } static void nfs4_open_confirm_release(void *calldata) @@ -674,6 +674,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) data->rpc_status = -ENOTDIR; } renew_lease(data->o_res.server, data->timestamp); + if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) + nfs_confirm_seqid(&data->owner->so_seqid, 0); } nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid); } @@ -748,7 +750,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) if (status != 0) return status; } - nfs_confirm_seqid(&data->owner->so_seqid, 0); if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) return server->nfs_client->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); return 0; -- cgit v1.2.1 From 003707c7225dbd4bf879b6c204743554de0a08d6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 5 Jul 2007 18:07:55 -0400 Subject: NFSv4: Always use the delegation if we have one Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 14 +++++---- fs/nfs/nfs4proc.c | 88 ++++++++++++++++++++++++++++++++++++++++-------------- fs/nfs/nfs4state.c | 28 +++++++++++------ 3 files changed, 94 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 44b56c915f72..4a1c4d80a577 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -122,7 +122,10 @@ struct nfs4_lock_state { /* bits for nfs4_state->flags */ enum { LK_STATE_IN_USE, - NFS_DELEGATED_STATE, + NFS_DELEGATED_STATE, /* Current stateid is delegation */ + NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ + NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ + NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ }; struct nfs4_state { @@ -136,11 +139,12 @@ struct nfs4_state { unsigned long flags; /* Do we hold any locks? */ spinlock_t state_lock; /* Protects the lock_states list */ - nfs4_stateid stateid; + nfs4_stateid stateid; /* Current stateid: may be delegation */ + nfs4_stateid open_stateid; /* OPEN stateid */ - unsigned int n_rdonly; - unsigned int n_wronly; - unsigned int n_rdwr; + unsigned int n_rdonly; /* Number of read-only references */ + unsigned int n_wronly; /* Number of write-only references */ + unsigned int n_rdwr; /* Number of read/write references */ int state; /* State on the server (R,W, or RW) */ atomic_t count; }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 61ba32af4d2f..128fe23d3f1e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -319,7 +319,7 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) return ret; } -static inline void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) +static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) { switch (open_flags) { case FMODE_WRITE: @@ -331,9 +331,36 @@ static inline void update_open_stateflags(struct nfs4_state *state, mode_t open_ case FMODE_READ|FMODE_WRITE: state->n_rdwr++; } + nfs4_state_set_mode_locked(state, state->state | open_flags); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) +{ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); + memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); + switch (open_flags) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_WRITE: + set_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case FMODE_READ|FMODE_WRITE: + set_bit(NFS_O_RDWR_STATE, &state->flags); + } } -static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) +static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) +{ + spin_lock(&state->owner->so_lock); + spin_lock(&state->inode->i_lock); + nfs_set_open_stateid_locked(state, stateid, open_flags); + spin_unlock(&state->inode->i_lock); + spin_unlock(&state->owner->so_lock); +} + +static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags) { struct inode *inode = state->inode; @@ -341,9 +368,13 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, /* Protect against nfs4_find_state_byowner() */ spin_lock(&state->owner->so_lock); spin_lock(&inode->i_lock); - memcpy(&state->stateid, stateid, sizeof(state->stateid)); + if (deleg_stateid != NULL) { + memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) + nfs_set_open_stateid_locked(state, open_stateid, open_flags); update_open_stateflags(state, open_flags); - nfs4_state_set_mode_locked(state, state->state | open_flags); spin_unlock(&inode->i_lock); spin_unlock(&state->owner->so_lock); } @@ -352,6 +383,8 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data { struct inode *inode; struct nfs4_state *state = NULL; + struct nfs_delegation *delegation; + nfs4_stateid *deleg_stateid = NULL; if (!(data->f_attr.valid & NFS_ATTR_FATTR)) goto out; @@ -361,13 +394,14 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data state = nfs4_get_open_state(inode, data->owner); if (state == NULL) goto put_inode; - update_open_stateid(state, &data->o_res.stateid, data->o_arg.open_flags); if (data->o_res.delegation_type != 0) { - struct nfs_inode *nfsi = NFS_I(inode); int delegation_flags = 0; - if (nfsi->delegation) - delegation_flags = nfsi->delegation->flags; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation) + delegation_flags = delegation->flags; + rcu_read_unlock(); if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM)) nfs_inode_set_delegation(state->inode, data->owner->so_cred, @@ -377,6 +411,12 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data data->owner->so_cred, &data->o_res); } + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL) + deleg_stateid = &delegation->stateid; + update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags); + rcu_read_unlock(); put_inode: iput(inode); out: @@ -911,8 +951,7 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred unlock_kernel(); if (err != 0) goto out_put_open_state; - set_bit(NFS_DELEGATED_STATE, &state->flags); - update_open_stateid(state, &delegation->stateid, open_flags); + update_open_stateid(state, NULL, &delegation->stateid, open_flags); out_ok: nfs4_put_state_owner(sp); up_read(&nfsi->rwsem); @@ -1149,8 +1188,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: - memcpy(&state->stateid, &calldata->res.stateid, - sizeof(state->stateid)); + nfs_set_open_stateid(state, &calldata->res.stateid, calldata->arg.open_flags); renew_lease(server, calldata->timestamp); break; case -NFS4ERR_STALE_STATEID: @@ -1175,26 +1213,32 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) .rpc_resp = &calldata->res, .rpc_cred = state->owner->so_cred, }; - int mode = 0, old_mode; + int clear_rd, clear_wr, clear_rdwr; + int mode; if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) return; - /* Recalculate the new open mode in case someone reopened the file - * while we were waiting in line to be scheduled. - */ + + mode = FMODE_READ|FMODE_WRITE; + clear_rd = clear_wr = clear_rdwr = 0; spin_lock(&state->owner->so_lock); spin_lock(&calldata->inode->i_lock); - mode = old_mode = state->state; + /* Calculate the change in open mode */ if (state->n_rdwr == 0) { - if (state->n_rdonly == 0) + if (state->n_rdonly == 0) { mode &= ~FMODE_READ; - if (state->n_wronly == 0) + clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); + } + if (state->n_wronly == 0) { mode &= ~FMODE_WRITE; + clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); + } } - nfs4_state_set_mode_locked(state, mode); spin_unlock(&calldata->inode->i_lock); spin_unlock(&state->owner->so_lock); - if (mode == old_mode || test_bit(NFS_DELEGATED_STATE, &state->flags)) { + if (!clear_rd && !clear_wr && !clear_rdwr) { /* Note: exit _without_ calling nfs4_close_done */ task->tk_action = NULL; return; @@ -1238,7 +1282,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state) calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); - calldata->arg.stateid = &state->stateid; + calldata->arg.stateid = &state->open_stateid; /* Serialization for the sequence id */ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); if (calldata->arg.seqid == NULL) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ac816b303f3a..4f78c0d1eab5 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -412,7 +412,8 @@ void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) { struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; - int oldstate, newstate = 0; + int call_close = 0; + int newstate; atomic_inc(&owner->so_count); /* Protect against nfs4_find_state() */ @@ -428,21 +429,26 @@ void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) case FMODE_READ|FMODE_WRITE: state->n_rdwr--; } - oldstate = newstate = state->state; + newstate = FMODE_READ|FMODE_WRITE; if (state->n_rdwr == 0) { - if (state->n_rdonly == 0) + if (state->n_rdonly == 0) { newstate &= ~FMODE_READ; - if (state->n_wronly == 0) + call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + } + if (state->n_wronly == 0) { newstate &= ~FMODE_WRITE; + call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + } + if (newstate == 0) + clear_bit(NFS_DELEGATED_STATE, &state->flags); } - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { - nfs4_state_set_mode_locked(state, newstate); - oldstate = newstate; - } + nfs4_state_set_mode_locked(state, newstate); spin_unlock(&inode->i_lock); spin_unlock(&owner->so_lock); - if (oldstate == newstate) { + if (!call_close) { nfs4_put_open_state(state); nfs4_put_state_owner(owner); } else @@ -838,6 +844,10 @@ static void nfs4_state_mark_reclaim(struct nfs_client *clp) sp->so_seqid.flags = 0; spin_lock(&sp->so_lock); list_for_each_entry(state, &sp->so_states, open_states) { + clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); list_for_each_entry(lock, &state->lock_states, ls_locks) { lock->ls_seqid.counter = 0; lock->ls_seqid.flags = 0; -- cgit v1.2.1 From 6f43ddccb31b5bd2297878f6f3735d45fd4dfce3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Jul 2007 16:49:11 -0400 Subject: NFSv4: Improve the debugging of bad sequence id errors... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 +++- fs/nfs/nfs4state.c | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 128fe23d3f1e..3b59c5ded3fb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1079,7 +1079,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int * the user though... */ if (status == -NFS4ERR_BAD_SEQID) { - printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n"); + printk(KERN_WARNING "NFS: v4 server %s " + " returned a bad sequence-id error!\n", + NFS_SERVER(dir)->nfs_client->cl_hostname); exception.retry = 1; continue; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 4f78c0d1eab5..4fa4054cdf34 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -38,6 +38,7 @@ * subsequent patch. */ +#include #include #include #include @@ -648,6 +649,12 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) case 0: break; case -NFS4ERR_BAD_SEQID: + if (seqid->sequence->flags & NFS_SEQID_CONFIRMED) + return; + printk(KERN_WARNING "NFS: v4 server returned a bad" + "sequence-id error on an" + "unconfirmed sequence %p!\n", + seqid->sequence); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: -- cgit v1.2.1 From 1b370bc28f90955bccda8be5e7d7047ad1381da7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 7 Jul 2007 08:04:47 -0400 Subject: NFSv4: Allow nfs4_opendata_to_nfs4_state to return errors. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3b59c5ded3fb..52ba7630794c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -385,15 +385,19 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data struct nfs4_state *state = NULL; struct nfs_delegation *delegation; nfs4_stateid *deleg_stateid = NULL; + int ret; + ret = -EAGAIN; if (!(data->f_attr.valid & NFS_ATTR_FATTR)) - goto out; + goto err; inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); + ret = PTR_ERR(inode); if (IS_ERR(inode)) - goto out; + goto err; + ret = -ENOMEM; state = nfs4_get_open_state(inode, data->owner); if (state == NULL) - goto put_inode; + goto err_put_inode; if (data->o_res.delegation_type != 0) { int delegation_flags = 0; @@ -417,10 +421,12 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data deleg_stateid = &delegation->stateid; update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags); rcu_read_unlock(); -put_inode: iput(inode); -out: return state; +err_put_inode: + iput(inode); +err: + return ERR_PTR(ret); } static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) @@ -453,8 +459,9 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openf if (ret != 0) return ret; newstate = nfs4_opendata_to_nfs4_state(opendata); - if (newstate != NULL) - nfs4_close_state(&opendata->path, newstate, openflags); + if (IS_ERR(newstate)) + return PTR_ERR(newstate); + nfs4_close_state(&opendata->path, newstate, openflags); *res = newstate; return 0; } @@ -631,7 +638,7 @@ static void nfs4_open_confirm_release(void *calldata) goto out_free; nfs_confirm_seqid(&data->owner->so_seqid, 0); state = nfs4_opendata_to_nfs4_state(data); - if (state != NULL) + if (!IS_ERR(state)) nfs4_close_state(&data->path, state, data->o_arg.open_flags); out_free: nfs4_opendata_put(data); @@ -736,7 +743,7 @@ static void nfs4_open_release(void *calldata) goto out_free; nfs_confirm_seqid(&data->owner->so_seqid, 0); state = nfs4_opendata_to_nfs4_state(data); - if (state != NULL) + if (!IS_ERR(state)) nfs4_close_state(&data->path, state, data->o_arg.open_flags); out_free: nfs4_opendata_put(data); @@ -1036,9 +1043,9 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct if (opendata->o_arg.open_flags & O_EXCL) nfs4_exclusive_attrset(opendata, sattr); - status = -ENOMEM; state = nfs4_opendata_to_nfs4_state(opendata); - if (state == NULL) + status = PTR_ERR(state); + if (IS_ERR(state)) goto err_opendata_put; nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); -- cgit v1.2.1 From 3e309914a15333a5493058e4927e979c7434ae44 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 7 Jul 2007 13:19:59 -0400 Subject: NFSv4: Clean up _nfs4_proc_open() Use a flag instead of the 'data->rpc_status = -ENOMEM hack. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 52ba7630794c..05afb7ba3bc4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -226,6 +226,7 @@ struct nfs4_opendata { struct nfs4_state_owner *owner; struct iattr attrs; unsigned long timestamp; + unsigned int rpc_done : 1; int rpc_status; int cancelled; }; @@ -620,6 +621,7 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) memcpy(data->o_res.stateid.data, data->c_res.stateid.data, sizeof(data->o_res.stateid.data)); renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; } nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status); nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid); @@ -634,7 +636,7 @@ static void nfs4_open_confirm_release(void *calldata) if (data->cancelled == 0) goto out_free; /* In case of error, no cleanup! */ - if (data->rpc_status != 0) + if (!data->rpc_done) goto out_free; nfs_confirm_seqid(&data->owner->so_seqid, 0); state = nfs4_opendata_to_nfs4_state(data); @@ -660,11 +662,8 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) int status; kref_get(&data->kref); - /* - * If rpc_run_task() ends up calling ->rpc_release(), we - * want to ensure that it takes the 'error' code path. - */ - data->rpc_status = -ENOMEM; + data->rpc_done = 0; + data->rpc_status = 0; task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data); if (IS_ERR(task)) return PTR_ERR(task); @@ -725,6 +724,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) nfs_confirm_seqid(&data->owner->so_seqid, 0); } nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid); + data->rpc_done = 1; } static void nfs4_open_release(void *calldata) @@ -736,7 +736,7 @@ static void nfs4_open_release(void *calldata) if (data->cancelled == 0) goto out_free; /* In case of error, no cleanup! */ - if (data->rpc_status != 0) + if (data->rpc_status != 0 || !data->rpc_done) goto out_free; /* In case we need an open_confirm, no cleanup! */ if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) @@ -768,11 +768,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) int status; kref_get(&data->kref); - /* - * If rpc_run_task() ends up calling ->rpc_release(), we - * want to ensure that it takes the 'error' code path. - */ - data->rpc_status = -ENOMEM; + data->rpc_done = 0; + data->rpc_status = 0; data->cancelled = 0; task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data); if (IS_ERR(task)) @@ -784,7 +781,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) } else status = data->rpc_status; rpc_put_task(task); - if (status != 0) + if (status != 0 || !data->rpc_done) return status; if (o_arg->open_flags & O_CREAT) { -- cgit v1.2.1 From aac00a8d0a53097063da532cbdf0b8775a4dcd53 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 5 Jul 2007 19:02:21 -0400 Subject: NFSv4: Check for the existence of a delegation in nfs4_open_prepare() We should not be calling open() on an inode that has a delegation unless we're doing a reclaim. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 195 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 107 insertions(+), 88 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 05afb7ba3bc4..ea332e831d75 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -65,6 +65,7 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *) static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp); +static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags); /* Prevent leaks of NFSv4 errors into userland */ int nfs4_map_errors(int err) @@ -224,6 +225,7 @@ struct nfs4_opendata { struct path path; struct dentry *dir; struct nfs4_state_owner *owner; + struct nfs4_state *state; struct iattr attrs; unsigned long timestamp; unsigned int rpc_done : 1; @@ -296,6 +298,8 @@ static void nfs4_opendata_free(struct kref *kref) struct nfs4_opendata, kref); nfs_free_seqid(p->o_arg.seqid); + if (p->state != NULL) + nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); dput(p->dir); dput(p->path.dentry); @@ -320,6 +324,15 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) return ret; } +static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags) +{ + if ((delegation->type & open_flags) != open_flags) + return 0; + if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) + return 0; + return 1; +} + static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) { switch (open_flags) { @@ -380,6 +393,65 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_sta spin_unlock(&state->owner->so_lock); } +static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation == NULL || (delegation->type & open_flags) == open_flags) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + nfs_inode_return_delegation(inode); +} + +static struct nfs4_state *nfs4_try_open_delegated(struct nfs4_opendata *opendata) +{ + struct nfs4_state *state = opendata->state; + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_delegation *delegation; + int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL); + nfs4_stateid stateid; + int ret = -EAGAIN; + + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL) + goto out_unlock; + for (;;) { + if (!can_open_delegated(delegation, open_mode)) + break; + /* Save the delegation */ + memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); + rcu_read_unlock(); + lock_kernel(); + ret = _nfs4_do_access(state->inode, state->owner->so_cred, open_mode); + unlock_kernel(); + if (ret != 0) + goto out; + ret = -EAGAIN; + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL) + break; + /* Is the delegation still valid? */ + if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0) + continue; + rcu_read_unlock(); + update_open_stateid(state, NULL, &stateid, open_mode); + goto out_return_state; + } +out_unlock: + rcu_read_unlock(); +out: + return ERR_PTR(ret); +out_return_state: + atomic_inc(&state->count); + return state; +} + static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) { struct inode *inode; @@ -388,6 +460,11 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data nfs4_stateid *deleg_stateid = NULL; int ret; + if (!data->rpc_done) { + state = nfs4_try_open_delegated(data); + goto out; + } + ret = -EAGAIN; if (!(data->f_attr.valid & NFS_ATTR_FATTR)) goto err; @@ -423,6 +500,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags); rcu_read_unlock(); iput(inode); +out: return state; err_put_inode: iput(inode); @@ -690,6 +768,23 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) return; + /* + * Check if we still need to send an OPEN call, or if we can use + * a delegation instead. + */ + if (data->state != NULL) { + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); + if (delegation != NULL && + (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) { + rcu_read_unlock(); + task->tk_action = NULL; + return; + } + rcu_read_unlock(); + } /* Update sequence id. */ data->o_arg.id = sp->so_owner_id.id; data->o_arg.clientid = sp->so_client->cl_clientid; @@ -906,90 +1001,6 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -/* - * Returns a referenced nfs4_state if there is an open delegation on the file - */ -static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res) -{ - struct nfs_delegation *delegation; - struct nfs_server *server = NFS_SERVER(inode); - struct nfs_client *clp = server->nfs_client; - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs4_state_owner *sp = NULL; - struct nfs4_state *state = NULL; - int open_flags = flags & (FMODE_READ|FMODE_WRITE); - int err; - - err = -ENOMEM; - if (!(sp = nfs4_get_state_owner(server, cred))) { - dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__); - return err; - } - err = nfs4_recover_expired_lease(server); - if (err != 0) - goto out_put_state_owner; - /* Protect against reboot recovery - NOTE ORDER! */ - down_read(&clp->cl_sem); - /* Protect against delegation recall */ - down_read(&nfsi->rwsem); - delegation = NFS_I(inode)->delegation; - err = -ENOENT; - if (delegation == NULL || (delegation->type & open_flags) != open_flags) - goto out_err; - err = -ENOMEM; - state = nfs4_get_open_state(inode, sp); - if (state == NULL) - goto out_err; - - err = -ENOENT; - if ((state->state & open_flags) == open_flags) { - spin_lock(&inode->i_lock); - update_open_stateflags(state, open_flags); - spin_unlock(&inode->i_lock); - goto out_ok; - } else if (state->state != 0) - goto out_put_open_state; - - lock_kernel(); - err = _nfs4_do_access(inode, cred, open_flags); - unlock_kernel(); - if (err != 0) - goto out_put_open_state; - update_open_stateid(state, NULL, &delegation->stateid, open_flags); -out_ok: - nfs4_put_state_owner(sp); - up_read(&nfsi->rwsem); - up_read(&clp->cl_sem); - *res = state; - return 0; -out_put_open_state: - nfs4_put_open_state(state); -out_err: - up_read(&nfsi->rwsem); - up_read(&clp->cl_sem); - if (err != -EACCES) - nfs_inode_return_delegation(inode); -out_put_state_owner: - nfs4_put_state_owner(sp); - return err; -} - -static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred) -{ - struct nfs4_exception exception = { }; - struct nfs4_state *res = ERR_PTR(-EIO); - int err; - - do { - err = _nfs4_open_delegated(inode, flags, cred, &res); - if (err == 0) - break; - res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(inode), - err, &exception)); - } while (exception.retry); - return res; -} - /* * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* * fields corresponding to attributes that were used to store the verifier. @@ -1016,7 +1027,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct struct nfs_server *server = NFS_SERVER(dir); struct nfs_client *clp = server->nfs_client; struct nfs4_opendata *opendata; - int status; + int status; /* Protect against reboot recovery conflicts */ status = -ENOMEM; @@ -1027,12 +1038,17 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct status = nfs4_recover_expired_lease(server); if (status != 0) goto err_put_state_owner; + if (path->dentry->d_inode != NULL) + nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE)); down_read(&clp->cl_sem); status = -ENOMEM; opendata = nfs4_opendata_alloc(path, sp, flags, sattr); if (opendata == NULL) goto err_release_rwsem; + if (path->dentry->d_inode != NULL) + opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp); + status = _nfs4_proc_open(opendata); if (status != 0) goto err_opendata_put; @@ -1099,6 +1115,11 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int exception.retry = 1; continue; } + if (status == -EAGAIN) { + /* We must have found a delegation */ + exception.retry = 1; + continue; + } res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), status, &exception)); } while (exception.retry); @@ -1390,9 +1411,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); if (IS_ERR(cred)) return PTR_ERR(cred); - state = nfs4_open_delegated(dentry->d_inode, openflags, cred); - if (IS_ERR(state)) - state = nfs4_do_open(dir, &path, openflags, NULL, cred); + state = nfs4_do_open(dir, &path, openflags, NULL, cred); put_rpccred(cred); if (IS_ERR(state)) { switch (PTR_ERR(state)) { -- cgit v1.2.1 From 6ee412689027dc7954453aed392ab5c7599c0f73 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Jul 2007 14:11:36 -0400 Subject: NFSv4: Don't call OPEN if we already have an open stateid for a file If we already have a stateid with the correct open mode for a given file, then we can reuse that stateid instead of re-issuing an OPEN call without violating the close-to-open caching semantics. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ea332e831d75..1de076619253 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -324,6 +324,24 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) return ret; } +static int can_open_cached(struct nfs4_state *state, int mode) +{ + int ret = 0; + switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) { + case FMODE_READ: + ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; + break; + case FMODE_WRITE: + ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; + break; + case FMODE_READ|FMODE_WRITE: + ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; + } + return ret; +} + static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags) { if ((delegation->type & open_flags) != open_flags) @@ -407,7 +425,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open nfs_inode_return_delegation(inode); } -static struct nfs4_state *nfs4_try_open_delegated(struct nfs4_opendata *opendata) +static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) { struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); @@ -418,9 +436,19 @@ static struct nfs4_state *nfs4_try_open_delegated(struct nfs4_opendata *opendata rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (delegation == NULL) - goto out_unlock; for (;;) { + if (can_open_cached(state, open_mode)) { + spin_lock(&state->owner->so_lock); + if (can_open_cached(state, open_mode)) { + update_open_stateflags(state, open_mode); + spin_unlock(&state->owner->so_lock); + rcu_read_unlock(); + goto out_return_state; + } + spin_unlock(&state->owner->so_lock); + } + if (delegation == NULL) + break; if (!can_open_delegated(delegation, open_mode)) break; /* Save the delegation */ @@ -434,8 +462,9 @@ static struct nfs4_state *nfs4_try_open_delegated(struct nfs4_opendata *opendata ret = -EAGAIN; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); + /* If no delegation, try a cached open */ if (delegation == NULL) - break; + continue; /* Is the delegation still valid? */ if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0) continue; @@ -443,7 +472,6 @@ static struct nfs4_state *nfs4_try_open_delegated(struct nfs4_opendata *opendata update_open_stateid(state, NULL, &stateid, open_mode); goto out_return_state; } -out_unlock: rcu_read_unlock(); out: return ERR_PTR(ret); @@ -461,7 +489,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data int ret; if (!data->rpc_done) { - state = nfs4_try_open_delegated(data); + state = nfs4_try_open_cached(data); goto out; } @@ -775,13 +803,14 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (data->state != NULL) { struct nfs_delegation *delegation; + if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL))) + goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) { rcu_read_unlock(); - task->tk_action = NULL; - return; + goto out_no_action; } rcu_read_unlock(); } @@ -792,6 +821,10 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; data->timestamp = jiffies; rpc_call_setup(task, &msg, 0); + return; +out_no_action: + task->tk_action = NULL; + } static void nfs4_open_done(struct rpc_task *task, void *calldata) -- cgit v1.2.1 From 1ac7e2fd35905f3d44df06568bca5f9d140369b3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Jul 2007 21:04:15 -0400 Subject: NFSv4: Clean up the callers of nfs4_open_recover_helper() Rely on nfs4_try_open_cached() when appropriate. Also fix an RCU violation in _nfs4_do_open_reclaim() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1de076619253..3a2af8053767 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -602,6 +602,19 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * if (newstate != state) return -ESTALE; } + /* + * We may have performed cached opens for all three recoveries. + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && + memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { + spin_lock(&state->owner->so_lock); + spin_lock(&state->inode->i_lock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); + spin_unlock(&state->inode->i_lock); + spin_unlock(&state->owner->so_lock); + } return 0; } @@ -611,26 +624,22 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * */ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) { - struct nfs_delegation *delegation = NFS_I(state->inode)->delegation; + struct nfs_delegation *delegation; struct nfs4_opendata *opendata; int delegation_type = 0; int status; - if (delegation != NULL) { - if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) { - memcpy(&state->stateid, &delegation->stateid, - sizeof(state->stateid)); - set_bit(NFS_DELEGATED_STATE, &state->flags); - return 0; - } - delegation_type = delegation->type; - } opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL); if (opendata == NULL) return -ENOMEM; opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; opendata->o_arg.fh = NFS_FH(state->inode); nfs_copy_fh(&opendata->o_res.fh, opendata->o_arg.fh); + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0) + delegation_type = delegation->flags; + rcu_read_unlock(); opendata->o_arg.u.delegation_type = delegation_type; status = nfs4_open_recover(opendata, state); nfs4_opendata_put(opendata); @@ -980,21 +989,10 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) */ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) { - struct inode *inode = state->inode; - struct nfs_delegation *delegation = NFS_I(inode)->delegation; struct nfs4_opendata *opendata; - int openflags = state->state & (FMODE_READ|FMODE_WRITE); int ret; - if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) { - ret = _nfs4_do_access(inode, ctx->cred, openflags); - if (ret < 0) - return ret; - memcpy(&state->stateid, &delegation->stateid, sizeof(state->stateid)); - set_bit(NFS_DELEGATED_STATE, &state->flags); - return 0; - } - opendata = nfs4_opendata_alloc(&ctx->path, state->owner, openflags, NULL); + opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL); if (opendata == NULL) return -ENOMEM; ret = nfs4_open_recover(opendata, state); -- cgit v1.2.1 From 8bda4e4c98d14566fc1a354c62fb59d70cc49b97 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Jul 2007 10:45:42 -0400 Subject: NFSv4: Fix up stateid locking... We really don't need to grab both the state->so_owner and the inode->i_lock. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 2 ++ fs/nfs/nfs4proc.c | 27 +++++++++++---------------- fs/nfs/nfs4state.c | 10 ++++++---- 3 files changed, 19 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4a1c4d80a577..dd1aa2b598ce 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -139,9 +139,11 @@ struct nfs4_state { unsigned long flags; /* Do we hold any locks? */ spinlock_t state_lock; /* Protects the lock_states list */ + seqlock_t seqlock; /* Protects the stateid/open_stateid */ nfs4_stateid stateid; /* Current stateid: may be delegation */ nfs4_stateid open_stateid; /* OPEN stateid */ + /* The following 3 fields are protected by owner->so_lock */ unsigned int n_rdonly; /* Number of read-only references */ unsigned int n_wronly; /* Number of write-only references */ unsigned int n_rdwr; /* Number of read/write references */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3a2af8053767..ba86ec654c2e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -385,29 +385,28 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) { - spin_lock(&state->owner->so_lock); - spin_lock(&state->inode->i_lock); + write_seqlock(&state->seqlock); nfs_set_open_stateid_locked(state, stateid, open_flags); - spin_unlock(&state->inode->i_lock); - spin_unlock(&state->owner->so_lock); + write_sequnlock(&state->seqlock); } static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags) { - struct inode *inode = state->inode; - open_flags &= (FMODE_READ|FMODE_WRITE); - /* Protect against nfs4_find_state_byowner() */ - spin_lock(&state->owner->so_lock); - spin_lock(&inode->i_lock); + /* + * Protect the call to nfs4_state_set_mode_locked and + * serialise the stateid update + */ + write_seqlock(&state->seqlock); if (deleg_stateid != NULL) { memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) nfs_set_open_stateid_locked(state, open_stateid, open_flags); + write_sequnlock(&state->seqlock); + spin_lock(&state->owner->so_lock); update_open_stateflags(state, open_flags); - spin_unlock(&inode->i_lock); spin_unlock(&state->owner->so_lock); } @@ -608,12 +607,10 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { - spin_lock(&state->owner->so_lock); - spin_lock(&state->inode->i_lock); + write_seqlock(&state->seqlock); if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); - spin_unlock(&state->inode->i_lock); - spin_unlock(&state->owner->so_lock); + write_sequnlock(&state->seqlock); } return 0; } @@ -1280,7 +1277,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) mode = FMODE_READ|FMODE_WRITE; clear_rd = clear_wr = clear_rdwr = 0; spin_lock(&state->owner->so_lock); - spin_lock(&calldata->inode->i_lock); /* Calculate the change in open mode */ if (state->n_rdwr == 0) { if (state->n_rdonly == 0) { @@ -1294,7 +1290,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); } } - spin_unlock(&calldata->inode->i_lock); spin_unlock(&state->owner->so_lock); if (!clear_rd && !clear_wr && !clear_rdwr) { /* Note: exit _without_ calling nfs4_close_done */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 4fa4054cdf34..523cc2cbb5e1 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -307,6 +307,7 @@ nfs4_alloc_open_state(void) atomic_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); spin_lock_init(&state->state_lock); + seqlock_init(&state->seqlock); return state; } @@ -411,7 +412,6 @@ void nfs4_put_open_state(struct nfs4_state *state) */ void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) { - struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; int call_close = 0; int newstate; @@ -419,7 +419,6 @@ void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) atomic_inc(&owner->so_count); /* Protect against nfs4_find_state() */ spin_lock(&owner->so_lock); - spin_lock(&inode->i_lock); switch (mode & (FMODE_READ | FMODE_WRITE)) { case FMODE_READ: state->n_rdonly--; @@ -446,7 +445,6 @@ void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) clear_bit(NFS_DELEGATED_STATE, &state->flags); } nfs4_state_set_mode_locked(state, newstate); - spin_unlock(&inode->i_lock); spin_unlock(&owner->so_lock); if (!call_close) { @@ -599,8 +597,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) { struct nfs4_lock_state *lsp; + int seq; - memcpy(dst, &state->stateid, sizeof(*dst)); + do { + seq = read_seqbegin(&state->seqlock); + memcpy(dst, &state->stateid, sizeof(*dst)); + } while (read_seqretry(&state->seqlock, seq)); if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) return; -- cgit v1.2.1 From 9eaa67c6a5b77f248c4703d81c4a6c6434e35385 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:12:19 -0400 Subject: NFS: Clean-up: use correct type when converting NFS blocks to local blocks inode->i_blocks is a blkcnt_t these days, which can be a u64 or unsigned long, depending on the setting of CONFIG_LSF. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ad2b40db1e65..76cf55d57101 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -183,9 +183,9 @@ unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) /* * Calculate the number of 512byte blocks used. */ -static inline unsigned long nfs_calc_block_size(u64 tsize) +static inline blkcnt_t nfs_calc_block_size(u64 tsize) { - loff_t used = (tsize + 511) >> 9; + blkcnt_t used = (tsize + 511) >> 9; return (used > ULONG_MAX) ? ULONG_MAX : used; } -- cgit v1.2.1 From 5680d48be88d12cd987e5579a6072a4ca34ca6ea Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:12:24 -0400 Subject: NFS: Clean-up: Define macros for maximum host and export path name lengths Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 14c7923697d2..e7d197085834 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -867,12 +867,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type, } } - p = nfs_copy_user_string(NULL, &data->hostname, 256); + p = nfs_copy_user_string(NULL, &data->hostname, NFS4_MAXNAMLEN); if (IS_ERR(p)) goto out_err; hostname = p; - p = nfs_copy_user_string(NULL, &data->mnt_path, 1024); + p = nfs_copy_user_string(NULL, &data->mnt_path, NFS4_MAXPATHLEN); if (IS_ERR(p)) goto out_err; mntpath = p; -- cgit v1.2.1 From 29eb981a3b8eb4e61cd5b9da835768045d0446cb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:12:30 -0400 Subject: NFS: Clean-up: Replace nfs_copy_user_string with strndup_user The new string utility function strndup_user can be used instead of nfs_copy_user_string, eliminating an unnecessary duplication of function. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 53 ++++++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e7d197085834..04ad881eac77 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -785,27 +785,6 @@ static void nfs4_fill_super(struct super_block *sb) nfs_initialise_sb(sb); } -static void *nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen) -{ - void *p = NULL; - - if (!src->len) - return ERR_PTR(-EINVAL); - if (src->len < maxlen) - maxlen = src->len; - if (dst == NULL) { - p = dst = kmalloc(maxlen + 1, GFP_KERNEL); - if (p == NULL) - return ERR_PTR(-ENOMEM); - } - if (copy_from_user(dst, src->data, maxlen)) { - kfree(p); - return ERR_PTR(-EFAULT); - } - dst[maxlen] = '\0'; - return dst; -} - /* * Get the superblock for an NFS4 mountpoint */ @@ -819,8 +798,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, rpc_authflavor_t authflavour; struct nfs_fh mntfh; struct dentry *mntroot; - char *mntpath = NULL, *hostname = NULL, ip_addr[16]; - void *p; + char *p, *mntpath = NULL, *hostname = NULL, *ip_addr = NULL; int error; if (data == NULL) { @@ -857,39 +835,39 @@ static int nfs4_get_sb(struct file_system_type *fs_type, dprintk("%s: Invalid number of RPC auth flavours %d.\n", __FUNCTION__, data->auth_flavourlen); error = -EINVAL; - goto out_err_noserver; + goto out; } if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) { error = -EFAULT; - goto out_err_noserver; + goto out; } } - p = nfs_copy_user_string(NULL, &data->hostname, NFS4_MAXNAMLEN); + p = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(p)) goto out_err; hostname = p; - p = nfs_copy_user_string(NULL, &data->mnt_path, NFS4_MAXPATHLEN); + p = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); if (IS_ERR(p)) goto out_err; mntpath = p; dprintk("MNTPATH: %s\n", mntpath); - p = nfs_copy_user_string(ip_addr, &data->client_addr, - sizeof(ip_addr) - 1); + p = strndup_user(data->client_addr.data, 16); if (IS_ERR(p)) goto out_err; + ip_addr = p; /* Get a volume representation */ server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr, authflavour, &mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); - goto out_err_noserver; + goto out; } /* Get a superblock - note that we may end up sharing one that already exists */ @@ -919,25 +897,26 @@ static int nfs4_get_sb(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; mnt->mnt_sb = s; mnt->mnt_root = mntroot; + error = 0; + +out: + kfree(ip_addr); kfree(mntpath); kfree(hostname); - return 0; + return error; out_err: error = PTR_ERR(p); - goto out_err_noserver; + goto out; out_free: nfs_free_server(server); -out_err_noserver: - kfree(mntpath); - kfree(hostname); - return error; + goto out; error_splat_super: up_write(&s->s_umount); deactivate_super(s); - goto out_err_noserver; + goto out; } static void nfs4_kill_super(struct super_block *sb) -- cgit v1.2.1 From 0655960f76922a720ad14a510ed91a51395e742b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:12:35 -0400 Subject: NFS: Clean up error handling in nfs_get_sb The error return logic in nfs_get_sb now matches nfs4_get_sb, and is more maintainable. A subsequent patch will take advantage of this simplification. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 04ad881eac77..aab5cd61725d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -633,13 +633,13 @@ static int nfs_get_sb(struct file_system_type *fs_type, /* Validate the mount data */ error = nfs_validate_mount_data(data, &mntfh); if (error < 0) - return error; + goto out; /* Get a volume representation */ server = nfs_create_server(data, &mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); - goto out_err_noserver; + goto out; } /* Get a superblock - note that we may end up sharing one that already exists */ @@ -669,17 +669,19 @@ static int nfs_get_sb(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; mnt->mnt_sb = s; mnt->mnt_root = mntroot; - return 0; + error = 0; + +out: + return error; out_err_nosb: nfs_free_server(server); -out_err_noserver: - return error; + goto out; error_splat_super: up_write(&s->s_umount); deactivate_super(s); - return error; + goto out; } /* -- cgit v1.2.1 From 4d81cd16112f86dc279d90ef7a24f2b1be339c3c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:12:40 -0400 Subject: NFS: Clean-up: fix a compiler warning in fs/nfs/super.c /home/cel/linux/fs/nfs/super.c: In function 'nfs_pseudoflavour_to_name': /home/cel/linux/fs/nfs/super.c:270: warning: comparison between signed and unsigned Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index aab5cd61725d..6eac5bf911e3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -263,11 +263,11 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) { RPC_AUTH_GSS_SPKM, "spkm" }, { RPC_AUTH_GSS_SPKMI, "spkmi" }, { RPC_AUTH_GSS_SPKMP, "spkmp" }, - { -1, "unknown" } + { UINT_MAX, "unknown" } }; int i; - for (i=0; sec_flavours[i].flavour != -1; i++) { + for (i = 0; sec_flavours[i].flavour != UINT_MAX; i++) { if (sec_flavours[i].flavour == flavour) break; } -- cgit v1.2.1 From fc50d58fd053862d6bafcf92f1ef2961296f3a1c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:12:46 -0400 Subject: NFS: Clean-up: Refactor IP address sanity checks in NFS client NFS and NFSv4 mounts can now share server address sanity checking. And, it provides an easy mechanism for adding IPv6 address checking at some later point. Signed-off-by: Chuck Lever Cc: Aurelien Charbon Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6eac5bf911e3..7f5bc28ea8d1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -446,6 +446,23 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags) rpc_killall_tasks(rpc); } +/* + * Sanity-check a server address provided by the mount command + */ +static int nfs_verify_server_address(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *sa = (struct sockaddr_in *) addr; + if (sa->sin_addr.s_addr != INADDR_ANY) + return 1; + break; + } + } + + return 0; +} + /* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle @@ -501,7 +518,7 @@ static int nfs_validate_mount_data(struct nfs_mount_data *data, #endif /* CONFIG_NFS_V3 */ /* We now require that the mount process passes the remote address */ - if (data->addr.sin_addr.s_addr == INADDR_ANY) { + if (!nfs_verify_server_address((struct sockaddr *) &data->addr)) { dprintk("%s: mount program didn't pass remote address!\n", __FUNCTION__); return -EINVAL; @@ -819,13 +836,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type, if (copy_from_user(&addr, data->host_addr, sizeof(addr))) return -EFAULT; - if (addr.sin_family != AF_INET || - addr.sin_addr.s_addr == INADDR_ANY - ) { + if (!nfs_verify_server_address((struct sockaddr *) &addr)) { dprintk("%s: mount program didn't pass remote IP address!\n", __FUNCTION__); return -EINVAL; } + /* RFC3530: The default port for NFS is 2049 */ if (addr.sin_port == 0) addr.sin_port = htons(NFS_PORT); -- cgit v1.2.1 From 5df36e78da9db1c5f02b429116ed98902bcc75e5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:12:56 -0400 Subject: NFS: Clean up nfs_validate_mount_data Move error handling code out of the main code path. The switch statement was also improperly indented, according to Documentation/CodingStyle. This prepares nfs_validate_mount_data for the addition of option string parsing. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 125 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 67 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 7f5bc28ea8d1..baf75e9bd3fe 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -470,77 +470,86 @@ static int nfs_verify_server_address(struct sockaddr *addr) static int nfs_validate_mount_data(struct nfs_mount_data *data, struct nfs_fh *mntfh) { - if (data == NULL) { - dprintk("%s: missing data argument\n", __FUNCTION__); - return -EINVAL; - } - - if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) { - dprintk("%s: bad mount version\n", __FUNCTION__); - return -EINVAL; - } + if (data == NULL) + goto out_no_data; switch (data->version) { - case 1: - data->namlen = 0; - case 2: - data->bsize = 0; - case 3: - if (data->flags & NFS_MOUNT_VER3) { - dprintk("%s: mount structure version %d does not support NFSv3\n", - __FUNCTION__, - data->version); - return -EINVAL; - } - data->root.size = NFS2_FHSIZE; - memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); - case 4: - if (data->flags & NFS_MOUNT_SECFLAVOUR) { - dprintk("%s: mount structure version %d does not support strong security\n", - __FUNCTION__, - data->version); - return -EINVAL; - } - case 5: - memset(data->context, 0, sizeof(data->context)); + case 1: + data->namlen = 0; + case 2: + data->bsize = 0; + case 3: + if (data->flags & NFS_MOUNT_VER3) + goto out_no_v3; + data->root.size = NFS2_FHSIZE; + memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + case 4: + if (data->flags & NFS_MOUNT_SECFLAVOUR) + goto out_no_sec; + case 5: + memset(data->context, 0, sizeof(data->context)); + case 6: + if (data->flags & NFS_MOUNT_VER3) + mntfh->size = data->root.size; + else + mntfh->size = NFS2_FHSIZE; + + if (mntfh->size > sizeof(mntfh->data)) + goto out_invalid_fh; + + memcpy(mntfh->data, data->root.data, mntfh->size); + if (mntfh->size < sizeof(mntfh->data)) + memset(mntfh->data + mntfh->size, 0, + sizeof(mntfh->data) - mntfh->size); + break; + default: + goto out_bad_version; } - /* Set the pseudoflavor */ if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) data->pseudoflavor = RPC_AUTH_UNIX; #ifndef CONFIG_NFS_V3 - /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */ - if (data->flags & NFS_MOUNT_VER3) { - dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__); - return -EPROTONOSUPPORT; - } -#endif /* CONFIG_NFS_V3 */ + if (data->flags & NFS_MOUNT_VER3) + goto out_v3_not_compiled; +#endif /* !CONFIG_NFS_V3 */ - /* We now require that the mount process passes the remote address */ - if (!nfs_verify_server_address((struct sockaddr *) &data->addr)) { - dprintk("%s: mount program didn't pass remote address!\n", - __FUNCTION__); - return -EINVAL; - } + if (!nfs_verify_server_address((struct sockaddr *) &data->addr)) + goto out_no_address; - /* Prepare the root filehandle */ - if (data->flags & NFS_MOUNT_VER3) - mntfh->size = data->root.size; - else - mntfh->size = NFS2_FHSIZE; + return 0; - if (mntfh->size > sizeof(mntfh->data)) { - dprintk("%s: invalid root filehandle\n", __FUNCTION__); - return -EINVAL; - } +out_no_data: + dfprintk(MOUNT, "NFS: mount program didn't pass any mount data\n"); + return -EINVAL; - memcpy(mntfh->data, data->root.data, mntfh->size); - if (mntfh->size < sizeof(mntfh->data)) - memset(mntfh->data + mntfh->size, 0, - sizeof(mntfh->data) - mntfh->size); +out_no_v3: + dfprintk(MOUNT, "NFS: nfs_mount_data version %d does not support v3\n", + data->version); + return -EINVAL; - return 0; +out_no_sec: + dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); + return -EINVAL; + +out_bad_version: + dfprintk(MOUNT, "NFS: bad nfs_mount_data version %d\n", + data->version); + return -EINVAL; + +#ifndef CONFIG_NFS_V3 +out_v3_not_compiled: + dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#endif /* !CONFIG_NFS_V3 */ + +out_no_address: + dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); + return -EINVAL; + +out_invalid_fh: + dfprintk(MOUNT, "NFS: invalid root filehandle\n"); + return -EINVAL; } /* -- cgit v1.2.1 From f0768ebd09385551277fcbc8b28c29eb491bf9e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:01 -0400 Subject: NFS: Introduce nfs4_validate_mount_options Refactor NFSv4 mount processing to break out mount data validation in the same way it's broken out in the NFSv2/v3 mount path. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 153 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 89 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index baf75e9bd3fe..ed3ec4477a0f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -813,6 +813,89 @@ static void nfs4_fill_super(struct super_block *sb) nfs_initialise_sb(sb); } +/* + * Validate NFSv4 mount options + */ +static int nfs4_validate_mount_data(struct nfs4_mount_data **options, + const char *dev_name, + struct sockaddr_in *addr, + rpc_authflavor_t *authflavour, + char **hostname, + char **mntpath, + char **ip_addr) +{ + struct nfs4_mount_data *data = *options; + char *c; + + if (data == NULL) + goto out_no_data; + + switch (data->version) { + case 1: + if (data->host_addrlen != sizeof(*addr)) + goto out_no_address; + if (copy_from_user(addr, data->host_addr, sizeof(*addr))) + return -EFAULT; + if (addr->sin_port == 0) + addr->sin_port = htons(NFS_PORT); + if (!nfs_verify_server_address((struct sockaddr *) addr)) + goto out_no_address; + + switch (data->auth_flavourlen) { + case 0: + *authflavour = RPC_AUTH_UNIX; + break; + case 1: + if (copy_from_user(authflavour, data->auth_flavours, + sizeof(*authflavour))) + return -EFAULT; + break; + default: + goto out_inval_auth; + } + + c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + *hostname = c; + + c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + *mntpath = c; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *mntpath); + + c = strndup_user(data->client_addr.data, 16); + if (IS_ERR(c)) + return PTR_ERR(c); + *ip_addr = c; + + break; + default: + goto out_bad_version; + } + + return 0; + +out_no_data: + dfprintk(MOUNT, "NFS4: mount program didn't pass any mount data\n"); + return -EINVAL; + +out_inval_auth: + dfprintk(MOUNT, "NFS4: Invalid number of RPC auth flavours %d\n", + data->auth_flavourlen); + return -EINVAL; + +out_no_address: + dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); + return -EINVAL; + +out_bad_version: + dfprintk(MOUNT, "NFS4: bad nfs_mount_data version %d\n", + data->version); + return -EINVAL; +} + /* * Get the superblock for an NFS4 mountpoint */ @@ -826,68 +909,14 @@ static int nfs4_get_sb(struct file_system_type *fs_type, rpc_authflavor_t authflavour; struct nfs_fh mntfh; struct dentry *mntroot; - char *p, *mntpath = NULL, *hostname = NULL, *ip_addr = NULL; + char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL; int error; - if (data == NULL) { - dprintk("%s: missing data argument\n", __FUNCTION__); - return -EINVAL; - } - if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) { - dprintk("%s: bad mount version\n", __FUNCTION__); - return -EINVAL; - } - - /* We now require that the mount process passes the remote address */ - if (data->host_addrlen != sizeof(addr)) - return -EINVAL; - - if (copy_from_user(&addr, data->host_addr, sizeof(addr))) - return -EFAULT; - - if (!nfs_verify_server_address((struct sockaddr *) &addr)) { - dprintk("%s: mount program didn't pass remote IP address!\n", - __FUNCTION__); - return -EINVAL; - } - - /* RFC3530: The default port for NFS is 2049 */ - if (addr.sin_port == 0) - addr.sin_port = htons(NFS_PORT); - - /* Grab the authentication type */ - authflavour = RPC_AUTH_UNIX; - if (data->auth_flavourlen != 0) { - if (data->auth_flavourlen != 1) { - dprintk("%s: Invalid number of RPC auth flavours %d.\n", - __FUNCTION__, data->auth_flavourlen); - error = -EINVAL; - goto out; - } - - if (copy_from_user(&authflavour, data->auth_flavours, - sizeof(authflavour))) { - error = -EFAULT; - goto out; - } - } - - p = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); - if (IS_ERR(p)) - goto out_err; - hostname = p; - - p = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); - if (IS_ERR(p)) - goto out_err; - mntpath = p; - - dprintk("MNTPATH: %s\n", mntpath); - - p = strndup_user(data->client_addr.data, 16); - if (IS_ERR(p)) - goto out_err; - ip_addr = p; + /* Validate the mount data */ + error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour, + &hostname, &mntpath, &ip_addr); + if (error < 0) + goto out; /* Get a volume representation */ server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr, @@ -932,10 +961,6 @@ out: kfree(hostname); return error; -out_err: - error = PTR_ERR(p); - goto out; - out_free: nfs_free_server(server); goto out; -- cgit v1.2.1 From cce63cd6374e6f1b4ea897ece1454feb13993d7c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:12 -0400 Subject: SUNRPC: Rename rpcb_getport_external routine In preparation for handling NFS mount option parsing in the kernel, rename rpcb_getport_external as rpcb_get_port_sync, and make it available always (instead of only when CONFIG_ROOT_NFS is enabled). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 49d1008ce1d7..f0db4703b1c9 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -428,7 +428,7 @@ static int __init root_nfs_getport(int program, int version, int proto) printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n", program, version, NIPQUAD(servaddr)); set_sockaddr(&sin, servaddr, 0); - return rpcb_getport_external(&sin, program, version, proto); + return rpcb_getport_sync(&sin, program, version, proto); } -- cgit v1.2.1 From 43780b87fa799ae65df11d89d4539d8d6a7c67eb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:22 -0400 Subject: SUNRPC: Add a convenient default for the hostname when calling rpc_create() A couple of callers just use a stringified IP address for the rpc client's hostname. Move the logic for constructing this into rpc_create(), so it can be shared. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/mount_clnt.c | 13 ++++--------- fs/nfsd/nfs4callback.c | 6 ------ 2 files changed, 4 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 878d7a5cb6d4..2892ec843066 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -28,8 +28,7 @@ #define MOUNT_UMNT 3 */ -static struct rpc_clnt * mnt_create(char *, struct sockaddr_in *, - int, int); +static struct rpc_clnt * mnt_create(struct sockaddr_in *, int, int); static struct rpc_program mnt_program; struct mnt_fhstatus { @@ -52,14 +51,12 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh, .rpc_argp = path, .rpc_resp = &result, }; - char hostname[32]; int status; dprintk("NFS: nfs_mount(%08x:%s)\n", (unsigned)ntohl(addr->sin_addr.s_addr), path); - sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr->sin_addr.s_addr)); - mnt_clnt = mnt_create(hostname, addr, version, protocol); + mnt_clnt = mnt_create(addr, version, protocol); if (IS_ERR(mnt_clnt)) return PTR_ERR(mnt_clnt); @@ -73,15 +70,13 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh, return status < 0? status : (result.status? -EACCES : 0); } -static struct rpc_clnt * -mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version, - int protocol) +static struct rpc_clnt *mnt_create(struct sockaddr_in *srvaddr, int version, + int protocol) { struct rpc_create_args args = { .protocol = protocol, .address = (struct sockaddr *)srvaddr, .addrsize = sizeof(*srvaddr), - .servername = hostname, .program = &mnt_program, .version = version, .authflavor = RPC_AUTH_UNIX, diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 6b1b487db1ec..5443c52b57aa 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -394,7 +394,6 @@ nfsd4_probe_callback(struct nfs4_client *clp) .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], .rpc_argp = clp, }; - char clientname[16]; int status; if (atomic_read(&cb->cb_set)) @@ -417,11 +416,6 @@ nfsd4_probe_callback(struct nfs4_client *clp) memset(program->stats, 0, sizeof(cb->cb_stat)); program->stats->program = program; - /* Just here to make some printk's more useful: */ - snprintf(clientname, sizeof(clientname), - "%u.%u.%u.%u", NIPQUAD(addr.sin_addr)); - args.servername = clientname; - /* Create RPC client */ cb->cb_client = rpc_create(&args); if (IS_ERR(cb->cb_client)) { -- cgit v1.2.1 From 3ea97309e6b18bce200211b3f9188e8023321adc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:27 -0400 Subject: NFS: Remake nfsroot_mount as a permanent part of NFS client In preparation for supporting NFSv2 and NFSv3 mount option handling in the kernel NFS client, convert mount_clnt.c to be a permanent part of the NFS client, instead of built only when CONFIG_ROOT_NFS is enabled. In addition, we also replace the "struct sockaddr_in *" argument with something more generic, to help support IPv6 at some later point. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 4 ++-- fs/nfs/mount_clnt.c | 55 +++++++++++++++++++++++++++-------------------------- fs/nfs/nfsroot.c | 3 ++- 3 files changed, 32 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index f4580b44eef4..b55cb236cf74 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,8 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ pagelist.o proc.o read.o symlink.o unlink.o \ - write.o namespace.o -nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o + write.o namespace.o mount_clnt.o +nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 2892ec843066..ee4899d96629 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -18,7 +18,7 @@ #include #ifdef RPC_DEBUG -# define NFSDBG_FACILITY NFSDBG_ROOT +# define NFSDBG_FACILITY NFSDBG_MOUNT #endif /* @@ -28,7 +28,6 @@ #define MOUNT_UMNT 3 */ -static struct rpc_clnt * mnt_create(struct sockaddr_in *, int, int); static struct rpc_program mnt_program; struct mnt_fhstatus { @@ -36,14 +35,21 @@ struct mnt_fhstatus { struct nfs_fh * fh; }; -/* - * Obtain an NFS file handle for the given host and path +/** + * nfs_mount - Obtain an NFS file handle for the given host and path + * @addr: pointer to server's address + * @len: size of server's address + * @hostname: name of server host, or NULL + * @path: pointer to string containing export path to mount + * @version: mount version to use for this request + * @protocol: transport protocol to use for thie request + * @fh: pointer to location to place returned file handle + * + * Uses default timeout parameters specified by underlying transport. */ -int -nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh, - int version, int protocol) +int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, + int version, int protocol, struct nfs_fh *fh) { - struct rpc_clnt *mnt_clnt; struct mnt_fhstatus result = { .fh = fh }; @@ -51,12 +57,23 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh, .rpc_argp = path, .rpc_resp = &result, }; + struct rpc_create_args args = { + .protocol = protocol, + .address = addr, + .addrsize = len, + .servername = hostname, + .program = &mnt_program, + .version = version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_INTR, + }; + struct rpc_clnt *mnt_clnt; int status; - dprintk("NFS: nfs_mount(%08x:%s)\n", - (unsigned)ntohl(addr->sin_addr.s_addr), path); + dprintk("NFS: sending MNT request for %s:%s\n", + (hostname ? hostname : "server"), path); - mnt_clnt = mnt_create(addr, version, protocol); + mnt_clnt = rpc_create(&args); if (IS_ERR(mnt_clnt)) return PTR_ERR(mnt_clnt); @@ -70,22 +87,6 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh, return status < 0? status : (result.status? -EACCES : 0); } -static struct rpc_clnt *mnt_create(struct sockaddr_in *srvaddr, int version, - int protocol) -{ - struct rpc_create_args args = { - .protocol = protocol, - .address = (struct sockaddr *)srvaddr, - .addrsize = sizeof(*srvaddr), - .program = &mnt_program, - .version = version, - .authflavor = RPC_AUTH_UNIX, - .flags = RPC_CLNT_CREATE_INTR, - }; - - return rpc_create(&args); -} - /* * XDR encode/decode functions for MOUNT */ diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index f0db4703b1c9..3490322d1145 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -496,7 +496,8 @@ static int __init root_nfs_get_handle(void) NFS_MNT3_VERSION : NFS_MNT_VERSION; set_sockaddr(&sin, servaddr, htons(mount_port)); - status = nfsroot_mount(&sin, nfs_path, &fh, version, protocol); + status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL, + nfs_path, version, protocol, &fh); if (status < 0) printk(KERN_ERR "Root-NFS: Server returned error %d " "while mounting %s\n", status, nfs_path); -- cgit v1.2.1 From 19207231c9874899e7511507ebb1b88d648a5743 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:33 -0400 Subject: NFS: Clean up in-kernel NFS mount Clean up white space and coding conventions. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/mount_clnt.c | 83 ++++++++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index ee4899d96629..961dc5243277 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -1,7 +1,5 @@ /* - * linux/fs/nfs/mount_clnt.c - * - * MOUNT client to support NFSroot. + * In-kernel MOUNT protocol client * * Copyright (C) 1997, Olaf Kirch */ @@ -21,18 +19,11 @@ # define NFSDBG_FACILITY NFSDBG_MOUNT #endif -/* -#define MOUNT_PROGRAM 100005 -#define MOUNT_VERSION 1 -#define MOUNT_MNT 1 -#define MOUNT_UMNT 3 - */ - static struct rpc_program mnt_program; struct mnt_fhstatus { - unsigned int status; - struct nfs_fh * fh; + u32 status; + struct nfs_fh *fh; }; /** @@ -90,8 +81,8 @@ int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, /* * XDR encode/decode functions for MOUNT */ -static int -xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, const char *path) +static int xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, + const char *path) { p = xdr_encode_string(p, path); @@ -99,8 +90,8 @@ xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, const char *path) return 0; } -static int -xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) +static int xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, + struct mnt_fhstatus *res) { struct nfs_fh *fh = res->fh; @@ -111,8 +102,8 @@ xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) return 0; } -static int -xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) +static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, + struct mnt_fhstatus *res) { struct nfs_fh *fh = res->fh; @@ -131,53 +122,53 @@ xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) #define MNT_fhstatus_sz (1 + 8) #define MNT_fhstatus3_sz (1 + 16) -static struct rpc_procinfo mnt_procedures[] = { -[MNTPROC_MNT] = { - .p_proc = MNTPROC_MNT, - .p_encode = (kxdrproc_t) xdr_encode_dirpath, - .p_decode = (kxdrproc_t) xdr_decode_fhstatus, - .p_arglen = MNT_dirpath_sz, - .p_replen = MNT_fhstatus_sz, - .p_statidx = MNTPROC_MNT, - .p_name = "MOUNT", +static struct rpc_procinfo mnt_procedures[] = { + [MNTPROC_MNT] = { + .p_proc = MNTPROC_MNT, + .p_encode = (kxdrproc_t) xdr_encode_dirpath, + .p_decode = (kxdrproc_t) xdr_decode_fhstatus, + .p_arglen = MNT_dirpath_sz, + .p_replen = MNT_fhstatus_sz, + .p_statidx = MNTPROC_MNT, + .p_name = "MOUNT", }, }; static struct rpc_procinfo mnt3_procedures[] = { -[MOUNTPROC3_MNT] = { - .p_proc = MOUNTPROC3_MNT, - .p_encode = (kxdrproc_t) xdr_encode_dirpath, - .p_decode = (kxdrproc_t) xdr_decode_fhstatus3, - .p_arglen = MNT_dirpath_sz, - .p_replen = MNT_fhstatus3_sz, - .p_statidx = MOUNTPROC3_MNT, - .p_name = "MOUNT", + [MOUNTPROC3_MNT] = { + .p_proc = MOUNTPROC3_MNT, + .p_encode = (kxdrproc_t) xdr_encode_dirpath, + .p_decode = (kxdrproc_t) xdr_decode_fhstatus3, + .p_arglen = MNT_dirpath_sz, + .p_replen = MNT_fhstatus3_sz, + .p_statidx = MOUNTPROC3_MNT, + .p_name = "MOUNT", }, }; -static struct rpc_version mnt_version1 = { - .number = 1, - .nrprocs = 2, - .procs = mnt_procedures +static struct rpc_version mnt_version1 = { + .number = 1, + .nrprocs = 2, + .procs = mnt_procedures, }; -static struct rpc_version mnt_version3 = { - .number = 3, - .nrprocs = 2, - .procs = mnt3_procedures +static struct rpc_version mnt_version3 = { + .number = 3, + .nrprocs = 2, + .procs = mnt3_procedures, }; -static struct rpc_version * mnt_version[] = { +static struct rpc_version *mnt_version[] = { NULL, &mnt_version1, NULL, &mnt_version3, }; -static struct rpc_stat mnt_stats; +static struct rpc_stat mnt_stats; -static struct rpc_program mnt_program = { +static struct rpc_program mnt_program = { .name = "mount", .number = NFS_MNT_PROGRAM, .nrvers = ARRAY_SIZE(mnt_version), -- cgit v1.2.1 From 013a8c1ab5a214c608e12b602770449fb6b15a81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:38 -0400 Subject: NFS: Improve debugging output in NFS in-kernel mount client Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/mount_clnt.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 961dc5243277..8afd9f7e7a97 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -66,7 +66,7 @@ int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, mnt_clnt = rpc_create(&args); if (IS_ERR(mnt_clnt)) - return PTR_ERR(mnt_clnt); + goto out_clnt_err; if (version == NFS_MNT3_VERSION) msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; @@ -75,7 +75,31 @@ int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, status = rpc_call_sync(mnt_clnt, &msg, 0); rpc_shutdown_client(mnt_clnt); - return status < 0? status : (result.status? -EACCES : 0); + + if (status < 0) + goto out_call_err; + if (result.status != 0) + goto out_mnt_err; + + dprintk("NFS: MNT request succeeded\n"); + status = 0; + +out: + return status; + +out_clnt_err: + status = PTR_ERR(mnt_clnt); + dprintk("NFS: failed to create RPC client, status=%d\n", status); + goto out; + +out_call_err: + dprintk("NFS: failed to start MNT request, status=%d\n", status); + goto out; + +out_mnt_err: + dprintk("NFS: MNT server returned result %d\n", result.status); + status = -EACCES; + goto out; } /* -- cgit v1.2.1 From bf0fd7680f1cf31b9cbabcc037a204548e2c866d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:44 -0400 Subject: NFS: Add enums and match tables for mount option parsing This generic infrastructure works for both NFS and NFSv4 mounts. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 528 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 528 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ed3ec4477a0f..7e56411e55fc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,164 @@ #define NFSDBG_FACILITY NFSDBG_VFS + +struct nfs_parsed_mount_data { + int flags; + int rsize, wsize; + int timeo, retrans; + int acregmin, acregmax, + acdirmin, acdirmax; + int namlen; + unsigned int bsize; + unsigned int auth_flavor_len; + rpc_authflavor_t auth_flavors[1]; + char *client_address; + + struct { + struct sockaddr_in address; + unsigned int program; + unsigned int version; + unsigned short port; + int protocol; + } mount_server; + + struct { + struct sockaddr_in address; + char *hostname; + char *export_path; + unsigned int program; + int protocol; + } nfs_server; +}; + +enum { + /* Mount options that take no arguments */ + Opt_soft, Opt_hard, + Opt_intr, Opt_nointr, + Opt_posix, Opt_noposix, + Opt_cto, Opt_nocto, + Opt_ac, Opt_noac, + Opt_lock, Opt_nolock, + Opt_v2, Opt_v3, + Opt_udp, Opt_tcp, + Opt_acl, Opt_noacl, + Opt_rdirplus, Opt_nordirplus, + + /* Mount options that take integer arguments */ + Opt_port, + Opt_rsize, Opt_wsize, Opt_bsize, + Opt_timeo, Opt_retrans, + Opt_acregmin, Opt_acregmax, + Opt_acdirmin, Opt_acdirmax, + Opt_actimeo, + Opt_namelen, + Opt_mountport, + Opt_mountprog, Opt_mountvers, + Opt_nfsprog, Opt_nfsvers, + + /* Mount options that take string arguments */ + Opt_sec, Opt_proto, Opt_mountproto, + Opt_addr, Opt_mounthost, Opt_clientaddr, + + /* Mount options that are ignored */ + Opt_userspace, Opt_deprecated, + + Opt_err +}; + +static match_table_t nfs_mount_option_tokens = { + { Opt_userspace, "bg" }, + { Opt_userspace, "fg" }, + { Opt_soft, "soft" }, + { Opt_hard, "hard" }, + { Opt_intr, "intr" }, + { Opt_nointr, "nointr" }, + { Opt_posix, "posix" }, + { Opt_noposix, "noposix" }, + { Opt_cto, "cto" }, + { Opt_nocto, "nocto" }, + { Opt_ac, "ac" }, + { Opt_noac, "noac" }, + { Opt_lock, "lock" }, + { Opt_nolock, "nolock" }, + { Opt_v2, "v2" }, + { Opt_v3, "v3" }, + { Opt_udp, "udp" }, + { Opt_tcp, "tcp" }, + { Opt_acl, "acl" }, + { Opt_noacl, "noacl" }, + { Opt_rdirplus, "rdirplus" }, + { Opt_nordirplus, "nordirplus" }, + + { Opt_port, "port=%u" }, + { Opt_rsize, "rsize=%u" }, + { Opt_wsize, "wsize=%u" }, + { Opt_bsize, "bsize=%u" }, + { Opt_timeo, "timeo=%u" }, + { Opt_retrans, "retrans=%u" }, + { Opt_acregmin, "acregmin=%u" }, + { Opt_acregmax, "acregmax=%u" }, + { Opt_acdirmin, "acdirmin=%u" }, + { Opt_acdirmax, "acdirmax=%u" }, + { Opt_actimeo, "actimeo=%u" }, + { Opt_userspace, "retry=%u" }, + { Opt_namelen, "namlen=%u" }, + { Opt_mountport, "mountport=%u" }, + { Opt_mountprog, "mountprog=%u" }, + { Opt_mountvers, "mountvers=%u" }, + { Opt_nfsprog, "nfsprog=%u" }, + { Opt_nfsvers, "nfsvers=%u" }, + { Opt_nfsvers, "vers=%u" }, + + { Opt_sec, "sec=%s" }, + { Opt_proto, "proto=%s" }, + { Opt_mountproto, "mountproto=%s" }, + { Opt_addr, "addr=%s" }, + { Opt_clientaddr, "clientaddr=%s" }, + { Opt_mounthost, "mounthost=%s" }, + + { Opt_err, NULL } +}; + +enum { + Opt_xprt_udp, Opt_xprt_tcp, + + Opt_xprt_err +}; + +static match_table_t nfs_xprt_protocol_tokens = { + { Opt_xprt_udp, "udp" }, + { Opt_xprt_tcp, "tcp" }, + + { Opt_xprt_err, NULL } +}; + +enum { + Opt_sec_none, Opt_sec_sys, + Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, + Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp, + Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp, + + Opt_sec_err +}; + +static match_table_t nfs_secflavor_tokens = { + { Opt_sec_none, "none" }, + { Opt_sec_none, "null" }, + { Opt_sec_sys, "sys" }, + + { Opt_sec_krb5, "krb5" }, + { Opt_sec_krb5i, "krb5i" }, + { Opt_sec_krb5p, "krb5p" }, + + { Opt_sec_lkey, "lkey" }, + { Opt_sec_lkeyi, "lkeyi" }, + { Opt_sec_lkeyp, "lkeyp" }, + + { Opt_sec_err, NULL } +}; + + static void nfs_umount_begin(struct vfsmount *, int); static int nfs_statfs(struct dentry *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); @@ -463,6 +622,375 @@ static int nfs_verify_server_address(struct sockaddr *addr) return 0; } +/* + * Error-check and convert a string of mount options from user space into + * a data structure + */ +static int nfs_parse_mount_options(char *raw, + struct nfs_parsed_mount_data *mnt) +{ + char *p, *string; + + if (!raw) { + dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); + return 1; + } + dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw); + + while ((p = strsep(&raw, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int option, token; + + if (!*p) + continue; + + dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", p); + + token = match_token(p, nfs_mount_option_tokens, args); + switch (token) { + case Opt_soft: + mnt->flags |= NFS_MOUNT_SOFT; + break; + case Opt_hard: + mnt->flags &= ~NFS_MOUNT_SOFT; + break; + case Opt_intr: + mnt->flags |= NFS_MOUNT_INTR; + break; + case Opt_nointr: + mnt->flags &= ~NFS_MOUNT_INTR; + break; + case Opt_posix: + mnt->flags |= NFS_MOUNT_POSIX; + break; + case Opt_noposix: + mnt->flags &= ~NFS_MOUNT_POSIX; + break; + case Opt_cto: + mnt->flags &= ~NFS_MOUNT_NOCTO; + break; + case Opt_nocto: + mnt->flags |= NFS_MOUNT_NOCTO; + break; + case Opt_ac: + mnt->flags &= ~NFS_MOUNT_NOAC; + break; + case Opt_noac: + mnt->flags |= NFS_MOUNT_NOAC; + break; + case Opt_lock: + mnt->flags &= ~NFS_MOUNT_NONLM; + break; + case Opt_nolock: + mnt->flags |= NFS_MOUNT_NONLM; + break; + case Opt_v2: + mnt->flags &= ~NFS_MOUNT_VER3; + break; + case Opt_v3: + mnt->flags |= NFS_MOUNT_VER3; + break; + case Opt_udp: + mnt->flags &= ~NFS_MOUNT_TCP; + mnt->nfs_server.protocol = IPPROTO_UDP; + mnt->timeo = 7; + mnt->retrans = 5; + break; + case Opt_tcp: + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = IPPROTO_TCP; + mnt->timeo = 600; + mnt->retrans = 2; + break; + case Opt_acl: + mnt->flags &= ~NFS_MOUNT_NOACL; + break; + case Opt_noacl: + mnt->flags |= NFS_MOUNT_NOACL; + break; + case Opt_rdirplus: + mnt->flags &= ~NFS_MOUNT_NORDIRPLUS; + break; + case Opt_nordirplus: + mnt->flags |= NFS_MOUNT_NORDIRPLUS; + break; + + case Opt_port: + if (match_int(args, &option)) + return 0; + if (option < 0 || option > 65535) + return 0; + mnt->nfs_server.address.sin_port = htonl(option); + break; + case Opt_rsize: + if (match_int(args, &mnt->rsize)) + return 0; + break; + case Opt_wsize: + if (match_int(args, &mnt->wsize)) + return 0; + break; + case Opt_bsize: + if (match_int(args, &option)) + return 0; + if (option < 0) + return 0; + mnt->bsize = option; + break; + case Opt_timeo: + if (match_int(args, &mnt->timeo)) + return 0; + break; + case Opt_retrans: + if (match_int(args, &mnt->retrans)) + return 0; + break; + case Opt_acregmin: + if (match_int(args, &mnt->acregmin)) + return 0; + break; + case Opt_acregmax: + if (match_int(args, &mnt->acregmax)) + return 0; + break; + case Opt_acdirmin: + if (match_int(args, &mnt->acdirmin)) + return 0; + break; + case Opt_acdirmax: + if (match_int(args, &mnt->acdirmax)) + return 0; + break; + case Opt_actimeo: + if (match_int(args, &option)) + return 0; + if (option < 0) + return 0; + mnt->acregmin = + mnt->acregmax = + mnt->acdirmin = + mnt->acdirmax = option; + break; + case Opt_namelen: + if (match_int(args, &mnt->namlen)) + return 0; + break; + case Opt_mountport: + if (match_int(args, &option)) + return 0; + if (option < 0 || option > 65535) + return 0; + mnt->mount_server.port = option; + break; + case Opt_mountprog: + if (match_int(args, &option)) + return 0; + if (option < 0) + return 0; + mnt->mount_server.program = option; + break; + case Opt_mountvers: + if (match_int(args, &option)) + return 0; + if (option < 0) + return 0; + mnt->mount_server.version = option; + break; + case Opt_nfsprog: + if (match_int(args, &option)) + return 0; + if (option < 0) + return 0; + mnt->nfs_server.program = option; + break; + case Opt_nfsvers: + if (match_int(args, &option)) + return 0; + switch (option) { + case 2: + mnt->flags &= ~NFS_MOUNT_VER3; + break; + case 3: + mnt->flags |= NFS_MOUNT_VER3; + break; + default: + goto out_unrec_vers; + } + break; + + case Opt_sec: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, nfs_secflavor_tokens, args); + kfree(string); + + /* + * The flags setting is for v2/v3. The flavor_len + * setting is for v4. v2/v3 also need to know the + * difference between NULL and UNIX. + */ + switch (token) { + case Opt_sec_none: + mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 0; + mnt->auth_flavors[0] = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 0; + mnt->auth_flavors[0] = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; + break; + default: + goto out_unrec_sec; + } + break; + case Opt_proto: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_xprt_protocol_tokens, args); + kfree(string); + + switch (token) { + case Opt_udp: + mnt->flags &= ~NFS_MOUNT_TCP; + mnt->nfs_server.protocol = IPPROTO_UDP; + mnt->timeo = 7; + mnt->retrans = 5; + break; + case Opt_tcp: + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = IPPROTO_TCP; + mnt->timeo = 600; + mnt->retrans = 2; + break; + default: + goto out_unrec_xprt; + } + break; + case Opt_mountproto: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_xprt_protocol_tokens, args); + kfree(string); + + switch (token) { + case Opt_udp: + mnt->mount_server.protocol = IPPROTO_UDP; + break; + case Opt_tcp: + mnt->mount_server.protocol = IPPROTO_TCP; + break; + default: + goto out_unrec_xprt; + } + break; + case Opt_addr: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + mnt->nfs_server.address.sin_family = AF_INET; + mnt->nfs_server.address.sin_addr.s_addr = + in_aton(string); + kfree(string); + break; + case Opt_clientaddr: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + mnt->client_address = string; + break; + case Opt_mounthost: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + mnt->mount_server.address.sin_family = AF_INET; + mnt->mount_server.address.sin_addr.s_addr = + in_aton(string); + kfree(string); + break; + + case Opt_userspace: + case Opt_deprecated: + break; + + default: + goto out_unknown; + } + } + + return 1; + +out_nomem: + printk(KERN_INFO "NFS: not enough memory to parse option\n"); + return 0; + +out_unrec_vers: + printk(KERN_INFO "NFS: unrecognized NFS version number\n"); + return 0; + +out_unrec_xprt: + printk(KERN_INFO "NFS: unrecognized transport protocol\n"); + return 0; + +out_unrec_sec: + printk(KERN_INFO "NFS: unrecognized security flavor\n"); + return 0; + +out_unknown: + printk(KERN_INFO "NFS: unknown mount option: %s\n", p); + return 0; +} + /* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle -- cgit v1.2.1 From 0076d7b7bab580ca2e94637d351fa7cd357743a8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:49 -0400 Subject: NFS: Introduce generic mount client API For NFSv2 and v3 mounts, the first step is to contact the server's MOUNTD and request the file handle for the root of the mounted share. Add a function to the NFS client that handles this operation. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 7e56411e55fc..48db52a7067a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -991,6 +991,63 @@ out_unknown: return 0; } +/* + * Use the remote server's MOUNT service to request the NFS file handle + * corresponding to the provided path. + */ +static int nfs_try_mount(struct nfs_parsed_mount_data *args, + struct nfs_fh *root_fh) +{ + struct sockaddr_in sin; + int status; + + if (args->mount_server.version == 0) { + if (args->flags & NFS_MOUNT_VER3) + args->mount_server.version = NFS_MNT3_VERSION; + else + args->mount_server.version = NFS_MNT_VERSION; + } + + /* + * Construct the mount server's address. + */ + if (args->mount_server.address.sin_addr.s_addr != INADDR_ANY) + sin = args->mount_server.address; + else + sin = args->nfs_server.address; + if (args->mount_server.port == 0) { + status = rpcb_getport_sync(&sin, + args->mount_server.program, + args->mount_server.version, + args->mount_server.protocol); + if (status < 0) + goto out_err; + sin.sin_port = htons(status); + } else + sin.sin_port = htons(args->mount_server.port); + + /* + * Now ask the mount server to map our export path + * to a file handle. + */ + status = nfs_mount((struct sockaddr *) &sin, + sizeof(sin), + args->nfs_server.hostname, + args->nfs_server.export_path, + args->mount_server.version, + args->mount_server.protocol, + root_fh); + if (status < 0) + goto out_err; + + return status; + +out_err: + dfprintk(MOUNT, "NFS: unable to contact server on host " + NIPQUAD_FMT "\n", NIPQUAD(sin.sin_addr.s_addr)); + return status; +} + /* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle -- cgit v1.2.1 From 136d558ce766967fe3cbf54c3351aba261b5d53b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:54 -0400 Subject: NFS: Add final pieces to support in-kernel mount option parsing Hook in final components required for supporting in-kernel mount option parsing for NFSv2 and NFSv3 mounts. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 48db52a7067a..757aa3b7e64b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1051,10 +1051,28 @@ out_err: /* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle + * + * For option strings, user space handles the following behaviors: + * + * + DNS: mapping server host name to IP address ("addr=" option) + * + * + failure mode: how to behave if a mount request can't be handled + * immediately ("fg/bg" option) + * + * + retry: how often to retry a mount request ("retry=" option) + * + * + breaking back: trying proto=udp after proto=tcp, v2 after v3, + * mountproto=tcp after mountproto=udp, and so on + * + * XXX: as far as I can tell, changing the NFS program number is not + * supported in the NFS client. */ -static int nfs_validate_mount_data(struct nfs_mount_data *data, - struct nfs_fh *mntfh) +static int nfs_validate_mount_data(struct nfs_mount_data **options, + struct nfs_fh *mntfh, + const char *dev_name) { + struct nfs_mount_data *data = *options; + if (data == NULL) goto out_no_data; @@ -1087,8 +1105,78 @@ static int nfs_validate_mount_data(struct nfs_mount_data *data, memset(mntfh->data + mntfh->size, 0, sizeof(mntfh->data) - mntfh->size); break; - default: - goto out_bad_version; + default: { + unsigned int len; + char *c; + int status; + struct nfs_parsed_mount_data args = { + .flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP), + .rsize = NFS_MAX_FILE_IO_SIZE, + .wsize = NFS_MAX_FILE_IO_SIZE, + .timeo = 600, + .retrans = 2, + .acregmin = 3, + .acregmax = 60, + .acdirmin = 30, + .acdirmax = 60, + .mount_server.protocol = IPPROTO_UDP, + .mount_server.program = NFS_MNT_PROGRAM, + .nfs_server.protocol = IPPROTO_TCP, + .nfs_server.program = NFS_PROGRAM, + }; + + if (nfs_parse_mount_options((char *) *options, &args) == 0) + return -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + /* + * NB: after this point, caller will free "data" + * if we return an error + */ + *options = data; + + c = strchr(dev_name, ':'); + if (c == NULL) + return -EINVAL; + len = c - dev_name - 1; + if (len > sizeof(data->hostname)) + return -EINVAL; + strncpy(data->hostname, dev_name, len); + args.nfs_server.hostname = data->hostname; + + c++; + if (strlen(c) > NFS_MAXPATHLEN) + return -EINVAL; + args.nfs_server.export_path = c; + + status = nfs_try_mount(&args, mntfh); + if (status) + return -EINVAL; + + /* + * Translate to nfs_mount_data, which nfs_fill_super + * can deal with. + */ + data->version = 6; + data->flags = args.flags; + data->rsize = args.rsize; + data->wsize = args.wsize; + data->timeo = args.timeo; + data->retrans = args.retrans; + data->acregmin = args.acregmin; + data->acregmax = args.acregmax; + data->acdirmin = args.acdirmin; + data->acdirmax = args.acdirmax; + data->addr = args.nfs_server.address; + data->namlen = args.namlen; + data->bsize = args.bsize; + data->pseudoflavor = args.auth_flavors[0]; + + break; + } } if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) @@ -1117,11 +1205,6 @@ out_no_sec: dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); return -EINVAL; -out_bad_version: - dfprintk(MOUNT, "NFS: bad nfs_mount_data version %d\n", - data->version); - return -EINVAL; - #ifndef CONFIG_NFS_V3 out_v3_not_compiled: dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n"); @@ -1242,7 +1325,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, int error; /* Validate the mount data */ - error = nfs_validate_mount_data(data, &mntfh); + error = nfs_validate_mount_data(&data, &mntfh, dev_name); if (error < 0) goto out; @@ -1283,6 +1366,8 @@ static int nfs_get_sb(struct file_system_type *fs_type, error = 0; out: + if (data != raw_data) + kfree(data); return error; out_err_nosb: -- cgit v1.2.1 From 8007122520f0a3599bdc4df47358a5d83b2574aa Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 1 Jul 2007 12:13:59 -0400 Subject: NFS: Add support for mounting NFSv4 file systems with string options Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 84 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 757aa3b7e64b..064e69d2fdde 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1541,8 +1541,90 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options, *ip_addr = c; break; - default: - goto out_bad_version; + default: { + unsigned int len; + struct nfs_parsed_mount_data args = { + .rsize = NFS_MAX_FILE_IO_SIZE, + .wsize = NFS_MAX_FILE_IO_SIZE, + .timeo = 600, + .retrans = 2, + .acregmin = 3, + .acregmax = 60, + .acdirmin = 30, + .acdirmax = 60, + .nfs_server.protocol = IPPROTO_TCP, + }; + + if (nfs_parse_mount_options((char *) *options, &args) == 0) + return -EINVAL; + + if (!nfs_verify_server_address((struct sockaddr *) + &args.nfs_server.address)) + return -EINVAL; + *addr = args.nfs_server.address; + + switch (args.auth_flavor_len) { + case 0: + *authflavour = RPC_AUTH_UNIX; + break; + case 1: + *authflavour = (rpc_authflavor_t) args.auth_flavors[0]; + break; + default: + goto out_inval_auth; + } + + /* + * Translate to nfs4_mount_data, which nfs4_fill_super + * can deal with. + */ + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + *options = data; + + data->version = 1; + data->flags = args.flags & NFS4_MOUNT_FLAGMASK; + data->rsize = args.rsize; + data->wsize = args.wsize; + data->timeo = args.timeo; + data->retrans = args.retrans; + data->acregmin = args.acregmin; + data->acregmax = args.acregmax; + data->acdirmin = args.acdirmin; + data->acdirmax = args.acdirmax; + data->proto = args.nfs_server.protocol; + + /* + * Split "dev_name" into "hostname:mntpath". + */ + c = strchr(dev_name, ':'); + if (c == NULL) + return -EINVAL; + /* while calculating len, pretend ':' is '\0' */ + len = c - dev_name; + if (len > NFS4_MAXNAMLEN) + return -EINVAL; + *hostname = kzalloc(len, GFP_KERNEL); + if (*hostname == NULL) + return -ENOMEM; + strncpy(*hostname, dev_name, len - 1); + + c++; /* step over the ':' */ + len = strlen(c); + if (len > NFS4_MAXPATHLEN) + return -EINVAL; + *mntpath = kzalloc(len + 1, GFP_KERNEL); + if (*mntpath == NULL) + return -ENOMEM; + strncpy(*mntpath, c, len); + + dprintk("MNTPATH: %s\n", *mntpath); + + *ip_addr = args.client_address; + + break; + } } return 0; @@ -1559,11 +1641,6 @@ out_inval_auth: out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; - -out_bad_version: - dfprintk(MOUNT, "NFS4: bad nfs_mount_data version %d\n", - data->version); - return -EINVAL; } /* -- cgit v1.2.1 From 75180df2ed467866ada839fe73cf7cc7d75c0a22 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 May 2007 16:53:28 -0400 Subject: NFS: Add the mount option "nosharecache" Prior to David Howell's mount changes in 2.6.18, users who mounted different directories which happened to be from the same filesystem on the server would get different super blocks, and hence could choose different mount options. As long as there were no hard linked files that crossed from one subtree to another, this was quite safe. Post the changes, if the two directories are on the same filesystem (have the same 'fsid'), they will share the same super block, and hence the same mount options. Add a flag to allow users to elect not to share the NFS super block with another mount point, even if the fsids are the same. This will allow users to set different mount options for the two different super blocks, as was previously possible. It is still up to the user to ensure that there are no cache coherency issues when doing this, however the default behaviour will be to share super blocks whenever two paths result in the same fsid. Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 064e69d2fdde..1b555cd41e3b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -100,6 +100,7 @@ enum { Opt_udp, Opt_tcp, Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, + Opt_sharecache, Opt_nosharecache, /* Mount options that take integer arguments */ Opt_port, @@ -146,6 +147,8 @@ static match_table_t nfs_mount_option_tokens = { { Opt_noacl, "noacl" }, { Opt_rdirplus, "rdirplus" }, { Opt_nordirplus, "nordirplus" }, + { Opt_sharecache, "sharecache" }, + { Opt_nosharecache, "nosharecache" }, { Opt_port, "port=%u" }, { Opt_rsize, "rsize=%u" }, @@ -450,6 +453,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_UNSHARED, ",nosharecache", ""}, { 0, NULL, NULL } }; const struct proc_nfs_info *nfs_infop; @@ -714,6 +718,12 @@ static int nfs_parse_mount_options(char *raw, case Opt_nordirplus: mnt->flags |= NFS_MOUNT_NORDIRPLUS; break; + case Opt_sharecache: + mnt->flags &= ~NFS_MOUNT_UNSHARED; + break; + case Opt_nosharecache: + mnt->flags |= NFS_MOUNT_UNSHARED; + break; case Opt_port: if (match_int(args, &option)) @@ -1309,6 +1319,9 @@ static int nfs_compare_super(struct super_block *sb, void *data) if (old->nfs_client != server->nfs_client) return 0; + /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */ + if (old->flags & NFS_MOUNT_UNSHARED) + return 0; if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0) return 0; return 1; @@ -1322,6 +1335,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, struct nfs_fh mntfh; struct nfs_mount_data *data = raw_data; struct dentry *mntroot; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; int error; /* Validate the mount data */ @@ -1336,8 +1350,11 @@ static int nfs_get_sb(struct file_system_type *fs_type, goto out; } + if (server->flags & NFS_MOUNT_UNSHARED) + compare_super = NULL; + /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(fs_type, nfs_compare_super, nfs_set_super, server); + s = sget(fs_type, compare_super, nfs_set_super, server); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_err_nosb; @@ -1402,6 +1419,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, struct super_block *s; struct nfs_server *server; struct dentry *mntroot; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; int error; dprintk("--> nfs_xdev_get_sb()\n"); @@ -1413,8 +1431,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, goto out_err_noserver; } + if (server->flags & NFS_MOUNT_UNSHARED) + compare_super = NULL; + /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); + s = sget(&nfs_fs_type, compare_super, nfs_set_super, server); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_err_nosb; @@ -1657,6 +1678,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, struct nfs_fh mntfh; struct dentry *mntroot; char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; int error; /* Validate the mount data */ @@ -1673,8 +1695,11 @@ static int nfs4_get_sb(struct file_system_type *fs_type, goto out; } + if (server->flags & NFS4_MOUNT_UNSHARED) + compare_super = NULL; + /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(fs_type, nfs_compare_super, nfs_set_super, server); + s = sget(fs_type, compare_super, nfs_set_super, server); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_free; @@ -1740,6 +1765,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, struct super_block *s; struct nfs_server *server; struct dentry *mntroot; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; int error; dprintk("--> nfs4_xdev_get_sb()\n"); @@ -1751,8 +1777,11 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, goto out_err_noserver; } + if (server->flags & NFS4_MOUNT_UNSHARED) + compare_super = NULL; + /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); + s = sget(&nfs_fs_type, compare_super, nfs_set_super, server); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_err_nosb; @@ -1807,6 +1836,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, struct nfs_server *server; struct dentry *mntroot; struct nfs_fh mntfh; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; int error; dprintk("--> nfs4_referral_get_sb()\n"); @@ -1818,8 +1848,11 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, goto out_err_noserver; } + if (server->flags & NFS4_MOUNT_UNSHARED) + compare_super = NULL; + /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); + s = sget(&nfs_fs_type, compare_super, nfs_set_super, server); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_err_nosb; -- cgit v1.2.1 From 275a5d24bf56b2d9dd4644c54a56366b89a028f1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 May 2007 16:53:28 -0400 Subject: NFS: Error when mounting the same filesystem with different options Unless the user sets the NFS_MOUNT_NOSHAREDCACHE mount flag, we should return EBUSY if the filesystem is already mounted on a superblock that has set conflicting mount options. Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 1b555cd41e3b..a2b1af89ca1a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1317,7 +1317,9 @@ static int nfs_compare_super(struct super_block *sb, void *data) { struct nfs_server *server = data, *old = NFS_SB(sb); - if (old->nfs_client != server->nfs_client) + if (memcmp(&old->nfs_client->cl_addr, + &server->nfs_client->cl_addr, + sizeof(old->nfs_client->cl_addr)) != 0) return 0; /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */ if (old->flags & NFS_MOUNT_UNSHARED) @@ -1327,6 +1329,39 @@ static int nfs_compare_super(struct super_block *sb, void *data) return 1; } +#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) + +static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) +{ + const struct nfs_server *a = s->s_fs_info; + const struct rpc_clnt *clnt_a = a->client; + const struct rpc_clnt *clnt_b = b->client; + + if ((s->s_flags & NFS_MS_MASK) != (flags & NFS_MS_MASK)) + goto Ebusy; + if (a->nfs_client != b->nfs_client) + goto Ebusy; + if (a->flags != b->flags) + goto Ebusy; + if (a->wsize != b->wsize) + goto Ebusy; + if (a->rsize != b->rsize) + goto Ebusy; + if (a->acregmin != b->acregmin) + goto Ebusy; + if (a->acregmax != b->acregmax) + goto Ebusy; + if (a->acdirmin != b->acdirmin) + goto Ebusy; + if (a->acdirmax != b->acdirmax) + goto Ebusy; + if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + goto Ebusy; + return 0; +Ebusy: + return -EBUSY; +} + static int nfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { @@ -1361,8 +1396,11 @@ static int nfs_get_sb(struct file_system_type *fs_type, } if (s->s_fs_info != server) { + error = nfs_compare_mount_options(s, server, flags); nfs_free_server(server); server = NULL; + if (error < 0) + goto error_splat_super; } if (!s->s_root) { @@ -1442,8 +1480,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, } if (s->s_fs_info != server) { + error = nfs_compare_mount_options(s, server, flags); nfs_free_server(server); server = NULL; + if (error < 0) + goto error_splat_super; } if (!s->s_root) { -- cgit v1.2.1 From 6f2e64d3e1f661095e274c9d9d47e3f39a6cf1c0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 6 Jul 2007 10:53:21 -0400 Subject: NFSv4: Make the NFS state model work with the nosharedcache mount option Consider the case where the user has mounted the remote filesystem server:/foo on the two local directories /bar and /baz using the nosharedcache mount option. The files /bar/file and /baz/file are represented by different inodes in the local namespace, but refer to the same file /foo/file on the server. Consider the case where a process opens both /bar/file and /baz/file, then closes /bar/file: because the nfs4_state is not shared between /bar/file and /baz/file, the kernel will see that the nfs4_state for /bar/file is no longer referenced, so it will send off a CLOSE rpc call. Unless the open_owners differ, then that CLOSE call will invalidate the open state on /baz/file too. Conclusion: we cannot share open state owners between two different non-shared mount instances of the same filesystem. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4state.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index dd1aa2b598ce..6c028e734fe6 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -83,6 +83,7 @@ struct nfs_unique_id { struct nfs4_state_owner { struct nfs_unique_id so_owner_id; struct nfs_client *so_client; + struct nfs_server *so_server; struct rb_node so_client_node; struct rpc_cred *so_cred; /* Associated cred */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 523cc2cbb5e1..e9662ba81d86 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -156,8 +156,9 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id) } static struct nfs4_state_owner * -nfs4_find_state_owner(struct nfs_client *clp, struct rpc_cred *cred) +nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred) { + struct nfs_client *clp = server->nfs_client; struct rb_node **p = &clp->cl_state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp, *res = NULL; @@ -166,6 +167,14 @@ nfs4_find_state_owner(struct nfs_client *clp, struct rpc_cred *cred) parent = *p; sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); + if (server < sp->so_server) { + p = &parent->rb_left; + continue; + } + if (server > sp->so_server) { + p = &parent->rb_right; + continue; + } if (cred < sp->so_cred) p = &parent->rb_left; else if (cred > sp->so_cred) @@ -190,6 +199,14 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) parent = *p; sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); + if (new->so_server < sp->so_server) { + p = &parent->rb_left; + continue; + } + if (new->so_server > sp->so_server) { + p = &parent->rb_right; + continue; + } if (new->so_cred < sp->so_cred) p = &parent->rb_left; else if (new->so_cred > sp->so_cred) @@ -260,7 +277,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct struct nfs4_state_owner *sp, *new; spin_lock(&clp->cl_lock); - sp = nfs4_find_state_owner(clp, cred); + sp = nfs4_find_state_owner(server, cred); spin_unlock(&clp->cl_lock); if (sp != NULL) return sp; @@ -268,6 +285,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct if (new == NULL) return NULL; new->so_client = clp; + new->so_server = server; new->so_cred = cred; spin_lock(&clp->cl_lock); sp = nfs4_insert_state_owner(clp, new); -- cgit v1.2.1 From c98451bdb2f3e6d6cc1e03adad641e9497512b49 Mon Sep 17 00:00:00 2001 From: Frank van Maarseveen Date: Mon, 9 Jul 2007 22:25:29 +0200 Subject: NLM: fix source address of callback to client Use the destination address of the original NLM request as the source address in callbacks to the client. Signed-off-by: Frank van Maarseveen Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index c252a1c95857..572601e98dcd 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -44,9 +44,8 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, */ static struct nlm_host * nlm_lookup_host(int server, const struct sockaddr_in *sin, - int proto, int version, - const char *hostname, - int hostname_len) + int proto, int version, const char *hostname, + int hostname_len, const struct sockaddr_in *ssin) { struct hlist_head *chain; struct hlist_node *pos; @@ -54,7 +53,9 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin, struct nsm_handle *nsm = NULL; int hash; - dprintk("lockd: nlm_lookup_host(%u.%u.%u.%u, p=%d, v=%d, my role=%s, name=%.*s)\n", + dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT + ", p=%d, v=%d, my role=%s, name=%.*s)\n", + NIPQUAD(ssin->sin_addr.s_addr), NIPQUAD(sin->sin_addr.s_addr), proto, version, server? "server" : "client", hostname_len, @@ -91,6 +92,8 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin, continue; if (host->h_server != server) continue; + if (!nlm_cmp_addr(&host->h_saddr, ssin)) + continue; /* Move to head of hash chain. */ hlist_del(&host->h_hash); @@ -118,6 +121,7 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin, host->h_name = nsm->sm_name; host->h_addr = *sin; host->h_addr.sin_port = 0; /* ouch! */ + host->h_saddr = *ssin; host->h_version = version; host->h_proto = proto; host->h_rpcclnt = NULL; @@ -174,8 +178,10 @@ struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, const char *hostname, int hostname_len) { + struct sockaddr_in ssin = {0}; + return nlm_lookup_host(0, sin, proto, version, - hostname, hostname_len); + hostname, hostname_len, &ssin); } /* @@ -185,9 +191,12 @@ struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *rqstp, const char *hostname, int hostname_len) { + struct sockaddr_in ssin = {0}; + + ssin.sin_addr = rqstp->rq_daddr.addr; return nlm_lookup_host(1, svc_addr_in(rqstp), rqstp->rq_prot, rqstp->rq_vers, - hostname, hostname_len); + hostname, hostname_len, &ssin); } /* @@ -198,8 +207,9 @@ nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - dprintk("lockd: nlm_bind_host(%08x)\n", - (unsigned)ntohl(host->h_addr.sin_addr.s_addr)); + dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n", + NIPQUAD(host->h_saddr.sin_addr), + NIPQUAD(host->h_addr.sin_addr)); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -226,6 +236,7 @@ nlm_bind_host(struct nlm_host *host) .protocol = host->h_proto, .address = (struct sockaddr *)&host->h_addr, .addrsize = sizeof(host->h_addr), + .saddress = (struct sockaddr *)&host->h_saddr, .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, -- cgit v1.2.1 From 137d6acaa64afa4cf3d977417424e731ea04705a Mon Sep 17 00:00:00 2001 From: Frank Filz Date: Mon, 9 Jul 2007 15:32:29 -0700 Subject: NFSv4: Make sure unlock is really an unlock when cancelling a lock I ran into a curious issue when a lock is being canceled. The cancellation results in a lock request to the vfs layer instead of an unlock request. This is particularly insidious when the process that owns the lock is exiting. In that case, sometimes the erroneous lock is applied AFTER the process has entered zombie state, preventing the lock from ever being released. Eventually other processes block on the lock causing a slow degredation of the system. In the 2.6.16 kernel this was investigated on, the problem is compounded by the fact that the cl_sem is held while blocking on the vfs lock, which results in most processes accessing the nfs file system in question hanging. In more detail, here is how the situation occurs: first _nfs4_do_setlk(): static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim) ... ret = nfs4_wait_for_completion_rpc_task(task); if (ret == 0) { ... } else data->cancelled = 1; then nfs4_lock_release(): static void nfs4_lock_release(void *calldata) ... if (data->cancelled != 0) { struct rpc_task *task; task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, data->arg.lock_seqid); The problem is the same file_lock that was passed in to _nfs4_do_setlk() gets passed to nfs4_do_unlck() from nfs4_lock_release(). So the type is still F_RDLCK or FWRLCK, not F_UNLCK. At some point, when cancelling the lock, the type needs to be changed to F_UNLCK. It seemed easiest to do that in nfs4_do_unlck(), but it could be done in nfs4_lock_release(). The concern I had with doing it there was if something still needed the original file_lock, though it turns out the original file_lock still needs to be modified by nfs4_do_unlck() because nfs4_do_unlck() uses the original file_lock to pass to the vfs layer, and a copy of the original file_lock for the RPC request. It seems like the simplest solution is to force all situations where nfs4_do_unlck() is being used to result in an unlock, so with that in mind, I made the following change: Signed-off-by: Frank Filz Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ba86ec654c2e..fee2da856c95 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3275,6 +3275,11 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, { struct nfs4_unlockdata *data; + /* Ensure this is an unlock - when canceling a lock, the + * canceled lock is passed in, and it won't be an unlock. + */ + fl->fl_type = F_UNLCK; + data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); if (data == NULL) { nfs_free_seqid(seqid); -- cgit v1.2.1 From cfc94cdf8e0f14e692a5a40ef3cc10f464b2511b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 May 2007 13:19:52 +0200 Subject: debugfs: add rename for debugfs files Implement debugfs_rename() to allow renaming files/directories in debugfs. Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index ec8896b264de..1d533a2ec3a6 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -368,6 +368,69 @@ void debugfs_remove(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_remove); +/** + * debugfs_rename - rename a file/directory in the debugfs filesystem + * @old_dir: a pointer to the parent dentry for the renamed object. This + * should be a directory dentry. + * @old_dentry: dentry of an object to be renamed. + * @new_dir: a pointer to the parent dentry where the object should be + * moved. This should be a directory dentry. + * @new_name: a pointer to a string containing the target name. + * + * This function renames a file/directory in debugfs. The target must not + * exist for rename to succeed. + * + * This function will return a pointer to old_dentry (which is updated to + * reflect renaming) if it succeeds. If an error occurs, %NULL will be + * returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, + struct dentry *new_dir, const char *new_name) +{ + int error; + struct dentry *dentry = NULL, *trap; + const char *old_name; + + trap = lock_rename(new_dir, old_dir); + /* Source or destination directories don't exist? */ + if (!old_dir->d_inode || !new_dir->d_inode) + goto exit; + /* Source does not exist, cyclic rename, or mountpoint? */ + if (!old_dentry->d_inode || old_dentry == trap || + d_mountpoint(old_dentry)) + goto exit; + dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); + /* Lookup failed, cyclic rename or target exists? */ + if (IS_ERR(dentry) || dentry == trap || dentry->d_inode) + goto exit; + + old_name = fsnotify_oldname_init(old_dentry->d_name.name); + + error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, + dentry); + if (error) { + fsnotify_oldname_free(old_name); + goto exit; + } + d_move(old_dentry, dentry); + fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, + old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode), + NULL, old_dentry->d_inode); + fsnotify_oldname_free(old_name); + unlock_rename(new_dir, old_dir); + dput(dentry); + return old_dentry; +exit: + if (dentry && !IS_ERR(dentry)) + dput(dentry); + unlock_rename(new_dir, old_dir); + return NULL; +} +EXPORT_SYMBOL_GPL(debugfs_rename); + static decl_subsys(debug, NULL, NULL); static int __init debugfs_init(void) -- cgit v1.2.1 From fa7f912ad4ae0ed7591add52422e48282389652d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:13 +0900 Subject: sysfs: move release_sysfs_dirent() to dir.c There is no reason this function should be inlined and soon to follow sysfs object reference simplification will make it heavier. Move it to dir.c. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 12 ++++++++++++ fs/sysfs/sysfs.h | 13 +------------ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c4342a019972..2544aae6f583 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -15,6 +15,18 @@ DECLARE_RWSEM(sysfs_rename_sem); spinlock_t sysfs_lock = SPIN_LOCK_UNLOCKED; +void release_sysfs_dirent(struct sysfs_dirent * sd) +{ + if (sd->s_type & SYSFS_KOBJ_LINK) { + struct sysfs_symlink * sl = sd->s_element; + kfree(sl->link_name); + kobject_put(sl->target_kobj); + kfree(sl); + } + kfree(sd->s_iattr); + kmem_cache_free(sysfs_dir_cachep, sd); +} + static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) { struct sysfs_dirent * sd = dentry->d_fsdata; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 502c949c402d..687d959f606d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -18,6 +18,7 @@ extern void sysfs_delete_inode(struct inode *inode); extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); +extern void release_sysfs_dirent(struct sysfs_dirent * sd); extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, umode_t, int); @@ -99,18 +100,6 @@ static inline struct kobject *sysfs_get_kobject(struct dentry *dentry) return kobj; } -static inline void release_sysfs_dirent(struct sysfs_dirent * sd) -{ - if (sd->s_type & SYSFS_KOBJ_LINK) { - struct sysfs_symlink * sl = sd->s_element; - kfree(sl->link_name); - kobject_put(sl->target_kobj); - kfree(sl); - } - kfree(sd->s_iattr); - kmem_cache_free(sysfs_dir_cachep, sd); -} - static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd) { if (sd) { -- cgit v1.2.1 From 2b611bb7abdcc08278453fc9f6517401fd69ef95 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:13 +0900 Subject: sysfs: allocate inode number using ida sysfs used simple incrementing allocator which is not guaranteed to be unique. This patch makes sysfs use ida to give each sd a unique and packed inode number. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2544aae6f583..f09626cc568a 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -9,12 +9,42 @@ #include #include #include +#include #include #include "sysfs.h" DECLARE_RWSEM(sysfs_rename_sem); spinlock_t sysfs_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_IDA(sysfs_ino_ida); + +int sysfs_alloc_ino(ino_t *pino) +{ + int ino, rc; + + retry: + spin_lock(&sysfs_ino_lock); + rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino); + spin_unlock(&sysfs_ino_lock); + + if (rc == -EAGAIN) { + if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL)) + goto retry; + rc = -ENOMEM; + } + + *pino = ino; + return rc; +} + +static void sysfs_free_ino(ino_t ino) +{ + spin_lock(&sysfs_ino_lock); + ida_remove(&sysfs_ino_ida, ino); + spin_unlock(&sysfs_ino_lock); +} + void release_sysfs_dirent(struct sysfs_dirent * sd) { if (sd->s_type & SYSFS_KOBJ_LINK) { @@ -24,6 +54,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) kfree(sl); } kfree(sd->s_iattr); + sysfs_free_ino(sd->s_ino); kmem_cache_free(sysfs_dir_cachep, sd); } @@ -54,14 +85,6 @@ static struct dentry_operations sysfs_dentry_ops = { .d_iput = sysfs_d_iput, }; -static unsigned int sysfs_inode_counter; -ino_t sysfs_get_inum(void) -{ - if (unlikely(sysfs_inode_counter < 3)) - sysfs_inode_counter = 3; - return sysfs_inode_counter++; -} - /* * Allocates a new sysfs_dirent and links it to the parent sysfs_dirent */ @@ -73,7 +96,11 @@ static struct sysfs_dirent * __sysfs_new_dirent(void * element) if (!sd) return NULL; - sd->s_ino = sysfs_get_inum(); + if (sysfs_alloc_ino(&sd->s_ino)) { + kmem_cache_free(sysfs_dir_cachep, sd); + return NULL; + } + atomic_set(&sd->s_count, 1); atomic_set(&sd->s_event, 1); INIT_LIST_HEAD(&sd->s_children); -- cgit v1.2.1 From 7a23ad44047b1084a032bc0d127fe08af024593a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:13 +0900 Subject: sysfs: make sysfs_put() ignore NULL sd Make sysfs_put() ignore NULL sd instead of oopsing. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/sysfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 687d959f606d..f4fdbbffd571 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -111,7 +111,7 @@ static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd) static inline void sysfs_put(struct sysfs_dirent * sd) { - if (atomic_dec_and_test(&sd->s_count)) + if (sd && atomic_dec_and_test(&sd->s_count)) release_sysfs_dirent(sd); } -- cgit v1.2.1 From 93e3cd8270d036953120eca83610f95d3f7374c6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:13 +0900 Subject: sysfs: fix error handling in binattr write() Error handling in fs/sysfs/bin.c:write() was wrong because size_t count is used to receive return value from flush_write() which is negative on failure. This patch updates write() such that int variable is used instead. read() is updated the same way for consistency. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index d3b9f5f07db1..606267a36275 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -33,16 +33,13 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) } static ssize_t -read(struct file * file, char __user * userbuf, size_t count, loff_t * off) +read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) { char *buffer = file->private_data; struct dentry *dentry = file->f_path.dentry; int size = dentry->d_inode->i_size; loff_t offs = *off; - int ret; - - if (count > PAGE_SIZE) - count = PAGE_SIZE; + int count = min_t(size_t, bytes, PAGE_SIZE); if (size) { if (offs > size) @@ -51,15 +48,14 @@ read(struct file * file, char __user * userbuf, size_t count, loff_t * off) count = size - offs; } - ret = fill_read(dentry, buffer, offs, count); - if (ret < 0) - return ret; - count = ret; + count = fill_read(dentry, buffer, offs, count); + if (count < 0) + return count; if (copy_to_user(userbuf, buffer, count)) return -EFAULT; - pr_debug("offs = %lld, *off = %lld, count = %zd\n", offs, *off, count); + pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); *off = offs + count; @@ -78,16 +74,15 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) return attr->write(kobj, buffer, offset, count); } -static ssize_t write(struct file * file, const char __user * userbuf, - size_t count, loff_t * off) +static ssize_t write(struct file *file, const char __user *userbuf, + size_t bytes, loff_t *off) { char *buffer = file->private_data; struct dentry *dentry = file->f_path.dentry; int size = dentry->d_inode->i_size; loff_t offs = *off; + int count = min_t(size_t, bytes, PAGE_SIZE); - if (count > PAGE_SIZE) - count = PAGE_SIZE; if (size) { if (offs > size) return 0; -- cgit v1.2.1 From dfeb9fb0343363aadc3ee00a9347d120bc2a26b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:14 +0900 Subject: sysfs: flatten cleanup paths in sysfs_add_link() and create_dir() Flatten cleanup paths in sysfs_add_link() and create_dir() to improve readability and ease further changes to these functions. This is in preparation of object reference simplification. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 73 ++++++++++++++++++++++++++++++++---------------------- fs/sysfs/symlink.c | 27 +++++++++++--------- 2 files changed, 58 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f09626cc568a..b4c482461403 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -207,40 +207,53 @@ static int init_symlink(struct inode * inode) return 0; } -static int create_dir(struct kobject * k, struct dentry * p, - const char * n, struct dentry ** d) +static int create_dir(struct kobject *kobj, struct dentry *parent, + const char *name, struct dentry **p_dentry) { int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + struct dentry *dentry; + struct sysfs_dirent *sd; - mutex_lock(&p->d_inode->i_mutex); - *d = lookup_one_len(n, p, strlen(n)); - if (!IS_ERR(*d)) { - if (sysfs_dirent_exist(p->d_fsdata, n)) - error = -EEXIST; - else - error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, - SYSFS_DIR); - if (!error) { - error = sysfs_create(*d, mode, init_dir); - if (!error) { - inc_nlink(p->d_inode); - (*d)->d_op = &sysfs_dentry_ops; - d_rehash(*d); - } - } - if (error && (error != -EEXIST)) { - struct sysfs_dirent *sd = (*d)->d_fsdata; - if (sd) { - list_del_init(&sd->s_sibling); - sysfs_put(sd); - } - d_drop(*d); - } - dput(*d); - } else - error = PTR_ERR(*d); - mutex_unlock(&p->d_inode->i_mutex); + mutex_lock(&parent->d_inode->i_mutex); + + dentry = lookup_one_len(name, parent, strlen(name)); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_unlock; + } + + error = -EEXIST; + if (sysfs_dirent_exist(parent->d_fsdata, name)) + goto out_dput; + + error = sysfs_make_dirent(parent->d_fsdata, dentry, kobj, mode, + SYSFS_DIR); + if (error) + goto out_drop; + + error = sysfs_create(dentry, mode, init_dir); + if (error) + goto out_sput; + + inc_nlink(parent->d_inode); + dentry->d_op = &sysfs_dentry_ops; + d_rehash(dentry); + + *p_dentry = dentry; + error = 0; + goto out_dput; + + out_sput: + sd = dentry->d_fsdata; + list_del_init(&sd->s_sibling); + sysfs_put(sd); + out_drop: + d_drop(dentry); + out_dput: + dput(dentry); + out_unlock: + mutex_unlock(&parent->d_inode->i_mutex); return error; } diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 7b9c5bfde920..b463f17f6638 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -49,30 +49,33 @@ static int sysfs_add_link(struct dentry * parent, const char * name, struct kobj { struct sysfs_dirent * parent_sd = parent->d_fsdata; struct sysfs_symlink * sl; - int error = 0; + int error; error = -ENOMEM; - sl = kmalloc(sizeof(*sl), GFP_KERNEL); + sl = kzalloc(sizeof(*sl), GFP_KERNEL); if (!sl) - goto exit1; + goto err_out; sl->link_name = kmalloc(strlen(name) + 1, GFP_KERNEL); if (!sl->link_name) - goto exit2; + goto err_out; strcpy(sl->link_name, name); sl->target_kobj = kobject_get(target); error = sysfs_make_dirent(parent_sd, NULL, sl, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); - if (!error) - return 0; - - kobject_put(target); - kfree(sl->link_name); -exit2: - kfree(sl); -exit1: + if (error) + goto err_out; + + return 0; + + err_out: + if (sl) { + kobject_put(sl->target_kobj); + kfree(sl->link_name); + kfree(sl); + } return error; } -- cgit v1.2.1 From 996b73764e9bb9d5e751fd15b130ba38637d66a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:14 +0900 Subject: sysfs: flatten and fix sysfs_rename_dir() error handling Error handling in sysfs_rename_dir() was broken. * When lookup_one_len() fails, 0 is returned. * If parent inode check fails, returns with inode mutex and rename rwsem held. This patch fixes the above bugs and flattens error handling such that it's more readable and easier to modify. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 73 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index b4c482461403..90bed5df254f 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -452,8 +452,9 @@ void sysfs_remove_dir(struct kobject * kobj) int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, const char *new_name) { - int error = 0; + int error; struct dentry * new_dentry; + struct sysfs_dirent *sd, *parent_sd; if (!new_parent) return -EFAULT; @@ -462,40 +463,48 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, mutex_lock(&new_parent->d_inode->i_mutex); new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name)); - if (!IS_ERR(new_dentry)) { - /* By allowing two different directories with the - * same d_parent we allow this routine to move - * between different shadows of the same directory - */ - if (kobj->dentry->d_parent->d_inode != new_parent->d_inode) - return -EINVAL; - else if (new_dentry->d_parent->d_inode != new_parent->d_inode) - error = -EINVAL; - else if (new_dentry == kobj->dentry) - error = -EINVAL; - else if (!new_dentry->d_inode) { - error = kobject_set_name(kobj, "%s", new_name); - if (!error) { - struct sysfs_dirent *sd, *parent_sd; - - d_add(new_dentry, NULL); - d_move(kobj->dentry, new_dentry); - - sd = kobj->dentry->d_fsdata; - parent_sd = new_parent->d_fsdata; - - list_del_init(&sd->s_sibling); - list_add(&sd->s_sibling, &parent_sd->s_children); - } - else - d_drop(new_dentry); - } else - error = -EEXIST; - dput(new_dentry); + if (IS_ERR(new_dentry)) { + error = PTR_ERR(new_dentry); + goto out_unlock; } + + /* By allowing two different directories with the same + * d_parent we allow this routine to move between different + * shadows of the same directory + */ + error = -EINVAL; + if (kobj->dentry->d_parent->d_inode != new_parent->d_inode || + new_dentry->d_parent->d_inode != new_parent->d_inode || + new_dentry == kobj->dentry) + goto out_dput; + + error = -EEXIST; + if (new_dentry->d_inode) + goto out_dput; + + error = kobject_set_name(kobj, "%s", new_name); + if (error) + goto out_drop; + + d_add(new_dentry, NULL); + d_move(kobj->dentry, new_dentry); + + sd = kobj->dentry->d_fsdata; + parent_sd = new_parent->d_fsdata; + + list_del_init(&sd->s_sibling); + list_add(&sd->s_sibling, &parent_sd->s_children); + + error = 0; + goto out_unlock; + + out_drop: + d_drop(new_dentry); + out_dput: + dput(new_dentry); + out_unlock: mutex_unlock(&new_parent->d_inode->i_mutex); up_write(&sysfs_rename_sem); - return error; } -- cgit v1.2.1 From a26cd7226c24c3be5dd5f48a74832fe64beb8489 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:14 +0900 Subject: sysfs: consolidate sysfs_dirent creation functions Currently there are four functions to create sysfs_dirent - __sysfs_new_dirent(), sysfs_new_dirent(), __sysfs_make_dirent() and sysfs_make_dirent(). Other than sysfs_make_dirent(), no function has two users if calls to implement other functions are excluded. This patch consolidates sysfs_dirent creation functions into the following two. * sysfs_new_dirent() : allocate and initialize * sysfs_attach_dirent() : attach to sysfs_dirent hierarchy and/or associate with dentry This simplifies interface and gives callers more flexibility. This is in preparation of object reference simplification. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 82 +++++++++++++++++------------------------------------- fs/sysfs/file.c | 21 ++++++++++---- fs/sysfs/symlink.c | 7 +++-- fs/sysfs/sysfs.h | 7 +++-- 4 files changed, 50 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 90bed5df254f..f16aa7e3eafc 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -85,10 +85,7 @@ static struct dentry_operations sysfs_dentry_ops = { .d_iput = sysfs_d_iput, }; -/* - * Allocates a new sysfs_dirent and links it to the parent sysfs_dirent - */ -static struct sysfs_dirent * __sysfs_new_dirent(void * element) +struct sysfs_dirent *sysfs_new_dirent(void *element, umode_t mode, int type) { struct sysfs_dirent * sd; @@ -105,25 +102,25 @@ static struct sysfs_dirent * __sysfs_new_dirent(void * element) atomic_set(&sd->s_event, 1); INIT_LIST_HEAD(&sd->s_children); INIT_LIST_HEAD(&sd->s_sibling); + sd->s_element = element; + sd->s_mode = mode; + sd->s_type = type; return sd; } -static void __sysfs_list_dirent(struct sysfs_dirent *parent_sd, - struct sysfs_dirent *sd) +void sysfs_attach_dirent(struct sysfs_dirent *sd, + struct sysfs_dirent *parent_sd, struct dentry *dentry) { - if (sd) - list_add(&sd->s_sibling, &parent_sd->s_children); -} + if (dentry) { + sd->s_dentry = dentry; + dentry->d_fsdata = sysfs_get(sd); + dentry->d_op = &sysfs_dentry_ops; + } -static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent *parent_sd, - void * element) -{ - struct sysfs_dirent *sd; - sd = __sysfs_new_dirent(element); - __sysfs_list_dirent(parent_sd, sd); - return sd; + if (parent_sd) + list_add(&sd->s_sibling, &parent_sd->s_children); } /* @@ -151,39 +148,6 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, return 0; } - -static struct sysfs_dirent * -__sysfs_make_dirent(struct dentry *dentry, void *element, mode_t mode, int type) -{ - struct sysfs_dirent * sd; - - sd = __sysfs_new_dirent(element); - if (!sd) - goto out; - - sd->s_mode = mode; - sd->s_type = type; - sd->s_dentry = dentry; - if (dentry) { - dentry->d_fsdata = sysfs_get(sd); - dentry->d_op = &sysfs_dentry_ops; - } - -out: - return sd; -} - -int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, - void * element, umode_t mode, int type) -{ - struct sysfs_dirent *sd; - - sd = __sysfs_make_dirent(dentry, element, mode, type); - __sysfs_list_dirent(parent_sd, sd); - - return sd ? 0 : -ENOMEM; -} - static int init_dir(struct inode * inode) { inode->i_op = &sysfs_dir_inode_operations; @@ -227,10 +191,11 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, if (sysfs_dirent_exist(parent->d_fsdata, name)) goto out_dput; - error = sysfs_make_dirent(parent->d_fsdata, dentry, kobj, mode, - SYSFS_DIR); - if (error) + error = -ENOMEM; + sd = sysfs_new_dirent(kobj, mode, SYSFS_DIR); + if (!sd) goto out_drop; + sysfs_attach_dirent(sd, parent->d_fsdata, dentry); error = sysfs_create(dentry, mode, init_dir); if (error) @@ -245,7 +210,6 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, goto out_dput; out_sput: - sd = dentry->d_fsdata; list_del_init(&sd->s_sibling); sysfs_put(sd); out_drop: @@ -557,13 +521,16 @@ static int sysfs_dir_open(struct inode *inode, struct file *file) { struct dentry * dentry = file->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; + struct sysfs_dirent * sd; mutex_lock(&dentry->d_inode->i_mutex); - file->private_data = sysfs_new_dirent(parent_sd, NULL); + sd = sysfs_new_dirent(NULL, 0, 0); + if (sd) + sysfs_attach_dirent(sd, parent_sd, NULL); mutex_unlock(&dentry->d_inode->i_mutex); - return file->private_data ? 0 : -ENOMEM; - + file->private_data = sd; + return sd ? 0 : -ENOMEM; } static int sysfs_dir_close(struct inode *inode, struct file *file) @@ -736,9 +703,10 @@ struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) if (!shadow) goto nomem; - sd = __sysfs_make_dirent(shadow, kobj, inode->i_mode, SYSFS_DIR); + sd = sysfs_new_dirent(kobj, inode->i_mode, SYSFS_DIR); if (!sd) goto nomem; + sysfs_attach_dirent(sd, NULL, shadow); d_instantiate(shadow, igrab(inode)); inc_nlink(inode); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index b502c7197ec0..fd4b6dc03d2d 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -444,14 +444,25 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) { struct sysfs_dirent * parent_sd = dir->d_fsdata; umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; - int error = -EEXIST; + struct sysfs_dirent *sd; + int error = 0; mutex_lock(&dir->d_inode->i_mutex); - if (!sysfs_dirent_exist(parent_sd, attr->name)) - error = sysfs_make_dirent(parent_sd, NULL, (void *)attr, - mode, type); - mutex_unlock(&dir->d_inode->i_mutex); + if (sysfs_dirent_exist(parent_sd, attr->name)) { + error = -EEXIST; + goto out_unlock; + } + + sd = sysfs_new_dirent((void *)attr, mode, type); + if (!sd) { + error = -ENOMEM; + goto out_unlock; + } + sysfs_attach_dirent(sd, parent_sd, NULL); + + out_unlock: + mutex_unlock(&dir->d_inode->i_mutex); return error; } diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index b463f17f6638..d96bb9cbc9d4 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -49,6 +49,7 @@ static int sysfs_add_link(struct dentry * parent, const char * name, struct kobj { struct sysfs_dirent * parent_sd = parent->d_fsdata; struct sysfs_symlink * sl; + struct sysfs_dirent * sd; int error; error = -ENOMEM; @@ -63,10 +64,10 @@ static int sysfs_add_link(struct dentry * parent, const char * name, struct kobj strcpy(sl->link_name, name); sl->target_kobj = kobject_get(target); - error = sysfs_make_dirent(parent_sd, NULL, sl, S_IFLNK|S_IRWXUGO, - SYSFS_KOBJ_LINK); - if (error) + sd = sysfs_new_dirent(sl, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); + if (!sd) goto err_out; + sysfs_attach_dirent(sd, parent_sd, NULL); return 0; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index f4fdbbffd571..f8f49cc5c852 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -20,8 +20,11 @@ extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern void release_sysfs_dirent(struct sysfs_dirent * sd); extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); -extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, - umode_t, int); +extern struct sysfs_dirent *sysfs_new_dirent(void *element, umode_t mode, + int type); +extern void sysfs_attach_dirent(struct sysfs_dirent *sd, + struct sysfs_dirent *parent_sd, + struct dentry *dentry); extern int sysfs_add_file(struct dentry *, const struct attribute *, int); extern int sysfs_hash_and_remove(struct dentry * dir, const char * name); -- cgit v1.2.1 From 13b3086d2ea483cbcae5a4236446cecc082a72cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:14 +0900 Subject: sysfs: add sysfs_dirent->s_parent Add sysfs_dirent->s_parent. With this patch, each sd points to and holds a reference to its parent. This allows walking sysfs tree without referencing sd->s_dentry which can go away anytime if the user doesn't control when it's deleted. sd->s_parent is initialized and parent is referenced in sysfs_attach_dirent(). Reference to parent is released when the sd is released, so as long as reference to a sd is held, s_parent can be followed. dentry walk in sysfs_readdir() is convereted to s_parent walk. This will be used to reimplement symlink such that it uses only sysfs_dirent tree. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 30 +++++++++++++++++++++++------- fs/sysfs/mount.c | 1 + fs/sysfs/sysfs.h | 1 + 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f16aa7e3eafc..5d50e1ddfbd1 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -47,6 +47,11 @@ static void sysfs_free_ino(ino_t ino) void release_sysfs_dirent(struct sysfs_dirent * sd) { + struct sysfs_dirent *parent_sd; + + repeat: + parent_sd = sd->s_parent; + if (sd->s_type & SYSFS_KOBJ_LINK) { struct sysfs_symlink * sl = sd->s_element; kfree(sl->link_name); @@ -56,6 +61,10 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) kfree(sd->s_iattr); sysfs_free_ino(sd->s_ino); kmem_cache_free(sysfs_dir_cachep, sd); + + sd = parent_sd; + if (sd && atomic_dec_and_test(&sd->s_count)) + goto repeat; } static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) @@ -119,8 +128,10 @@ void sysfs_attach_dirent(struct sysfs_dirent *sd, dentry->d_op = &sysfs_dentry_ops; } - if (parent_sd) + if (parent_sd) { + sd->s_parent = sysfs_get(parent_sd); list_add(&sd->s_sibling, &parent_sd->s_children); + } } /* @@ -571,7 +582,10 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ case 1: - ino = parent_ino(dentry); + if (parent_sd->s_parent) + ino = parent_sd->s_parent->s_ino; + else + ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) break; filp->f_pos++; @@ -688,13 +702,13 @@ int sysfs_make_shadowed_dir(struct kobject *kobj, struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) { + struct dentry *dir = kobj->dentry; + struct inode *inode = dir->d_inode; + struct dentry *parent = dir->d_parent; + struct sysfs_dirent *parent_sd = parent->d_fsdata; + struct dentry *shadow; struct sysfs_dirent *sd; - struct dentry *parent, *dir, *shadow; - struct inode *inode; - dir = kobj->dentry; - inode = dir->d_inode; - parent = dir->d_parent; shadow = ERR_PTR(-EINVAL); if (!sysfs_is_shadowed_inode(inode)) goto out; @@ -706,6 +720,8 @@ struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) sd = sysfs_new_dirent(kobj, inode->i_mode, SYSFS_DIR); if (!sd) goto nomem; + /* point to parent_sd but don't attach to it */ + sd->s_parent = sysfs_get(parent_sd); sysfs_attach_dirent(sd, NULL, shadow); d_instantiate(shadow, igrab(inode)); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 00ab9125d398..31c1fc67f604 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -28,6 +28,7 @@ static const struct super_operations sysfs_ops = { }; static struct sysfs_dirent sysfs_root = { + .s_count = ATOMIC_INIT(1), .s_sibling = LIST_HEAD_INIT(sysfs_root.s_sibling), .s_children = LIST_HEAD_INIT(sysfs_root.s_children), .s_element = NULL, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index f8f49cc5c852..ce05d6fd7522 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -1,5 +1,6 @@ struct sysfs_dirent { atomic_t s_count; + struct sysfs_dirent * s_parent; struct list_head s_sibling; struct list_head s_children; void * s_element; -- cgit v1.2.1 From 0c096b507f15397da890051ee73de4266d3941fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:15 +0900 Subject: sysfs: add sysfs_dirent->s_name Add s_name to sysfs_dirent. This is to further reduce dependency to the associated dentry. Name is copied for directories and symlinks but not for attributes. Where possible, name dereferences are converted to use sd->s_name. sysfs_symlink->link_name and sysfs_get_name() are unused now and removed. This change allows symlink to be implemented using sysfs_dirent tree proper, which is the last remaining dentry-dependent sysfs walk. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 67 +++++++++++++++++++++++++++++++++++------------------- fs/sysfs/file.c | 2 +- fs/sysfs/inode.c | 33 +-------------------------- fs/sysfs/symlink.c | 8 +------ fs/sysfs/sysfs.h | 7 +++--- 5 files changed, 50 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 5d50e1ddfbd1..6e8d6f54f082 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -54,10 +54,11 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) if (sd->s_type & SYSFS_KOBJ_LINK) { struct sysfs_symlink * sl = sd->s_element; - kfree(sl->link_name); kobject_put(sl->target_kobj); kfree(sl); } + if (sd->s_type & SYSFS_COPY_NAME) + kfree(sd->s_name); kfree(sd->s_iattr); sysfs_free_ino(sd->s_ino); kmem_cache_free(sysfs_dir_cachep, sd); @@ -94,29 +95,41 @@ static struct dentry_operations sysfs_dentry_ops = { .d_iput = sysfs_d_iput, }; -struct sysfs_dirent *sysfs_new_dirent(void *element, umode_t mode, int type) +struct sysfs_dirent *sysfs_new_dirent(const char *name, void *element, + umode_t mode, int type) { - struct sysfs_dirent * sd; + char *dup_name = NULL; + struct sysfs_dirent *sd = NULL; + + if (type & SYSFS_COPY_NAME) { + name = dup_name = kstrdup(name, GFP_KERNEL); + if (!name) + goto err_out; + } sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL); if (!sd) - return NULL; + goto err_out; - if (sysfs_alloc_ino(&sd->s_ino)) { - kmem_cache_free(sysfs_dir_cachep, sd); - return NULL; - } + if (sysfs_alloc_ino(&sd->s_ino)) + goto err_out; atomic_set(&sd->s_count, 1); atomic_set(&sd->s_event, 1); INIT_LIST_HEAD(&sd->s_children); INIT_LIST_HEAD(&sd->s_sibling); + sd->s_name = name; sd->s_element = element; sd->s_mode = mode; sd->s_type = type; return sd; + + err_out: + kfree(dup_name); + kmem_cache_free(sysfs_dir_cachep, sd); + return NULL; } void sysfs_attach_dirent(struct sysfs_dirent *sd, @@ -148,8 +161,7 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (sd->s_element) { - const unsigned char *existing = sysfs_get_name(sd); - if (strcmp(existing, new)) + if (strcmp(sd->s_name, new)) continue; else return -EEXIST; @@ -203,7 +215,7 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, goto out_dput; error = -ENOMEM; - sd = sysfs_new_dirent(kobj, mode, SYSFS_DIR); + sd = sysfs_new_dirent(name, kobj, mode, SYSFS_DIR); if (!sd) goto out_drop; sysfs_attach_dirent(sd, parent->d_fsdata, dentry); @@ -334,9 +346,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (sd->s_type & SYSFS_NOT_PINNED) { - const unsigned char * name = sysfs_get_name(sd); - - if (strcmp(name, dentry->d_name.name)) + if (strcmp(sd->s_name, dentry->d_name.name)) continue; if (sd->s_type & SYSFS_KOBJ_LINK) @@ -427,9 +437,11 @@ void sysfs_remove_dir(struct kobject * kobj) int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, const char *new_name) { + struct sysfs_dirent *sd = kobj->dentry->d_fsdata; + struct sysfs_dirent *parent_sd = new_parent->d_fsdata; + struct dentry *new_dentry; + char *dup_name; int error; - struct dentry * new_dentry; - struct sysfs_dirent *sd, *parent_sd; if (!new_parent) return -EFAULT; @@ -457,22 +469,31 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, if (new_dentry->d_inode) goto out_dput; + /* rename kobject and sysfs_dirent */ + error = -ENOMEM; + new_name = dup_name = kstrdup(new_name, GFP_KERNEL); + if (!new_name) + goto out_drop; + error = kobject_set_name(kobj, "%s", new_name); if (error) - goto out_drop; + goto out_free; + kfree(sd->s_name); + sd->s_name = new_name; + + /* move under the new parent */ d_add(new_dentry, NULL); d_move(kobj->dentry, new_dentry); - sd = kobj->dentry->d_fsdata; - parent_sd = new_parent->d_fsdata; - list_del_init(&sd->s_sibling); list_add(&sd->s_sibling, &parent_sd->s_children); error = 0; goto out_unlock; + out_free: + kfree(dup_name); out_drop: d_drop(new_dentry); out_dput: @@ -535,7 +556,7 @@ static int sysfs_dir_open(struct inode *inode, struct file *file) struct sysfs_dirent * sd; mutex_lock(&dentry->d_inode->i_mutex); - sd = sysfs_new_dirent(NULL, 0, 0); + sd = sysfs_new_dirent("_DIR_", NULL, 0, 0); if (sd) sysfs_attach_dirent(sd, parent_sd, NULL); mutex_unlock(&dentry->d_inode->i_mutex); @@ -605,7 +626,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) if (!next->s_element) continue; - name = sysfs_get_name(next); + name = next->s_name; len = strlen(name); ino = next->s_ino; @@ -717,7 +738,7 @@ struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) if (!shadow) goto nomem; - sd = sysfs_new_dirent(kobj, inode->i_mode, SYSFS_DIR); + sd = sysfs_new_dirent("_SHADOW_", kobj, inode->i_mode, SYSFS_DIR); if (!sd) goto nomem; /* point to parent_sd but don't attach to it */ diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index fd4b6dc03d2d..8240b1687dd0 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -454,7 +454,7 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) goto out_unlock; } - sd = sysfs_new_dirent((void *)attr, mode, type); + sd = sysfs_new_dirent(attr->name, (void *)attr, mode, type); if (!sd) { error = -ENOMEM; goto out_unlock; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 5266eec15f6e..5c605b0003a8 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -191,37 +191,6 @@ int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) return error; } -/* - * Get the name for corresponding element represented by the given sysfs_dirent - */ -const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) -{ - struct attribute * attr; - struct bin_attribute * bin_attr; - struct sysfs_symlink * sl; - - BUG_ON(!sd || !sd->s_element); - - switch (sd->s_type) { - case SYSFS_DIR: - /* Always have a dentry so use that */ - return sd->s_dentry->d_name.name; - - case SYSFS_KOBJ_ATTR: - attr = sd->s_element; - return attr->name; - - case SYSFS_KOBJ_BIN_ATTR: - bin_attr = sd->s_element; - return bin_attr->attr.name; - - case SYSFS_KOBJ_LINK: - sl = sd->s_element; - return sl->link_name; - } - return NULL; -} - static inline void orphan_all_buffers(struct inode *node) { struct sysfs_buffer_collection *set; @@ -305,7 +274,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; - if (!strcmp(sysfs_get_name(sd), name)) { + if (!strcmp(sd->s_name, name)) { list_del_init(&sd->s_sibling); sysfs_drop_dentry(sd, dir); sysfs_put(sd); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index d96bb9cbc9d4..c72820450e7c 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -57,14 +57,9 @@ static int sysfs_add_link(struct dentry * parent, const char * name, struct kobj if (!sl) goto err_out; - sl->link_name = kmalloc(strlen(name) + 1, GFP_KERNEL); - if (!sl->link_name) - goto err_out; - - strcpy(sl->link_name, name); sl->target_kobj = kobject_get(target); - sd = sysfs_new_dirent(sl, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); + sd = sysfs_new_dirent(name, sl, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); if (!sd) goto err_out; sysfs_attach_dirent(sd, parent_sd, NULL); @@ -74,7 +69,6 @@ static int sysfs_add_link(struct dentry * parent, const char * name, struct kobj err_out: if (sl) { kobject_put(sl->target_kobj); - kfree(sl->link_name); kfree(sl); } return error; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index ce05d6fd7522..d34b008537d5 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -3,6 +3,7 @@ struct sysfs_dirent { struct sysfs_dirent * s_parent; struct list_head s_sibling; struct list_head s_children; + const char * s_name; void * s_element; int s_type; umode_t s_mode; @@ -21,8 +22,8 @@ extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern void release_sysfs_dirent(struct sysfs_dirent * sd); extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); -extern struct sysfs_dirent *sysfs_new_dirent(void *element, umode_t mode, - int type); +extern struct sysfs_dirent *sysfs_new_dirent(const char *name, void *element, + umode_t mode, int type); extern void sysfs_attach_dirent(struct sysfs_dirent *sd, struct sysfs_dirent *parent_sd, struct dentry *dentry); @@ -34,7 +35,6 @@ extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * na extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); extern void sysfs_remove_subdir(struct dentry *); -extern const unsigned char * sysfs_get_name(struct sysfs_dirent *sd); extern void sysfs_drop_dentry(struct sysfs_dirent *sd, struct dentry *parent); extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); @@ -48,7 +48,6 @@ extern const struct inode_operations sysfs_dir_inode_operations; extern const struct inode_operations sysfs_symlink_inode_operations; struct sysfs_symlink { - char * link_name; struct kobject * target_kobj; }; -- cgit v1.2.1 From 3e5190380ebef77f2b015c9e7a4ca225a3d75021 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:15 +0900 Subject: sysfs: make sysfs_dirent->s_element a union Make sd->s_element a union of sysfs_elem_{dir|symlink|attr|bin_attr} and rename it to s_elem. This is to achieve... * some level of type checking : changing symlink to point to sysfs_dirent instead of kobject is much safer and less painful now. * easier / standardized dereferencing * allow sysfs_elem_* to contain more than one entry Where possible, pointer is obtained by directly deferencing from sd instead of going through other entities. This reduces dependencies to dentry, inode and kobject. to_attr() and to_bin_attr() are unused now and removed. This is in preparation of object reference simplification. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 18 +++++++++++------- fs/sysfs/dir.c | 31 ++++++++++++++---------------- fs/sysfs/file.c | 19 +++++++++--------- fs/sysfs/inode.c | 2 +- fs/sysfs/mount.c | 1 - fs/sysfs/symlink.c | 23 ++++------------------ fs/sysfs/sysfs.h | 56 ++++++++++++++++++++++++++++++------------------------ 7 files changed, 71 insertions(+), 79 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 606267a36275..67a0d5030c96 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -23,7 +23,8 @@ static int fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) { - struct bin_attribute * attr = to_bin_attr(dentry); + struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; struct kobject * kobj = to_kobj(dentry->d_parent); if (!attr->read) @@ -65,7 +66,8 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) static int flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) { - struct bin_attribute *attr = to_bin_attr(dentry); + struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; struct kobject *kobj = to_kobj(dentry->d_parent); if (!attr->write) @@ -101,9 +103,9 @@ static ssize_t write(struct file *file, const char __user *userbuf, static int mmap(struct file *file, struct vm_area_struct *vma) { - struct dentry *dentry = file->f_path.dentry; - struct bin_attribute *attr = to_bin_attr(dentry); - struct kobject *kobj = to_kobj(dentry->d_parent); + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; + struct kobject *kobj = to_kobj(file->f_path.dentry->d_parent); if (!attr->mmap) return -EINVAL; @@ -114,7 +116,8 @@ static int mmap(struct file *file, struct vm_area_struct *vma) static int open(struct inode * inode, struct file * file) { struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); - struct bin_attribute * attr = to_bin_attr(file->f_path.dentry); + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; int error = -EINVAL; if (!kobj || !attr) @@ -150,7 +153,8 @@ static int open(struct inode * inode, struct file * file) static int release(struct inode * inode, struct file * file) { struct kobject * kobj = to_kobj(file->f_path.dentry->d_parent); - struct bin_attribute * attr = to_bin_attr(file->f_path.dentry); + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; u8 * buffer = file->private_data; kobject_put(kobj); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 6e8d6f54f082..079122695675 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -52,11 +52,8 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) repeat: parent_sd = sd->s_parent; - if (sd->s_type & SYSFS_KOBJ_LINK) { - struct sysfs_symlink * sl = sd->s_element; - kobject_put(sl->target_kobj); - kfree(sl); - } + if (sd->s_type & SYSFS_KOBJ_LINK) + kobject_put(sd->s_elem.symlink.target_kobj); if (sd->s_type & SYSFS_COPY_NAME) kfree(sd->s_name); kfree(sd->s_iattr); @@ -95,8 +92,7 @@ static struct dentry_operations sysfs_dentry_ops = { .d_iput = sysfs_d_iput, }; -struct sysfs_dirent *sysfs_new_dirent(const char *name, void *element, - umode_t mode, int type) +struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) { char *dup_name = NULL; struct sysfs_dirent *sd = NULL; @@ -120,7 +116,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, void *element, INIT_LIST_HEAD(&sd->s_sibling); sd->s_name = name; - sd->s_element = element; sd->s_mode = mode; sd->s_type = type; @@ -160,7 +155,7 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, struct sysfs_dirent * sd; list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (sd->s_element) { + if (sd->s_type) { if (strcmp(sd->s_name, new)) continue; else @@ -215,9 +210,10 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, goto out_dput; error = -ENOMEM; - sd = sysfs_new_dirent(name, kobj, mode, SYSFS_DIR); + sd = sysfs_new_dirent(name, mode, SYSFS_DIR); if (!sd) goto out_drop; + sd->s_elem.dir.kobj = kobj; sysfs_attach_dirent(sd, parent->d_fsdata, dentry); error = sysfs_create(dentry, mode, init_dir); @@ -290,10 +286,10 @@ static int sysfs_attach_attr(struct sysfs_dirent * sd, struct dentry * dentry) int error = 0; if (sd->s_type & SYSFS_KOBJ_BIN_ATTR) { - bin_attr = sd->s_element; + bin_attr = sd->s_elem.bin_attr.bin_attr; attr = &bin_attr->attr; } else { - attr = sd->s_element; + attr = sd->s_elem.attr.attr; init = init_file; } @@ -404,7 +400,7 @@ static void __sysfs_remove_dir(struct dentry *dentry) mutex_lock(&dentry->d_inode->i_mutex); parent_sd = dentry->d_fsdata; list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { - if (!sd->s_element || !(sd->s_type & SYSFS_NOT_PINNED)) + if (!sd->s_type || !(sd->s_type & SYSFS_NOT_PINNED)) continue; list_del_init(&sd->s_sibling); sysfs_drop_dentry(sd, dentry); @@ -556,7 +552,7 @@ static int sysfs_dir_open(struct inode *inode, struct file *file) struct sysfs_dirent * sd; mutex_lock(&dentry->d_inode->i_mutex); - sd = sysfs_new_dirent("_DIR_", NULL, 0, 0); + sd = sysfs_new_dirent("_DIR_", 0, 0); if (sd) sysfs_attach_dirent(sd, parent_sd, NULL); mutex_unlock(&dentry->d_inode->i_mutex); @@ -623,7 +619,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) next = list_entry(p, struct sysfs_dirent, s_sibling); - if (!next->s_element) + if (!next->s_type) continue; name = next->s_name; @@ -671,7 +667,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) struct sysfs_dirent *next; next = list_entry(p, struct sysfs_dirent, s_sibling); - if (next->s_element) + if (next->s_type) n--; p = p->next; } @@ -738,9 +734,10 @@ struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) if (!shadow) goto nomem; - sd = sysfs_new_dirent("_SHADOW_", kobj, inode->i_mode, SYSFS_DIR); + sd = sysfs_new_dirent("_SHADOW_", inode->i_mode, SYSFS_DIR); if (!sd) goto nomem; + sd->s_elem.dir.kobj = kobj; /* point to parent_sd but don't attach to it */ sd->s_parent = sysfs_get(parent_sd); sysfs_attach_dirent(sd, NULL, shadow); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 8240b1687dd0..04f6b0ebc889 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -88,7 +88,6 @@ remove_from_collection(struct sysfs_buffer *buffer, struct inode *node) static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { struct sysfs_dirent * sd = dentry->d_fsdata; - struct attribute * attr = to_attr(dentry); struct kobject * kobj = to_kobj(dentry->d_parent); struct sysfs_ops * ops = buffer->ops; int ret = 0; @@ -100,7 +99,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer return -ENOMEM; buffer->event = atomic_read(&sd->s_event); - count = ops->show(kobj,attr,buffer->page); + count = ops->show(kobj, sd->s_elem.attr.attr, buffer->page); BUG_ON(count > (ssize_t)PAGE_SIZE); if (count >= 0) { buffer->needs_read_fill = 0; @@ -199,11 +198,11 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t static int flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count) { - struct attribute * attr = to_attr(dentry); + struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject * kobj = to_kobj(dentry->d_parent); struct sysfs_ops * ops = buffer->ops; - return ops->store(kobj,attr,buffer->page,count); + return ops->store(kobj, attr_sd->s_elem.attr.attr, buffer->page, count); } @@ -248,7 +247,8 @@ out: static int sysfs_open_file(struct inode *inode, struct file *file) { struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); - struct attribute * attr = to_attr(file->f_path.dentry); + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct attribute *attr = attr_sd->s_elem.attr.attr; struct sysfs_buffer_collection *set; struct sysfs_buffer * buffer; struct sysfs_ops * ops = NULL; @@ -341,15 +341,15 @@ static int sysfs_open_file(struct inode *inode, struct file *file) static int sysfs_release(struct inode * inode, struct file * filp) { struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); - struct attribute * attr = to_attr(filp->f_path.dentry); - struct module * owner = attr->owner; + struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; + struct attribute *attr = attr_sd->s_elem.attr.attr; struct sysfs_buffer * buffer = filp->private_data; if (buffer) remove_from_collection(buffer, inode); kobject_put(kobj); /* After this point, attr should not be accessed. */ - module_put(owner); + module_put(attr->owner); if (buffer) { if (buffer->page) @@ -454,11 +454,12 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) goto out_unlock; } - sd = sysfs_new_dirent(attr->name, (void *)attr, mode, type); + sd = sysfs_new_dirent(attr->name, mode, type); if (!sd) { error = -ENOMEM; goto out_unlock; } + sd->s_elem.attr.attr = (void *)attr; sysfs_attach_dirent(sd, parent_sd, NULL); out_unlock: diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 5c605b0003a8..617d10cea07d 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -272,7 +272,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) parent_sd = dir->d_fsdata; mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (!sd->s_element) + if (!sd->s_type) continue; if (!strcmp(sd->s_name, name)) { list_del_init(&sd->s_sibling); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 31c1fc67f604..8f6d8b1b211f 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -31,7 +31,6 @@ static struct sysfs_dirent sysfs_root = { .s_count = ATOMIC_INIT(1), .s_sibling = LIST_HEAD_INIT(sysfs_root.s_sibling), .s_children = LIST_HEAD_INIT(sysfs_root.s_children), - .s_element = NULL, .s_type = SYSFS_ROOT, .s_iattr = NULL, .s_ino = 1, diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c72820450e7c..27df635b786a 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -48,30 +48,15 @@ static void fill_object_path(struct kobject * kobj, char * buffer, int length) static int sysfs_add_link(struct dentry * parent, const char * name, struct kobject * target) { struct sysfs_dirent * parent_sd = parent->d_fsdata; - struct sysfs_symlink * sl; struct sysfs_dirent * sd; - int error; - error = -ENOMEM; - sl = kzalloc(sizeof(*sl), GFP_KERNEL); - if (!sl) - goto err_out; - - sl->target_kobj = kobject_get(target); - - sd = sysfs_new_dirent(name, sl, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); + sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); if (!sd) - goto err_out; - sysfs_attach_dirent(sd, parent_sd, NULL); + return -ENOMEM; + sd->s_elem.symlink.target_kobj = kobject_get(target); + sysfs_attach_dirent(sd, parent_sd, NULL); return 0; - - err_out: - if (sl) { - kobject_put(sl->target_kobj); - kfree(sl); - } - return error; } /** diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d34b008537d5..39ab0481379c 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -1,10 +1,33 @@ +struct sysfs_elem_dir { + struct kobject * kobj; +}; + +struct sysfs_elem_symlink { + struct kobject * target_kobj; +}; + +struct sysfs_elem_attr { + struct attribute * attr; +}; + +struct sysfs_elem_bin_attr { + struct bin_attribute * bin_attr; +}; + struct sysfs_dirent { atomic_t s_count; struct sysfs_dirent * s_parent; struct list_head s_sibling; struct list_head s_children; const char * s_name; - void * s_element; + + union { + struct sysfs_elem_dir dir; + struct sysfs_elem_symlink symlink; + struct sysfs_elem_attr attr; + struct sysfs_elem_bin_attr bin_attr; + } s_elem; + int s_type; umode_t s_mode; ino_t s_ino; @@ -22,8 +45,8 @@ extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern void release_sysfs_dirent(struct sysfs_dirent * sd); extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); -extern struct sysfs_dirent *sysfs_new_dirent(const char *name, void *element, - umode_t mode, int type); +extern struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, + int type); extern void sysfs_attach_dirent(struct sysfs_dirent *sd, struct sysfs_dirent *parent_sd, struct dentry *dentry); @@ -47,10 +70,6 @@ extern const struct file_operations bin_fops; extern const struct inode_operations sysfs_dir_inode_operations; extern const struct inode_operations sysfs_symlink_inode_operations; -struct sysfs_symlink { - struct kobject * target_kobj; -}; - struct sysfs_buffer { struct list_head associates; size_t count; @@ -70,19 +89,7 @@ struct sysfs_buffer_collection { static inline struct kobject * to_kobj(struct dentry * dentry) { struct sysfs_dirent * sd = dentry->d_fsdata; - return ((struct kobject *) sd->s_element); -} - -static inline struct attribute * to_attr(struct dentry * dentry) -{ - struct sysfs_dirent * sd = dentry->d_fsdata; - return ((struct attribute *) sd->s_element); -} - -static inline struct bin_attribute * to_bin_attr(struct dentry * dentry) -{ - struct sysfs_dirent * sd = dentry->d_fsdata; - return ((struct bin_attribute *) sd->s_element); + return sd->s_elem.dir.kobj; } static inline struct kobject *sysfs_get_kobject(struct dentry *dentry) @@ -92,11 +99,10 @@ static inline struct kobject *sysfs_get_kobject(struct dentry *dentry) spin_lock(&dcache_lock); if (!d_unhashed(dentry)) { struct sysfs_dirent * sd = dentry->d_fsdata; - if (sd->s_type & SYSFS_KOBJ_LINK) { - struct sysfs_symlink * sl = sd->s_element; - kobj = kobject_get(sl->target_kobj); - } else - kobj = kobject_get(sd->s_element); + if (sd->s_type & SYSFS_KOBJ_LINK) + kobj = kobject_get(sd->s_elem.symlink.target_kobj); + else + kobj = kobject_get(sd->s_elem.dir.kobj); } spin_unlock(&dcache_lock); -- cgit v1.2.1 From aecdcedaab49ca40620dc7dd70f67ee7269a66c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:15 +0900 Subject: sysfs: implement kobj_sysfs_assoc_lock kobj->dentry can go away anytime unless the user controls when the associated sysfs node is deleted. This patch implements kobj_sysfs_assoc_lock which protects kobj->dentry. This will be used to maintain kobj based API when converting sysfs to use sysfs_dirent tree instead of dentry/kobject. Note that this lock belongs to kobject/driver-model not sysfs. Once sysfs is converted to not use kobject in its interface, this can be removed from sysfs. This is in preparation of object reference simplification. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 8 +++++++- fs/sysfs/sysfs.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 079122695675..e9fddcc59447 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -15,6 +15,7 @@ DECLARE_RWSEM(sysfs_rename_sem); spinlock_t sysfs_lock = SPIN_LOCK_UNLOCKED; +spinlock_t kobj_sysfs_assoc_lock = SPIN_LOCK_UNLOCKED; static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED; static DEFINE_IDA(sysfs_ino_ida); @@ -426,8 +427,13 @@ static void __sysfs_remove_dir(struct dentry *dentry) void sysfs_remove_dir(struct kobject * kobj) { - __sysfs_remove_dir(kobj->dentry); + struct dentry *d = kobj->dentry; + + spin_lock(&kobj_sysfs_assoc_lock); kobj->dentry = NULL; + spin_unlock(&kobj_sysfs_assoc_lock); + + __sysfs_remove_dir(d); } int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 39ab0481379c..718e2e123fae 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -62,6 +62,7 @@ extern void sysfs_drop_dentry(struct sysfs_dirent *sd, struct dentry *parent); extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); extern spinlock_t sysfs_lock; +extern spinlock_t kobj_sysfs_assoc_lock; extern struct rw_semaphore sysfs_rename_sem; extern struct super_block * sysfs_sb; extern const struct file_operations sysfs_dir_operations; -- cgit v1.2.1 From 2b29ac252afff87b8465b064ca2d9740cf1f6e52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:15 +0900 Subject: sysfs: reimplement symlink using sysfs_dirent tree sysfs symlink is implemented by referencing dentry and kobject from sysfs_dirent - symlink entry references kobject, dentry is used to walk the tree. This complicates object lifetimes rules and is dangerous - for example, there is no way to tell to which module the target of a symlink belongs and referencing that kobject can make it linger after the module is gone. This patch reimplements symlink using only sysfs_dirent tree. sd for a symlink points and holds reference to the target sysfs_dirent and all walking is done using sysfs_dirent tree. Simpler and safer. Please read the following message for more info. http://article.gmane.org/gmane.linux.kernel/510293 Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 2 +- fs/sysfs/symlink.c | 88 +++++++++++++++++++++++++++++------------------------- fs/sysfs/sysfs.h | 9 +++--- 3 files changed, 53 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e9fddcc59447..2a94dc36d166 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -54,7 +54,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) parent_sd = sd->s_parent; if (sd->s_type & SYSFS_KOBJ_LINK) - kobject_put(sd->s_elem.symlink.target_kobj); + sysfs_put(sd->s_elem.symlink.target_sd); if (sd->s_type & SYSFS_COPY_NAME) kfree(sd->s_name); kfree(sd->s_iattr); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 27df635b786a..ff605d3f4d33 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -11,50 +11,49 @@ #include "sysfs.h" -static int object_depth(struct kobject * kobj) +static int object_depth(struct sysfs_dirent *sd) { - struct kobject * p = kobj; int depth = 0; - do { depth++; } while ((p = p->parent)); + + for (; sd->s_parent; sd = sd->s_parent) + depth++; + return depth; } -static int object_path_length(struct kobject * kobj) +static int object_path_length(struct sysfs_dirent * sd) { - struct kobject * p = kobj; int length = 1; - do { - length += strlen(kobject_name(p)) + 1; - p = p->parent; - } while (p); + + for (; sd->s_parent; sd = sd->s_parent) + length += strlen(sd->s_name) + 1; + return length; } -static void fill_object_path(struct kobject * kobj, char * buffer, int length) +static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length) { - struct kobject * p; - --length; - for (p = kobj; p; p = p->parent) { - int cur = strlen(kobject_name(p)); + for (; sd->s_parent; sd = sd->s_parent) { + int cur = strlen(sd->s_name); /* back up enough to print this bus id with '/' */ length -= cur; - strncpy(buffer + length,kobject_name(p),cur); + strncpy(buffer + length, sd->s_name, cur); *(buffer + --length) = '/'; } } -static int sysfs_add_link(struct dentry * parent, const char * name, struct kobject * target) +static int sysfs_add_link(struct sysfs_dirent * parent_sd, const char * name, + struct sysfs_dirent * target_sd) { - struct sysfs_dirent * parent_sd = parent->d_fsdata; struct sysfs_dirent * sd; sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); if (!sd) return -ENOMEM; - sd->s_elem.symlink.target_kobj = kobject_get(target); + sd->s_elem.symlink.target_sd = target_sd; sysfs_attach_dirent(sd, parent_sd, NULL); return 0; } @@ -68,6 +67,8 @@ static int sysfs_add_link(struct dentry * parent, const char * name, struct kobj int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name) { struct dentry *dentry = NULL; + struct sysfs_dirent *parent_sd = NULL; + struct sysfs_dirent *target_sd = NULL; int error = -EEXIST; BUG_ON(!name); @@ -80,11 +81,27 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char if (!dentry) return -EFAULT; + parent_sd = dentry->d_fsdata; + + /* target->dentry can go away beneath us but is protected with + * kobj_sysfs_assoc_lock. Fetch target_sd from it. + */ + spin_lock(&kobj_sysfs_assoc_lock); + if (target->dentry) + target_sd = sysfs_get(target->dentry->d_fsdata); + spin_unlock(&kobj_sysfs_assoc_lock); + + if (!target_sd) + return -ENOENT; mutex_lock(&dentry->d_inode->i_mutex); if (!sysfs_dirent_exist(dentry->d_fsdata, name)) - error = sysfs_add_link(dentry, name, target); + error = sysfs_add_link(parent_sd, name, target_sd); mutex_unlock(&dentry->d_inode->i_mutex); + + if (error) + sysfs_put(target_sd); + return error; } @@ -100,14 +117,14 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) sysfs_hash_and_remove(kobj->dentry,name); } -static int sysfs_get_target_path(struct kobject * kobj, struct kobject * target, - char *path) +static int sysfs_get_target_path(struct sysfs_dirent * parent_sd, + struct sysfs_dirent * target_sd, char *path) { char * s; int depth, size; - depth = object_depth(kobj); - size = object_path_length(target) + depth * 3 - 1; + depth = object_depth(parent_sd); + size = object_path_length(target_sd) + depth * 3 - 1; if (size > PATH_MAX) return -ENAMETOOLONG; @@ -116,7 +133,7 @@ static int sysfs_get_target_path(struct kobject * kobj, struct kobject * target, for (s = path; depth--; s += 3) strcpy(s,"../"); - fill_object_path(target, path, size); + fill_object_path(target_sd, path, size); pr_debug("%s: path = '%s'\n", __FUNCTION__, path); return 0; @@ -124,27 +141,16 @@ static int sysfs_get_target_path(struct kobject * kobj, struct kobject * target, static int sysfs_getlink(struct dentry *dentry, char * path) { - struct kobject *kobj, *target_kobj; - int error = 0; - - kobj = sysfs_get_kobject(dentry->d_parent); - if (!kobj) - return -EINVAL; - - target_kobj = sysfs_get_kobject(dentry); - if (!target_kobj) { - kobject_put(kobj); - return -EINVAL; - } + struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_dirent *parent_sd = sd->s_parent; + struct sysfs_dirent *target_sd = sd->s_elem.symlink.target_sd; + int error; down_read(&sysfs_rename_sem); - error = sysfs_get_target_path(kobj, target_kobj, path); + error = sysfs_get_target_path(parent_sd, target_sd, path); up_read(&sysfs_rename_sem); - - kobject_put(kobj); - kobject_put(target_kobj); - return error; + return error; } static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 718e2e123fae..60717660ac55 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -3,7 +3,7 @@ struct sysfs_elem_dir { }; struct sysfs_elem_symlink { - struct kobject * target_kobj; + struct sysfs_dirent * target_sd; }; struct sysfs_elem_attr { @@ -100,10 +100,11 @@ static inline struct kobject *sysfs_get_kobject(struct dentry *dentry) spin_lock(&dcache_lock); if (!d_unhashed(dentry)) { struct sysfs_dirent * sd = dentry->d_fsdata; + if (sd->s_type & SYSFS_KOBJ_LINK) - kobj = kobject_get(sd->s_elem.symlink.target_kobj); - else - kobj = kobject_get(sd->s_elem.dir.kobj); + sd = sd->s_elem.symlink.target_sd; + + kobj = kobject_get(sd->s_elem.dir.kobj); } spin_unlock(&dcache_lock); -- cgit v1.2.1 From eb36165353d0e5ac32b063f555acedcbaf6d3b75 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:16 +0900 Subject: sysfs: implement bin_buffer Implement bin_buffer which contains a mutex and pointer to PAGE_SIZE buffer to properly synchronize accesses to per-openfile buffer and prepare for immediate-kobj-disconnect. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 64 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 67a0d5030c96..5dc47fe5de5e 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -20,6 +20,11 @@ #include "sysfs.h" +struct bin_buffer { + struct mutex mutex; + void *buffer; +}; + static int fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) { @@ -36,7 +41,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) static ssize_t read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) { - char *buffer = file->private_data; + struct bin_buffer *bb = file->private_data; struct dentry *dentry = file->f_path.dentry; int size = dentry->d_inode->i_size; loff_t offs = *off; @@ -49,17 +54,23 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) count = size - offs; } - count = fill_read(dentry, buffer, offs, count); + mutex_lock(&bb->mutex); + + count = fill_read(dentry, bb->buffer, offs, count); if (count < 0) - return count; + goto out_unlock; - if (copy_to_user(userbuf, buffer, count)) - return -EFAULT; + if (copy_to_user(userbuf, bb->buffer, count)) { + count = -EFAULT; + goto out_unlock; + } pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); *off = offs + count; + out_unlock: + mutex_unlock(&bb->mutex); return count; } @@ -79,7 +90,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) static ssize_t write(struct file *file, const char __user *userbuf, size_t bytes, loff_t *off) { - char *buffer = file->private_data; + struct bin_buffer *bb = file->private_data; struct dentry *dentry = file->f_path.dentry; int size = dentry->d_inode->i_size; loff_t offs = *off; @@ -92,25 +103,38 @@ static ssize_t write(struct file *file, const char __user *userbuf, count = size - offs; } - if (copy_from_user(buffer, userbuf, count)) - return -EFAULT; + mutex_lock(&bb->mutex); + + if (copy_from_user(bb->buffer, userbuf, count)) { + count = -EFAULT; + goto out_unlock; + } - count = flush_write(dentry, buffer, offs, count); + count = flush_write(dentry, bb->buffer, offs, count); if (count > 0) *off = offs + count; + + out_unlock: + mutex_unlock(&bb->mutex); return count; } static int mmap(struct file *file, struct vm_area_struct *vma) { + struct bin_buffer *bb = file->private_data; struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; struct kobject *kobj = to_kobj(file->f_path.dentry->d_parent); + int rc; if (!attr->mmap) return -EINVAL; - return attr->mmap(kobj, attr, vma); + mutex_lock(&bb->mutex); + rc = attr->mmap(kobj, attr, vma); + mutex_unlock(&bb->mutex); + + return rc; } static int open(struct inode * inode, struct file * file) @@ -118,6 +142,7 @@ static int open(struct inode * inode, struct file * file) struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; + struct bin_buffer *bb = NULL; int error = -EINVAL; if (!kobj || !attr) @@ -135,14 +160,22 @@ static int open(struct inode * inode, struct file * file) goto Error; error = -ENOMEM; - file->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!file->private_data) + bb = kzalloc(sizeof(*bb), GFP_KERNEL); + if (!bb) goto Error; + bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!bb->buffer) + goto Error; + + mutex_init(&bb->mutex); + file->private_data = bb; + error = 0; - goto Done; + goto Done; Error: + kfree(bb); module_put(attr->attr.owner); Done: if (error) @@ -155,11 +188,12 @@ static int release(struct inode * inode, struct file * file) struct kobject * kobj = to_kobj(file->f_path.dentry->d_parent); struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; - u8 * buffer = file->private_data; + struct bin_buffer *bb = file->private_data; kobject_put(kobj); module_put(attr->attr.owner); - kfree(buffer); + kfree(bb->buffer); + kfree(bb); return 0; } -- cgit v1.2.1 From 0ab66088c855eca68513bdd7442a426c4b374ced Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:16 +0900 Subject: sysfs: implement sysfs_dirent active reference and immediate disconnect sysfs: implement sysfs_dirent active reference and immediate disconnect Opening a sysfs node references its associated kobject, so userland can arbitrarily prolong lifetime of a kobject which complicates lifetime rules in drivers. This patch implements active reference and makes the association between kobject and sysfs immediately breakable. Now each sysfs_dirent has two reference counts - s_count and s_active. s_count is a regular reference count which guarantees that the containing sysfs_dirent is accessible. As long as s_count reference is held, all sysfs internal fields in sysfs_dirent are accessible including s_parent and s_name. The newly added s_active is active reference count. This is acquired by invoking sysfs_get_active() and it's the caller's responsibility to ensure sysfs_dirent itself is accessible (should be holding s_count one way or the other). Dereferencing sysfs_dirent to access objects out of sysfs proper requires active reference. This includes access to the associated kobjects, attributes and ops. The active references can be drained and denied by calling sysfs_deactivate(). All active sysfs_dirents must be deactivated after deletion but before the default reference is dropped. This enables immediate disconnect of sysfs nodes. Once a sysfs_dirent is deleted, it won't access any entity external to sysfs proper. Because attr/bin_attr ops access both the node itself and its parent for kobject, they need to hold active references to both. sysfs_get/put_active_two() helpers are provided to help grabbing both references. Parent's is acquired first and released last. Unlike other operations, mmapped area lingers on after mmap() is finished and the module implement implementing it and kobj need to stay referenced till all the mapped pages are gone. This is accomplished by holding one set of active references to the bin_attr and its parent if there have been any mmap during lifetime of an openfile. The references are dropped when the openfile is released. This change makes sysfs lifetime rules independent from both kobject's and module's. It not only fixes several race conditions caused by sysfs not holding onto the proper module when referencing kobject, but also helps fixing and simplifying lifetime management in driver model and drivers by taking sysfs out of the equation. Please read the following message for more info. http://article.gmane.org/gmane.linux.kernel/510293 Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 95 ++++++++++++++++++++++++++-------------- fs/sysfs/dir.c | 28 ++++++++++-- fs/sysfs/file.c | 130 ++++++++++++++++++++++++++++++++----------------------- fs/sysfs/inode.c | 8 +++- fs/sysfs/sysfs.h | 123 +++++++++++++++++++++++++++++++++++++++++++--------- 5 files changed, 271 insertions(+), 113 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 5dc47fe5de5e..618b8aea6a7b 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -23,6 +23,7 @@ struct bin_buffer { struct mutex mutex; void *buffer; + int mmapped; }; static int @@ -30,12 +31,20 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; - struct kobject * kobj = to_kobj(dentry->d_parent); + struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; + int rc; + + /* need attr_sd for attr, its parent for kobj */ + if (!sysfs_get_active_two(attr_sd)) + return -ENODEV; - if (!attr->read) - return -EIO; + rc = -EIO; + if (attr->read) + rc = attr->read(kobj, buffer, off, count); - return attr->read(kobj, buffer, off, count); + sysfs_put_active_two(attr_sd); + + return rc; } static ssize_t @@ -79,12 +88,20 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; - struct kobject *kobj = to_kobj(dentry->d_parent); + struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; + int rc; + + /* need attr_sd for attr, its parent for kobj */ + if (!sysfs_get_active_two(attr_sd)) + return -ENODEV; - if (!attr->write) - return -EIO; + rc = -EIO; + if (attr->write) + rc = attr->write(kobj, buffer, offset, count); - return attr->write(kobj, buffer, offset, count); + sysfs_put_active_two(attr_sd); + + return rc; } static ssize_t write(struct file *file, const char __user *userbuf, @@ -124,14 +141,24 @@ static int mmap(struct file *file, struct vm_area_struct *vma) struct bin_buffer *bb = file->private_data; struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; - struct kobject *kobj = to_kobj(file->f_path.dentry->d_parent); + struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; int rc; - if (!attr->mmap) - return -EINVAL; - mutex_lock(&bb->mutex); - rc = attr->mmap(kobj, attr, vma); + + /* need attr_sd for attr, its parent for kobj */ + if (!sysfs_get_active_two(attr_sd)) + return -ENODEV; + + rc = -EINVAL; + if (attr->mmap) + rc = attr->mmap(kobj, attr, vma); + + if (rc == 0 && !bb->mmapped) + bb->mmapped = 1; + else + sysfs_put_active_two(attr_sd); + mutex_unlock(&bb->mutex); return rc; @@ -139,58 +166,60 @@ static int mmap(struct file *file, struct vm_area_struct *vma) static int open(struct inode * inode, struct file * file) { - struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; struct bin_buffer *bb = NULL; - int error = -EINVAL; + int error; - if (!kobj || !attr) - goto Done; + /* need attr_sd for attr */ + if (!sysfs_get_active(attr_sd)) + return -ENODEV; - /* Grab the module reference for this attribute if we have one */ + /* Grab the module reference for this attribute */ error = -ENODEV; - if (!try_module_get(attr->attr.owner)) - goto Done; + if (!try_module_get(attr->attr.owner)) + goto err_sput; error = -EACCES; if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap)) - goto Error; + goto err_mput; if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap)) - goto Error; + goto err_mput; error = -ENOMEM; bb = kzalloc(sizeof(*bb), GFP_KERNEL); if (!bb) - goto Error; + goto err_mput; bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!bb->buffer) - goto Error; + goto err_mput; mutex_init(&bb->mutex); file->private_data = bb; - error = 0; - goto Done; + /* open succeeded, put active reference and pin attr_sd */ + sysfs_put_active(attr_sd); + sysfs_get(attr_sd); + return 0; - Error: - kfree(bb); + err_mput: module_put(attr->attr.owner); - Done: - if (error) - kobject_put(kobj); + err_sput: + sysfs_put_active(attr_sd); + kfree(bb); return error; } static int release(struct inode * inode, struct file * file) { - struct kobject * kobj = to_kobj(file->f_path.dentry->d_parent); struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; struct bin_buffer *bb = file->private_data; - kobject_put(kobj); + if (bb->mmapped) + sysfs_put_active_two(attr_sd); + sysfs_put(attr_sd); module_put(attr->attr.owner); kfree(bb->buffer); kfree(bb); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2a94dc36d166..e0d377aaf2cc 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -53,6 +53,19 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) repeat: parent_sd = sd->s_parent; + /* If @sd is being released after deletion, s_active is write + * locked. If @sd is cursor for directory walk or being + * released prematurely, s_active has no reader or writer. + * + * sysfs_deactivate() lies to lockdep that s_active is + * unlocked immediately. Lie one more time to cover the + * previous lie. + */ + if (!down_write_trylock(&sd->s_active)) + rwsem_acquire(&sd->s_active.dep_map, + SYSFS_S_ACTIVE_DEACTIVATE, 0, _RET_IP_); + up_write(&sd->s_active); + if (sd->s_type & SYSFS_KOBJ_LINK) sysfs_put(sd->s_elem.symlink.target_sd); if (sd->s_type & SYSFS_COPY_NAME) @@ -113,6 +126,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) atomic_set(&sd->s_count, 1); atomic_set(&sd->s_event, 1); + init_rwsem(&sd->s_active); INIT_LIST_HEAD(&sd->s_children); INIT_LIST_HEAD(&sd->s_sibling); @@ -371,7 +385,6 @@ static void remove_dir(struct dentry * d) d_delete(d); sd = d->d_fsdata; list_del_init(&sd->s_sibling); - sysfs_put(sd); if (d->d_inode) simple_rmdir(parent->d_inode,d); @@ -380,6 +393,9 @@ static void remove_dir(struct dentry * d) mutex_unlock(&parent->d_inode->i_mutex); dput(parent); + + sysfs_deactivate(sd); + sysfs_put(sd); } void sysfs_remove_subdir(struct dentry * d) @@ -390,6 +406,7 @@ void sysfs_remove_subdir(struct dentry * d) static void __sysfs_remove_dir(struct dentry *dentry) { + LIST_HEAD(removed); struct sysfs_dirent * parent_sd; struct sysfs_dirent * sd, * tmp; @@ -403,12 +420,17 @@ static void __sysfs_remove_dir(struct dentry *dentry) list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { if (!sd->s_type || !(sd->s_type & SYSFS_NOT_PINNED)) continue; - list_del_init(&sd->s_sibling); + list_move(&sd->s_sibling, &removed); sysfs_drop_dentry(sd, dentry); - sysfs_put(sd); } mutex_unlock(&dentry->d_inode->i_mutex); + list_for_each_entry_safe(sd, tmp, &removed, s_sibling) { + list_del_init(&sd->s_sibling); + sysfs_deactivate(sd); + sysfs_put(sd); + } + remove_dir(dentry); /** * Drop reference from dget() on entrance. diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 04f6b0ebc889..310430baf572 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -87,8 +87,8 @@ remove_from_collection(struct sysfs_buffer *buffer, struct inode *node) */ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { - struct sysfs_dirent * sd = dentry->d_fsdata; - struct kobject * kobj = to_kobj(dentry->d_parent); + struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; struct sysfs_ops * ops = buffer->ops; int ret = 0; ssize_t count; @@ -98,8 +98,15 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer if (!buffer->page) return -ENOMEM; - buffer->event = atomic_read(&sd->s_event); - count = ops->show(kobj, sd->s_elem.attr.attr, buffer->page); + /* need attr_sd for attr and ops, its parent for kobj */ + if (!sysfs_get_active_two(attr_sd)) + return -ENODEV; + + buffer->event = atomic_read(&attr_sd->s_event); + count = ops->show(kobj, attr_sd->s_elem.attr.attr, buffer->page); + + sysfs_put_active_two(attr_sd); + BUG_ON(count > (ssize_t)PAGE_SIZE); if (count >= 0) { buffer->needs_read_fill = 0; @@ -195,14 +202,23 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t * passing the buffer that we acquired in fill_write_buffer(). */ -static int +static int flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count) { struct sysfs_dirent *attr_sd = dentry->d_fsdata; - struct kobject * kobj = to_kobj(dentry->d_parent); + struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; struct sysfs_ops * ops = buffer->ops; + int rc; + + /* need attr_sd for attr and ops, its parent for kobj */ + if (!sysfs_get_active_two(attr_sd)) + return -ENODEV; + + rc = ops->store(kobj, attr_sd->s_elem.attr.attr, buffer->page, count); + + sysfs_put_active_two(attr_sd); - return ops->store(kobj, attr_sd->s_elem.attr.attr, buffer->page, count); + return rc; } @@ -246,22 +262,22 @@ out: static int sysfs_open_file(struct inode *inode, struct file *file) { - struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct attribute *attr = attr_sd->s_elem.attr.attr; + struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; struct sysfs_buffer_collection *set; struct sysfs_buffer * buffer; struct sysfs_ops * ops = NULL; - int error = 0; + int error; - if (!kobj || !attr) - goto Einval; + /* need attr_sd for attr and ops, its parent for kobj */ + if (!sysfs_get_active_two(attr_sd)) + return -ENODEV; - /* Grab the module reference for this attribute if we have one */ - if (!try_module_get(attr->owner)) { - error = -ENODEV; - goto Done; - } + /* Grab the module reference for this attribute */ + error = -ENODEV; + if (!try_module_get(attr->owner)) + goto err_sput; /* if the kobject has no ktype, then we assume that it is a subsystem * itself, and use ops for it. @@ -276,30 +292,30 @@ static int sysfs_open_file(struct inode *inode, struct file *file) /* No sysfs operations, either from having no subsystem, * or the subsystem have no operations. */ + error = -EACCES; if (!ops) - goto Eaccess; + goto err_mput; /* make sure we have a collection to add our buffers to */ mutex_lock(&inode->i_mutex); if (!(set = inode->i_private)) { - if (!(set = inode->i_private = kmalloc(sizeof(struct sysfs_buffer_collection), GFP_KERNEL))) { - error = -ENOMEM; - goto Done; - } else { + error = -ENOMEM; + if (!(set = inode->i_private = kmalloc(sizeof(struct sysfs_buffer_collection), GFP_KERNEL))) + goto err_mput; + else INIT_LIST_HEAD(&set->associates); - } } mutex_unlock(&inode->i_mutex); + error = -EACCES; + /* File needs write support. * The inode's perms must say it's ok, * and we must have a store method. */ if (file->f_mode & FMODE_WRITE) { - if (!(inode->i_mode & S_IWUGO) || !ops->store) - goto Eaccess; - + goto err_mput; } /* File needs read support. @@ -308,46 +324,45 @@ static int sysfs_open_file(struct inode *inode, struct file *file) */ if (file->f_mode & FMODE_READ) { if (!(inode->i_mode & S_IRUGO) || !ops->show) - goto Eaccess; + goto err_mput; } /* No error? Great, allocate a buffer for the file, and store it * it in file->private_data for easy access. */ + error = -ENOMEM; buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); - if (buffer) { - INIT_LIST_HEAD(&buffer->associates); - init_MUTEX(&buffer->sem); - buffer->needs_read_fill = 1; - buffer->ops = ops; - add_to_collection(buffer, inode); - file->private_data = buffer; - } else - error = -ENOMEM; - goto Done; + if (!buffer) + goto err_mput; - Einval: - error = -EINVAL; - goto Done; - Eaccess: - error = -EACCES; + INIT_LIST_HEAD(&buffer->associates); + init_MUTEX(&buffer->sem); + buffer->needs_read_fill = 1; + buffer->ops = ops; + add_to_collection(buffer, inode); + file->private_data = buffer; + + /* open succeeded, put active references and pin attr_sd */ + sysfs_put_active_two(attr_sd); + sysfs_get(attr_sd); + return 0; + + err_mput: module_put(attr->owner); - Done: - if (error) - kobject_put(kobj); + err_sput: + sysfs_put_active_two(attr_sd); return error; } static int sysfs_release(struct inode * inode, struct file * filp) { - struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; struct attribute *attr = attr_sd->s_elem.attr.attr; struct sysfs_buffer * buffer = filp->private_data; if (buffer) remove_from_collection(buffer, inode); - kobject_put(kobj); + sysfs_put(attr_sd); /* After this point, attr should not be accessed. */ module_put(attr->owner); @@ -376,18 +391,25 @@ static int sysfs_release(struct inode * inode, struct file * filp) static unsigned int sysfs_poll(struct file *filp, poll_table *wait) { struct sysfs_buffer * buffer = filp->private_data; - struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); - struct sysfs_dirent * sd = filp->f_path.dentry->d_fsdata; - int res = 0; + struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; + struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; + + /* need parent for the kobj, grab both */ + if (!sysfs_get_active_two(attr_sd)) + goto trigger; poll_wait(filp, &kobj->poll, wait); - if (buffer->event != atomic_read(&sd->s_event)) { - res = POLLERR|POLLPRI; - buffer->needs_read_fill = 1; - } + sysfs_put_active_two(attr_sd); - return res; + if (buffer->event != atomic_read(&attr_sd->s_event)) + goto trigger; + + return 0; + + trigger: + buffer->needs_read_fill = 1; + return POLLERR|POLLPRI; } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 617d10cea07d..7b9a8f132d5a 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -277,12 +277,16 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) if (!strcmp(sd->s_name, name)) { list_del_init(&sd->s_sibling); sysfs_drop_dentry(sd, dir); - sysfs_put(sd); found = 1; break; } } mutex_unlock(&dir->d_inode->i_mutex); - return found ? 0 : -ENOENT; + if (!found) + return -ENOENT; + + sysfs_deactivate(sd); + sysfs_put(sd); + return 0; } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 60717660ac55..d998e8e27841 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -14,8 +14,14 @@ struct sysfs_elem_bin_attr { struct bin_attribute * bin_attr; }; +/* + * As long as s_count reference is held, the sysfs_dirent itself is + * accessible. Dereferencing s_elem or any other outer entity + * requires s_active reference. + */ struct sysfs_dirent { atomic_t s_count; + struct rw_semaphore s_active; struct sysfs_dirent * s_parent; struct list_head s_sibling; struct list_head s_children; @@ -36,6 +42,17 @@ struct sysfs_dirent { atomic_t s_event; }; +/* + * A sysfs file which deletes another file when written to need to + * write lock the s_active of the victim while its s_active is read + * locked for the write operation. Tell lockdep that this is okay. + */ +enum sysfs_s_active_class +{ + SYSFS_S_ACTIVE_NORMAL, /* file r/w access, etc - default */ + SYSFS_S_ACTIVE_DEACTIVATE, /* file deactivation */ +}; + extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; @@ -87,43 +104,107 @@ struct sysfs_buffer_collection { struct list_head associates; }; -static inline struct kobject * to_kobj(struct dentry * dentry) +static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd) { - struct sysfs_dirent * sd = dentry->d_fsdata; - return sd->s_elem.dir.kobj; + if (sd) { + WARN_ON(!atomic_read(&sd->s_count)); + atomic_inc(&sd->s_count); + } + return sd; } -static inline struct kobject *sysfs_get_kobject(struct dentry *dentry) +static inline void sysfs_put(struct sysfs_dirent * sd) { - struct kobject * kobj = NULL; - - spin_lock(&dcache_lock); - if (!d_unhashed(dentry)) { - struct sysfs_dirent * sd = dentry->d_fsdata; - - if (sd->s_type & SYSFS_KOBJ_LINK) - sd = sd->s_elem.symlink.target_sd; + if (sd && atomic_dec_and_test(&sd->s_count)) + release_sysfs_dirent(sd); +} - kobj = kobject_get(sd->s_elem.dir.kobj); +/** + * sysfs_get_active - get an active reference to sysfs_dirent + * @sd: sysfs_dirent to get an active reference to + * + * Get an active reference of @sd. This function is noop if @sd + * is NULL. + * + * RETURNS: + * Pointer to @sd on success, NULL on failure. + */ +static inline struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) +{ + if (sd) { + if (unlikely(!down_read_trylock(&sd->s_active))) + sd = NULL; } - spin_unlock(&dcache_lock); + return sd; +} - return kobj; +/** + * sysfs_put_active - put an active reference to sysfs_dirent + * @sd: sysfs_dirent to put an active reference to + * + * Put an active reference to @sd. This function is noop if @sd + * is NULL. + */ +static inline void sysfs_put_active(struct sysfs_dirent *sd) +{ + if (sd) + up_read(&sd->s_active); } -static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd) +/** + * sysfs_get_active_two - get active references to sysfs_dirent and parent + * @sd: sysfs_dirent of interest + * + * Get active reference to @sd and its parent. Parent's active + * reference is grabbed first. This function is noop if @sd is + * NULL. + * + * RETURNS: + * Pointer to @sd on success, NULL on failure. + */ +static inline struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd) { if (sd) { - WARN_ON(!atomic_read(&sd->s_count)); - atomic_inc(&sd->s_count); + if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent))) + return NULL; + if (unlikely(!sysfs_get_active(sd))) { + sysfs_put_active(sd->s_parent); + return NULL; + } } return sd; } -static inline void sysfs_put(struct sysfs_dirent * sd) +/** + * sysfs_put_active_two - put active references to sysfs_dirent and parent + * @sd: sysfs_dirent of interest + * + * Put active references to @sd and its parent. This function is + * noop if @sd is NULL. + */ +static inline void sysfs_put_active_two(struct sysfs_dirent *sd) { - if (sd && atomic_dec_and_test(&sd->s_count)) - release_sysfs_dirent(sd); + if (sd) { + sysfs_put_active(sd); + sysfs_put_active(sd->s_parent); + } +} + +/** + * sysfs_deactivate - deactivate sysfs_dirent + * @sd: sysfs_dirent to deactivate + * + * Deny new active references and drain existing ones. s_active + * will be unlocked when the sysfs_dirent is released. + */ +static inline void sysfs_deactivate(struct sysfs_dirent *sd) +{ + down_write_nested(&sd->s_active, SYSFS_S_ACTIVE_DEACTIVATE); + + /* s_active will be unlocked by the thread doing the final put + * on @sd. Lie to lockdep. + */ + rwsem_release(&sd->s_active.dep_map, 1, _RET_IP_); } static inline int sysfs_is_shadowed_inode(struct inode *inode) -- cgit v1.2.1 From 73107cb3ad3963c0f929ae681c05081eafb1c079 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:16 +0900 Subject: sysfs: kill attribute file orphaning Now that sysfs_dirent can be disconnected from kobject on deletion, there is no need to orphan each attribute files. All [bin_]attribute nodes are automatically orphaned when the parent node is deleted. Kill attribute file orphaning. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 65 ++++++++++++-------------------------------------------- fs/sysfs/inode.c | 25 ---------------------- fs/sysfs/mount.c | 8 ------- fs/sysfs/sysfs.h | 16 -------------- 4 files changed, 13 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 310430baf572..d673d9b5d33f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -50,29 +50,15 @@ static struct sysfs_ops subsys_sysfs_ops = { .store = subsys_attr_store, }; -/** - * add_to_collection - add buffer to a collection - * @buffer: buffer to be added - * @node: inode of set to add to - */ - -static inline void -add_to_collection(struct sysfs_buffer *buffer, struct inode *node) -{ - struct sysfs_buffer_collection *set = node->i_private; - - mutex_lock(&node->i_mutex); - list_add(&buffer->associates, &set->associates); - mutex_unlock(&node->i_mutex); -} - -static inline void -remove_from_collection(struct sysfs_buffer *buffer, struct inode *node) -{ - mutex_lock(&node->i_mutex); - list_del(&buffer->associates); - mutex_unlock(&node->i_mutex); -} +struct sysfs_buffer { + size_t count; + loff_t pos; + char * page; + struct sysfs_ops * ops; + struct semaphore sem; + int needs_read_fill; + int event; +}; /** * fill_read_buffer - allocate and fill buffer from object. @@ -144,10 +130,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) down(&buffer->sem); if (buffer->needs_read_fill) { - if (buffer->orphaned) - retval = -ENODEV; - else - retval = fill_read_buffer(file->f_path.dentry,buffer); + retval = fill_read_buffer(file->f_path.dentry,buffer); if (retval) goto out; } @@ -246,16 +229,11 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t ssize_t len; down(&buffer->sem); - if (buffer->orphaned) { - len = -ENODEV; - goto out; - } len = fill_write_buffer(buffer, buf, count); if (len > 0) len = flush_write_buffer(file->f_path.dentry, buffer, len); if (len > 0) *ppos += len; -out: up(&buffer->sem); return len; } @@ -265,7 +243,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct attribute *attr = attr_sd->s_elem.attr.attr; struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; - struct sysfs_buffer_collection *set; struct sysfs_buffer * buffer; struct sysfs_ops * ops = NULL; int error; @@ -289,26 +266,14 @@ static int sysfs_open_file(struct inode *inode, struct file *file) else ops = &subsys_sysfs_ops; + error = -EACCES; + /* No sysfs operations, either from having no subsystem, * or the subsystem have no operations. */ - error = -EACCES; if (!ops) goto err_mput; - /* make sure we have a collection to add our buffers to */ - mutex_lock(&inode->i_mutex); - if (!(set = inode->i_private)) { - error = -ENOMEM; - if (!(set = inode->i_private = kmalloc(sizeof(struct sysfs_buffer_collection), GFP_KERNEL))) - goto err_mput; - else - INIT_LIST_HEAD(&set->associates); - } - mutex_unlock(&inode->i_mutex); - - error = -EACCES; - /* File needs write support. * The inode's perms must say it's ok, * and we must have a store method. @@ -335,11 +300,9 @@ static int sysfs_open_file(struct inode *inode, struct file *file) if (!buffer) goto err_mput; - INIT_LIST_HEAD(&buffer->associates); init_MUTEX(&buffer->sem); buffer->needs_read_fill = 1; buffer->ops = ops; - add_to_collection(buffer, inode); file->private_data = buffer; /* open succeeded, put active references and pin attr_sd */ @@ -358,10 +321,8 @@ static int sysfs_release(struct inode * inode, struct file * filp) { struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; struct attribute *attr = attr_sd->s_elem.attr.attr; - struct sysfs_buffer * buffer = filp->private_data; + struct sysfs_buffer *buffer = filp->private_data; - if (buffer) - remove_from_collection(buffer, inode); sysfs_put(attr_sd); /* After this point, attr should not be accessed. */ module_put(attr->owner); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 7b9a8f132d5a..d9ccc830b73a 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -191,24 +191,6 @@ int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) return error; } -static inline void orphan_all_buffers(struct inode *node) -{ - struct sysfs_buffer_collection *set; - struct sysfs_buffer *buf; - - mutex_lock_nested(&node->i_mutex, I_MUTEX_CHILD); - set = node->i_private; - if (set) { - list_for_each_entry(buf, &set->associates, associates) { - down(&buf->sem); - buf->orphaned = 1; - up(&buf->sem); - } - } - mutex_unlock(&node->i_mutex); -} - - /* * Unhashes the dentry corresponding to given sysfs_dirent * Called with parent inode's i_mutex held. @@ -216,7 +198,6 @@ static inline void orphan_all_buffers(struct inode *node) void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) { struct dentry *dentry = NULL; - struct inode *inode; /* We're not holding a reference to ->s_dentry dentry but the * field will stay valid as long as sysfs_lock is held. @@ -236,17 +217,11 @@ void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!d_unhashed(dentry) && dentry->d_inode) { - inode = dentry->d_inode; - spin_lock(&inode->i_lock); - __iget(inode); - spin_unlock(&inode->i_lock); dget_locked(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); - orphan_all_buffers(inode); - iput(inode); } else { spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8f6d8b1b211f..37ff9ffc55f0 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,12 +19,9 @@ struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; struct kmem_cache *sysfs_dir_cachep; -static void sysfs_clear_inode(struct inode *inode); - static const struct super_operations sysfs_ops = { .statfs = simple_statfs, .drop_inode = sysfs_delete_inode, - .clear_inode = sysfs_clear_inode, }; static struct sysfs_dirent sysfs_root = { @@ -36,11 +33,6 @@ static struct sysfs_dirent sysfs_root = { .s_ino = 1, }; -static void sysfs_clear_inode(struct inode *inode) -{ - kfree(inode->i_private); -} - static int sysfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d998e8e27841..8b09e9d882c2 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -88,22 +88,6 @@ extern const struct file_operations bin_fops; extern const struct inode_operations sysfs_dir_inode_operations; extern const struct inode_operations sysfs_symlink_inode_operations; -struct sysfs_buffer { - struct list_head associates; - size_t count; - loff_t pos; - char * page; - struct sysfs_ops * ops; - struct semaphore sem; - int orphaned; - int needs_read_fill; - int event; -}; - -struct sysfs_buffer_collection { - struct list_head associates; -}; - static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd) { if (sd) { -- cgit v1.2.1 From 198a2a847015805c6f57d8cc732bdaaccb494007 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:16 +0900 Subject: sysfs: separate out sysfs_attach_dentry() Consolidate sd <-> dentry association into sysfs_attach_dentry() and call it after dentry and inode are properly set up. This is in preparation of sysfs_drop_dentry() updates. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 59 +++++++++++++++++++++++++------------------------------- fs/sysfs/inode.c | 4 ++-- fs/sysfs/sysfs.h | 3 ++- 3 files changed, 30 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e0d377aaf2cc..01eeb4b954b1 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -142,14 +142,24 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) return NULL; } +static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry) +{ + dentry->d_op = &sysfs_dentry_ops; + dentry->d_fsdata = sysfs_get(sd); + + /* protect sd->s_dentry against sysfs_d_iput */ + spin_lock(&sysfs_lock); + sd->s_dentry = dentry; + spin_unlock(&sysfs_lock); + + d_rehash(dentry); +} + void sysfs_attach_dirent(struct sysfs_dirent *sd, struct sysfs_dirent *parent_sd, struct dentry *dentry) { - if (dentry) { - sd->s_dentry = dentry; - dentry->d_fsdata = sysfs_get(sd); - dentry->d_op = &sysfs_dentry_ops; - } + if (dentry) + sysfs_attach_dentry(sd, dentry); if (parent_sd) { sd->s_parent = sysfs_get(parent_sd); @@ -229,15 +239,13 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, if (!sd) goto out_drop; sd->s_elem.dir.kobj = kobj; - sysfs_attach_dirent(sd, parent->d_fsdata, dentry); - error = sysfs_create(dentry, mode, init_dir); + error = sysfs_create(sd, dentry, mode, init_dir); if (error) goto out_sput; inc_nlink(parent->d_inode); - dentry->d_op = &sysfs_dentry_ops; - d_rehash(dentry); + sysfs_attach_dirent(sd, parent->d_fsdata, dentry); *p_dentry = dentry; error = 0; @@ -308,42 +316,28 @@ static int sysfs_attach_attr(struct sysfs_dirent * sd, struct dentry * dentry) init = init_file; } - dentry->d_fsdata = sysfs_get(sd); - /* protect sd->s_dentry against sysfs_d_iput */ - spin_lock(&sysfs_lock); - sd->s_dentry = dentry; - spin_unlock(&sysfs_lock); - error = sysfs_create(dentry, (attr->mode & S_IALLUGO) | S_IFREG, init); - if (error) { - sysfs_put(sd); + error = sysfs_create(sd, dentry, + (attr->mode & S_IALLUGO) | S_IFREG, init); + if (error) return error; - } if (bin_attr) { dentry->d_inode->i_size = bin_attr->size; dentry->d_inode->i_fop = &bin_fops; } - dentry->d_op = &sysfs_dentry_ops; - d_rehash(dentry); + + sysfs_attach_dentry(sd, dentry); return 0; } static int sysfs_attach_link(struct sysfs_dirent * sd, struct dentry * dentry) { - int err = 0; + int err; - dentry->d_fsdata = sysfs_get(sd); - /* protect sd->s_dentry against sysfs_d_iput */ - spin_lock(&sysfs_lock); - sd->s_dentry = dentry; - spin_unlock(&sysfs_lock); - err = sysfs_create(dentry, S_IFLNK|S_IRWXUGO, init_symlink); - if (!err) { - dentry->d_op = &sysfs_dentry_ops; - d_rehash(dentry); - } else - sysfs_put(sd); + err = sysfs_create(sd, dentry, S_IFLNK|S_IRWXUGO, init_symlink); + if (!err) + sysfs_attach_dentry(sd, dentry); return err; } @@ -773,7 +767,6 @@ struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) d_instantiate(shadow, igrab(inode)); inc_nlink(inode); inc_nlink(parent->d_inode); - shadow->d_op = &sysfs_dentry_ops; dget(shadow); /* Extra count - pin the dentry in core */ diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index d9ccc830b73a..88857a399d0c 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -156,13 +156,13 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) return inode; } -int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) +int sysfs_create(struct sysfs_dirent *sd, struct dentry *dentry, int mode, + int (*init)(struct inode *)) { int error = 0; struct inode * inode = NULL; if (dentry) { if (!dentry->d_inode) { - struct sysfs_dirent * sd = dentry->d_fsdata; if ((inode = sysfs_new_inode(mode, sd))) { if (dentry->d_parent && dentry->d_parent->d_inode) { struct inode *p_inode = dentry->d_parent->d_inode; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 8b09e9d882c2..9fa77d648aa5 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -58,7 +58,8 @@ extern struct kmem_cache *sysfs_dir_cachep; extern void sysfs_delete_inode(struct inode *inode); extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); -extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); +extern int sysfs_create(struct sysfs_dirent *sd, struct dentry *dentry, + int mode, int (*init)(struct inode *)); extern void release_sysfs_dirent(struct sysfs_dirent * sd); extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); -- cgit v1.2.1 From dbde0fcf9f8f6d477af3c32d9979e789ee680cde Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:16 +0900 Subject: sysfs: reimplement sysfs_drop_dentry() This patch reimplements sysfs_drop_dentry() such that remove_dir() can use it to drop dentry instead of using a separate mechanism. With this change, making directories reclaimable is much easier. This patch used to contain fixes for two race conditions around sd->s_dentry but that part has been separated out and included into mainline early as commit 6aa054aadfea613a437ad0b15d38eca2b963fc0a and dd14cbc994709a1c5a64ed3621f583c49a27e521. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 18 ++++--------- fs/sysfs/inode.c | 82 +++++++++++++++++++++++++++++++++++++++++--------------- fs/sysfs/sysfs.h | 2 +- 3 files changed, 67 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 01eeb4b954b1..bc11a263aa53 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -372,22 +372,19 @@ const struct inode_operations sysfs_dir_inode_operations = { static void remove_dir(struct dentry * d) { - struct dentry * parent = dget(d->d_parent); - struct sysfs_dirent * sd; + struct dentry *parent = d->d_parent; + struct sysfs_dirent *sd = d->d_fsdata; mutex_lock(&parent->d_inode->i_mutex); - d_delete(d); - sd = d->d_fsdata; + list_del_init(&sd->s_sibling); - if (d->d_inode) - simple_rmdir(parent->d_inode,d); pr_debug(" o %s removing done (%d)\n",d->d_name.name, atomic_read(&d->d_count)); mutex_unlock(&parent->d_inode->i_mutex); - dput(parent); + sysfs_drop_dentry(sd); sysfs_deactivate(sd); sysfs_put(sd); } @@ -404,7 +401,6 @@ static void __sysfs_remove_dir(struct dentry *dentry) struct sysfs_dirent * parent_sd; struct sysfs_dirent * sd, * tmp; - dget(dentry); if (!dentry) return; @@ -415,21 +411,17 @@ static void __sysfs_remove_dir(struct dentry *dentry) if (!sd->s_type || !(sd->s_type & SYSFS_NOT_PINNED)) continue; list_move(&sd->s_sibling, &removed); - sysfs_drop_dentry(sd, dentry); } mutex_unlock(&dentry->d_inode->i_mutex); list_for_each_entry_safe(sd, tmp, &removed, s_sibling) { list_del_init(&sd->s_sibling); + sysfs_drop_dentry(sd); sysfs_deactivate(sd); sysfs_put(sd); } remove_dir(dentry); - /** - * Drop reference from dget() on entrance. - */ - dput(dentry); } /** diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 88857a399d0c..6ad47c13b94d 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -191,13 +191,25 @@ int sysfs_create(struct sysfs_dirent *sd, struct dentry *dentry, int mode, return error; } -/* - * Unhashes the dentry corresponding to given sysfs_dirent - * Called with parent inode's i_mutex held. +/** + * sysfs_drop_dentry - drop dentry for the specified sysfs_dirent + * @sd: target sysfs_dirent + * + * Drop dentry for @sd. @sd must have been unlinked from its + * parent on entry to this function such that it can't be looked + * up anymore. + * + * @sd->s_dentry which is protected with sysfs_lock points to the + * currently associated dentry but we're not holding a reference + * to it and racing with dput(). Grab dcache_lock and verify + * dentry before dropping it. If @sd->s_dentry is NULL or dput() + * beats us, no need to bother. */ -void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) +void sysfs_drop_dentry(struct sysfs_dirent *sd) { - struct dentry *dentry = NULL; + struct dentry *dentry = NULL, *parent = NULL; + struct inode *dir; + struct timespec curtime; /* We're not holding a reference to ->s_dentry dentry but the * field will stay valid as long as sysfs_lock is held. @@ -205,30 +217,57 @@ void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) spin_lock(&sysfs_lock); spin_lock(&dcache_lock); - /* dget dentry if it's still alive */ - if (sd->s_dentry && sd->s_dentry->d_inode) + if (sd->s_dentry && sd->s_dentry->d_inode) { + /* get dentry if it's there and dput() didn't kill it yet */ dentry = dget_locked(sd->s_dentry); + parent = dentry->d_parent; + } else if (sd->s_parent->s_dentry->d_inode) { + /* We need to update the parent even if dentry for the + * victim itself doesn't exist. + */ + parent = dget_locked(sd->s_parent->s_dentry); + } + + /* drop */ + if (dentry) { + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + } spin_unlock(&dcache_lock); spin_unlock(&sysfs_lock); - /* drop dentry */ + /* nothing to do if the parent isn't in dcache */ + if (!parent) + return; + + /* adjust nlink and update timestamp */ + dir = parent->d_inode; + mutex_lock(&dir->i_mutex); + + curtime = CURRENT_TIME; + + dir->i_ctime = dir->i_mtime = curtime; + if (dentry) { - spin_lock(&dcache_lock); - spin_lock(&dentry->d_lock); - if (!d_unhashed(dentry) && dentry->d_inode) { - dget_locked(dentry); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - simple_unlink(parent->d_inode, dentry); - } else { - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + dentry->d_inode->i_ctime = curtime; + drop_nlink(dentry->d_inode); + if (sd->s_type & SYSFS_DIR) { + drop_nlink(dentry->d_inode); + drop_nlink(dir); + /* XXX: unpin if directory, this will go away soon */ + dput(dentry); } + } + + mutex_unlock(&dir->i_mutex); + /* bye bye */ + if (dentry) dput(dentry); - } + else + dput(parent); } int sysfs_hash_and_remove(struct dentry * dir, const char * name) @@ -251,7 +290,6 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) continue; if (!strcmp(sd->s_name, name)) { list_del_init(&sd->s_sibling); - sysfs_drop_dentry(sd, dir); found = 1; break; } @@ -261,7 +299,9 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) if (!found) return -ENOENT; + sysfs_drop_dentry(sd); sysfs_deactivate(sd); sysfs_put(sd); + return 0; } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 9fa77d648aa5..fc6aa863b947 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -76,7 +76,7 @@ extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * na extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); extern void sysfs_remove_subdir(struct dentry *); -extern void sysfs_drop_dentry(struct sysfs_dirent *sd, struct dentry *parent); +extern void sysfs_drop_dentry(struct sysfs_dirent *sd); extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); extern spinlock_t sysfs_lock; -- cgit v1.2.1 From 7b595756ec1f49e0049a9e01a1298d53a7faaa15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:17 +0900 Subject: sysfs: kill unnecessary attribute->owner sysfs is now completely out of driver/module lifetime game. After deletion, a sysfs node doesn't access anything outside sysfs proper, so there's no reason to hold onto the attribute owners. Note that often the wrong modules were accounted for as owners leading to accessing removed modules. This patch kills now unnecessary attribute->owner. Note that with this change, userland holding a sysfs node does not prevent the backing module from being unloaded. For more info regarding lifetime rule cleanup, please read the following message. http://article.gmane.org/gmane.linux.kernel/510293 (tweaked by Greg to not delete the field just yet, to make it easier to merge things properly.) Signed-off-by: Tejun Heo Cc: Cornelia Huck Cc: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/main.c | 2 -- fs/ocfs2/cluster/masklog.c | 1 - fs/partitions/check.c | 1 - fs/sysfs/bin.c | 19 +++++-------------- fs/sysfs/file.c | 21 +++++---------------- 5 files changed, 10 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 606128f5c927..02ca6f1e55d7 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -840,8 +840,6 @@ static int __init ecryptfs_init(void) goto out; } kobj_set_kset_s(&ecryptfs_subsys, fs_subsys); - sysfs_attr_version.attr.owner = THIS_MODULE; - sysfs_attr_version_str.attr.owner = THIS_MODULE; rc = do_sysfs_registration(); if (rc) { printk(KERN_ERR "sysfs registration failed\n"); diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 2b205f5d5790..e9e042b93dbf 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -74,7 +74,6 @@ struct mlog_attribute { #define define_mask(_name) { \ .attr = { \ .name = #_name, \ - .owner = THIS_MODULE, \ .mode = S_IRUGO | S_IWUSR, \ }, \ .mask = ML_##_name, \ diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 9a3a058f3553..98e0b85a9bb2 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -397,7 +397,6 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, static struct attribute addpartattr = { .name = "whole_disk", .mode = S_IRUSR | S_IRGRP | S_IROTH, - .owner = THIS_MODULE, }; sysfs_create_file(&p->kobj, &addpartattr); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 618b8aea6a7b..3c5574a40b09 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -175,25 +175,20 @@ static int open(struct inode * inode, struct file * file) if (!sysfs_get_active(attr_sd)) return -ENODEV; - /* Grab the module reference for this attribute */ - error = -ENODEV; - if (!try_module_get(attr->attr.owner)) - goto err_sput; - error = -EACCES; if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap)) - goto err_mput; + goto err_out; if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap)) - goto err_mput; + goto err_out; error = -ENOMEM; bb = kzalloc(sizeof(*bb), GFP_KERNEL); if (!bb) - goto err_mput; + goto err_out; bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!bb->buffer) - goto err_mput; + goto err_out; mutex_init(&bb->mutex); file->private_data = bb; @@ -203,9 +198,7 @@ static int open(struct inode * inode, struct file * file) sysfs_get(attr_sd); return 0; - err_mput: - module_put(attr->attr.owner); - err_sput: + err_out: sysfs_put_active(attr_sd); kfree(bb); return error; @@ -214,13 +207,11 @@ static int open(struct inode * inode, struct file * file) static int release(struct inode * inode, struct file * file) { struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; struct bin_buffer *bb = file->private_data; if (bb->mmapped) sysfs_put_active_two(attr_sd); sysfs_put(attr_sd); - module_put(attr->attr.owner); kfree(bb->buffer); kfree(bb); return 0; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index d673d9b5d33f..a84b734f7b29 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -241,7 +241,6 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t static int sysfs_open_file(struct inode *inode, struct file *file) { struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct attribute *attr = attr_sd->s_elem.attr.attr; struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; struct sysfs_buffer * buffer; struct sysfs_ops * ops = NULL; @@ -251,11 +250,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file) if (!sysfs_get_active_two(attr_sd)) return -ENODEV; - /* Grab the module reference for this attribute */ - error = -ENODEV; - if (!try_module_get(attr->owner)) - goto err_sput; - /* if the kobject has no ktype, then we assume that it is a subsystem * itself, and use ops for it. */ @@ -272,7 +266,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) * or the subsystem have no operations. */ if (!ops) - goto err_mput; + goto err_out; /* File needs write support. * The inode's perms must say it's ok, @@ -280,7 +274,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) */ if (file->f_mode & FMODE_WRITE) { if (!(inode->i_mode & S_IWUGO) || !ops->store) - goto err_mput; + goto err_out; } /* File needs read support. @@ -289,7 +283,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) */ if (file->f_mode & FMODE_READ) { if (!(inode->i_mode & S_IRUGO) || !ops->show) - goto err_mput; + goto err_out; } /* No error? Great, allocate a buffer for the file, and store it @@ -298,7 +292,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) error = -ENOMEM; buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); if (!buffer) - goto err_mput; + goto err_out; init_MUTEX(&buffer->sem); buffer->needs_read_fill = 1; @@ -310,9 +304,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) sysfs_get(attr_sd); return 0; - err_mput: - module_put(attr->owner); - err_sput: + err_out: sysfs_put_active_two(attr_sd); return error; } @@ -320,12 +312,9 @@ static int sysfs_open_file(struct inode *inode, struct file *file) static int sysfs_release(struct inode * inode, struct file * filp) { struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; - struct attribute *attr = attr_sd->s_elem.attr.attr; struct sysfs_buffer *buffer = filp->private_data; sysfs_put(attr_sd); - /* After this point, attr should not be accessed. */ - module_put(attr->owner); if (buffer) { if (buffer->page) -- cgit v1.2.1 From 42b37df6abb42ae021e15bf865b43f3629c7f3ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:17 +0900 Subject: sysfs: make sysfs_alloc_ino() static sysfs_alloc_ino() isn't used out side of fs/sysfs/dir.c. Make it static. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index bc11a263aa53..a63d12e4be56 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -20,7 +20,7 @@ spinlock_t kobj_sysfs_assoc_lock = SPIN_LOCK_UNLOCKED; static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED; static DEFINE_IDA(sysfs_ino_ida); -int sysfs_alloc_ino(ino_t *pino) +static int sysfs_alloc_ino(ino_t *pino) { int ino, rc; -- cgit v1.2.1 From 7f7cfffe60ed6271c4028ec79ae1c297b44bcb14 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:17 +0900 Subject: sysfs: fix parent refcounting during rename and move Parent reference wasn't properly transferred during rename and move. Fix it. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index a63d12e4be56..a26e3db89432 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -497,6 +497,9 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, d_move(kobj->dentry, new_dentry); list_del_init(&sd->s_sibling); + sysfs_get(parent_sd); + sysfs_put(sd->s_parent); + sd->s_parent = parent_sd; list_add(&sd->s_sibling, &parent_sd->s_children); error = 0; @@ -550,6 +553,9 @@ again: /* Remove from old parent's list and insert into new parent's list. */ list_del_init(&sd->s_sibling); + sysfs_get(new_parent_sd); + sysfs_put(sd->s_parent); + sd->s_parent = new_parent_sd; list_add(&sd->s_sibling, &new_parent_sd->s_children); out: -- cgit v1.2.1 From fc9f54b9982e14e6dbe023425c87ffbfd6992c45 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:17 +0900 Subject: sysfs: reorganize sysfs_new_indoe() and sysfs_create() Reorganize/clean up sysfs_new_inode() and sysfs_create(). * sysfs_init_inode() is separated out from sysfs_new_inode() and is responsible for basic initialization. * sysfs_instantiate() replaces the last step of sysfs_create() and is responsible for dentry instantitaion. * type-specific initialization is moved out to the callers. * mode is specified only once when creating a sysfs_dirent. * spurious list_del_init(&sd->s_sibling) dropped from create_dir() This change is to * prepare for inode allocation fix. * separate alloc and init code for synchronization update. * make dentry/inode initialization more flexible for later changes. This patch doesn't introduce visible behavior change. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 130 ++++++++++++++++++++----------------------------------- fs/sysfs/inode.c | 108 +++++++++++++++++++++++---------------------- fs/sysfs/mount.c | 18 ++++---- fs/sysfs/sysfs.h | 6 +-- 4 files changed, 118 insertions(+), 144 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index a26e3db89432..bbf3525fd222 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -191,39 +191,18 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, return 0; } -static int init_dir(struct inode * inode) -{ - inode->i_op = &sysfs_dir_inode_operations; - inode->i_fop = &sysfs_dir_operations; - - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - return 0; -} - -static int init_file(struct inode * inode) -{ - inode->i_size = PAGE_SIZE; - inode->i_fop = &sysfs_file_operations; - return 0; -} - -static int init_symlink(struct inode * inode) -{ - inode->i_op = &sysfs_symlink_inode_operations; - return 0; -} - static int create_dir(struct kobject *kobj, struct dentry *parent, const char *name, struct dentry **p_dentry) { int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; struct dentry *dentry; + struct inode *inode; struct sysfs_dirent *sd; mutex_lock(&parent->d_inode->i_mutex); + /* allocate */ dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); @@ -231,7 +210,7 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, } error = -EEXIST; - if (sysfs_dirent_exist(parent->d_fsdata, name)) + if (dentry->d_inode) goto out_dput; error = -ENOMEM; @@ -240,19 +219,31 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, goto out_drop; sd->s_elem.dir.kobj = kobj; - error = sysfs_create(sd, dentry, mode, init_dir); - if (error) + inode = sysfs_new_inode(sd); + if (!inode) goto out_sput; + inode->i_op = &sysfs_dir_inode_operations; + inode->i_fop = &sysfs_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + + /* link in */ + error = -EEXIST; + if (sysfs_dirent_exist(parent->d_fsdata, name)) + goto out_iput; + + sysfs_instantiate(dentry, inode); inc_nlink(parent->d_inode); sysfs_attach_dirent(sd, parent->d_fsdata, dentry); *p_dentry = dentry; error = 0; - goto out_dput; + goto out_unlock; /* pin directory dentry in core */ + out_iput: + iput(inode); out_sput: - list_del_init(&sd->s_sibling); sysfs_put(sd); out_drop: d_drop(dentry); @@ -298,71 +289,46 @@ int sysfs_create_dir(struct kobject * kobj, struct dentry *shadow_parent) return error; } -/* attaches attribute's sysfs_dirent to the dentry corresponding to the - * attribute file - */ -static int sysfs_attach_attr(struct sysfs_dirent * sd, struct dentry * dentry) -{ - struct attribute * attr = NULL; - struct bin_attribute * bin_attr = NULL; - int (* init) (struct inode *) = NULL; - int error = 0; - - if (sd->s_type & SYSFS_KOBJ_BIN_ATTR) { - bin_attr = sd->s_elem.bin_attr.bin_attr; - attr = &bin_attr->attr; - } else { - attr = sd->s_elem.attr.attr; - init = init_file; - } - - error = sysfs_create(sd, dentry, - (attr->mode & S_IALLUGO) | S_IFREG, init); - if (error) - return error; - - if (bin_attr) { - dentry->d_inode->i_size = bin_attr->size; - dentry->d_inode->i_fop = &bin_fops; - } - - sysfs_attach_dentry(sd, dentry); - - return 0; -} - -static int sysfs_attach_link(struct sysfs_dirent * sd, struct dentry * dentry) -{ - int err; - - err = sysfs_create(sd, dentry, S_IFLNK|S_IRWXUGO, init_symlink); - if (!err) - sysfs_attach_dentry(sd, dentry); - - return err; -} - static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct sysfs_dirent * parent_sd = dentry->d_parent->d_fsdata; struct sysfs_dirent * sd; - int err = 0; + struct inode *inode; + int found = 0; list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (sd->s_type & SYSFS_NOT_PINNED) { - if (strcmp(sd->s_name, dentry->d_name.name)) - continue; - - if (sd->s_type & SYSFS_KOBJ_LINK) - err = sysfs_attach_link(sd, dentry); - else - err = sysfs_attach_attr(sd, dentry); + if ((sd->s_type & SYSFS_NOT_PINNED) && + !strcmp(sd->s_name, dentry->d_name.name)) { + found = 1; break; } } - return ERR_PTR(err); + /* no such entry */ + if (!found) + return NULL; + + /* attach dentry and inode */ + inode = sysfs_new_inode(sd); + if (!inode) + return ERR_PTR(-ENOMEM); + + /* initialize inode according to type */ + if (sd->s_type & SYSFS_KOBJ_ATTR) { + inode->i_size = PAGE_SIZE; + inode->i_fop = &sysfs_file_operations; + } else if (sd->s_type & SYSFS_KOBJ_BIN_ATTR) { + struct bin_attribute *bin_attr = sd->s_elem.bin_attr.bin_attr; + inode->i_size = bin_attr->size; + inode->i_fop = &bin_fops; + } else if (sd->s_type & SYSFS_KOBJ_LINK) + inode->i_op = &sysfs_symlink_inode_operations; + + sysfs_instantiate(dentry, inode); + sysfs_attach_dentry(sd, dentry); + + return NULL; } const struct inode_operations sysfs_dir_inode_operations = { diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 6ad47c13b94d..26d8503c8997 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -133,62 +133,68 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) */ static struct lock_class_key sysfs_inode_imutex_key; -struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) +void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { - struct inode * inode = new_inode(sysfs_sb); - if (inode) { - inode->i_blocks = 0; - inode->i_mapping->a_ops = &sysfs_aops; - inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; - inode->i_op = &sysfs_inode_operations; - inode->i_ino = sd->s_ino; - lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); - - if (sd->s_iattr) { - /* sysfs_dirent has non-default attributes - * get them for the new inode from persistent copy - * in sysfs_dirent - */ - set_inode_attr(inode, sd->s_iattr); - } else - set_default_inode_attr(inode, mode); - } + inode->i_blocks = 0; + inode->i_mapping->a_ops = &sysfs_aops; + inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; + inode->i_op = &sysfs_inode_operations; + inode->i_ino = sd->s_ino; + lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); + + if (sd->s_iattr) { + /* sysfs_dirent has non-default attributes + * get them for the new inode from persistent copy + * in sysfs_dirent + */ + set_inode_attr(inode, sd->s_iattr); + } else + set_default_inode_attr(inode, sd->s_mode); +} + +/** + * sysfs_new_inode - allocate new inode for sysfs_dirent + * @sd: sysfs_dirent to allocate inode for + * + * Allocate inode for @sd and initialize basics. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * Pointer to allocated inode on success, NULL on failure. + */ +struct inode * sysfs_new_inode(struct sysfs_dirent *sd) +{ + struct inode *inode; + + inode = new_inode(sysfs_sb); + if (inode) + sysfs_init_inode(sd, inode); + return inode; } -int sysfs_create(struct sysfs_dirent *sd, struct dentry *dentry, int mode, - int (*init)(struct inode *)) +/** + * sysfs_instantiate - instantiate dentry + * @dentry: dentry to be instantiated + * @inode: inode associated with @sd + * + * Instantiate @dentry with @inode. + * + * LOCKING: + * None. + */ +void sysfs_instantiate(struct dentry *dentry, struct inode *inode) { - int error = 0; - struct inode * inode = NULL; - if (dentry) { - if (!dentry->d_inode) { - if ((inode = sysfs_new_inode(mode, sd))) { - if (dentry->d_parent && dentry->d_parent->d_inode) { - struct inode *p_inode = dentry->d_parent->d_inode; - p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; - } - goto Proceed; - } - else - error = -ENOMEM; - } else - error = -EEXIST; - } else - error = -ENOENT; - goto Done; - - Proceed: - if (init) - error = init(inode); - if (!error) { - d_instantiate(dentry, inode); - if (S_ISDIR(mode)) - dget(dentry); /* pin only directory dentry in core */ - } else - iput(inode); - Done: - return error; + BUG_ON(!dentry || dentry->d_inode); + + if (dentry->d_parent && dentry->d_parent->d_inode) { + struct inode *p_inode = dentry->d_parent->d_inode; + p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; + } + + d_instantiate(dentry, inode); } /** diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 37ff9ffc55f0..6d3a6249d21c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -29,6 +29,7 @@ static struct sysfs_dirent sysfs_root = { .s_sibling = LIST_HEAD_INIT(sysfs_root.s_sibling), .s_children = LIST_HEAD_INIT(sysfs_root.s_children), .s_type = SYSFS_ROOT, + .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, .s_iattr = NULL, .s_ino = 1, }; @@ -45,18 +46,19 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sysfs_sb = sb; - inode = sysfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, - &sysfs_root); - if (inode) { - inode->i_op = &sysfs_dir_inode_operations; - inode->i_fop = &sysfs_dir_operations; - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - } else { + inode = new_inode(sysfs_sb); + if (!inode) { pr_debug("sysfs: could not get root inode\n"); return -ENOMEM; } + sysfs_init_inode(&sysfs_root, inode); + + inode->i_op = &sysfs_dir_inode_operations; + inode->i_fop = &sysfs_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + root = d_alloc_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n",__FUNCTION__); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index fc6aa863b947..143fdbe56c14 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -57,9 +57,9 @@ extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; extern void sysfs_delete_inode(struct inode *inode); -extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); -extern int sysfs_create(struct sysfs_dirent *sd, struct dentry *dentry, - int mode, int (*init)(struct inode *)); +extern void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode); +extern struct inode * sysfs_new_inode(struct sysfs_dirent *sd); +extern void sysfs_instantiate(struct dentry *dentry, struct inode *inode); extern void release_sysfs_dirent(struct sysfs_dirent * sd); extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); -- cgit v1.2.1 From 8312a8d7c1d19d31027bd4ca127ce671962c23d4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:17 +0900 Subject: sysfs: use iget_locked() instead of new_inode() After dentry is reclaimed, sysfs always used to allocate new dentry and inode if the file is accessed again. This causes problem with operations which only pin the inode. For example, if inotify watch is added to a sysfs file and the dentry for the file is reclaimed, the next update event creates new dentry and new inode making the inotify watch miss all the events from there on. This patch fixes it by using iget_locked() instead of new_inode(). sysfs_new_inode() is renamed to sysfs_get_inode() and inode is initialized iff the inode is newly allocated. sysfs_instantiate() is responsible for unlocking new inodes. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 37 +++++++++++++++++++++---------------- fs/sysfs/inode.c | 24 +++++++++++++++--------- fs/sysfs/sysfs.h | 2 +- 3 files changed, 37 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index bbf3525fd222..06dff2c30c9b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -219,14 +219,16 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, goto out_drop; sd->s_elem.dir.kobj = kobj; - inode = sysfs_new_inode(sd); + inode = sysfs_get_inode(sd); if (!inode) goto out_sput; - inode->i_op = &sysfs_dir_inode_operations; - inode->i_fop = &sysfs_dir_operations; - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); + if (inode->i_state & I_NEW) { + inode->i_op = &sysfs_dir_inode_operations; + inode->i_fop = &sysfs_dir_operations; + /* directory inodes start off with i_nlink == 2 (for ".") */ + inc_nlink(inode); + } /* link in */ error = -EEXIST; @@ -310,20 +312,23 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, return NULL; /* attach dentry and inode */ - inode = sysfs_new_inode(sd); + inode = sysfs_get_inode(sd); if (!inode) return ERR_PTR(-ENOMEM); - /* initialize inode according to type */ - if (sd->s_type & SYSFS_KOBJ_ATTR) { - inode->i_size = PAGE_SIZE; - inode->i_fop = &sysfs_file_operations; - } else if (sd->s_type & SYSFS_KOBJ_BIN_ATTR) { - struct bin_attribute *bin_attr = sd->s_elem.bin_attr.bin_attr; - inode->i_size = bin_attr->size; - inode->i_fop = &bin_fops; - } else if (sd->s_type & SYSFS_KOBJ_LINK) - inode->i_op = &sysfs_symlink_inode_operations; + if (inode->i_state & I_NEW) { + /* initialize inode according to type */ + if (sd->s_type & SYSFS_KOBJ_ATTR) { + inode->i_size = PAGE_SIZE; + inode->i_fop = &sysfs_file_operations; + } else if (sd->s_type & SYSFS_KOBJ_BIN_ATTR) { + struct bin_attribute *bin_attr = + sd->s_elem.bin_attr.bin_attr; + inode->i_size = bin_attr->size; + inode->i_fop = &bin_fops; + } else if (sd->s_type & SYSFS_KOBJ_LINK) + inode->i_op = &sysfs_symlink_inode_operations; + } sysfs_instantiate(dentry, inode); sysfs_attach_dentry(sd, dentry); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 26d8503c8997..3eab9c46a71b 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -153,10 +153,12 @@ void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) } /** - * sysfs_new_inode - allocate new inode for sysfs_dirent + * sysfs_get_inode - get inode for sysfs_dirent * @sd: sysfs_dirent to allocate inode for * - * Allocate inode for @sd and initialize basics. + * Get inode for @sd. If such inode doesn't exist, a new inode + * is allocated and basics are initialized. New inode is + * returned locked. * * LOCKING: * Kernel thread context (may sleep). @@ -164,12 +166,12 @@ void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) * RETURNS: * Pointer to allocated inode on success, NULL on failure. */ -struct inode * sysfs_new_inode(struct sysfs_dirent *sd) +struct inode * sysfs_get_inode(struct sysfs_dirent *sd) { struct inode *inode; - inode = new_inode(sysfs_sb); - if (inode) + inode = iget_locked(sysfs_sb, sd->s_ino); + if (inode && (inode->i_state & I_NEW)) sysfs_init_inode(sd, inode); return inode; @@ -180,7 +182,7 @@ struct inode * sysfs_new_inode(struct sysfs_dirent *sd) * @dentry: dentry to be instantiated * @inode: inode associated with @sd * - * Instantiate @dentry with @inode. + * Unlock @inode if locked and instantiate @dentry with @inode. * * LOCKING: * None. @@ -189,9 +191,13 @@ void sysfs_instantiate(struct dentry *dentry, struct inode *inode) { BUG_ON(!dentry || dentry->d_inode); - if (dentry->d_parent && dentry->d_parent->d_inode) { - struct inode *p_inode = dentry->d_parent->d_inode; - p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; + if (inode->i_state & I_NEW) { + unlock_new_inode(inode); + + if (dentry->d_parent && dentry->d_parent->d_inode) { + struct inode *p_inode = dentry->d_parent->d_inode; + p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; + } } d_instantiate(dentry, inode); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 143fdbe56c14..627bf3940dfa 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -58,7 +58,7 @@ extern struct kmem_cache *sysfs_dir_cachep; extern void sysfs_delete_inode(struct inode *inode); extern void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode); -extern struct inode * sysfs_new_inode(struct sysfs_dirent *sd); +extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd); extern void sysfs_instantiate(struct dentry *dentry, struct inode *inode); extern void release_sysfs_dirent(struct sysfs_dirent * sd); -- cgit v1.2.1 From 0b8ead82f5d9d8f08c0d1236f2e350b70a977753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:18 +0900 Subject: sysfs: fix root sysfs_dirent -> root dentry association The root sysfs_dirent didn't point to the root dentry fix it. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/mount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 6d3a6249d21c..d0e160307c5d 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -65,6 +65,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) iput(inode); return -ENOMEM; } + sysfs_root.s_dentry = root; root->d_fsdata = &sysfs_root; sb->s_root = root; return 0; -- cgit v1.2.1 From b6b4a4399c2a83d1af77c99dee0d0b5cc15ec268 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:18 +0900 Subject: sysfs: move s_active functions to fs/sysfs/dir.c These functions are about to receive more complexity and doesn't really need to be inlined in the first place. Move them from fs/sysfs/sysfs.h to fs/sysfs/dir.c. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/sysfs/sysfs.h | 94 ++++---------------------------------------------------- 2 files changed, 94 insertions(+), 88 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 06dff2c30c9b..f5f0b936f181 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -20,6 +20,94 @@ spinlock_t kobj_sysfs_assoc_lock = SPIN_LOCK_UNLOCKED; static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED; static DEFINE_IDA(sysfs_ino_ida); +/** + * sysfs_get_active - get an active reference to sysfs_dirent + * @sd: sysfs_dirent to get an active reference to + * + * Get an active reference of @sd. This function is noop if @sd + * is NULL. + * + * RETURNS: + * Pointer to @sd on success, NULL on failure. + */ +struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) +{ + if (sd) { + if (unlikely(!down_read_trylock(&sd->s_active))) + sd = NULL; + } + return sd; +} + +/** + * sysfs_put_active - put an active reference to sysfs_dirent + * @sd: sysfs_dirent to put an active reference to + * + * Put an active reference to @sd. This function is noop if @sd + * is NULL. + */ +void sysfs_put_active(struct sysfs_dirent *sd) +{ + if (sd) + up_read(&sd->s_active); +} + +/** + * sysfs_get_active_two - get active references to sysfs_dirent and parent + * @sd: sysfs_dirent of interest + * + * Get active reference to @sd and its parent. Parent's active + * reference is grabbed first. This function is noop if @sd is + * NULL. + * + * RETURNS: + * Pointer to @sd on success, NULL on failure. + */ +struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd) +{ + if (sd) { + if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent))) + return NULL; + if (unlikely(!sysfs_get_active(sd))) { + sysfs_put_active(sd->s_parent); + return NULL; + } + } + return sd; +} + +/** + * sysfs_put_active_two - put active references to sysfs_dirent and parent + * @sd: sysfs_dirent of interest + * + * Put active references to @sd and its parent. This function is + * noop if @sd is NULL. + */ +void sysfs_put_active_two(struct sysfs_dirent *sd) +{ + if (sd) { + sysfs_put_active(sd); + sysfs_put_active(sd->s_parent); + } +} + +/** + * sysfs_deactivate - deactivate sysfs_dirent + * @sd: sysfs_dirent to deactivate + * + * Deny new active references and drain existing ones. s_active + * will be unlocked when the sysfs_dirent is released. + */ +void sysfs_deactivate(struct sysfs_dirent *sd) +{ + down_write_nested(&sd->s_active, SYSFS_S_ACTIVE_DEACTIVATE); + + /* s_active will be unlocked by the thread doing the final put + * on @sd. Lie to lockdep. + */ + rwsem_release(&sd->s_active.dep_map, 1, _RET_IP_); +} + static int sysfs_alloc_ino(ino_t *pino) { int ino, rc; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 627bf3940dfa..f8779eaa53ff 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -56,6 +56,12 @@ enum sysfs_s_active_class extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; +extern struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); +extern void sysfs_put_active(struct sysfs_dirent *sd); +extern struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); +extern void sysfs_put_active_two(struct sysfs_dirent *sd); +extern void sysfs_deactivate(struct sysfs_dirent *sd); + extern void sysfs_delete_inode(struct inode *inode); extern void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode); extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd); @@ -104,94 +110,6 @@ static inline void sysfs_put(struct sysfs_dirent * sd) release_sysfs_dirent(sd); } -/** - * sysfs_get_active - get an active reference to sysfs_dirent - * @sd: sysfs_dirent to get an active reference to - * - * Get an active reference of @sd. This function is noop if @sd - * is NULL. - * - * RETURNS: - * Pointer to @sd on success, NULL on failure. - */ -static inline struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) -{ - if (sd) { - if (unlikely(!down_read_trylock(&sd->s_active))) - sd = NULL; - } - return sd; -} - -/** - * sysfs_put_active - put an active reference to sysfs_dirent - * @sd: sysfs_dirent to put an active reference to - * - * Put an active reference to @sd. This function is noop if @sd - * is NULL. - */ -static inline void sysfs_put_active(struct sysfs_dirent *sd) -{ - if (sd) - up_read(&sd->s_active); -} - -/** - * sysfs_get_active_two - get active references to sysfs_dirent and parent - * @sd: sysfs_dirent of interest - * - * Get active reference to @sd and its parent. Parent's active - * reference is grabbed first. This function is noop if @sd is - * NULL. - * - * RETURNS: - * Pointer to @sd on success, NULL on failure. - */ -static inline struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd) -{ - if (sd) { - if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent))) - return NULL; - if (unlikely(!sysfs_get_active(sd))) { - sysfs_put_active(sd->s_parent); - return NULL; - } - } - return sd; -} - -/** - * sysfs_put_active_two - put active references to sysfs_dirent and parent - * @sd: sysfs_dirent of interest - * - * Put active references to @sd and its parent. This function is - * noop if @sd is NULL. - */ -static inline void sysfs_put_active_two(struct sysfs_dirent *sd) -{ - if (sd) { - sysfs_put_active(sd); - sysfs_put_active(sd->s_parent); - } -} - -/** - * sysfs_deactivate - deactivate sysfs_dirent - * @sd: sysfs_dirent to deactivate - * - * Deny new active references and drain existing ones. s_active - * will be unlocked when the sysfs_dirent is released. - */ -static inline void sysfs_deactivate(struct sysfs_dirent *sd) -{ - down_write_nested(&sd->s_active, SYSFS_S_ACTIVE_DEACTIVATE); - - /* s_active will be unlocked by the thread doing the final put - * on @sd. Lie to lockdep. - */ - rwsem_release(&sd->s_active.dep_map, 1, _RET_IP_); -} - static inline int sysfs_is_shadowed_inode(struct inode *inode) { return S_ISDIR(inode->i_mode) && inode->i_op->follow_link; -- cgit v1.2.1 From 8619f979898397582e366877fd5feeba7560d70c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:18 +0900 Subject: sysfs: slim down sysfs_dirent->s_active Make sysfs_dirent->s_active an atomic_t instead of rwsem. This reduces the size of sysfs_dirent from 136 to 104 on 64bit and from 76 to 60 on 32bit with lock debugging turned off. With lock debugging turned on the reduction is much larger. s_active starts at zero and each active reference increments s_active. Putting a reference decrements s_active. Deactivation subtracts SD_DEACTIVATED_BIAS which is currently INT_MIN and assumed to be small enough to make s_active negative. If s_active is negative, sysfs_get() no longer grants new references. Deactivation succeeds immediately if there is no active user; otherwise, it waits using a completion for the last put. Due to the removal of lockdep tricks, this change makes things less trickier in release_sysfs_dirent(). As all the complexity is contained in three s_active functions, I think it's more readable this way. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 74 ++++++++++++++++++++++++++++++++++++-------------------- fs/sysfs/sysfs.h | 13 ++-------- 2 files changed, 50 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f5f0b936f181..40596a0eee52 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "sysfs.h" @@ -32,11 +33,24 @@ static DEFINE_IDA(sysfs_ino_ida); */ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) { - if (sd) { - if (unlikely(!down_read_trylock(&sd->s_active))) - sd = NULL; + if (unlikely(!sd)) + return NULL; + + while (1) { + int v, t; + + v = atomic_read(&sd->s_active); + if (unlikely(v < 0)) + return NULL; + + t = atomic_cmpxchg(&sd->s_active, v, v + 1); + if (likely(t == v)) + return sd; + if (t < 0) + return NULL; + + cpu_relax(); } - return sd; } /** @@ -48,8 +62,21 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) */ void sysfs_put_active(struct sysfs_dirent *sd) { - if (sd) - up_read(&sd->s_active); + struct completion *cmpl; + int v; + + if (unlikely(!sd)) + return; + + v = atomic_dec_return(&sd->s_active); + if (likely(v != SD_DEACTIVATED_BIAS)) + return; + + /* atomic_dec_return() is a mb(), we'll always see the updated + * sd->s_sibling.next. + */ + cmpl = (void *)sd->s_sibling.next; + complete(cmpl); } /** @@ -95,17 +122,25 @@ void sysfs_put_active_two(struct sysfs_dirent *sd) * sysfs_deactivate - deactivate sysfs_dirent * @sd: sysfs_dirent to deactivate * - * Deny new active references and drain existing ones. s_active - * will be unlocked when the sysfs_dirent is released. + * Deny new active references and drain existing ones. */ void sysfs_deactivate(struct sysfs_dirent *sd) { - down_write_nested(&sd->s_active, SYSFS_S_ACTIVE_DEACTIVATE); + DECLARE_COMPLETION_ONSTACK(wait); + int v; + + BUG_ON(!list_empty(&sd->s_sibling)); + sd->s_sibling.next = (void *)&wait; - /* s_active will be unlocked by the thread doing the final put - * on @sd. Lie to lockdep. + /* atomic_add_return() is a mb(), put_active() will always see + * the updated sd->s_sibling.next. */ - rwsem_release(&sd->s_active.dep_map, 1, _RET_IP_); + v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); + + if (v != SD_DEACTIVATED_BIAS) + wait_for_completion(&wait); + + INIT_LIST_HEAD(&sd->s_sibling); } static int sysfs_alloc_ino(ino_t *pino) @@ -141,19 +176,6 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) repeat: parent_sd = sd->s_parent; - /* If @sd is being released after deletion, s_active is write - * locked. If @sd is cursor for directory walk or being - * released prematurely, s_active has no reader or writer. - * - * sysfs_deactivate() lies to lockdep that s_active is - * unlocked immediately. Lie one more time to cover the - * previous lie. - */ - if (!down_write_trylock(&sd->s_active)) - rwsem_acquire(&sd->s_active.dep_map, - SYSFS_S_ACTIVE_DEACTIVATE, 0, _RET_IP_); - up_write(&sd->s_active); - if (sd->s_type & SYSFS_KOBJ_LINK) sysfs_put(sd->s_elem.symlink.target_sd); if (sd->s_type & SYSFS_COPY_NAME) @@ -213,8 +235,8 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) goto err_out; atomic_set(&sd->s_count, 1); + atomic_set(&sd->s_active, 0); atomic_set(&sd->s_event, 1); - init_rwsem(&sd->s_active); INIT_LIST_HEAD(&sd->s_children); INIT_LIST_HEAD(&sd->s_sibling); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index f8779eaa53ff..ae006b070bf0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -21,7 +21,7 @@ struct sysfs_elem_bin_attr { */ struct sysfs_dirent { atomic_t s_count; - struct rw_semaphore s_active; + atomic_t s_active; struct sysfs_dirent * s_parent; struct list_head s_sibling; struct list_head s_children; @@ -42,16 +42,7 @@ struct sysfs_dirent { atomic_t s_event; }; -/* - * A sysfs file which deletes another file when written to need to - * write lock the s_active of the victim while its s_active is read - * locked for the write operation. Tell lockdep that this is okay. - */ -enum sysfs_s_active_class -{ - SYSFS_S_ACTIVE_NORMAL, /* file r/w access, etc - default */ - SYSFS_S_ACTIVE_DEACTIVATE, /* file deactivation */ -}; +#define SD_DEACTIVATED_BIAS INT_MIN extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; -- cgit v1.2.1 From 0c73f18b7d95de8a007039337063a770b5fc8e7a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:18 +0900 Subject: sysfs: use singly-linked list for sysfs_dirent tree Make sysfs_dirent use singly linked list for its tree structure. sysfs_link_sibling() and sysfs_unlink_sibling() functions are added to handle simpler cases. It adds some complexity and cpu cycle overhead but reduced memory footprint is worthwhile on big machines. This change reduces the sizeof sysfs_dirent from 104 to 88 on 64bit and from 60 to 52 on 32bit. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 146 ++++++++++++++++++++++++++++++++++++++----------------- fs/sysfs/inode.c | 12 +++-- fs/sysfs/mount.c | 3 -- fs/sysfs/sysfs.h | 4 +- 4 files changed, 111 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 40596a0eee52..b4074adbab01 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,6 +21,48 @@ spinlock_t kobj_sysfs_assoc_lock = SPIN_LOCK_UNLOCKED; static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED; static DEFINE_IDA(sysfs_ino_ida); +/** + * sysfs_link_sibling - link sysfs_dirent into sibling list + * @sd: sysfs_dirent of interest + * + * Link @sd into its sibling list which starts from + * sd->s_parent->s_children. + * + * Locking: + * mutex_lock(sd->s_parent->dentry->d_inode->i_mutex) + */ +static void sysfs_link_sibling(struct sysfs_dirent *sd) +{ + struct sysfs_dirent *parent_sd = sd->s_parent; + + BUG_ON(sd->s_sibling); + sd->s_sibling = parent_sd->s_children; + parent_sd->s_children = sd; +} + +/** + * sysfs_unlink_sibling - unlink sysfs_dirent from sibling list + * @sd: sysfs_dirent of interest + * + * Unlink @sd from its sibling list which starts from + * sd->s_parent->s_children. + * + * Locking: + * mutex_lock(sd->s_parent->dentry->d_inode->i_mutex) + */ +static void sysfs_unlink_sibling(struct sysfs_dirent *sd) +{ + struct sysfs_dirent **pos; + + for (pos = &sd->s_parent->s_children; *pos; pos = &(*pos)->s_sibling) { + if (*pos == sd) { + *pos = sd->s_sibling; + sd->s_sibling = NULL; + break; + } + } +} + /** * sysfs_get_active - get an active reference to sysfs_dirent * @sd: sysfs_dirent to get an active reference to @@ -73,9 +115,9 @@ void sysfs_put_active(struct sysfs_dirent *sd) return; /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->s_sibling.next. + * sd->s_sibling. */ - cmpl = (void *)sd->s_sibling.next; + cmpl = (void *)sd->s_sibling; complete(cmpl); } @@ -129,18 +171,18 @@ void sysfs_deactivate(struct sysfs_dirent *sd) DECLARE_COMPLETION_ONSTACK(wait); int v; - BUG_ON(!list_empty(&sd->s_sibling)); - sd->s_sibling.next = (void *)&wait; + BUG_ON(sd->s_sibling); + sd->s_sibling = (void *)&wait; /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->s_sibling.next. + * the updated sd->s_sibling. */ v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); if (v != SD_DEACTIVATED_BIAS) wait_for_completion(&wait); - INIT_LIST_HEAD(&sd->s_sibling); + sd->s_sibling = NULL; } static int sysfs_alloc_ino(ino_t *pino) @@ -237,8 +279,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) atomic_set(&sd->s_count, 1); atomic_set(&sd->s_active, 0); atomic_set(&sd->s_event, 1); - INIT_LIST_HEAD(&sd->s_children); - INIT_LIST_HEAD(&sd->s_sibling); sd->s_name = name; sd->s_mode = mode; @@ -273,7 +313,7 @@ void sysfs_attach_dirent(struct sysfs_dirent *sd, if (parent_sd) { sd->s_parent = sysfs_get(parent_sd); - list_add(&sd->s_sibling, &parent_sd->s_children); + sysfs_link_sibling(sd); } } @@ -289,7 +329,7 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, { struct sysfs_dirent * sd; - list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) { if (sd->s_type) { if (strcmp(sd->s_name, new)) continue; @@ -409,7 +449,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode; int found = 0; - list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) { if ((sd->s_type & SYSFS_NOT_PINNED) && !strcmp(sd->s_name, dentry->d_name.name)) { found = 1; @@ -458,7 +498,7 @@ static void remove_dir(struct dentry * d) mutex_lock(&parent->d_inode->i_mutex); - list_del_init(&sd->s_sibling); + sysfs_unlink_sibling(sd); pr_debug(" o %s removing done (%d)\n",d->d_name.name, atomic_read(&d->d_count)); @@ -478,9 +518,9 @@ void sysfs_remove_subdir(struct dentry * d) static void __sysfs_remove_dir(struct dentry *dentry) { - LIST_HEAD(removed); - struct sysfs_dirent * parent_sd; - struct sysfs_dirent * sd, * tmp; + struct sysfs_dirent *removed = NULL; + struct sysfs_dirent *parent_sd; + struct sysfs_dirent **pos; if (!dentry) return; @@ -488,15 +528,25 @@ static void __sysfs_remove_dir(struct dentry *dentry) pr_debug("sysfs %s: removing dir\n",dentry->d_name.name); mutex_lock(&dentry->d_inode->i_mutex); parent_sd = dentry->d_fsdata; - list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { - if (!sd->s_type || !(sd->s_type & SYSFS_NOT_PINNED)) - continue; - list_move(&sd->s_sibling, &removed); + pos = &parent_sd->s_children; + while (*pos) { + struct sysfs_dirent *sd = *pos; + + if (sd->s_type && (sd->s_type & SYSFS_NOT_PINNED)) { + *pos = sd->s_sibling; + sd->s_sibling = removed; + removed = sd; + } else + pos = &(*pos)->s_sibling; } mutex_unlock(&dentry->d_inode->i_mutex); - list_for_each_entry_safe(sd, tmp, &removed, s_sibling) { - list_del_init(&sd->s_sibling); + while (removed) { + struct sysfs_dirent *sd = removed; + + removed = sd->s_sibling; + sd->s_sibling = NULL; + sysfs_drop_dentry(sd); sysfs_deactivate(sd); sysfs_put(sd); @@ -577,11 +627,11 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, d_add(new_dentry, NULL); d_move(kobj->dentry, new_dentry); - list_del_init(&sd->s_sibling); + sysfs_unlink_sibling(sd); sysfs_get(parent_sd); sysfs_put(sd->s_parent); sd->s_parent = parent_sd; - list_add(&sd->s_sibling, &parent_sd->s_children); + sysfs_link_sibling(sd); error = 0; goto out_unlock; @@ -633,11 +683,11 @@ again: dput(new_dentry); /* Remove from old parent's list and insert into new parent's list. */ - list_del_init(&sd->s_sibling); + sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); sysfs_put(sd->s_parent); sd->s_parent = new_parent_sd; - list_add(&sd->s_sibling, &new_parent_sd->s_children); + sysfs_link_sibling(sd); out: mutex_unlock(&new_parent_dentry->d_inode->i_mutex); @@ -668,7 +718,7 @@ static int sysfs_dir_close(struct inode *inode, struct file *file) struct sysfs_dirent * cursor = file->private_data; mutex_lock(&dentry->d_inode->i_mutex); - list_del_init(&cursor->s_sibling); + sysfs_unlink_sibling(cursor); mutex_unlock(&dentry->d_inode->i_mutex); release_sysfs_dirent(cursor); @@ -687,7 +737,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) struct dentry *dentry = filp->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; struct sysfs_dirent *cursor = filp->private_data; - struct list_head *p, *q = &cursor->s_sibling; + struct sysfs_dirent **pos; ino_t ino; int i = filp->f_pos; @@ -710,16 +760,21 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ default: + pos = &parent_sd->s_children; + while (*pos != cursor) + pos = &(*pos)->s_sibling; + + /* unlink cursor */ + *pos = cursor->s_sibling; + if (filp->f_pos == 2) - list_move(q, &parent_sd->s_children); + pos = &parent_sd->s_children; - for (p=q->next; p!= &parent_sd->s_children; p=p->next) { - struct sysfs_dirent *next; + for ( ; *pos; pos = &(*pos)->s_sibling) { + struct sysfs_dirent *next = *pos; const char * name; int len; - next = list_entry(p, struct sysfs_dirent, - s_sibling); if (!next->s_type) continue; @@ -729,12 +784,14 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) if (filldir(dirent, name, len, filp->f_pos, ino, dt_type(next)) < 0) - return 0; + break; - list_move(q, p); - p = q; filp->f_pos++; } + + /* put cursor back in */ + cursor->s_sibling = *pos; + *pos = cursor; } return 0; } @@ -759,20 +816,21 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) if (file->f_pos >= 2) { struct sysfs_dirent *sd = dentry->d_fsdata; struct sysfs_dirent *cursor = file->private_data; - struct list_head *p; + struct sysfs_dirent **pos; loff_t n = file->f_pos - 2; - list_del(&cursor->s_sibling); - p = sd->s_children.next; - while (n && p != &sd->s_children) { - struct sysfs_dirent *next; - next = list_entry(p, struct sysfs_dirent, - s_sibling); + sysfs_unlink_sibling(cursor); + + pos = &sd->s_children; + while (n && *pos) { + struct sysfs_dirent *next = *pos; if (next->s_type) n--; - p = p->next; + pos = &(*pos)->s_sibling; } - list_add_tail(&cursor->s_sibling, p); + + cursor->s_sibling = *pos; + *pos = cursor; } } mutex_unlock(&dentry->d_inode->i_mutex); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 3eab9c46a71b..732fd7f371e0 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -284,8 +284,8 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) int sysfs_hash_and_remove(struct dentry * dir, const char * name) { - struct sysfs_dirent * sd; - struct sysfs_dirent * parent_sd; + struct sysfs_dirent **pos, *sd; + struct sysfs_dirent *parent_sd = dir->d_fsdata; int found = 0; if (!dir) @@ -295,13 +295,15 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) /* no inode means this hasn't been made visible yet */ return -ENOENT; - parent_sd = dir->d_fsdata; mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + for (pos = &parent_sd->s_children; *pos; pos = &(*pos)->s_sibling) { + sd = *pos; + if (!sd->s_type) continue; if (!strcmp(sd->s_name, name)) { - list_del_init(&sd->s_sibling); + *pos = sd->s_sibling; + sd->s_sibling = NULL; found = 1; break; } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index d0e160307c5d..4be9593ea000 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -26,11 +26,8 @@ static const struct super_operations sysfs_ops = { static struct sysfs_dirent sysfs_root = { .s_count = ATOMIC_INIT(1), - .s_sibling = LIST_HEAD_INIT(sysfs_root.s_sibling), - .s_children = LIST_HEAD_INIT(sysfs_root.s_children), .s_type = SYSFS_ROOT, .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, - .s_iattr = NULL, .s_ino = 1, }; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index ae006b070bf0..6f8aaf3805d2 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -23,8 +23,8 @@ struct sysfs_dirent { atomic_t s_count; atomic_t s_active; struct sysfs_dirent * s_parent; - struct list_head s_sibling; - struct list_head s_children; + struct sysfs_dirent * s_sibling; + struct sysfs_dirent * s_children; const char * s_name; union { -- cgit v1.2.1 From 9d9307dabb3de8140fb3801bf6eb01f231dbd83d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Jun 2007 03:45:18 +0900 Subject: sysfs: Fix oops in sysfs_drop_dentry on x86_64 Fix oops on x86_64 caused by the dereference of dir in sysfs_drop_dentry() made before checking if dir is not NULL (cf. http://marc.info/?l=linux-kernel&m=118151626704924&w=2). Signed-off-by: Rafael J. Wysocki Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 732fd7f371e0..ee31bf369a88 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -285,7 +285,7 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) int sysfs_hash_and_remove(struct dentry * dir, const char * name) { struct sysfs_dirent **pos, *sd; - struct sysfs_dirent *parent_sd = dir->d_fsdata; + struct sysfs_dirent *parent_sd; int found = 0; if (!dir) @@ -295,6 +295,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) /* no inode means this hasn't been made visible yet */ return -ENOENT; + parent_sd = dir->d_fsdata; mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); for (pos = &parent_sd->s_children; *pos; pos = &(*pos)->s_sibling) { sd = *pos; -- cgit v1.2.1 From d0bcb5689a521df98bff7549fcb8b17499660a99 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:21 +0900 Subject: sysfs: make sysfs_drop_dentry() access inodes using ilookup() sysfs_drop_dentry() used to go through sd->s_dentry and sd->s_parent->s_dentry to access the inodes. This is incorrect because inode can be cached without dentry. This patch makes sysfs_drop_dentry() access inodes using ilookup() on sd->s_ino. This is both correct and simpler. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/inode.c | 63 +++++++++++++++++++++++++------------------------------- 1 file changed, 28 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index ee31bf369a88..63daa06c4194 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -219,9 +219,9 @@ void sysfs_instantiate(struct dentry *dentry, struct inode *inode) */ void sysfs_drop_dentry(struct sysfs_dirent *sd) { - struct dentry *dentry = NULL, *parent = NULL; - struct inode *dir; + struct dentry *dentry = NULL; struct timespec curtime; + struct inode *inode; /* We're not holding a reference to ->s_dentry dentry but the * field will stay valid as long as sysfs_lock is held. @@ -229,19 +229,9 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) spin_lock(&sysfs_lock); spin_lock(&dcache_lock); + /* drop dentry if it's there and dput() didn't kill it yet */ if (sd->s_dentry && sd->s_dentry->d_inode) { - /* get dentry if it's there and dput() didn't kill it yet */ dentry = dget_locked(sd->s_dentry); - parent = dentry->d_parent; - } else if (sd->s_parent->s_dentry->d_inode) { - /* We need to update the parent even if dentry for the - * victim itself doesn't exist. - */ - parent = dget_locked(sd->s_parent->s_dentry); - } - - /* drop */ - if (dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -250,36 +240,39 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) spin_unlock(&dcache_lock); spin_unlock(&sysfs_lock); - /* nothing to do if the parent isn't in dcache */ - if (!parent) - return; + dput(dentry); + /* XXX: unpin if directory, this will go away soon */ + if (sd->s_type & SYSFS_DIR) + dput(dentry); /* adjust nlink and update timestamp */ - dir = parent->d_inode; - mutex_lock(&dir->i_mutex); - curtime = CURRENT_TIME; - dir->i_ctime = dir->i_mtime = curtime; + inode = ilookup(sysfs_sb, sd->s_ino); + if (inode) { + mutex_lock(&inode->i_mutex); - if (dentry) { - dentry->d_inode->i_ctime = curtime; - drop_nlink(dentry->d_inode); - if (sd->s_type & SYSFS_DIR) { - drop_nlink(dentry->d_inode); - drop_nlink(dir); - /* XXX: unpin if directory, this will go away soon */ - dput(dentry); - } + inode->i_ctime = curtime; + drop_nlink(inode); + if (sd->s_type & SYSFS_DIR) + drop_nlink(inode); + + mutex_unlock(&inode->i_mutex); + iput(inode); } - mutex_unlock(&dir->i_mutex); + /* adjust nlink and udpate timestamp of the parent */ + inode = ilookup(sysfs_sb, sd->s_parent->s_ino); + if (inode) { + mutex_lock(&inode->i_mutex); - /* bye bye */ - if (dentry) - dput(dentry); - else - dput(parent); + inode->i_ctime = inode->i_mtime = curtime; + if (sd->s_type & SYSFS_DIR) + drop_nlink(inode); + + mutex_unlock(&inode->i_mutex); + iput(inode); + } } int sysfs_hash_and_remove(struct dentry * dir, const char * name) -- cgit v1.2.1 From b402d72cf7b338a074e3c12b305ec79284e18845 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:21 +0900 Subject: sysfs: rename sysfs_dirent->s_type to s_flags and make room for flags Rename sysfs_dirent->s_type to s_flags, pack type into lower eight bits and reserve the rest for flags. sysfs_type() can used to access the type. All existing sd->s_type accesses are converted to use sysfs_type(). While at it, type test is changed to equality test instead of bit-and test where appropriate. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 33 ++++++++++++++++++++------------- fs/sysfs/inode.c | 8 ++++---- fs/sysfs/mount.c | 2 +- fs/sysfs/sysfs.h | 7 ++++++- 4 files changed, 31 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index b4074adbab01..eb9bc0a8717b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -218,9 +218,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) repeat: parent_sd = sd->s_parent; - if (sd->s_type & SYSFS_KOBJ_LINK) + if (sysfs_type(sd) == SYSFS_KOBJ_LINK) sysfs_put(sd->s_elem.symlink.target_sd); - if (sd->s_type & SYSFS_COPY_NAME) + if (sysfs_type(sd) & SYSFS_COPY_NAME) kfree(sd->s_name); kfree(sd->s_iattr); sysfs_free_ino(sd->s_ino); @@ -282,7 +282,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) sd->s_name = name; sd->s_mode = mode; - sd->s_type = type; + sd->s_flags = type; return sd; @@ -330,7 +330,7 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, struct sysfs_dirent * sd; for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) { - if (sd->s_type) { + if (sysfs_type(sd)) { if (strcmp(sd->s_name, new)) continue; else @@ -446,11 +446,12 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, { struct sysfs_dirent * parent_sd = dentry->d_parent->d_fsdata; struct sysfs_dirent * sd; + struct bin_attribute *bin_attr; struct inode *inode; int found = 0; for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) { - if ((sd->s_type & SYSFS_NOT_PINNED) && + if ((sysfs_type(sd) & SYSFS_NOT_PINNED) && !strcmp(sd->s_name, dentry->d_name.name)) { found = 1; break; @@ -468,16 +469,22 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, if (inode->i_state & I_NEW) { /* initialize inode according to type */ - if (sd->s_type & SYSFS_KOBJ_ATTR) { + switch (sysfs_type(sd)) { + case SYSFS_KOBJ_ATTR: inode->i_size = PAGE_SIZE; inode->i_fop = &sysfs_file_operations; - } else if (sd->s_type & SYSFS_KOBJ_BIN_ATTR) { - struct bin_attribute *bin_attr = - sd->s_elem.bin_attr.bin_attr; + break; + case SYSFS_KOBJ_BIN_ATTR: + bin_attr = sd->s_elem.bin_attr.bin_attr; inode->i_size = bin_attr->size; inode->i_fop = &bin_fops; - } else if (sd->s_type & SYSFS_KOBJ_LINK) + break; + case SYSFS_KOBJ_LINK: inode->i_op = &sysfs_symlink_inode_operations; + break; + default: + BUG(); + } } sysfs_instantiate(dentry, inode); @@ -532,7 +539,7 @@ static void __sysfs_remove_dir(struct dentry *dentry) while (*pos) { struct sysfs_dirent *sd = *pos; - if (sd->s_type && (sd->s_type & SYSFS_NOT_PINNED)) { + if (sysfs_type(sd) && (sysfs_type(sd) & SYSFS_NOT_PINNED)) { *pos = sd->s_sibling; sd->s_sibling = removed; removed = sd; @@ -775,7 +782,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) const char * name; int len; - if (!next->s_type) + if (!sysfs_type(next)) continue; name = next->s_name; @@ -824,7 +831,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) pos = &sd->s_children; while (n && *pos) { struct sysfs_dirent *next = *pos; - if (next->s_type) + if (sysfs_type(next)) n--; pos = &(*pos)->s_sibling; } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 63daa06c4194..ee3a5d957051 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -242,7 +242,7 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) dput(dentry); /* XXX: unpin if directory, this will go away soon */ - if (sd->s_type & SYSFS_DIR) + if (sysfs_type(sd) == SYSFS_DIR) dput(dentry); /* adjust nlink and update timestamp */ @@ -254,7 +254,7 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) inode->i_ctime = curtime; drop_nlink(inode); - if (sd->s_type & SYSFS_DIR) + if (sysfs_type(sd) == SYSFS_DIR) drop_nlink(inode); mutex_unlock(&inode->i_mutex); @@ -267,7 +267,7 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) mutex_lock(&inode->i_mutex); inode->i_ctime = inode->i_mtime = curtime; - if (sd->s_type & SYSFS_DIR) + if (sysfs_type(sd) == SYSFS_DIR) drop_nlink(inode); mutex_unlock(&inode->i_mutex); @@ -293,7 +293,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) for (pos = &parent_sd->s_children; *pos; pos = &(*pos)->s_sibling) { sd = *pos; - if (!sd->s_type) + if (!sysfs_type(sd)) continue; if (!strcmp(sd->s_name, name)) { *pos = sd->s_sibling; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 4be9593ea000..078537e5d696 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -26,7 +26,7 @@ static const struct super_operations sysfs_ops = { static struct sysfs_dirent sysfs_root = { .s_count = ATOMIC_INIT(1), - .s_type = SYSFS_ROOT, + .s_flags = SYSFS_ROOT, .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, .s_ino = 1, }; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 6f8aaf3805d2..06b5085804a1 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -34,7 +34,7 @@ struct sysfs_dirent { struct sysfs_elem_bin_attr bin_attr; } s_elem; - int s_type; + unsigned int s_flags; umode_t s_mode; ino_t s_ino; struct dentry * s_dentry; @@ -86,6 +86,11 @@ extern const struct file_operations bin_fops; extern const struct inode_operations sysfs_dir_inode_operations; extern const struct inode_operations sysfs_symlink_inode_operations; +static inline unsigned int sysfs_type(struct sysfs_dirent *sd) +{ + return sd->s_flags & SYSFS_TYPE_MASK; +} + static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd) { if (sd) { -- cgit v1.2.1 From 380e6fbb729a55b73d5d8409551474884e0d93fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:22 +0900 Subject: sysfs: implement SYSFS_FLAG_REMOVED flag Implement SYSFS_FLAG_REMOVED flag which currently is used only to improve sanity check in sysfs_deactivate(). The flag will be used to make directory entries reclamiable. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 4 +++- fs/sysfs/inode.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index eb9bc0a8717b..f2ea00683ec9 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -171,7 +171,7 @@ void sysfs_deactivate(struct sysfs_dirent *sd) DECLARE_COMPLETION_ONSTACK(wait); int v; - BUG_ON(sd->s_sibling); + BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); sd->s_sibling = (void *)&wait; /* atomic_add_return() is a mb(), put_active() will always see @@ -506,6 +506,7 @@ static void remove_dir(struct dentry * d) mutex_lock(&parent->d_inode->i_mutex); sysfs_unlink_sibling(sd); + sd->s_flags |= SYSFS_FLAG_REMOVED; pr_debug(" o %s removing done (%d)\n",d->d_name.name, atomic_read(&d->d_count)); @@ -540,6 +541,7 @@ static void __sysfs_remove_dir(struct dentry *dentry) struct sysfs_dirent *sd = *pos; if (sysfs_type(sd) && (sysfs_type(sd) & SYSFS_NOT_PINNED)) { + sd->s_flags |= SYSFS_FLAG_REMOVED; *pos = sd->s_sibling; sd->s_sibling = removed; removed = sd; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index ee3a5d957051..e2f6ef138d20 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -296,6 +296,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) if (!sysfs_type(sd)) continue; if (!strcmp(sd->s_name, name)) { + sd->s_flags |= SYSFS_FLAG_REMOVED; *pos = sd->s_sibling; sd->s_sibling = NULL; found = 1; -- cgit v1.2.1 From f0b0af4792d751106e2003f96af76fa95e10c68d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:22 +0900 Subject: sysfs: implement sysfs_find_dirent() and sysfs_get_dirent() Implement sysfs_find_dirent() and sysfs_get_dirent(). sysfs_dirent_exist() is replaced by sysfs_find_dirent(). These will be used to make directory entries reclamiable. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 61 +++++++++++++++++++++++++++++++++++++++--------------- fs/sysfs/file.c | 2 +- fs/sysfs/symlink.c | 2 +- fs/sysfs/sysfs.h | 5 ++++- 4 files changed, 50 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f2ea00683ec9..4762a9aa0b27 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -317,28 +317,55 @@ void sysfs_attach_dirent(struct sysfs_dirent *sd, } } -/* +/** + * sysfs_find_dirent - find sysfs_dirent with the given name + * @parent_sd: sysfs_dirent to search under + * @name: name to look for * - * Return -EEXIST if there is already a sysfs element with the same name for - * the same parent. + * Look for sysfs_dirent with name @name under @parent_sd. * - * called with parent inode's i_mutex held + * LOCKING: + * mutex_lock(parent->i_mutex) + * + * RETURNS: + * Pointer to sysfs_dirent if found, NULL if not. */ -int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, - const unsigned char *new) +struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, + const unsigned char *name) { - struct sysfs_dirent * sd; + struct sysfs_dirent *sd; - for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) { - if (sysfs_type(sd)) { - if (strcmp(sd->s_name, new)) - continue; - else - return -EEXIST; - } - } + for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) + if (sysfs_type(sd) && !strcmp(sd->s_name, name)) + return sd; + return NULL; +} - return 0; +/** + * sysfs_get_dirent - find and get sysfs_dirent with the given name + * @parent_sd: sysfs_dirent to search under + * @name: name to look for + * + * Look for sysfs_dirent with name @name under @parent_sd and get + * it if found. + * + * LOCKING: + * Kernel thread context (may sleep) + * + * RETURNS: + * Pointer to sysfs_dirent if found, NULL if not. + */ +struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, + const unsigned char *name) +{ + struct sysfs_dirent *sd; + + mutex_lock(&parent_sd->s_dentry->d_inode->i_mutex); + sd = sysfs_find_dirent(parent_sd, name); + sysfs_get(sd); + mutex_unlock(&parent_sd->s_dentry->d_inode->i_mutex); + + return sd; } static int create_dir(struct kobject *kobj, struct dentry *parent, @@ -382,7 +409,7 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, /* link in */ error = -EEXIST; - if (sysfs_dirent_exist(parent->d_fsdata, name)) + if (sysfs_find_dirent(parent->d_fsdata, name)) goto out_iput; sysfs_instantiate(dentry, inode); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index a84b734f7b29..e448b88e313e 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -421,7 +421,7 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) mutex_lock(&dir->d_inode->i_mutex); - if (sysfs_dirent_exist(parent_sd, attr->name)) { + if (sysfs_find_dirent(parent_sd, attr->name)) { error = -EEXIST; goto out_unlock; } diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index ff605d3f4d33..45b62e229627 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -95,7 +95,7 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char return -ENOENT; mutex_lock(&dentry->d_inode->i_mutex); - if (!sysfs_dirent_exist(dentry->d_fsdata, name)) + if (!sysfs_find_dirent(dentry->d_fsdata, name)) error = sysfs_add_link(parent_sd, name, target_sd); mutex_unlock(&dentry->d_inode->i_mutex); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 06b5085804a1..f1629b4520aa 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -59,7 +59,10 @@ extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd); extern void sysfs_instantiate(struct dentry *dentry, struct inode *inode); extern void release_sysfs_dirent(struct sysfs_dirent * sd); -extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); +extern struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, + const unsigned char *name); +extern struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, + const unsigned char *name); extern struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); extern void sysfs_attach_dirent(struct sysfs_dirent *sd, -- cgit v1.2.1 From 608e266a2d4e62c1b98c1c573064b6afe8c06a58 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:22 +0900 Subject: sysfs: make kobj point to sysfs_dirent instead of dentry As kobj sysfs dentries and inodes are gonna be made reclaimable, dentry can't be used as naming token for sysfs file/directory, replace kobj->dentry with kobj->sd. The only external interface change is shadow directory handling. All other changes are contained in kobj and sysfs. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 6 +-- fs/sysfs/dir.c | 119 ++++++++++++++++++++++++++--------------------------- fs/sysfs/file.c | 47 ++++++++++----------- fs/sysfs/group.c | 55 ++++++++++++------------- fs/sysfs/inode.c | 11 ++--- fs/sysfs/symlink.c | 22 +++++----- fs/sysfs/sysfs.h | 10 +++-- 7 files changed, 135 insertions(+), 135 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 3c5574a40b09..55796bdacd3d 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -234,9 +234,9 @@ const struct file_operations bin_fops = { int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) { - BUG_ON(!kobj || !kobj->dentry || !attr); + BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file(kobj->dentry, &attr->attr, SYSFS_KOBJ_BIN_ATTR); + return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); } @@ -248,7 +248,7 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) { - if (sysfs_hash_and_remove(kobj->dentry, attr->attr.name) < 0) { + if (sysfs_hash_and_remove(kobj->sd, attr->attr.name) < 0) { printk(KERN_ERR "%s: " "bad dentry or inode or no such file: \"%s\"\n", __FUNCTION__, attr->attr.name); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 4762a9aa0b27..31b6cf30636d 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -368,9 +368,10 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, return sd; } -static int create_dir(struct kobject *kobj, struct dentry *parent, - const char *name, struct dentry **p_dentry) +static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, + const char *name, struct sysfs_dirent **p_sd) { + struct dentry *parent = parent_sd->s_dentry; int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; struct dentry *dentry; @@ -409,14 +410,14 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, /* link in */ error = -EEXIST; - if (sysfs_find_dirent(parent->d_fsdata, name)) + if (sysfs_find_dirent(parent_sd, name)) goto out_iput; sysfs_instantiate(dentry, inode); inc_nlink(parent->d_inode); - sysfs_attach_dirent(sd, parent->d_fsdata, dentry); + sysfs_attach_dirent(sd, parent_sd, dentry); - *p_dentry = dentry; + *p_sd = sd; error = 0; goto out_unlock; /* pin directory dentry in core */ @@ -433,38 +434,37 @@ static int create_dir(struct kobject *kobj, struct dentry *parent, return error; } - -int sysfs_create_subdir(struct kobject * k, const char * n, struct dentry ** d) +int sysfs_create_subdir(struct kobject *kobj, const char *name, + struct sysfs_dirent **p_sd) { - return create_dir(k,k->dentry,n,d); + return create_dir(kobj, kobj->sd, name, p_sd); } /** * sysfs_create_dir - create a directory for an object. * @kobj: object we're creating directory for. - * @shadow_parent: parent parent object. + * @shadow_parent: parent object. */ - -int sysfs_create_dir(struct kobject * kobj, struct dentry *shadow_parent) +int sysfs_create_dir(struct kobject *kobj, + struct sysfs_dirent *shadow_parent_sd) { - struct dentry * dentry = NULL; - struct dentry * parent; + struct sysfs_dirent *parent_sd, *sd; int error = 0; BUG_ON(!kobj); - if (shadow_parent) - parent = shadow_parent; + if (shadow_parent_sd) + parent_sd = shadow_parent_sd; else if (kobj->parent) - parent = kobj->parent->dentry; + parent_sd = kobj->parent->sd; else if (sysfs_mount && sysfs_mount->mnt_sb) - parent = sysfs_mount->mnt_sb->s_root; + parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata; else return -EFAULT; - error = create_dir(kobj,parent,kobject_name(kobj),&dentry); + error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); if (!error) - kobj->dentry = dentry; + kobj->sd = sd; return error; } @@ -525,18 +525,16 @@ const struct inode_operations sysfs_dir_inode_operations = { .setattr = sysfs_setattr, }; -static void remove_dir(struct dentry * d) +static void remove_dir(struct sysfs_dirent *sd) { - struct dentry *parent = d->d_parent; - struct sysfs_dirent *sd = d->d_fsdata; + struct dentry *parent = sd->s_parent->s_dentry; mutex_lock(&parent->d_inode->i_mutex); sysfs_unlink_sibling(sd); sd->s_flags |= SYSFS_FLAG_REMOVED; - pr_debug(" o %s removing done (%d)\n",d->d_name.name, - atomic_read(&d->d_count)); + pr_debug(" o %s removing done\n", sd->s_name); mutex_unlock(&parent->d_inode->i_mutex); @@ -545,25 +543,26 @@ static void remove_dir(struct dentry * d) sysfs_put(sd); } -void sysfs_remove_subdir(struct dentry * d) +void sysfs_remove_subdir(struct sysfs_dirent *sd) { - remove_dir(d); + remove_dir(sd); } -static void __sysfs_remove_dir(struct dentry *dentry) +static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) { struct sysfs_dirent *removed = NULL; - struct sysfs_dirent *parent_sd; struct sysfs_dirent **pos; + struct dentry *dir; - if (!dentry) + if (!dir_sd) return; - pr_debug("sysfs %s: removing dir\n",dentry->d_name.name); - mutex_lock(&dentry->d_inode->i_mutex); - parent_sd = dentry->d_fsdata; - pos = &parent_sd->s_children; + dir = dir_sd->s_dentry; + + pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); + mutex_lock(&dir->d_inode->i_mutex); + pos = &dir_sd->s_children; while (*pos) { struct sysfs_dirent *sd = *pos; @@ -575,7 +574,7 @@ static void __sysfs_remove_dir(struct dentry *dentry) } else pos = &(*pos)->s_sibling; } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&dir->d_inode->i_mutex); while (removed) { struct sysfs_dirent *sd = removed; @@ -588,7 +587,7 @@ static void __sysfs_remove_dir(struct dentry *dentry) sysfs_put(sd); } - remove_dir(dentry); + remove_dir(dir_sd); } /** @@ -602,25 +601,25 @@ static void __sysfs_remove_dir(struct dentry *dentry) void sysfs_remove_dir(struct kobject * kobj) { - struct dentry *d = kobj->dentry; + struct sysfs_dirent *sd = kobj->sd; spin_lock(&kobj_sysfs_assoc_lock); - kobj->dentry = NULL; + kobj->sd = NULL; spin_unlock(&kobj_sysfs_assoc_lock); - __sysfs_remove_dir(d); + __sysfs_remove_dir(sd); } -int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, +int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, const char *new_name) { - struct sysfs_dirent *sd = kobj->dentry->d_fsdata; - struct sysfs_dirent *parent_sd = new_parent->d_fsdata; + struct sysfs_dirent *sd = kobj->sd; + struct dentry *new_parent = new_parent_sd->s_dentry; struct dentry *new_dentry; char *dup_name; int error; - if (!new_parent) + if (!new_parent_sd) return -EFAULT; down_write(&sysfs_rename_sem); @@ -637,9 +636,9 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, * shadows of the same directory */ error = -EINVAL; - if (kobj->dentry->d_parent->d_inode != new_parent->d_inode || + if (sd->s_parent->s_dentry->d_inode != new_parent->d_inode || new_dentry->d_parent->d_inode != new_parent->d_inode || - new_dentry == kobj->dentry) + new_dentry == sd->s_dentry) goto out_dput; error = -EEXIST; @@ -661,12 +660,12 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, /* move under the new parent */ d_add(new_dentry, NULL); - d_move(kobj->dentry, new_dentry); + d_move(sd->s_dentry, new_dentry); sysfs_unlink_sibling(sd); - sysfs_get(parent_sd); + sysfs_get(new_parent_sd); sysfs_put(sd->s_parent); - sd->s_parent = parent_sd; + sd->s_parent = new_parent_sd; sysfs_link_sibling(sd); error = 0; @@ -691,9 +690,9 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent) int error; old_parent_dentry = kobj->parent ? - kobj->parent->dentry : sysfs_mount->mnt_sb->s_root; + kobj->parent->sd->s_dentry : sysfs_mount->mnt_sb->s_root; new_parent_dentry = new_parent ? - new_parent->dentry : sysfs_mount->mnt_sb->s_root; + new_parent->sd->s_dentry : sysfs_mount->mnt_sb->s_root; if (old_parent_dentry->d_inode == new_parent_dentry->d_inode) return 0; /* nothing to move */ @@ -705,7 +704,7 @@ again: } new_parent_sd = new_parent_dentry->d_fsdata; - sd = kobj->dentry->d_fsdata; + sd = kobj->sd; new_dentry = lookup_one_len(kobj->name, new_parent_dentry, strlen(kobj->name)); @@ -715,7 +714,7 @@ again: } else error = 0; d_add(new_dentry, NULL); - d_move(kobj->dentry, new_dentry); + d_move(sd->s_dentry, new_dentry); dput(new_dentry); /* Remove from old parent's list and insert into new parent's list. */ @@ -885,7 +884,7 @@ int sysfs_make_shadowed_dir(struct kobject *kobj, struct inode *inode; struct inode_operations *i_op; - inode = kobj->dentry->d_inode; + inode = kobj->sd->s_dentry->d_inode; if (inode->i_op != &sysfs_dir_inode_operations) return -EINVAL; @@ -912,16 +911,16 @@ int sysfs_make_shadowed_dir(struct kobject *kobj, * directory. */ -struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) +struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj) { - struct dentry *dir = kobj->dentry; + struct dentry *dir = kobj->sd->s_dentry; struct inode *inode = dir->d_inode; struct dentry *parent = dir->d_parent; struct sysfs_dirent *parent_sd = parent->d_fsdata; struct dentry *shadow; struct sysfs_dirent *sd; - shadow = ERR_PTR(-EINVAL); + sd = ERR_PTR(-EINVAL); if (!sysfs_is_shadowed_inode(inode)) goto out; @@ -944,25 +943,25 @@ struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) dget(shadow); /* Extra count - pin the dentry in core */ out: - return shadow; + return sd; nomem: dput(shadow); - shadow = ERR_PTR(-ENOMEM); + sd = ERR_PTR(-ENOMEM); goto out; } /** * sysfs_remove_shadow_dir - remove an object's directory. - * @shadow: dentry of shadow directory + * @shadow_sd: sysfs_dirent of shadow directory * * The only thing special about this is that we remove any files in * the directory before we remove the directory, and we've inlined * what used to be sysfs_rmdir() below, instead of calling separately. */ -void sysfs_remove_shadow_dir(struct dentry *shadow) +void sysfs_remove_shadow_dir(struct sysfs_dirent *shadow_sd) { - __sysfs_remove_dir(shadow); + __sysfs_remove_dir(shadow_sd); } const struct file_operations sysfs_dir_operations = { diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index e448b88e313e..20703b9ee064 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -385,7 +385,7 @@ static struct dentry *step_down(struct dentry *dir, const char * name) void sysfs_notify(struct kobject * k, char *dir, char *attr) { - struct dentry *de = k->dentry; + struct dentry *de = k->sd->s_dentry; if (de) dget(de); if (de && dir) @@ -412,16 +412,17 @@ const struct file_operations sysfs_file_operations = { }; -int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) +int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, + int type) { - struct sysfs_dirent * parent_sd = dir->d_fsdata; + struct dentry *dir = dir_sd->s_dentry; umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; struct sysfs_dirent *sd; int error = 0; mutex_lock(&dir->d_inode->i_mutex); - if (sysfs_find_dirent(parent_sd, attr->name)) { + if (sysfs_find_dirent(dir_sd, attr->name)) { error = -EEXIST; goto out_unlock; } @@ -432,7 +433,7 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) goto out_unlock; } sd->s_elem.attr.attr = (void *)attr; - sysfs_attach_dirent(sd, parent_sd, NULL); + sysfs_attach_dirent(sd, dir_sd, NULL); out_unlock: mutex_unlock(&dir->d_inode->i_mutex); @@ -448,9 +449,9 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) { - BUG_ON(!kobj || !kobj->dentry || !attr); + BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file(kobj->dentry, attr, SYSFS_KOBJ_ATTR); + return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); } @@ -464,16 +465,16 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct dentry *dir; + struct sysfs_dirent *dir_sd; int error; - dir = lookup_one_len(group, kobj->dentry, strlen(group)); - if (IS_ERR(dir)) - error = PTR_ERR(dir); - else { - error = sysfs_add_file(dir, attr, SYSFS_KOBJ_ATTR); - dput(dir); - } + dir_sd = sysfs_get_dirent(kobj->sd, group); + if (!dir_sd) + return -ENOENT; + + error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR); + sysfs_put(dir_sd); + return error; } EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); @@ -486,7 +487,7 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); */ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) { - struct dentry * dir = kobj->dentry; + struct dentry *dir = kobj->sd->s_dentry; struct dentry * victim; int res = -ENOENT; @@ -522,7 +523,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) */ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) { - struct dentry *dir = kobj->dentry; + struct dentry *dir = kobj->sd->s_dentry; struct dentry *victim; struct inode * inode; struct iattr newattrs; @@ -560,7 +561,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->dentry, attr->name); + sysfs_hash_and_remove(kobj->sd, attr->name); } @@ -573,12 +574,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct dentry *dir; + struct sysfs_dirent *dir_sd; - dir = lookup_one_len(group, kobj->dentry, strlen(group)); - if (!IS_ERR(dir)) { - sysfs_hash_and_remove(dir, attr->name); - dput(dir); + dir_sd = sysfs_get_dirent(kobj->sd, group); + if (dir_sd) { + sysfs_hash_and_remove(dir_sd, attr->name); + sysfs_put(dir_sd); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 52eed2a7a5ef..f318b73c790c 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -18,26 +18,25 @@ #include "sysfs.h" -static void remove_files(struct dentry * dir, - const struct attribute_group * grp) +static void remove_files(struct sysfs_dirent *dir_sd, + const struct attribute_group *grp) { struct attribute *const* attr; for (attr = grp->attrs; *attr; attr++) - sysfs_hash_and_remove(dir,(*attr)->name); + sysfs_hash_and_remove(dir_sd, (*attr)->name); } -static int create_files(struct dentry * dir, - const struct attribute_group * grp) +static int create_files(struct sysfs_dirent *dir_sd, + const struct attribute_group *grp) { struct attribute *const* attr; int error = 0; - for (attr = grp->attrs; *attr && !error; attr++) { - error = sysfs_add_file(dir, *attr, SYSFS_KOBJ_ATTR); - } + for (attr = grp->attrs; *attr && !error; attr++) + error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); if (error) - remove_files(dir,grp); + remove_files(dir_sd, grp); return error; } @@ -45,44 +44,44 @@ static int create_files(struct dentry * dir, int sysfs_create_group(struct kobject * kobj, const struct attribute_group * grp) { - struct dentry * dir; + struct sysfs_dirent *sd; int error; - BUG_ON(!kobj || !kobj->dentry); + BUG_ON(!kobj || !kobj->sd); if (grp->name) { - error = sysfs_create_subdir(kobj,grp->name,&dir); + error = sysfs_create_subdir(kobj, grp->name, &sd); if (error) return error; } else - dir = kobj->dentry; - dir = dget(dir); - if ((error = create_files(dir,grp))) { + sd = kobj->sd; + sysfs_get(sd); + error = create_files(sd, grp); + if (error) { if (grp->name) - sysfs_remove_subdir(dir); + sysfs_remove_subdir(sd); } - dput(dir); + sysfs_put(sd); return error; } void sysfs_remove_group(struct kobject * kobj, const struct attribute_group * grp) { - struct dentry * dir; + struct sysfs_dirent *dir_sd = kobj->sd; + struct sysfs_dirent *sd; if (grp->name) { - dir = lookup_one_len_kern(grp->name, kobj->dentry, - strlen(grp->name)); - BUG_ON(IS_ERR(dir)); - } - else - dir = dget(kobj->dentry); + sd = sysfs_get_dirent(dir_sd, grp->name); + BUG_ON(!sd); + } else + sd = sysfs_get(dir_sd); - remove_files(dir,grp); + remove_files(sd, grp); if (grp->name) - sysfs_remove_subdir(dir); - /* release the ref. taken in this routine */ - dput(dir); + sysfs_remove_subdir(sd); + + sysfs_put(sd); } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e2f6ef138d20..1be853706e99 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -275,22 +275,23 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) } } -int sysfs_hash_and_remove(struct dentry * dir, const char * name) +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) { + struct dentry *dir; struct sysfs_dirent **pos, *sd; - struct sysfs_dirent *parent_sd; int found = 0; - if (!dir) + if (!dir_sd) return -ENOENT; + dir = dir_sd->s_dentry; + if (dir->d_inode == NULL) /* no inode means this hasn't been made visible yet */ return -ENOENT; - parent_sd = dir->d_fsdata; mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - for (pos = &parent_sd->s_children; *pos; pos = &(*pos)->s_sibling) { + for (pos = &dir_sd->s_children; *pos; pos = &(*pos)->s_sibling) { sd = *pos; if (!sysfs_type(sd)) diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 45b62e229627..43cc5222f136 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -66,7 +66,6 @@ static int sysfs_add_link(struct sysfs_dirent * parent_sd, const char * name, */ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name) { - struct dentry *dentry = NULL; struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; int error = -EEXIST; @@ -75,29 +74,28 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char if (!kobj) { if (sysfs_mount && sysfs_mount->mnt_sb) - dentry = sysfs_mount->mnt_sb->s_root; + parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata; } else - dentry = kobj->dentry; + parent_sd = kobj->sd; - if (!dentry) + if (!parent_sd) return -EFAULT; - parent_sd = dentry->d_fsdata; - /* target->dentry can go away beneath us but is protected with + /* target->sd can go away beneath us but is protected with * kobj_sysfs_assoc_lock. Fetch target_sd from it. */ spin_lock(&kobj_sysfs_assoc_lock); - if (target->dentry) - target_sd = sysfs_get(target->dentry->d_fsdata); + if (target->sd) + target_sd = sysfs_get(target->sd); spin_unlock(&kobj_sysfs_assoc_lock); if (!target_sd) return -ENOENT; - mutex_lock(&dentry->d_inode->i_mutex); - if (!sysfs_find_dirent(dentry->d_fsdata, name)) + mutex_lock(&parent_sd->s_dentry->d_inode->i_mutex); + if (!sysfs_find_dirent(parent_sd, name)) error = sysfs_add_link(parent_sd, name, target_sd); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&parent_sd->s_dentry->d_inode->i_mutex); if (error) sysfs_put(target_sd); @@ -114,7 +112,7 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char void sysfs_remove_link(struct kobject * kobj, const char * name) { - sysfs_hash_and_remove(kobj->dentry,name); + sysfs_hash_and_remove(kobj->sd, name); } static int sysfs_get_target_path(struct sysfs_dirent * parent_sd, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index f1629b4520aa..27a5f4b4e3b0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -69,12 +69,14 @@ extern void sysfs_attach_dirent(struct sysfs_dirent *sd, struct sysfs_dirent *parent_sd, struct dentry *dentry); -extern int sysfs_add_file(struct dentry *, const struct attribute *, int); -extern int sysfs_hash_and_remove(struct dentry * dir, const char * name); +extern int sysfs_add_file(struct sysfs_dirent *dir_sd, + const struct attribute *attr, int type); +extern int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name); -extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); -extern void sysfs_remove_subdir(struct dentry *); +extern int sysfs_create_subdir(struct kobject *kobj, const char *name, + struct sysfs_dirent **p_sd); +extern void sysfs_remove_subdir(struct sysfs_dirent *sd); extern void sysfs_drop_dentry(struct sysfs_dirent *sd); extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); -- cgit v1.2.1 From 5f9953237f684ea1778adb9d26162da00b282225 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:23 +0900 Subject: sysfs: consolidate sysfs spinlocks Replace sysfs_lock and kobj_sysfs_assoc_lock with sysfs_assoc_lock. sysfs_lock was originally to be used to protect sysfs_dirent tree but mutex seems better choice, so there is no reason to keep sysfs_lock separate. Merge the two spinlocks into one. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 19 +++++++++---------- fs/sysfs/inode.c | 16 ++++++++-------- fs/sysfs/symlink.c | 6 +++--- fs/sysfs/sysfs.h | 3 +-- 4 files changed, 21 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 31b6cf30636d..1b5643407a95 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -15,8 +15,7 @@ #include "sysfs.h" DECLARE_RWSEM(sysfs_rename_sem); -spinlock_t sysfs_lock = SPIN_LOCK_UNLOCKED; -spinlock_t kobj_sysfs_assoc_lock = SPIN_LOCK_UNLOCKED; +spinlock_t sysfs_assoc_lock = SPIN_LOCK_UNLOCKED; static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED; static DEFINE_IDA(sysfs_ino_ida); @@ -236,10 +235,10 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) struct sysfs_dirent * sd = dentry->d_fsdata; if (sd) { - /* sd->s_dentry is protected with sysfs_lock. This - * allows sysfs_drop_dentry() to dereference it. + /* sd->s_dentry is protected with sysfs_assoc_lock. + * This allows sysfs_drop_dentry() to dereference it. */ - spin_lock(&sysfs_lock); + spin_lock(&sysfs_assoc_lock); /* The dentry might have been deleted or another * lookup could have happened updating sd->s_dentry to @@ -248,7 +247,7 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) */ if (sd->s_dentry == dentry) sd->s_dentry = NULL; - spin_unlock(&sysfs_lock); + spin_unlock(&sysfs_assoc_lock); sysfs_put(sd); } iput(inode); @@ -298,9 +297,9 @@ static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry) dentry->d_fsdata = sysfs_get(sd); /* protect sd->s_dentry against sysfs_d_iput */ - spin_lock(&sysfs_lock); + spin_lock(&sysfs_assoc_lock); sd->s_dentry = dentry; - spin_unlock(&sysfs_lock); + spin_unlock(&sysfs_assoc_lock); d_rehash(dentry); } @@ -603,9 +602,9 @@ void sysfs_remove_dir(struct kobject * kobj) { struct sysfs_dirent *sd = kobj->sd; - spin_lock(&kobj_sysfs_assoc_lock); + spin_lock(&sysfs_assoc_lock); kobj->sd = NULL; - spin_unlock(&kobj_sysfs_assoc_lock); + spin_unlock(&sysfs_assoc_lock); __sysfs_remove_dir(sd); } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 1be853706e99..e4c23939fb36 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -211,11 +211,11 @@ void sysfs_instantiate(struct dentry *dentry, struct inode *inode) * parent on entry to this function such that it can't be looked * up anymore. * - * @sd->s_dentry which is protected with sysfs_lock points to the - * currently associated dentry but we're not holding a reference - * to it and racing with dput(). Grab dcache_lock and verify - * dentry before dropping it. If @sd->s_dentry is NULL or dput() - * beats us, no need to bother. + * @sd->s_dentry which is protected with sysfs_assoc_lock points + * to the currently associated dentry but we're not holding a + * reference to it and racing with dput(). Grab dcache_lock and + * verify dentry before dropping it. If @sd->s_dentry is NULL or + * dput() beats us, no need to bother. */ void sysfs_drop_dentry(struct sysfs_dirent *sd) { @@ -224,9 +224,9 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) struct inode *inode; /* We're not holding a reference to ->s_dentry dentry but the - * field will stay valid as long as sysfs_lock is held. + * field will stay valid as long as sysfs_assoc_lock is held. */ - spin_lock(&sysfs_lock); + spin_lock(&sysfs_assoc_lock); spin_lock(&dcache_lock); /* drop dentry if it's there and dput() didn't kill it yet */ @@ -238,7 +238,7 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) } spin_unlock(&dcache_lock); - spin_unlock(&sysfs_lock); + spin_unlock(&sysfs_assoc_lock); dput(dentry); /* XXX: unpin if directory, this will go away soon */ diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 43cc5222f136..cbd95a4109de 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -82,12 +82,12 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char return -EFAULT; /* target->sd can go away beneath us but is protected with - * kobj_sysfs_assoc_lock. Fetch target_sd from it. + * sysfs_assoc_lock. Fetch target_sd from it. */ - spin_lock(&kobj_sysfs_assoc_lock); + spin_lock(&sysfs_assoc_lock); if (target->sd) target_sd = sysfs_get(target->sd); - spin_unlock(&kobj_sysfs_assoc_lock); + spin_unlock(&sysfs_assoc_lock); if (!target_sd) return -ENOENT; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 27a5f4b4e3b0..457267721f4e 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -81,8 +81,7 @@ extern void sysfs_remove_subdir(struct sysfs_dirent *sd); extern void sysfs_drop_dentry(struct sysfs_dirent *sd); extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); -extern spinlock_t sysfs_lock; -extern spinlock_t kobj_sysfs_assoc_lock; +extern spinlock_t sysfs_assoc_lock; extern struct rw_semaphore sysfs_rename_sem; extern struct super_block * sysfs_sb; extern const struct file_operations sysfs_dir_operations; -- cgit v1.2.1 From 3007e997de91ec59af39a3f9c91595b31ae6e08b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:23 +0900 Subject: sysfs: use sysfs_mutex to protect the sysfs_dirent tree As kobj sysfs dentries and inodes are gonna be made reclaimable, i_mutex can't be used to protect sysfs_dirent tree. Use sysfs_mutex globally instead. As the whole tree is protected with sysfs_mutex, there is no reason to keep sysfs_rename_sem. Drop it. While at it, add docbook comments to functions which require sysfs_mutex locking. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 101 +++++++++++++++++++++++++++++++++++++---------------- fs/sysfs/file.c | 31 ++++++++-------- fs/sysfs/inode.c | 11 ++---- fs/sysfs/symlink.c | 51 ++++++++++++++------------- fs/sysfs/sysfs.h | 2 +- 5 files changed, 116 insertions(+), 80 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 1b5643407a95..9fc8558fd86c 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -14,7 +14,7 @@ #include #include "sysfs.h" -DECLARE_RWSEM(sysfs_rename_sem); +DEFINE_MUTEX(sysfs_mutex); spinlock_t sysfs_assoc_lock = SPIN_LOCK_UNLOCKED; static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED; @@ -28,7 +28,7 @@ static DEFINE_IDA(sysfs_ino_ida); * sd->s_parent->s_children. * * Locking: - * mutex_lock(sd->s_parent->dentry->d_inode->i_mutex) + * mutex_lock(sysfs_mutex) */ static void sysfs_link_sibling(struct sysfs_dirent *sd) { @@ -47,7 +47,7 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) * sd->s_parent->s_children. * * Locking: - * mutex_lock(sd->s_parent->dentry->d_inode->i_mutex) + * mutex_lock(sysfs_mutex) */ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) { @@ -215,6 +215,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) struct sysfs_dirent *parent_sd; repeat: + /* Moving/renaming is always done while holding reference. + * sd->s_parent won't change beneath us. + */ parent_sd = sd->s_parent; if (sysfs_type(sd) == SYSFS_KOBJ_LINK) @@ -291,6 +294,17 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) return NULL; } +/** + * sysfs_attach_dentry - associate sysfs_dirent with dentry + * @sd: target sysfs_dirent + * @dentry: dentry to associate + * + * Associate @sd with @dentry. This is protected by + * sysfs_assoc_lock to avoid race with sysfs_d_iput(). + * + * LOCKING: + * mutex_lock(sysfs_mutex) + */ static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry) { dentry->d_op = &sysfs_dentry_ops; @@ -304,6 +318,17 @@ static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry) d_rehash(dentry); } +/** + * sysfs_attach_dirent - attach sysfs_dirent to its parent and dentry + * @sd: sysfs_dirent to attach + * @parent_sd: parent to attach to (optional) + * @dentry: dentry to be associated to @sd (optional) + * + * Attach @sd to @parent_sd and/or @dentry. Both are optional. + * + * LOCKING: + * mutex_lock(sysfs_mutex) + */ void sysfs_attach_dirent(struct sysfs_dirent *sd, struct sysfs_dirent *parent_sd, struct dentry *dentry) { @@ -324,7 +349,7 @@ void sysfs_attach_dirent(struct sysfs_dirent *sd, * Look for sysfs_dirent with name @name under @parent_sd. * * LOCKING: - * mutex_lock(parent->i_mutex) + * mutex_lock(sysfs_mutex) * * RETURNS: * Pointer to sysfs_dirent if found, NULL if not. @@ -349,7 +374,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, * it if found. * * LOCKING: - * Kernel thread context (may sleep) + * Kernel thread context (may sleep). Grabs sysfs_mutex. * * RETURNS: * Pointer to sysfs_dirent if found, NULL if not. @@ -359,10 +384,10 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, { struct sysfs_dirent *sd; - mutex_lock(&parent_sd->s_dentry->d_inode->i_mutex); + mutex_lock(&sysfs_mutex); sd = sysfs_find_dirent(parent_sd, name); sysfs_get(sd); - mutex_unlock(&parent_sd->s_dentry->d_inode->i_mutex); + mutex_unlock(&sysfs_mutex); return sd; } @@ -408,14 +433,20 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, } /* link in */ + mutex_lock(&sysfs_mutex); + error = -EEXIST; - if (sysfs_find_dirent(parent_sd, name)) + if (sysfs_find_dirent(parent_sd, name)) { + mutex_unlock(&sysfs_mutex); goto out_iput; + } sysfs_instantiate(dentry, inode); inc_nlink(parent->d_inode); sysfs_attach_dirent(sd, parent_sd, dentry); + mutex_unlock(&sysfs_mutex); + *p_sd = sd; error = 0; goto out_unlock; /* pin directory dentry in core */ @@ -493,6 +524,8 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); + mutex_lock(&sysfs_mutex); + if (inode->i_state & I_NEW) { /* initialize inode according to type */ switch (sysfs_type(sd)) { @@ -516,6 +549,8 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, sysfs_instantiate(dentry, inode); sysfs_attach_dentry(sd, dentry); + mutex_unlock(&sysfs_mutex); + return NULL; } @@ -526,17 +561,13 @@ const struct inode_operations sysfs_dir_inode_operations = { static void remove_dir(struct sysfs_dirent *sd) { - struct dentry *parent = sd->s_parent->s_dentry; - - mutex_lock(&parent->d_inode->i_mutex); - + mutex_lock(&sysfs_mutex); sysfs_unlink_sibling(sd); sd->s_flags |= SYSFS_FLAG_REMOVED; + mutex_unlock(&sysfs_mutex); pr_debug(" o %s removing done\n", sd->s_name); - mutex_unlock(&parent->d_inode->i_mutex); - sysfs_drop_dentry(sd); sysfs_deactivate(sd); sysfs_put(sd); @@ -552,15 +583,12 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) { struct sysfs_dirent *removed = NULL; struct sysfs_dirent **pos; - struct dentry *dir; if (!dir_sd) return; - dir = dir_sd->s_dentry; - pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&sysfs_mutex); pos = &dir_sd->s_children; while (*pos) { struct sysfs_dirent *sd = *pos; @@ -573,7 +601,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) } else pos = &(*pos)->s_sibling; } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&sysfs_mutex); while (removed) { struct sysfs_dirent *sd = removed; @@ -621,7 +649,6 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, if (!new_parent_sd) return -EFAULT; - down_write(&sysfs_rename_sem); mutex_lock(&new_parent->d_inode->i_mutex); new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name)); @@ -661,12 +688,16 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, d_add(new_dentry, NULL); d_move(sd->s_dentry, new_dentry); + mutex_lock(&sysfs_mutex); + sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); sysfs_put(sd->s_parent); sd->s_parent = new_parent_sd; sysfs_link_sibling(sd); + mutex_unlock(&sysfs_mutex); + error = 0; goto out_unlock; @@ -678,7 +709,6 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, dput(new_dentry); out_unlock: mutex_unlock(&new_parent->d_inode->i_mutex); - up_write(&sysfs_rename_sem); return error; } @@ -717,12 +747,15 @@ again: dput(new_dentry); /* Remove from old parent's list and insert into new parent's list. */ + mutex_lock(&sysfs_mutex); + sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); sysfs_put(sd->s_parent); sd->s_parent = new_parent_sd; sysfs_link_sibling(sd); + mutex_unlock(&sysfs_mutex); out: mutex_unlock(&new_parent_dentry->d_inode->i_mutex); mutex_unlock(&old_parent_dentry->d_inode->i_mutex); @@ -736,11 +769,12 @@ static int sysfs_dir_open(struct inode *inode, struct file *file) struct sysfs_dirent * parent_sd = dentry->d_fsdata; struct sysfs_dirent * sd; - mutex_lock(&dentry->d_inode->i_mutex); sd = sysfs_new_dirent("_DIR_", 0, 0); - if (sd) + if (sd) { + mutex_lock(&sysfs_mutex); sysfs_attach_dirent(sd, parent_sd, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&sysfs_mutex); + } file->private_data = sd; return sd ? 0 : -ENOMEM; @@ -748,12 +782,11 @@ static int sysfs_dir_open(struct inode *inode, struct file *file) static int sysfs_dir_close(struct inode *inode, struct file *file) { - struct dentry * dentry = file->f_path.dentry; struct sysfs_dirent * cursor = file->private_data; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&sysfs_mutex); sysfs_unlink_sibling(cursor); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&sysfs_mutex); release_sysfs_dirent(cursor); @@ -794,6 +827,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ default: + mutex_lock(&sysfs_mutex); + pos = &parent_sd->s_children; while (*pos != cursor) pos = &(*pos)->s_sibling; @@ -826,6 +861,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) /* put cursor back in */ cursor->s_sibling = *pos; *pos = cursor; + + mutex_unlock(&sysfs_mutex); } return 0; } @@ -834,7 +871,6 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&dentry->d_inode->i_mutex); switch (origin) { case 1: offset += file->f_pos; @@ -842,10 +878,11 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) if (offset >= 0) break; default: - mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); return -EINVAL; } if (offset != file->f_pos) { + mutex_lock(&sysfs_mutex); + file->f_pos = offset; if (file->f_pos >= 2) { struct sysfs_dirent *sd = dentry->d_fsdata; @@ -866,8 +903,10 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) cursor->s_sibling = *pos; *pos = cursor; } + + mutex_unlock(&sysfs_mutex); } - mutex_unlock(&dentry->d_inode->i_mutex); + return offset; } @@ -933,7 +972,9 @@ struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj) sd->s_elem.dir.kobj = kobj; /* point to parent_sd but don't attach to it */ sd->s_parent = sysfs_get(parent_sd); + mutex_lock(&sysfs_mutex); sysfs_attach_dirent(sd, NULL, shadow); + mutex_unlock(&sysfs_mutex); d_instantiate(shadow, igrab(inode)); inc_nlink(inode); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 20703b9ee064..d0deed3e60b5 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -415,29 +415,28 @@ const struct file_operations sysfs_file_operations = { int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type) { - struct dentry *dir = dir_sd->s_dentry; umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; struct sysfs_dirent *sd; - int error = 0; - mutex_lock(&dir->d_inode->i_mutex); + sd = sysfs_new_dirent(attr->name, mode, type); + if (!sd) + return -ENOMEM; + sd->s_elem.attr.attr = (void *)attr; - if (sysfs_find_dirent(dir_sd, attr->name)) { - error = -EEXIST; - goto out_unlock; - } + mutex_lock(&sysfs_mutex); - sd = sysfs_new_dirent(attr->name, mode, type); - if (!sd) { - error = -ENOMEM; - goto out_unlock; + if (!sysfs_find_dirent(dir_sd, attr->name)) { + sysfs_attach_dirent(sd, dir_sd, NULL); + sd = NULL; } - sd->s_elem.attr.attr = (void *)attr; - sysfs_attach_dirent(sd, dir_sd, NULL); - out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); - return error; + mutex_unlock(&sysfs_mutex); + + if (sd) { + sysfs_put(sd); + return -EEXIST; + } + return 0; } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e4c23939fb36..d439c0b4bfce 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -277,20 +277,14 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) { - struct dentry *dir; struct sysfs_dirent **pos, *sd; int found = 0; if (!dir_sd) return -ENOENT; - dir = dir_sd->s_dentry; + mutex_lock(&sysfs_mutex); - if (dir->d_inode == NULL) - /* no inode means this hasn't been made visible yet */ - return -ENOENT; - - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); for (pos = &dir_sd->s_children; *pos; pos = &(*pos)->s_sibling) { sd = *pos; @@ -304,7 +298,8 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) break; } } - mutex_unlock(&dir->d_inode->i_mutex); + + mutex_unlock(&sysfs_mutex); if (!found) return -ENOENT; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index cbd95a4109de..683316f0aa96 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -44,20 +44,6 @@ static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length) } } -static int sysfs_add_link(struct sysfs_dirent * parent_sd, const char * name, - struct sysfs_dirent * target_sd) -{ - struct sysfs_dirent * sd; - - sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); - if (!sd) - return -ENOMEM; - - sd->s_elem.symlink.target_sd = target_sd; - sysfs_attach_dirent(sd, parent_sd, NULL); - return 0; -} - /** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. @@ -68,7 +54,8 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char { struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; - int error = -EEXIST; + struct sysfs_dirent *sd = NULL; + int error; BUG_ON(!name); @@ -78,8 +65,9 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char } else parent_sd = kobj->sd; + error = -EFAULT; if (!parent_sd) - return -EFAULT; + goto out_put; /* target->sd can go away beneath us but is protected with * sysfs_assoc_lock. Fetch target_sd from it. @@ -89,17 +77,30 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char target_sd = sysfs_get(target->sd); spin_unlock(&sysfs_assoc_lock); + error = -ENOENT; if (!target_sd) - return -ENOENT; + goto out_put; + + error = -ENOMEM; + sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); + if (!sd) + goto out_put; + sd->s_elem.symlink.target_sd = target_sd; - mutex_lock(&parent_sd->s_dentry->d_inode->i_mutex); - if (!sysfs_find_dirent(parent_sd, name)) - error = sysfs_add_link(parent_sd, name, target_sd); - mutex_unlock(&parent_sd->s_dentry->d_inode->i_mutex); + mutex_lock(&sysfs_mutex); + error = -EEXIST; + if (sysfs_find_dirent(parent_sd, name)) + goto out_unlock; + sysfs_attach_dirent(sd, parent_sd, NULL); + mutex_unlock(&sysfs_mutex); - if (error) - sysfs_put(target_sd); + return 0; + out_unlock: + mutex_unlock(&sysfs_mutex); + out_put: + sysfs_put(target_sd); + sysfs_put(sd); return error; } @@ -144,9 +145,9 @@ static int sysfs_getlink(struct dentry *dentry, char * path) struct sysfs_dirent *target_sd = sd->s_elem.symlink.target_sd; int error; - down_read(&sysfs_rename_sem); + mutex_lock(&sysfs_mutex); error = sysfs_get_target_path(parent_sd, target_sd, path); - up_read(&sysfs_rename_sem); + mutex_unlock(&sysfs_mutex); return error; } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 457267721f4e..26051616ed11 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -82,7 +82,7 @@ extern void sysfs_drop_dentry(struct sysfs_dirent *sd); extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); extern spinlock_t sysfs_assoc_lock; -extern struct rw_semaphore sysfs_rename_sem; +extern struct mutex sysfs_mutex; extern struct super_block * sysfs_sb; extern const struct file_operations sysfs_dir_operations; extern const struct file_operations sysfs_file_operations; -- cgit v1.2.1 From fb6896da37f19be4b75154c14d1cd79231255b17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:24 +0900 Subject: sysfs: restructure add/remove paths and fix inode update The original add/remove code had the following problems. * parent's timestamps are updated on dentry instantiation. this is incorrect with reclaimable files. * updating parent's timestamps isn't synchronized. * parent nlink update assumes the inode is accessible which won't be true once directory dentries are made reclaimable. This patch restructures add/remove paths to resolve the above problems. Add/removal are done in the following steps. 1. sysfs_addrm_start() : acquire locks including sysfs_mutex and other resources. 2-a. sysfs_add_one() : add new sd. linking the new sd into the children list is caller's responsibility. 2-b. sysfs_remove_one() : remove a sd. unlinking the sd from the children list is caller's responsibility. 3. sysfs_addrm_finish() : release all resources and clean up. Steps 2-a and/or 2-b can be repeated multiple times. Parent's inode is looked up during sysfs_addrm_start(). If available (always at the moment), it's pinned and nlink is updated as sd's are added and removed. Timestamps are updated during finish if any sd has been added or removed. If parent's inode is not available during start, sysfs_mutex ensures that parent inode is not created till add/remove is complete. All the complexity is contained inside the helper functions. Especially, dentry/inode handling is properly hidden from the rest of sysfs which now mostly operate on sysfs_dirents. As an added bonus, codes which use these helpers to add and remove sysfs_dirents are now more structured and simpler. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 250 +++++++++++++++++++++++++++++++++++++++-------------- fs/sysfs/file.c | 17 ++-- fs/sysfs/inode.c | 46 ++-------- fs/sysfs/symlink.c | 20 +++-- fs/sysfs/sysfs.h | 20 ++++- 5 files changed, 229 insertions(+), 124 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 9fc8558fd86c..edb30621b82f 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -30,7 +30,7 @@ static DEFINE_IDA(sysfs_ino_ida); * Locking: * mutex_lock(sysfs_mutex) */ -static void sysfs_link_sibling(struct sysfs_dirent *sd) +void sysfs_link_sibling(struct sysfs_dirent *sd) { struct sysfs_dirent *parent_sd = sd->s_parent; @@ -49,7 +49,7 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) * Locking: * mutex_lock(sysfs_mutex) */ -static void sysfs_unlink_sibling(struct sysfs_dirent *sd) +void sysfs_unlink_sibling(struct sysfs_dirent *sd) { struct sysfs_dirent **pos; @@ -165,7 +165,7 @@ void sysfs_put_active_two(struct sysfs_dirent *sd) * * Deny new active references and drain existing ones. */ -void sysfs_deactivate(struct sysfs_dirent *sd) +static void sysfs_deactivate(struct sysfs_dirent *sd) { DECLARE_COMPLETION_ONSTACK(wait); int v; @@ -318,27 +318,164 @@ static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry) d_rehash(dentry); } +static int sysfs_ilookup_test(struct inode *inode, void *arg) +{ + struct sysfs_dirent *sd = arg; + return inode->i_ino == sd->s_ino; +} + /** - * sysfs_attach_dirent - attach sysfs_dirent to its parent and dentry - * @sd: sysfs_dirent to attach - * @parent_sd: parent to attach to (optional) - * @dentry: dentry to be associated to @sd (optional) + * sysfs_addrm_start - prepare for sysfs_dirent add/remove + * @acxt: pointer to sysfs_addrm_cxt to be used + * @parent_sd: parent sysfs_dirent * - * Attach @sd to @parent_sd and/or @dentry. Both are optional. + * This function is called when the caller is about to add or + * remove sysfs_dirent under @parent_sd. This function acquires + * sysfs_mutex, grabs inode for @parent_sd if available and lock + * i_mutex of it. @acxt is used to keep and pass context to + * other addrm functions. * * LOCKING: - * mutex_lock(sysfs_mutex) + * Kernel thread context (may sleep). sysfs_mutex is locked on + * return. i_mutex of parent inode is locked on return if + * available. */ -void sysfs_attach_dirent(struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd, struct dentry *dentry) +void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, + struct sysfs_dirent *parent_sd) { - if (dentry) - sysfs_attach_dentry(sd, dentry); + struct inode *inode; - if (parent_sd) { - sd->s_parent = sysfs_get(parent_sd); - sysfs_link_sibling(sd); + memset(acxt, 0, sizeof(*acxt)); + acxt->parent_sd = parent_sd; + + /* Lookup parent inode. inode initialization and I_NEW + * clearing are protected by sysfs_mutex. By grabbing it and + * looking up with _nowait variant, inode state can be + * determined reliably. + */ + mutex_lock(&sysfs_mutex); + + inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, + parent_sd); + + if (inode && !(inode->i_state & I_NEW)) { + /* parent inode available */ + acxt->parent_inode = inode; + + /* sysfs_mutex is below i_mutex in lock hierarchy. + * First, trylock i_mutex. If fails, unlock + * sysfs_mutex and lock them in order. + */ + if (!mutex_trylock(&inode->i_mutex)) { + mutex_unlock(&sysfs_mutex); + mutex_lock(&inode->i_mutex); + mutex_lock(&sysfs_mutex); + } + } else + iput(inode); +} + +/** + * sysfs_add_one - add sysfs_dirent to parent + * @acxt: addrm context to use + * @sd: sysfs_dirent to be added + * + * Get @acxt->parent_sd and set sd->s_parent to it and increment + * nlink of parent inode if @sd is a directory. @sd is NOT + * linked into the children list of the parent. The caller + * should invoke sysfs_link_sibling() after this function + * completes if @sd needs to be on the children list. + * + * This function should be called between calls to + * sysfs_addrm_start() and sysfs_addrm_finish() and should be + * passed the same @acxt as passed to sysfs_addrm_start(). + * + * LOCKING: + * Determined by sysfs_addrm_start(). + */ +void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +{ + sd->s_parent = sysfs_get(acxt->parent_sd); + + if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode) + inc_nlink(acxt->parent_inode); + + acxt->cnt++; +} + +/** + * sysfs_remove_one - remove sysfs_dirent from parent + * @acxt: addrm context to use + * @sd: sysfs_dirent to be added + * + * Mark @sd removed and drop nlink of parent inode if @sd is a + * directory. @sd is NOT unlinked from the children list of the + * parent. The caller is repsonsible for removing @sd from the + * children list before calling this function. + * + * This function should be called between calls to + * sysfs_addrm_start() and sysfs_addrm_finish() and should be + * passed the same @acxt as passed to sysfs_addrm_start(). + * + * LOCKING: + * Determined by sysfs_addrm_start(). + */ +void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +{ + BUG_ON(sd->s_sibling || (sd->s_flags & SYSFS_FLAG_REMOVED)); + + sd->s_flags |= SYSFS_FLAG_REMOVED; + sd->s_sibling = acxt->removed; + acxt->removed = sd; + + if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode) + drop_nlink(acxt->parent_inode); + + acxt->cnt++; +} + +/** + * sysfs_addrm_finish - finish up sysfs_dirent add/remove + * @acxt: addrm context to finish up + * + * Finish up sysfs_dirent add/remove. Resources acquired by + * sysfs_addrm_start() are released and removed sysfs_dirents are + * cleaned up. Timestamps on the parent inode are updated. + * + * LOCKING: + * All mutexes acquired by sysfs_addrm_start() are released. + * + * RETURNS: + * Number of added/removed sysfs_dirents since sysfs_addrm_start(). + */ +int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) +{ + /* release resources acquired by sysfs_addrm_start() */ + mutex_unlock(&sysfs_mutex); + if (acxt->parent_inode) { + struct inode *inode = acxt->parent_inode; + + /* if added/removed, update timestamps on the parent */ + if (acxt->cnt) + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + mutex_unlock(&inode->i_mutex); + iput(inode); + } + + /* kill removed sysfs_dirents */ + while (acxt->removed) { + struct sysfs_dirent *sd = acxt->removed; + + acxt->removed = sd->s_sibling; + sd->s_sibling = NULL; + + sysfs_drop_dentry(sd); + sysfs_deactivate(sd); + sysfs_put(sd); } + + return acxt->cnt; } /** @@ -396,19 +533,20 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, const char *name, struct sysfs_dirent **p_sd) { struct dentry *parent = parent_sd->s_dentry; + struct sysfs_addrm_cxt acxt; int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; struct dentry *dentry; struct inode *inode; struct sysfs_dirent *sd; - mutex_lock(&parent->d_inode->i_mutex); + sysfs_addrm_start(&acxt, parent_sd); /* allocate */ dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); - goto out_unlock; + goto out_finish; } error = -EEXIST; @@ -433,23 +571,18 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, } /* link in */ - mutex_lock(&sysfs_mutex); - error = -EEXIST; - if (sysfs_find_dirent(parent_sd, name)) { - mutex_unlock(&sysfs_mutex); + if (sysfs_find_dirent(parent_sd, name)) goto out_iput; - } + sysfs_add_one(&acxt, sd); + sysfs_link_sibling(sd); sysfs_instantiate(dentry, inode); - inc_nlink(parent->d_inode); - sysfs_attach_dirent(sd, parent_sd, dentry); - - mutex_unlock(&sysfs_mutex); + sysfs_attach_dentry(sd, dentry); *p_sd = sd; error = 0; - goto out_unlock; /* pin directory dentry in core */ + goto out_finish; /* pin directory dentry in core */ out_iput: iput(inode); @@ -459,8 +592,8 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, d_drop(dentry); out_dput: dput(dentry); - out_unlock: - mutex_unlock(&parent->d_inode->i_mutex); + out_finish: + sysfs_addrm_finish(&acxt); return error; } @@ -561,16 +694,12 @@ const struct inode_operations sysfs_dir_inode_operations = { static void remove_dir(struct sysfs_dirent *sd) { - mutex_lock(&sysfs_mutex); - sysfs_unlink_sibling(sd); - sd->s_flags |= SYSFS_FLAG_REMOVED; - mutex_unlock(&sysfs_mutex); + struct sysfs_addrm_cxt acxt; - pr_debug(" o %s removing done\n", sd->s_name); - - sysfs_drop_dentry(sd); - sysfs_deactivate(sd); - sysfs_put(sd); + sysfs_addrm_start(&acxt, sd->s_parent); + sysfs_unlink_sibling(sd); + sysfs_remove_one(&acxt, sd); + sysfs_addrm_finish(&acxt); } void sysfs_remove_subdir(struct sysfs_dirent *sd) @@ -581,38 +710,26 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd) static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) { - struct sysfs_dirent *removed = NULL; + struct sysfs_addrm_cxt acxt; struct sysfs_dirent **pos; if (!dir_sd) return; pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); - mutex_lock(&sysfs_mutex); + sysfs_addrm_start(&acxt, dir_sd); pos = &dir_sd->s_children; while (*pos) { struct sysfs_dirent *sd = *pos; if (sysfs_type(sd) && (sysfs_type(sd) & SYSFS_NOT_PINNED)) { - sd->s_flags |= SYSFS_FLAG_REMOVED; *pos = sd->s_sibling; - sd->s_sibling = removed; - removed = sd; + sd->s_sibling = NULL; + sysfs_remove_one(&acxt, sd); } else pos = &(*pos)->s_sibling; } - mutex_unlock(&sysfs_mutex); - - while (removed) { - struct sysfs_dirent *sd = removed; - - removed = sd->s_sibling; - sd->s_sibling = NULL; - - sysfs_drop_dentry(sd); - sysfs_deactivate(sd); - sysfs_put(sd); - } + sysfs_addrm_finish(&acxt); remove_dir(dir_sd); } @@ -772,7 +889,8 @@ static int sysfs_dir_open(struct inode *inode, struct file *file) sd = sysfs_new_dirent("_DIR_", 0, 0); if (sd) { mutex_lock(&sysfs_mutex); - sysfs_attach_dirent(sd, parent_sd, NULL); + sd->s_parent = sysfs_get(parent_sd); + sysfs_link_sibling(sd); mutex_unlock(&sysfs_mutex); } @@ -957,6 +1075,7 @@ struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj) struct sysfs_dirent *parent_sd = parent->d_fsdata; struct dentry *shadow; struct sysfs_dirent *sd; + struct sysfs_addrm_cxt acxt; sd = ERR_PTR(-EINVAL); if (!sysfs_is_shadowed_inode(inode)) @@ -970,15 +1089,18 @@ struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj) if (!sd) goto nomem; sd->s_elem.dir.kobj = kobj; - /* point to parent_sd but don't attach to it */ - sd->s_parent = sysfs_get(parent_sd); - mutex_lock(&sysfs_mutex); - sysfs_attach_dirent(sd, NULL, shadow); - mutex_unlock(&sysfs_mutex); + sysfs_addrm_start(&acxt, parent_sd); + + /* add but don't link into children list */ + sysfs_add_one(&acxt, sd); + + /* attach and instantiate dentry */ + sysfs_attach_dentry(sd, shadow); d_instantiate(shadow, igrab(inode)); - inc_nlink(inode); - inc_nlink(parent->d_inode); + inc_nlink(inode); /* tj: synchronization? */ + + sysfs_addrm_finish(&acxt); dget(shadow); /* Extra count - pin the dentry in core */ diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index d0deed3e60b5..69bacf1db596 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -416,6 +416,7 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type) { umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; + struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; sd = sysfs_new_dirent(attr->name, mode, type); @@ -423,20 +424,18 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, return -ENOMEM; sd->s_elem.attr.attr = (void *)attr; - mutex_lock(&sysfs_mutex); + sysfs_addrm_start(&acxt, dir_sd); if (!sysfs_find_dirent(dir_sd, attr->name)) { - sysfs_attach_dirent(sd, dir_sd, NULL); - sd = NULL; + sysfs_add_one(&acxt, sd); + sysfs_link_sibling(sd); } - mutex_unlock(&sysfs_mutex); + if (sysfs_addrm_finish(&acxt)) + return 0; - if (sd) { - sysfs_put(sd); - return -EEXIST; - } - return 0; + sysfs_put(sd); + return -EEXIST; } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index d439c0b4bfce..f95966847a81 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -191,15 +191,9 @@ void sysfs_instantiate(struct dentry *dentry, struct inode *inode) { BUG_ON(!dentry || dentry->d_inode); - if (inode->i_state & I_NEW) { + if (inode->i_state & I_NEW) unlock_new_inode(inode); - if (dentry->d_parent && dentry->d_parent->d_inode) { - struct inode *p_inode = dentry->d_parent->d_inode; - p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; - } - } - d_instantiate(dentry, inode); } @@ -220,7 +214,6 @@ void sysfs_instantiate(struct dentry *dentry, struct inode *inode) void sysfs_drop_dentry(struct sysfs_dirent *sd) { struct dentry *dentry = NULL; - struct timespec curtime; struct inode *inode; /* We're not holding a reference to ->s_dentry dentry but the @@ -246,13 +239,11 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) dput(dentry); /* adjust nlink and update timestamp */ - curtime = CURRENT_TIME; - inode = ilookup(sysfs_sb, sd->s_ino); if (inode) { mutex_lock(&inode->i_mutex); - inode->i_ctime = curtime; + inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (sysfs_type(sd) == SYSFS_DIR) drop_nlink(inode); @@ -260,30 +251,17 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd) mutex_unlock(&inode->i_mutex); iput(inode); } - - /* adjust nlink and udpate timestamp of the parent */ - inode = ilookup(sysfs_sb, sd->s_parent->s_ino); - if (inode) { - mutex_lock(&inode->i_mutex); - - inode->i_ctime = inode->i_mtime = curtime; - if (sysfs_type(sd) == SYSFS_DIR) - drop_nlink(inode); - - mutex_unlock(&inode->i_mutex); - iput(inode); - } } int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) { + struct sysfs_addrm_cxt acxt; struct sysfs_dirent **pos, *sd; - int found = 0; if (!dir_sd) return -ENOENT; - mutex_lock(&sysfs_mutex); + sysfs_addrm_start(&acxt, dir_sd); for (pos = &dir_sd->s_children; *pos; pos = &(*pos)->s_sibling) { sd = *pos; @@ -291,22 +269,14 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) if (!sysfs_type(sd)) continue; if (!strcmp(sd->s_name, name)) { - sd->s_flags |= SYSFS_FLAG_REMOVED; *pos = sd->s_sibling; sd->s_sibling = NULL; - found = 1; + sysfs_remove_one(&acxt, sd); break; } } - mutex_unlock(&sysfs_mutex); - - if (!found) - return -ENOENT; - - sysfs_drop_dentry(sd); - sysfs_deactivate(sd); - sysfs_put(sd); - - return 0; + if (sysfs_addrm_finish(&acxt)) + return 0; + return -ENOENT; } diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 683316f0aa96..2f86e0422290 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -55,6 +55,7 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; + struct sysfs_addrm_cxt acxt; int error; BUG_ON(!name); @@ -87,17 +88,18 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char goto out_put; sd->s_elem.symlink.target_sd = target_sd; - mutex_lock(&sysfs_mutex); - error = -EEXIST; - if (sysfs_find_dirent(parent_sd, name)) - goto out_unlock; - sysfs_attach_dirent(sd, parent_sd, NULL); - mutex_unlock(&sysfs_mutex); + sysfs_addrm_start(&acxt, parent_sd); - return 0; + if (!sysfs_find_dirent(parent_sd, name)) { + sysfs_add_one(&acxt, sd); + sysfs_link_sibling(sd); + } - out_unlock: - mutex_unlock(&sysfs_mutex); + if (sysfs_addrm_finish(&acxt)) + return 0; + + error = -EEXIST; + /* fall through */ out_put: sysfs_put(target_sd); sysfs_put(sd); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 26051616ed11..3e9a5ee38233 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -44,14 +44,29 @@ struct sysfs_dirent { #define SD_DEACTIVATED_BIAS INT_MIN +struct sysfs_addrm_cxt { + struct sysfs_dirent *parent_sd; + struct inode *parent_inode; + struct sysfs_dirent *removed; + int cnt; +}; + extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; +extern void sysfs_link_sibling(struct sysfs_dirent *sd); +extern void sysfs_unlink_sibling(struct sysfs_dirent *sd); extern struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); extern void sysfs_put_active(struct sysfs_dirent *sd); extern struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); extern void sysfs_put_active_two(struct sysfs_dirent *sd); -extern void sysfs_deactivate(struct sysfs_dirent *sd); +extern void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, + struct sysfs_dirent *parent_sd); +extern void sysfs_add_one(struct sysfs_addrm_cxt *acxt, + struct sysfs_dirent *sd); +extern void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, + struct sysfs_dirent *sd); +extern int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); extern void sysfs_delete_inode(struct inode *inode); extern void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode); @@ -65,9 +80,6 @@ extern struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, const unsigned char *name); extern struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); -extern void sysfs_attach_dirent(struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd, - struct dentry *dentry); extern int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type); -- cgit v1.2.1 From a0edd7c848945a75e2f41673f43bc37d0a5fed15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:24 +0900 Subject: sysfs: move sysfs_drop_dentry() to dir.c and make it static After add/remove path restructuring, the only user of sysfs_drop_dentry() is sysfs_addrm_finish(). Move sysfs_drop_dentry() to dir.c and make it static. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/sysfs/inode.c | 56 -------------------------------------------------------- fs/sysfs/sysfs.h | 1 - 3 files changed, 56 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index edb30621b82f..c6f3b697064c 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -434,6 +434,62 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) acxt->cnt++; } +/** + * sysfs_drop_dentry - drop dentry for the specified sysfs_dirent + * @sd: target sysfs_dirent + * + * Drop dentry for @sd. @sd must have been unlinked from its + * parent on entry to this function such that it can't be looked + * up anymore. + * + * @sd->s_dentry which is protected with sysfs_assoc_lock points + * to the currently associated dentry but we're not holding a + * reference to it and racing with dput(). Grab dcache_lock and + * verify dentry before dropping it. If @sd->s_dentry is NULL or + * dput() beats us, no need to bother. + */ +static void sysfs_drop_dentry(struct sysfs_dirent *sd) +{ + struct dentry *dentry = NULL; + struct inode *inode; + + /* We're not holding a reference to ->s_dentry dentry but the + * field will stay valid as long as sysfs_assoc_lock is held. + */ + spin_lock(&sysfs_assoc_lock); + spin_lock(&dcache_lock); + + /* drop dentry if it's there and dput() didn't kill it yet */ + if (sd->s_dentry && sd->s_dentry->d_inode) { + dentry = dget_locked(sd->s_dentry); + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + } + + spin_unlock(&dcache_lock); + spin_unlock(&sysfs_assoc_lock); + + dput(dentry); + /* XXX: unpin if directory, this will go away soon */ + if (sysfs_type(sd) == SYSFS_DIR) + dput(dentry); + + /* adjust nlink and update timestamp */ + inode = ilookup(sysfs_sb, sd->s_ino); + if (inode) { + mutex_lock(&inode->i_mutex); + + inode->i_ctime = CURRENT_TIME; + drop_nlink(inode); + if (sysfs_type(sd) == SYSFS_DIR) + drop_nlink(inode); + + mutex_unlock(&inode->i_mutex); + iput(inode); + } +} + /** * sysfs_addrm_finish - finish up sysfs_dirent add/remove * @acxt: addrm context to finish up diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index f95966847a81..3756e152285a 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -197,62 +197,6 @@ void sysfs_instantiate(struct dentry *dentry, struct inode *inode) d_instantiate(dentry, inode); } -/** - * sysfs_drop_dentry - drop dentry for the specified sysfs_dirent - * @sd: target sysfs_dirent - * - * Drop dentry for @sd. @sd must have been unlinked from its - * parent on entry to this function such that it can't be looked - * up anymore. - * - * @sd->s_dentry which is protected with sysfs_assoc_lock points - * to the currently associated dentry but we're not holding a - * reference to it and racing with dput(). Grab dcache_lock and - * verify dentry before dropping it. If @sd->s_dentry is NULL or - * dput() beats us, no need to bother. - */ -void sysfs_drop_dentry(struct sysfs_dirent *sd) -{ - struct dentry *dentry = NULL; - struct inode *inode; - - /* We're not holding a reference to ->s_dentry dentry but the - * field will stay valid as long as sysfs_assoc_lock is held. - */ - spin_lock(&sysfs_assoc_lock); - spin_lock(&dcache_lock); - - /* drop dentry if it's there and dput() didn't kill it yet */ - if (sd->s_dentry && sd->s_dentry->d_inode) { - dentry = dget_locked(sd->s_dentry); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - } - - spin_unlock(&dcache_lock); - spin_unlock(&sysfs_assoc_lock); - - dput(dentry); - /* XXX: unpin if directory, this will go away soon */ - if (sysfs_type(sd) == SYSFS_DIR) - dput(dentry); - - /* adjust nlink and update timestamp */ - inode = ilookup(sysfs_sb, sd->s_ino); - if (inode) { - mutex_lock(&inode->i_mutex); - - inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); - if (sysfs_type(sd) == SYSFS_DIR) - drop_nlink(inode); - - mutex_unlock(&inode->i_mutex); - iput(inode); - } -} - int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) { struct sysfs_addrm_cxt acxt; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3e9a5ee38233..92fe1e51a29b 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -90,7 +90,6 @@ extern int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); extern void sysfs_remove_subdir(struct sysfs_dirent *sd); -extern void sysfs_drop_dentry(struct sysfs_dirent *sd); extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); extern spinlock_t sysfs_assoc_lock; -- cgit v1.2.1 From 53e0ae92690c52eceb997905d85fbb42de5fff63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:25 +0900 Subject: sysfs: implement sysfs_get_dentry() Some sysfs operations require dentry and inode. sysfs_get_dentry() looks up and gets dentry for the specified sysfs_dirent. It finds the first ancestor with dentry attached and starts looking up dentries from there. Looking up from the nearest ancestor is necessary to support shadowed directories because we can't reliably lookup dentry for one of the shadows. Dentries for each shadow will be pinned in memory such that they can serve as the starting point for dentry lookup. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/sysfs/sysfs.h | 1 + 2 files changed, 99 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c6f3b697064c..987211296106 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -62,6 +62,104 @@ void sysfs_unlink_sibling(struct sysfs_dirent *sd) } } +/** + * sysfs_get_dentry - get dentry for the given sysfs_dirent + * @sd: sysfs_dirent of interest + * + * Get dentry for @sd. Dentry is looked up if currently not + * present. This function climbs sysfs_dirent tree till it + * reaches a sysfs_dirent with valid dentry attached and descends + * down from there looking up dentry for each step. + * + * LOCKING: + * Kernel thread context (may sleep) + * + * RETURNS: + * Pointer to found dentry on success, ERR_PTR() value on error. + */ +struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd) +{ + struct sysfs_dirent *cur; + struct dentry *parent_dentry, *dentry; + int i, depth; + + /* Find the first parent which has valid s_dentry and get the + * dentry. + */ + mutex_lock(&sysfs_mutex); + restart0: + spin_lock(&sysfs_assoc_lock); + restart1: + spin_lock(&dcache_lock); + + dentry = NULL; + depth = 0; + cur = sd; + while (!cur->s_dentry || !cur->s_dentry->d_inode) { + if (cur->s_flags & SYSFS_FLAG_REMOVED) { + dentry = ERR_PTR(-ENOENT); + depth = 0; + break; + } + cur = cur->s_parent; + depth++; + } + if (!IS_ERR(dentry)) + dentry = dget_locked(cur->s_dentry); + + spin_unlock(&dcache_lock); + spin_unlock(&sysfs_assoc_lock); + + /* from the found dentry, look up depth times */ + while (depth--) { + /* find and get depth'th ancestor */ + for (cur = sd, i = 0; cur && i < depth; i++) + cur = cur->s_parent; + + /* This can happen if tree structure was modified due + * to move/rename. Restart. + */ + if (i != depth) { + dput(dentry); + goto restart0; + } + + sysfs_get(cur); + + mutex_unlock(&sysfs_mutex); + + /* look it up */ + parent_dentry = dentry; + dentry = lookup_one_len_kern(cur->s_name, parent_dentry, + strlen(cur->s_name)); + dput(parent_dentry); + + if (IS_ERR(dentry)) { + sysfs_put(cur); + return dentry; + } + + mutex_lock(&sysfs_mutex); + spin_lock(&sysfs_assoc_lock); + + /* This, again, can happen if tree structure has + * changed and we looked up the wrong thing. Restart. + */ + if (cur->s_dentry != dentry) { + dput(dentry); + sysfs_put(cur); + goto restart1; + } + + spin_unlock(&sysfs_assoc_lock); + + sysfs_put(cur); + } + + mutex_unlock(&sysfs_mutex); + return dentry; +} + /** * sysfs_get_active - get an active reference to sysfs_dirent * @sd: sysfs_dirent to get an active reference to diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 92fe1e51a29b..72530dc666fd 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -54,6 +54,7 @@ struct sysfs_addrm_cxt { extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; +extern struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); extern void sysfs_link_sibling(struct sysfs_dirent *sd); extern void sysfs_unlink_sibling(struct sysfs_dirent *sd); extern struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); -- cgit v1.2.1 From 51225039f3cf9d250596d1344494b293274b9169 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 04:27:25 +0900 Subject: sysfs: make directory dentries and inodes reclaimable This patch makes dentries and inodes for sysfs directories reclaimable. * sysfs_notify() is modified to walk sysfs_dirent tree instead of dentry tree. * sysfs_update_file() and sysfs_chmod_file() use sysfs_get_dentry() to grab the victim dentry. * sysfs_rename_dir() and sysfs_move_dir() grab all dentries using sysfs_get_dentry() on startup. * Dentries for all shadowed directories are pinned in memory to serve as lookup start point. Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 231 ++++++++++++++++++++++++++++++------------------------- fs/sysfs/file.c | 134 ++++++++++++++------------------ fs/sysfs/mount.c | 2 +- fs/sysfs/sysfs.h | 1 + 4 files changed, 187 insertions(+), 181 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 987211296106..aee966c44aac 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -568,10 +568,10 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd) spin_unlock(&dcache_lock); spin_unlock(&sysfs_assoc_lock); - dput(dentry); - /* XXX: unpin if directory, this will go away soon */ - if (sysfs_type(sd) == SYSFS_DIR) + /* dentries for shadowed inodes are pinned, unpin */ + if (dentry && sysfs_is_shadowed_inode(dentry->d_inode)) dput(dentry); + dput(dentry); /* adjust nlink and update timestamp */ inode = ilookup(sysfs_sb, sd->s_ino); @@ -686,69 +686,29 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, const char *name, struct sysfs_dirent **p_sd) { - struct dentry *parent = parent_sd->s_dentry; - struct sysfs_addrm_cxt acxt; - int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; - struct dentry *dentry; - struct inode *inode; + struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; - sysfs_addrm_start(&acxt, parent_sd); - /* allocate */ - dentry = lookup_one_len(name, parent, strlen(name)); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_finish; - } - - error = -EEXIST; - if (dentry->d_inode) - goto out_dput; - - error = -ENOMEM; sd = sysfs_new_dirent(name, mode, SYSFS_DIR); if (!sd) - goto out_drop; + return -ENOMEM; sd->s_elem.dir.kobj = kobj; - inode = sysfs_get_inode(sd); - if (!inode) - goto out_sput; - - if (inode->i_state & I_NEW) { - inode->i_op = &sysfs_dir_inode_operations; - inode->i_fop = &sysfs_dir_operations; - /* directory inodes start off with i_nlink == 2 (for ".") */ - inc_nlink(inode); - } - /* link in */ - error = -EEXIST; - if (sysfs_find_dirent(parent_sd, name)) - goto out_iput; - - sysfs_add_one(&acxt, sd); - sysfs_link_sibling(sd); - sysfs_instantiate(dentry, inode); - sysfs_attach_dentry(sd, dentry); - - *p_sd = sd; - error = 0; - goto out_finish; /* pin directory dentry in core */ + sysfs_addrm_start(&acxt, parent_sd); + if (!sysfs_find_dirent(parent_sd, name)) { + sysfs_add_one(&acxt, sd); + sysfs_link_sibling(sd); + } + if (sysfs_addrm_finish(&acxt)) { + *p_sd = sd; + return 0; + } - out_iput: - iput(inode); - out_sput: sysfs_put(sd); - out_drop: - d_drop(dentry); - out_dput: - dput(dentry); - out_finish: - sysfs_addrm_finish(&acxt); - return error; + return -EEXIST; } int sysfs_create_subdir(struct kobject *kobj, const char *name, @@ -785,6 +745,17 @@ int sysfs_create_dir(struct kobject *kobj, return error; } +static int sysfs_count_nlink(struct sysfs_dirent *sd) +{ + struct sysfs_dirent *child; + int nr = 0; + + for (child = sd->s_children; child; child = child->s_sibling) + if (sysfs_type(child) == SYSFS_DIR) + nr++; + return nr + 2; +} + static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -795,7 +766,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, int found = 0; for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) { - if ((sysfs_type(sd) & SYSFS_NOT_PINNED) && + if (sysfs_type(sd) && !strcmp(sd->s_name, dentry->d_name.name)) { found = 1; break; @@ -816,6 +787,11 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, if (inode->i_state & I_NEW) { /* initialize inode according to type */ switch (sysfs_type(sd)) { + case SYSFS_DIR: + inode->i_op = &sysfs_dir_inode_operations; + inode->i_fop = &sysfs_dir_operations; + inode->i_nlink = sysfs_count_nlink(sd); + break; case SYSFS_KOBJ_ATTR: inode->i_size = PAGE_SIZE; inode->i_fop = &sysfs_file_operations; @@ -876,7 +852,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) while (*pos) { struct sysfs_dirent *sd = *pos; - if (sysfs_type(sd) && (sysfs_type(sd) & SYSFS_NOT_PINNED)) { + if (sysfs_type(sd) && sysfs_type(sd) != SYSFS_DIR) { *pos = sd->s_sibling; sd->s_sibling = NULL; sysfs_remove_one(&acxt, sd); @@ -912,14 +888,25 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, const char *new_name) { struct sysfs_dirent *sd = kobj->sd; - struct dentry *new_parent = new_parent_sd->s_dentry; - struct dentry *new_dentry; - char *dup_name; + struct dentry *new_parent = NULL; + struct dentry *old_dentry = NULL, *new_dentry = NULL; + const char *dup_name = NULL; int error; - if (!new_parent_sd) - return -EFAULT; + /* get dentries */ + old_dentry = sysfs_get_dentry(sd); + if (IS_ERR(old_dentry)) { + error = PTR_ERR(old_dentry); + goto out_dput; + } + + new_parent = sysfs_get_dentry(new_parent_sd); + if (IS_ERR(new_parent)) { + error = PTR_ERR(new_parent); + goto out_dput; + } + /* lock new_parent and get dentry for new name */ mutex_lock(&new_parent->d_inode->i_mutex); new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name)); @@ -933,14 +920,14 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, * shadows of the same directory */ error = -EINVAL; - if (sd->s_parent->s_dentry->d_inode != new_parent->d_inode || + if (old_dentry->d_parent->d_inode != new_parent->d_inode || new_dentry->d_parent->d_inode != new_parent->d_inode || - new_dentry == sd->s_dentry) - goto out_dput; + old_dentry == new_dentry) + goto out_unlock; error = -EEXIST; if (new_dentry->d_inode) - goto out_dput; + goto out_unlock; /* rename kobject and sysfs_dirent */ error = -ENOMEM; @@ -950,9 +937,9 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, error = kobject_set_name(kobj, "%s", new_name); if (error) - goto out_free; + goto out_drop; - kfree(sd->s_name); + dup_name = sd->s_name; sd->s_name = new_name; /* move under the new parent */ @@ -972,45 +959,58 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, error = 0; goto out_unlock; - out_free: - kfree(dup_name); out_drop: d_drop(new_dentry); - out_dput: - dput(new_dentry); out_unlock: mutex_unlock(&new_parent->d_inode->i_mutex); + out_dput: + kfree(dup_name); + dput(new_parent); + dput(old_dentry); + dput(new_dentry); return error; } -int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent) +int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) { - struct dentry *old_parent_dentry, *new_parent_dentry, *new_dentry; - struct sysfs_dirent *new_parent_sd, *sd; + struct sysfs_dirent *sd = kobj->sd; + struct sysfs_dirent *new_parent_sd; + struct dentry *old_parent, *new_parent = NULL; + struct dentry *old_dentry = NULL, *new_dentry = NULL; int error; - old_parent_dentry = kobj->parent ? - kobj->parent->sd->s_dentry : sysfs_mount->mnt_sb->s_root; - new_parent_dentry = new_parent ? - new_parent->sd->s_dentry : sysfs_mount->mnt_sb->s_root; + BUG_ON(!sd->s_parent); + new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; + + /* get dentries */ + old_dentry = sysfs_get_dentry(sd); + if (IS_ERR(old_dentry)) { + error = PTR_ERR(old_dentry); + goto out_dput; + } + old_parent = sd->s_parent->s_dentry; + + new_parent = sysfs_get_dentry(new_parent_sd); + if (IS_ERR(new_parent)) { + error = PTR_ERR(new_parent); + goto out_dput; + } - if (old_parent_dentry->d_inode == new_parent_dentry->d_inode) - return 0; /* nothing to move */ + if (old_parent->d_inode == new_parent->d_inode) { + error = 0; + goto out_dput; /* nothing to move */ + } again: - mutex_lock(&old_parent_dentry->d_inode->i_mutex); - if (!mutex_trylock(&new_parent_dentry->d_inode->i_mutex)) { - mutex_unlock(&old_parent_dentry->d_inode->i_mutex); + mutex_lock(&old_parent->d_inode->i_mutex); + if (!mutex_trylock(&new_parent->d_inode->i_mutex)) { + mutex_unlock(&old_parent->d_inode->i_mutex); goto again; } - new_parent_sd = new_parent_dentry->d_fsdata; - sd = kobj->sd; - - new_dentry = lookup_one_len(kobj->name, new_parent_dentry, - strlen(kobj->name)); + new_dentry = lookup_one_len(kobj->name, new_parent, strlen(kobj->name)); if (IS_ERR(new_dentry)) { error = PTR_ERR(new_dentry); - goto out; + goto out_unlock; } else error = 0; d_add(new_dentry, NULL); @@ -1027,10 +1027,14 @@ again: sysfs_link_sibling(sd); mutex_unlock(&sysfs_mutex); -out: - mutex_unlock(&new_parent_dentry->d_inode->i_mutex); - mutex_unlock(&old_parent_dentry->d_inode->i_mutex); + out_unlock: + mutex_unlock(&new_parent->d_inode->i_mutex); + mutex_unlock(&old_parent->d_inode->i_mutex); + out_dput: + dput(new_parent); + dput(old_dentry); + dput(new_dentry); return error; } @@ -1191,12 +1195,20 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) int sysfs_make_shadowed_dir(struct kobject *kobj, void * (*follow_link)(struct dentry *, struct nameidata *)) { + struct dentry *dentry; struct inode *inode; struct inode_operations *i_op; - inode = kobj->sd->s_dentry->d_inode; - if (inode->i_op != &sysfs_dir_inode_operations) + /* get dentry for @kobj->sd, dentry of a shadowed dir is pinned */ + dentry = sysfs_get_dentry(kobj->sd); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + inode = dentry->d_inode; + if (inode->i_op != &sysfs_dir_inode_operations) { + dput(dentry); return -EINVAL; + } i_op = kmalloc(sizeof(*i_op), GFP_KERNEL); if (!i_op) @@ -1223,17 +1235,23 @@ int sysfs_make_shadowed_dir(struct kobject *kobj, struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj) { - struct dentry *dir = kobj->sd->s_dentry; - struct inode *inode = dir->d_inode; - struct dentry *parent = dir->d_parent; - struct sysfs_dirent *parent_sd = parent->d_fsdata; - struct dentry *shadow; + struct sysfs_dirent *parent_sd = kobj->sd->s_parent; + struct dentry *dir, *parent, *shadow; + struct inode *inode; struct sysfs_dirent *sd; struct sysfs_addrm_cxt acxt; + dir = sysfs_get_dentry(kobj->sd); + if (IS_ERR(dir)) { + sd = (void *)dir; + goto out; + } + parent = dir->d_parent; + + inode = dir->d_inode; sd = ERR_PTR(-EINVAL); if (!sysfs_is_shadowed_inode(inode)) - goto out; + goto out_dput; shadow = d_alloc(parent, &dir->d_name); if (!shadow) @@ -1258,12 +1276,15 @@ struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj) dget(shadow); /* Extra count - pin the dentry in core */ -out: - return sd; -nomem: + goto out_dput; + + nomem: dput(shadow); sd = ERR_PTR(-ENOMEM); - goto out; + out_dput: + dput(dir); + out: + return sd; } /** diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 69bacf1db596..cc497994b2a8 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -362,43 +362,22 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) return POLLERR|POLLPRI; } - -static struct dentry *step_down(struct dentry *dir, const char * name) +void sysfs_notify(struct kobject *k, char *dir, char *attr) { - struct dentry * de; - - if (dir == NULL || dir->d_inode == NULL) - return NULL; - - mutex_lock(&dir->d_inode->i_mutex); - de = lookup_one_len(name, dir, strlen(name)); - mutex_unlock(&dir->d_inode->i_mutex); - dput(dir); - if (IS_ERR(de)) - return NULL; - if (de->d_inode == NULL) { - dput(de); - return NULL; - } - return de; -} + struct sysfs_dirent *sd = k->sd; -void sysfs_notify(struct kobject * k, char *dir, char *attr) -{ - struct dentry *de = k->sd->s_dentry; - if (de) - dget(de); - if (de && dir) - de = step_down(de, dir); - if (de && attr) - de = step_down(de, attr); - if (de) { - struct sysfs_dirent * sd = de->d_fsdata; - if (sd) - atomic_inc(&sd->s_event); + mutex_lock(&sysfs_mutex); + + if (sd && dir) + sd = sysfs_find_dirent(sd, dir); + if (sd && attr) + sd = sysfs_find_dirent(sd, attr); + if (sd) { + atomic_inc(&sd->s_event); wake_up_interruptible(&k->poll); - dput(de); } + + mutex_unlock(&sysfs_mutex); } EXPORT_SYMBOL_GPL(sysfs_notify); @@ -485,30 +464,31 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); */ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) { - struct dentry *dir = kobj->sd->s_dentry; - struct dentry * victim; - int res = -ENOENT; - - mutex_lock(&dir->d_inode->i_mutex); - victim = lookup_one_len(attr->name, dir, strlen(attr->name)); - if (!IS_ERR(victim)) { - /* make sure dentry is really there */ - if (victim->d_inode && - (victim->d_parent->d_inode == dir->d_inode)) { - victim->d_inode->i_mtime = CURRENT_TIME; - fsnotify_modify(victim); - res = 0; - } else - d_drop(victim); - - /** - * Drop the reference acquired from lookup_one_len() above. - */ - dput(victim); + struct sysfs_dirent *victim_sd = NULL; + struct dentry *victim = NULL; + int rc; + + rc = -ENOENT; + victim_sd = sysfs_get_dirent(kobj->sd, attr->name); + if (!victim_sd) + goto out; + + victim = sysfs_get_dentry(victim_sd); + if (IS_ERR(victim)) { + rc = PTR_ERR(victim); + victim = NULL; + goto out; } - mutex_unlock(&dir->d_inode->i_mutex); - return res; + mutex_lock(&victim->d_inode->i_mutex); + victim->d_inode->i_mtime = CURRENT_TIME; + fsnotify_modify(victim); + mutex_unlock(&victim->d_inode->i_mutex); + rc = 0; + out: + dput(victim); + sysfs_put(victim_sd); + return rc; } @@ -521,30 +501,34 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) */ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) { - struct dentry *dir = kobj->sd->s_dentry; - struct dentry *victim; + struct sysfs_dirent *victim_sd = NULL; + struct dentry *victim = NULL; struct inode * inode; struct iattr newattrs; - int res = -ENOENT; - - mutex_lock(&dir->d_inode->i_mutex); - victim = lookup_one_len(attr->name, dir, strlen(attr->name)); - if (!IS_ERR(victim)) { - if (victim->d_inode && - (victim->d_parent->d_inode == dir->d_inode)) { - inode = victim->d_inode; - mutex_lock(&inode->i_mutex); - newattrs.ia_mode = (mode & S_IALLUGO) | - (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - res = notify_change(victim, &newattrs); - mutex_unlock(&inode->i_mutex); - } - dput(victim); + int rc; + + rc = -ENOENT; + victim_sd = sysfs_get_dirent(kobj->sd, attr->name); + if (!victim_sd) + goto out; + + victim = sysfs_get_dentry(victim_sd); + if (IS_ERR(victim)) { + rc = PTR_ERR(victim); + victim = NULL; + goto out; } - mutex_unlock(&dir->d_inode->i_mutex); - return res; + inode = victim->d_inode; + mutex_lock(&inode->i_mutex); + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + rc = notify_change(victim, &newattrs); + mutex_unlock(&inode->i_mutex); + out: + dput(victim); + sysfs_put(victim_sd); + return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 078537e5d696..402cc356203c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -24,7 +24,7 @@ static const struct super_operations sysfs_ops = { .drop_inode = sysfs_delete_inode, }; -static struct sysfs_dirent sysfs_root = { +struct sysfs_dirent sysfs_root = { .s_count = ATOMIC_INIT(1), .s_flags = SYSFS_ROOT, .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 72530dc666fd..6a37f2386a8d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -52,6 +52,7 @@ struct sysfs_addrm_cxt { }; extern struct vfsmount * sysfs_mount; +extern struct sysfs_dirent sysfs_root; extern struct kmem_cache *sysfs_dir_cachep; extern struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); -- cgit v1.2.1 From 91a6902958f052358899f58683d44e36228d85c2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 9 Jun 2007 13:57:22 +0800 Subject: sysfs: add parameter "struct bin_attribute *" in .read/.write methods for sysfs binary attributes Well, first of all, I don't want to change so many files either. What I do: Adding a new parameter "struct bin_attribute *" in the .read/.write methods for the sysfs binary attributes. In fact, only the four lines change in fs/sysfs/bin.c and include/linux/sysfs.h do the real work. But I have to update all the files that use binary attributes to make them compatible with the new .read and .write methods. I'm not sure if I missed any. :( Why I do this: For a sysfs attribute, we can get a pointer pointing to the struct attribute in the .show/.store method, while we can't do this for the binary attributes. I don't know why this is different, but this does make it not so handy to use the binary attributes as the regular ones. So I think this patch is reasonable. :) Who benefits from it: The patch that exposes ACPI tables in sysfs requires such an improvement. All the table binary attributes share the same .read method. Parameter "struct bin_attribute *" is used to get the table signature and instance number which are used to distinguish different ACPI table binary attributes. Without this parameter, we need to offer different .read methods for different ACPI table binary attributes. This is impossible as there are various ACPI tables on different platforms, and we don't know what they are until they are loaded. Signed-off-by: Zhang Rui Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/bin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 55796bdacd3d..135353f8a296 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -40,7 +40,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) rc = -EIO; if (attr->read) - rc = attr->read(kobj, buffer, off, count); + rc = attr->read(kobj, attr, buffer, off, count); sysfs_put_active_two(attr_sd); @@ -97,7 +97,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) rc = -EIO; if (attr->write) - rc = attr->write(kobj, buffer, offset, count); + rc = attr->write(kobj, attr, buffer, offset, count); sysfs_put_active_two(attr_sd); -- cgit v1.2.1 From 29ce20586be54ceba49c55ae049541398cd2c416 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 13 Jul 2007 11:44:32 +0200 Subject: security: revalidate rw permissions for sys_splice and sys_vmsplice Revalidate read/write permissions for splice(2) and vmslice(2), in case security policy has changed since the files were opened. Acked-by: Stephen Smalley Signed-off-by: James Morris Signed-off-by: Jens Axboe --- fs/splice.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index ed2ce995475c..ef808227bc11 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -961,6 +962,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(ret < 0)) return ret; + ret = security_file_permission(out, MAY_WRITE); + if (unlikely(ret < 0)) + return ret; + return out->f_op->splice_write(pipe, out, ppos, len, flags); } @@ -983,6 +988,10 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; + ret = security_file_permission(in, MAY_READ); + if (unlikely(ret < 0)) + return ret; + return in->f_op->splice_read(in, ppos, pipe, len, flags); } -- cgit v1.2.1 From 51a92c0f6ce8fa85fa0e18ecda1d847e606e8066 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jul 2007 14:11:43 +0200 Subject: splice: fix offset mangling with direct splicing (sendfile) If the output actor doesn't transfer the full amount of data, we will increment ppos too much. Two related bugs in there: - We need to break out and return actor() retval if it is shorted than what we spliced into the pipe. - Adjust ppos only according to actor() return. Also fix loop problem in generic_file_splice_read(), it should not keep going when data has already been transferred. Signed-off-by: Jens Axboe --- fs/splice.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index ef808227bc11..6c9828651e6f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -492,7 +492,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, ret = 0; spliced = 0; - while (len) { + while (len && !spliced) { ret = __generic_file_splice_read(in, ppos, pipe, len, flags); if (ret < 0) @@ -1060,15 +1060,10 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, sd->flags &= ~SPLICE_F_NONBLOCK; while (len) { - size_t read_len, max_read_len; - - /* - * Do at most PIPE_BUFFERS pages worth of transfer: - */ - max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + size_t read_len; - ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags); - if (unlikely(ret < 0)) + ret = do_splice_to(in, &sd->pos, pipe, len, flags); + if (unlikely(ret <= 0)) goto out_release; read_len = ret; @@ -1080,26 +1075,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); - if (unlikely(ret < 0)) + if (unlikely(ret <= 0)) goto out_release; bytes += ret; len -= ret; - /* - * In nonblocking mode, if we got back a short read then - * that was due to either an IO error or due to the - * pagecache entry not being there. In the IO error case - * the _next_ splice attempt will produce a clean IO error - * return value (not a short read), so in both cases it's - * correct to break out of the loop here: - */ - if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) - break; + if (ret < read_len) + goto out_release; } pipe->nrbufs = pipe->curbuf = 0; - return bytes; out_release: @@ -1161,10 +1147,12 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .pos = *ppos, .u.file = out, }; - size_t ret; + long ret; ret = splice_direct_to_actor(in, &sd, direct_splice_actor); - *ppos = sd.pos; + if (ret > 0) + *ppos += ret; + return ret; } -- cgit v1.2.1 From 4fbef206daead133085fe33905f5e842d38fb8da Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jul 2007 22:42:20 +0200 Subject: nfsd: fix nfsd_vfs_read() splice actor setup When nfsd was transitioned to use splice instead of sendfile() for data transfers, a line setting the page index was lost. Restore it, so that nfsd is functional when that path is used. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- fs/nfsd/vfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8604e35bd48e..945b1cedde2b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -879,6 +879,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, .u.data = rqstp, }; + rqstp->rq_resused = 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); } else { oldfs = get_fs(); -- cgit v1.2.1 From 1fa40b01ae4d1b00e366d4949edcc230f5cd6d99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 May 2007 18:23:50 +1000 Subject: [XFS] Only use refcounted pages for I/O Many block drivers (aoe, iscsi) really want refcountable pages in bios, which is what almost everyone send down. XFS unfortunately has a few places where it sends down buffers that may come from kmalloc, which breaks them. Fix the places that use kmalloc()d buffers. SGI-PV: 964546 SGI-Modid: xfs-linux-melb:xfs-kern:28562a Signed-Off-By: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_buf.c | 49 ++++++++++++++++++++-------------------------- fs/xfs/linux-2.6/xfs_buf.h | 2 +- fs/xfs/xfs_log.c | 19 +++++++++--------- 3 files changed, 32 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index fe4f66a5af14..208daf58b826 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -314,7 +314,7 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_hash_list)); - if (bp->b_flags & _XBF_PAGE_CACHE) { + if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) @@ -323,18 +323,11 @@ xfs_buf_free( for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; - ASSERT(!PagePrivate(page)); + if (bp->b_flags & _XBF_PAGE_CACHE) + ASSERT(!PagePrivate(page)); page_cache_release(page); } _xfs_buf_free_pages(bp); - } else if (bp->b_flags & _XBF_KMEM_ALLOC) { - /* - * XXX(hch): bp->b_count_desired might be incorrect (see - * xfs_buf_associate_memory for details), but fortunately - * the Linux version of kmem_free ignores the len argument.. - */ - kmem_free(bp->b_addr, bp->b_count_desired); - _xfs_buf_free_pages(bp); } xfs_buf_deallocate(bp); @@ -764,41 +757,41 @@ xfs_buf_get_noaddr( size_t len, xfs_buftarg_t *target) { - size_t malloc_len = len; + unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + int error, i; xfs_buf_t *bp; - void *data; - int error; bp = xfs_buf_allocate(0); if (unlikely(bp == NULL)) goto fail; _xfs_buf_initialize(bp, target, 0, len, 0); - try_again: - data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL | KM_LARGE); - if (unlikely(data == NULL)) + error = _xfs_buf_get_pages(bp, page_count, 0); + if (error) goto fail_free_buf; - /* check whether alignment matches.. */ - if ((__psunsigned_t)data != - ((__psunsigned_t)data & ~target->bt_smask)) { - /* .. else double the size and try again */ - kmem_free(data, malloc_len); - malloc_len <<= 1; - goto try_again; + for (i = 0; i < page_count; i++) { + bp->b_pages[i] = alloc_page(GFP_KERNEL); + if (!bp->b_pages[i]) + goto fail_free_mem; } + bp->b_flags |= _XBF_PAGES; - error = xfs_buf_associate_memory(bp, data, len); - if (error) + error = _xfs_buf_map_pages(bp, XBF_MAPPED); + if (unlikely(error)) { + printk(KERN_WARNING "%s: failed to map pages\n", + __FUNCTION__); goto fail_free_mem; - bp->b_flags |= _XBF_KMEM_ALLOC; + } xfs_buf_unlock(bp); - XB_TRACE(bp, "no_daddr", data); + XB_TRACE(bp, "no_daddr", len); return bp; + fail_free_mem: - kmem_free(data, malloc_len); + while (--i >= 0) + __free_page(bp->b_pages[i]); fail_free_buf: xfs_buf_free(bp); fail: diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index b6241f6201a5..b5908a34b15d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -63,7 +63,7 @@ typedef enum { /* flags used only internally */ _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ - _XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ + _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ } xfs_buf_flags_t; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c48bf61f17bd..635f99e6302f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1199,11 +1199,18 @@ xlog_alloc_log(xfs_mount_t *mp, *iclogp = (xlog_in_core_t *) kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); iclog = *iclogp; - iclog->hic_data = (xlog_in_core_2_t *) - kmem_zalloc(iclogsize, KM_SLEEP | KM_LARGE); - iclog->ic_prev = prev_iclog; prev_iclog = iclog; + + bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); + if (!XFS_BUF_CPSEMA(bp)) + ASSERT(0); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); + XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + iclog->ic_bp = bp; + iclog->hic_data = bp->b_addr; + log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); head = &iclog->ic_header; @@ -1216,11 +1223,6 @@ xlog_alloc_log(xfs_mount_t *mp, INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); - iclog->ic_bp = bp; iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; @@ -1528,7 +1530,6 @@ xlog_dealloc_log(xlog_t *log) } #endif next_iclog = iclog->ic_next; - kmem_free(iclog->hic_data, log->l_iclog_size); kmem_free(iclog, sizeof(xlog_in_core_t)); iclog = next_iclog; } -- cgit v1.2.1 From 4cc929ee305c69573cb842aade059dbe2a93940c Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Mon, 14 May 2007 18:24:02 +1000 Subject: [XFS] Don't grow filesystems past the size they can index. When growing a filesystem we don't check to see if the new size overflows the page cache index range, so we can do silly things like grow a filesystem page 16TB on a 32bit. Check new filesystem sizes against the limits the kernel can support. SGI-PV: 957886 SGI-Modid: xfs-linux-melb:xfs-kern:28563a Signed-Off-By: Nathan Scott Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_fsops.c | 2 ++ fs/xfs/xfs_mount.c | 35 +++++++++++++++++++++++------------ fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_rtalloc.c | 4 +++- 4 files changed, 29 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b599e6be9ec1..25e5eae8a976 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -140,6 +140,8 @@ xfs_growfs_data_private( pct = in->imaxpct; if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + return error; dpct = pct - mp->m_sb.sb_imax_pct; error = xfs_read_buf(mp, mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index a96bde6df96d..5de1f392e632 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -202,6 +202,27 @@ xfs_mount_free( kmem_free(mp, sizeof(xfs_mount_t)); } +/* + * Check size of device based on the (data/realtime) block count. + * Note: this check is used by the growfs code as well as mount. + */ +int +xfs_sb_validate_fsb_count( + xfs_sb_t *sbp, + __uint64_t nblocks) +{ + ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); + ASSERT(sbp->sb_blocklog >= BBSHIFT); + +#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ + if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + return E2BIG; +#else /* Limited by UINT_MAX of sectors */ + if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) + return E2BIG; +#endif + return 0; +} /* * Check the validity of the SB found. @@ -284,18 +305,8 @@ xfs_mount_validate_sb( return XFS_ERROR(EFSCORRUPTED); } - ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); - ASSERT(sbp->sb_blocklog >= BBSHIFT); - -#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ - if (unlikely( - (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX || - (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) { -#else /* Limited by UINT_MAX of sectors */ - if (unlikely( - (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX || - (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) { -#endif + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { xfs_fs_mount_cmn_err(flags, "file system too large to be mounted on this system."); return XFS_ERROR(E2BIG); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 82304b94646d..871a5bfd8617 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -624,6 +624,7 @@ extern int xfs_sync_inodes(xfs_mount_t *, int, int *); extern xfs_agnumber_t xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *, xfs_agnumber_t); extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern struct xfs_dmops xfs_dmcore_stub; extern struct xfs_qmops xfs_qmcore_stub; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b3a5f07bd073..47082c01872d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1882,11 +1882,13 @@ xfs_growfs_rt( (nrblocks = in->newblocks) <= sbp->sb_rblocks || (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) + return error; /* * Read in the last block of the device, make sure it exists. */ error = xfs_read_buf(mp, mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), + XFS_FSB_TO_BB(mp, nrblocks - 1), XFS_FSB_TO_BB(mp, 1), 0, &bp); if (error) return error; -- cgit v1.2.1 From 40095b64f5da601a8ab61fbe4b40feb46830052e Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 14 May 2007 18:24:09 +1000 Subject: [XFS] Sleeping with the ilock waiting for I/O completion is Bad. Recent fixes to the filesystem freezing code introduced a vn_iowait call in the middle of the sync code. Unfortunately, at the point where this call was added we are holding the ilock. The ilock is needed by I/O completion for unwritten extent conversion and now updating the file size. Hence I/o cannot complete if we hold the ilock while waiting for I/O completion. Fix up the bug and clean the code up around it. SGI-PV: 963674 SGI-Modid: xfs-linux-melb:xfs-kern:28566a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/xfs_vfsops.c | 73 ++++++++++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 65c561201cb8..92c1425d06ce 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -1128,58 +1128,41 @@ xfs_sync_inodes( * in the inode list. */ - if ((flags & SYNC_CLOSE) && (vp != NULL)) { - /* - * This is the shutdown case. We just need to - * flush and invalidate all the pages associated - * with the inode. Drop the inode lock since - * we can't hold it across calls to the buffer - * cache. - * - * We don't set the VREMAPPING bit in the vnode - * here, because we don't hold the vnode lock - * exclusively. It doesn't really matter, though, - * because we only come here when we're shutting - * down anyway. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (XFS_FORCED_SHUTDOWN(mp)) { - bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF); - } else { - error = bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF); + /* + * If we have to flush data or wait for I/O completion + * we need to drop the ilock that we currently hold. + * If we need to drop the lock, insert a marker if we + * have not already done so. + */ + if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) || + ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) { + if (mount_locked) { + IPOINTER_INSERT(ip, mp); } + xfs_iunlock(ip, XFS_ILOCK_SHARED); - xfs_ilock(ip, XFS_ILOCK_SHARED); - - } else if ((flags & SYNC_DELWRI) && (vp != NULL)) { - if (VN_DIRTY(vp)) { - /* We need to have dropped the lock here, - * so insert a marker if we have not already - * done so. - */ - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - /* - * Drop the inode lock since we can't hold it - * across calls to the buffer cache. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (flags & SYNC_CLOSE) { + /* Shutdown case. Flush and invalidate. */ + if (XFS_FORCED_SHUTDOWN(mp)) + bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF); + else + error = bhv_vop_flushinval_pages(vp, 0, + -1, FI_REMAPF); + } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) { error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, fflag, FI_NONE); - xfs_ilock(ip, XFS_ILOCK_SHARED); } + /* + * When freezing, we need to wait ensure all I/O (including direct + * I/O) is complete to ensure no further data modification can take + * place after this point + */ + if (flags & SYNC_IOWAIT) + vn_iowait(vp); + + xfs_ilock(ip, XFS_ILOCK_SHARED); } - /* - * When freezing, we need to wait ensure all I/O (including direct - * I/O) is complete to ensure no further data modification can take - * place after this point - */ - if (flags & SYNC_IOWAIT) - vn_iowait(vp); if (flags & SYNC_BDFLUSH) { if ((flags & SYNC_ATTR) && -- cgit v1.2.1 From 3db296f341b5902c4f9317022ae5d4da2d59d598 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 14 May 2007 18:24:16 +1000 Subject: [XFS] Fix use-after-free during log unmount. Don't reference the log buffer after running the callbacks as the callback can trigger the log buffers to be freed during unmount. SGI-PV: 964545 SGI-Modid: xfs-linux-melb:xfs-kern:28567a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/xfs_log.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 635f99e6302f..5bb902056e61 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -967,14 +967,16 @@ xlog_iodone(xfs_buf_t *bp) } else if (iclog->ic_state & XLOG_STATE_IOERROR) { aborted = XFS_LI_ABORTED; } + + /* log I/O is always issued ASYNC */ + ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); - if (!(XFS_BUF_ISASYNC(bp))) { - /* - * Corresponding psema() will be done in bwrite(). If we don't - * vsema() here, panic. - */ - XFS_BUF_V_IODONESEMA(bp); - } + /* + * do not reference the buffer (bp) here as we could race + * with it being freed after writing the unmount record to the + * log. + */ + } /* xlog_iodone */ /* -- cgit v1.2.1 From ca165b88927e41ad18908d7b37f08ef81eae0bf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 May 2007 15:21:11 +1000 Subject: [XFS] Fix double free in xfs_buf_get_noaddr error handling path SGI-PV: 964983 SGI-Modid: xfs-linux-melb:xfs-kern:28639a Signed-off-by: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_buf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 208daf58b826..192aa0650919 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -792,8 +792,9 @@ xfs_buf_get_noaddr( fail_free_mem: while (--i >= 0) __free_page(bp->b_pages[i]); + _xfs_buf_free_pages(bp); fail_free_buf: - xfs_buf_free(bp); + xfs_buf_deallocate(bp); fail: return NULL; } -- cgit v1.2.1 From 511105b3d7c2440ee84fc3f90d200569aac88162 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 24 May 2007 15:21:57 +1000 Subject: [XFS] Fix vmalloc leak on mount/unmount. When setting the length of the iclogbuf to write out we should just be changing the desired byte count rather completely reassociating the buffer memory with the buffer. Reassociating the buffer memory changes the apparent length of the buffer and hence when we free the buffer, we don't free all the vmap()d space we originally allocated. SGI-PV: 964983 SGI-Modid: xfs-linux-melb:xfs-kern:28640a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/xfs_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5bb902056e61..fb50fd400e53 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1436,7 +1436,7 @@ xlog_sync(xlog_t *log, } else { iclog->ic_bwritecnt = 1; } - XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); + XFS_BUF_SET_COUNT(bp, count); XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); -- cgit v1.2.1 From 92dfe8d266eaf35a50607a0e0dcf525e1d367c80 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 24 May 2007 15:22:19 +1000 Subject: [XFS] Make hole punching at EOF atomic. If hole punching at EOF is done as two steps (i.e. truncate then extend) the file is in a transient state between the two steps where an application can see the incorrect file size. Punching a hole to EOF needs to be treated in teh same way as all other hole punching cases so that the file size is never seen to change. SGI-PV: 962012 SGI-Modid: xfs-linux-melb:xfs-kern:28641a Signed-off-by: David Chinner Signed-off-by: Vlad Apostolov Signed-off-by: Tim Shimmin --- fs/xfs/xfs_rw.h | 14 +++++++++++--- fs/xfs/xfs_vnodeops.c | 28 ++++++++++++++++++---------- 2 files changed, 29 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index 188b296ff50c..ba74000931ec 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -71,6 +71,12 @@ xfs_fsb_to_db_io(struct xfs_iocore *io, xfs_fsblock_t fsb) XFS_FSB_TO_DADDR((io)->io_mount, (fsb))); } +/* + * Flags for xfs_free_eofblocks + */ +#define XFS_FREE_EOF_LOCK (1<<0) +#define XFS_FREE_EOF_NOLOCK (1<<1) + /* * Prototypes for functions in xfs_rw.c. */ @@ -91,10 +97,12 @@ extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, extern int xfs_rwlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock); extern void xfs_rwunlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock); extern int xfs_setattr(bhv_desc_t *, bhv_vattr_t *vap, int flags, - cred_t *credp); + cred_t *credp); extern int xfs_change_file_space(bhv_desc_t *bdp, int cmd, xfs_flock64_t *bf, - xfs_off_t offset, cred_t *credp, int flags); + xfs_off_t offset, cred_t *credp, int flags); extern int xfs_set_dmattrs(bhv_desc_t *bdp, u_int evmask, u_int16_t state, - cred_t *credp); + cred_t *credp); +extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, + int flags); #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 70bc82f65311..6420ca8df5ec 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1201,13 +1201,15 @@ xfs_fsync( } /* - * This is called by xfs_inactive to free any blocks beyond eof, - * when the link count isn't zero. + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. */ -STATIC int -xfs_inactive_free_eofblocks( +int +xfs_free_eofblocks( xfs_mount_t *mp, - xfs_inode_t *ip) + xfs_inode_t *ip, + int flags) { xfs_trans_t *tp; int error; @@ -1216,6 +1218,7 @@ xfs_inactive_free_eofblocks( xfs_filblks_t map_len; int nimaps; xfs_bmbt_irec_t imap; + int use_iolock = (flags & XFS_FREE_EOF_LOCK); /* * Figure out if there are any blocks beyond the end @@ -1256,11 +1259,13 @@ xfs_inactive_free_eofblocks( * cache and we can't * do that within a transaction. */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (use_iolock) + xfs_ilock(ip, XFS_IOLOCK_EXCL); error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, ip->i_size); if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + if (use_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -1297,7 +1302,8 @@ xfs_inactive_free_eofblocks( error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); } - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL) + : XFS_ILOCK_EXCL)); } return error; } @@ -1573,7 +1579,8 @@ xfs_release( (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { - if ((error = xfs_inactive_free_eofblocks(mp, ip))) + error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); + if (error) return error; /* Update linux inode block count after free above */ vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp, @@ -1654,7 +1661,8 @@ xfs_inactive( (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || (ip->i_delayed_blks != 0)))) { - if ((error = xfs_inactive_free_eofblocks(mp, ip))) + error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); + if (error) return VN_INACTIVE_CACHE; /* Update linux inode block count after free above */ vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp, -- cgit v1.2.1 From 3260f78ad6d5b788e78ea709d377f58e569bee41 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 24 May 2007 15:25:42 +1000 Subject: [XFS] Use generic shrinker interfaces in XFS. SGI-PV: 964986 SGI-Modid: xfs-linux-melb:xfs-kern:28642a Signed-Off-By: Andrew Morton Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/kmem.h | 19 ------------------- fs/xfs/linux-2.6/xfs_buf.c | 6 +++--- fs/xfs/quota/xfs_qm.c | 6 +++--- 3 files changed, 6 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index 9ebabdf7829c..4b6470cf87f0 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -100,25 +100,6 @@ kmem_zone_destroy(kmem_zone_t *zone) extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); -/* - * Low memory cache shrinkers - */ - -typedef struct shrinker *kmem_shaker_t; -typedef int (*kmem_shake_func_t)(int, gfp_t); - -static inline kmem_shaker_t -kmem_shake_register(kmem_shake_func_t sfunc) -{ - return set_shrinker(DEFAULT_SEEKS, sfunc); -} - -static inline void -kmem_shake_deregister(kmem_shaker_t shrinker) -{ - remove_shrinker(shrinker); -} - static inline int kmem_shake_allow(gfp_t gfp_mask) { diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 192aa0650919..a3eff54a201c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -35,7 +35,7 @@ #include static kmem_zone_t *xfs_buf_zone; -static kmem_shaker_t xfs_buf_shake; +static struct shrinker *xfs_buf_shake; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); @@ -1831,7 +1831,7 @@ xfs_buf_init(void) if (!xfsdatad_workqueue) goto out_destroy_xfslogd_workqueue; - xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup); + xfs_buf_shake = set_shrinker(DEFAULT_SEEKS, xfsbufd_wakeup); if (!xfs_buf_shake) goto out_destroy_xfsdatad_workqueue; @@ -1853,7 +1853,7 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - kmem_shake_deregister(xfs_buf_shake); + remove_shrinker(xfs_buf_shake); destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 3e4a8ad8a34c..eeb3d9cc0e8e 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -62,7 +62,7 @@ uint ndquot; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; -static kmem_shaker_t xfs_qm_shaker; +static struct shrinker *xfs_qm_shaker; static cred_t xfs_zerocr; static xfs_inode_t xfs_zeroino; @@ -150,7 +150,7 @@ xfs_Gqm_init(void) } else xqm->qm_dqzone = qm_dqzone; - xfs_qm_shaker = kmem_shake_register(xfs_qm_shake); + xfs_qm_shaker = set_shrinker(DEFAULT_SEEKS, xfs_qm_shake); /* * The t_dqinfo portion of transactions. @@ -182,7 +182,7 @@ xfs_qm_destroy( ASSERT(xqm != NULL); ASSERT(xqm->qm_nrefs == 0); - kmem_shake_deregister(xfs_qm_shaker); + remove_shrinker(xfs_qm_shaker); hsize = xqm->qm_dqhashmask + 1; for (i = 0; i < hsize; i++) { xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); -- cgit v1.2.1 From 92821e2ba4ae26887223326fb0b95cdab963b768 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 24 May 2007 15:26:31 +1000 Subject: [XFS] Lazy Superblock Counters When we have a couple of hundred transactions on the fly at once, they all typically modify the on disk superblock in some way. create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify free block counts. When these counts are modified in a transaction, they must eventually lock the superblock buffer and apply the mods. The buffer then remains locked until the transaction is committed into the incore log buffer. The result of this is that with enough transactions on the fly the incore superblock buffer becomes a bottleneck. The result of contention on the incore superblock buffer is that transaction rates fall - the more pressure that is put on the superblock buffer, the slower things go. The key to removing the contention is to not require the superblock fields in question to be locked. We do that by not marking the superblock dirty in the transaction. IOWs, we modify the incore superblock but do not modify the cached superblock buffer. In short, we do not log superblock modifications to critical fields in the superblock on every transaction. In fact we only do it just before we write the superblock to disk every sync period or just before unmount. This creates an interesting problem - if we don't log or write out the fields in every transaction, then how do the values get recovered after a crash? the answer is simple - we keep enough duplicate, logged information in other structures that we can reconstruct the correct count after log recovery has been performed. It is the AGF and AGI structures that contain the duplicate information; after recovery, we walk every AGI and AGF and sum their individual counters to get the correct value, and we do a transaction into the log to correct them. An optimisation of this is that if we have a clean unmount record, we know the value in the superblock is correct, so we can avoid the summation walk under normal conditions and so mount/recovery times do not change under normal operation. One wrinkle that was discovered during development was that the blocks used in the freespace btrees are never accounted for in the AGF counters. This was once a valid optimisation to make; when the filesystem is full, the free space btrees are empty and consume no space. Hence when it matters, the "accounting" is correct. But that means the when we do the AGF summations, we would not have a correct count and xfs_check would complain. Hence a new counter was added to track the number of blocks used by the free space btrees. This is an *on-disk format change*. As a result of this, lazy superblock counters are a mkfs option and at the moment on linux there is no way to convert an old filesystem. This is possible - xfs_db can be used to twiddle the right bits and then xfs_repair will do the format conversion for you. Similarly, you can convert backwards as well. At some point we'll add functionality to xfs_admin to do the bit twiddling easily.... SGI-PV: 964999 SGI-Modid: xfs-linux-melb:xfs-kern:28652a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_super.c | 3 +- fs/xfs/linux-2.6/xfs_vfs.h | 1 + fs/xfs/xfs_ag.h | 8 ++- fs/xfs/xfs_alloc.c | 48 +++++++++++--- fs/xfs/xfs_alloc.h | 6 +- fs/xfs/xfs_alloc_btree.c | 20 +++--- fs/xfs/xfs_fs.h | 1 + fs/xfs/xfs_fsops.c | 2 + fs/xfs/xfs_ialloc.c | 28 +++++++- fs/xfs/xfs_ialloc.h | 10 +++ fs/xfs/xfs_log.c | 4 +- fs/xfs/xfs_log_recover.c | 8 +++ fs/xfs/xfs_mount.c | 154 +++++++++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_mount.h | 10 ++- fs/xfs/xfs_sb.h | 16 ++++- fs/xfs/xfs_trans.c | 58 ++++++++++++---- fs/xfs/xfs_trans.h | 3 +- fs/xfs/xfs_vfsops.c | 11 ++++ 18 files changed, 337 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index bf9a9d5909be..05f188ed1206 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -547,7 +547,8 @@ vfs_sync_worker( if (!(vfsp->vfs_flag & VFS_RDONLY)) error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \ - SYNC_ATTR | SYNC_REFCACHE, NULL); + SYNC_ATTR | SYNC_REFCACHE | SYNC_SUPER, + NULL); vfsp->vfs_sync_seq++; wake_up(&vfsp->vfs_wait_single_sync_task); } diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h index e2c2ce98ab5b..cb7b0d62fb96 100644 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -92,6 +92,7 @@ typedef enum { #define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ #define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ #define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */ +#define SYNC_SUPER 0x0200 /* flush superblock to disk */ #define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 9ece7f87ec5b..b1dd0029c60e 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -68,6 +68,7 @@ typedef struct xfs_agf { __be32 agf_flcount; /* count of blocks in freelist */ __be32 agf_freeblks; /* total free blocks */ __be32 agf_longest; /* longest free space */ + __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ } xfs_agf_t; #define XFS_AGF_MAGICNUM 0x00000001 @@ -81,7 +82,8 @@ typedef struct xfs_agf { #define XFS_AGF_FLCOUNT 0x00000100 #define XFS_AGF_FREEBLKS 0x00000200 #define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_NUM_BITS 11 +#define XFS_AGF_BTREEBLKS 0x00000800 +#define XFS_AGF_NUM_BITS 12 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) /* disk block (xfs_daddr_t) in the AG */ @@ -186,11 +188,13 @@ typedef struct xfs_perag __uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ + __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + int pagb_count; /* pagb slots in use */ #ifdef __KERNEL__ lock_t pagb_lock; /* lock for pagb_list */ #endif - int pagb_count; /* pagb slots in use */ xfs_perag_busy_t *pagb_list; /* unstable blocks */ } xfs_perag_t; diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 8e9a40aa0cd3..98f95d4c4bcc 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1447,7 +1447,8 @@ xfs_alloc_ag_vextent_small( else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { - if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno))) + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + if (error) goto error0; if (fbno != NULLAGBLOCK) { if (args->userdata) { @@ -1923,7 +1924,8 @@ xfs_alloc_fix_freelist( while (be32_to_cpu(agf->agf_flcount) > need) { xfs_buf_t *bp; - if ((error = xfs_alloc_get_freelist(tp, agbp, &bno))) + error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + if (error) return error; if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) return error; @@ -1973,8 +1975,9 @@ xfs_alloc_fix_freelist( * Put each allocated block on the list. */ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { - if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp, - bno))) + error = xfs_alloc_put_freelist(tp, agbp, + agflbp, bno, 0); + if (error) return error; } } @@ -1991,13 +1994,15 @@ int /* error */ xfs_alloc_get_freelist( xfs_trans_t *tp, /* transaction pointer */ xfs_buf_t *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop) /* block address retrieved from freelist */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk) /* destination is a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ xfs_agfl_t *agfl; /* a.g. freelist structure */ xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ int error; + int logflags; #ifdef XFS_ALLOC_TRACE static char fname[] = "xfs_alloc_get_freelist"; #endif @@ -2032,8 +2037,16 @@ xfs_alloc_get_freelist( be32_add(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; - TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); + + logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add(&agf->agf_btreeblks, 1); + pag->pagf_btreeblks++; + logflags |= XFS_AGF_BTREEBLKS; + } + + TRACE_MODAGF(NULL, agf, logflags); + xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; /* @@ -2071,6 +2084,7 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_flcount), offsetof(xfs_agf_t, agf_freeblks), offsetof(xfs_agf_t, agf_longest), + offsetof(xfs_agf_t, agf_btreeblks), sizeof(xfs_agf_t) }; @@ -2106,12 +2120,14 @@ xfs_alloc_put_freelist( xfs_trans_t *tp, /* transaction pointer */ xfs_buf_t *agbp, /* buffer for a.g. freelist header */ xfs_buf_t *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno) /* block being freed */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk) /* block came from a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ xfs_agfl_t *agfl; /* a.g. free block array */ __be32 *blockp;/* pointer to array entry */ int error; + int logflags; #ifdef XFS_ALLOC_TRACE static char fname[] = "xfs_alloc_put_freelist"; #endif @@ -2132,11 +2148,22 @@ xfs_alloc_put_freelist( be32_add(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; + + logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add(&agf->agf_btreeblks, -1); + pag->pagf_btreeblks--; + logflags |= XFS_AGF_BTREEBLKS; + } + + TRACE_MODAGF(NULL, agf, logflags); + xfs_alloc_log_agf(tp, agbp, logflags); + ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); - TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); + TRACE_MODAGF(NULL, agf, logflags); + xfs_alloc_log_agf(tp, agbp, logflags); xfs_trans_log_buf(tp, agflbp, (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + @@ -2196,6 +2223,7 @@ xfs_alloc_read_agf( pag = &mp->m_perag[agno]; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); + pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); pag->pagf_longest = be32_to_cpu(agf->agf_longest); pag->pagf_levels[XFS_BTNUM_BNOi] = diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 5a4256120ccc..5aec15d0651e 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -136,7 +136,8 @@ int /* error */ xfs_alloc_get_freelist( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop); /* block address retrieved from freelist */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk); /* destination is a AGF btree */ /* * Log the given fields from the agf structure. @@ -165,7 +166,8 @@ xfs_alloc_put_freelist( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for a.g. freelist header */ struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno); /* block being freed */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk); /* owner was a AGF btree */ /* * Read in the allocation group header (free/alloc section). diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 74cadf95d4e8..1603ce595853 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -226,8 +226,9 @@ xfs_alloc_delrec( /* * Put this buffer/block on the ag's freelist. */ - if ((error = xfs_alloc_put_freelist(cur->bc_tp, - cur->bc_private.a.agbp, NULL, bno))) + error = xfs_alloc_put_freelist(cur->bc_tp, + cur->bc_private.a.agbp, NULL, bno, 1); + if (error) return error; /* * Since blocks move to the free list without the @@ -549,8 +550,9 @@ xfs_alloc_delrec( /* * Free the deleting block by putting it on the freelist. */ - if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp, - NULL, rbno))) + error = xfs_alloc_put_freelist(cur->bc_tp, + cur->bc_private.a.agbp, NULL, rbno, 1); + if (error) return error; /* * Since blocks move to the free list without the coordination @@ -1320,8 +1322,9 @@ xfs_alloc_newroot( /* * Get a buffer from the freelist blocks, for the new root. */ - if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, - &nbno))) + error = xfs_alloc_get_freelist(cur->bc_tp, + cur->bc_private.a.agbp, &nbno, 1); + if (error) return error; /* * None available, we fail. @@ -1604,8 +1607,9 @@ xfs_alloc_split( * Allocate the new block from the freelist. * If we can't do it, we're toast. Give up. */ - if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, - &rbno))) + error = xfs_alloc_get_freelist(cur->bc_tp, + cur->bc_private.a.agbp, &rbno, 1); + if (error) return error; if (rbno == NULLAGBLOCK) { *stat = 0; diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 1335449841cd..1b60cfc28be5 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -238,6 +238,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ +#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 25e5eae8a976..27d01afe8465 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -94,6 +94,8 @@ xfs_fs_geometry( XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | + (xfs_sb_version_haslazysbcount(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | (XFS_SB_VERSION_HASATTR2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index b5feb3e77116..f943368c9b93 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -123,6 +123,7 @@ xfs_ialloc_ag_alloc( int blks_per_cluster; /* fs blocks per inode cluster */ xfs_btree_cur_t *cur; /* inode btree cursor */ xfs_daddr_t d; /* disk addr of buffer */ + xfs_agnumber_t agno; int error; xfs_buf_t *fbuf; /* new free inodes' buffer */ xfs_dinode_t *free; /* new free inode structure */ @@ -302,15 +303,15 @@ xfs_ialloc_ag_alloc( } be32_add(&agi->agi_count, newlen); be32_add(&agi->agi_freecount, newlen); + agno = be32_to_cpu(agi->agi_seqno); down_read(&args.mp->m_peraglock); - args.mp->m_perag[be32_to_cpu(agi->agi_seqno)].pagi_freecount += newlen; + args.mp->m_perag[agno].pagi_freecount += newlen; up_read(&args.mp->m_peraglock); agi->agi_newino = cpu_to_be32(newino); /* * Insert records describing the new inode chunk into the btree. */ - cur = xfs_btree_init_cursor(args.mp, tp, agbp, - be32_to_cpu(agi->agi_seqno), + cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno, XFS_BTNUM_INO, (xfs_inode_t *)0, 0); for (thisino = newino; thisino < newino + newlen; @@ -1387,6 +1388,7 @@ xfs_ialloc_read_agi( pag = &mp->m_perag[agno]; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); + pag->pagi_count = be32_to_cpu(agi->agi_count); pag->pagi_init = 1; } else { /* @@ -1410,3 +1412,23 @@ xfs_ialloc_read_agi( *bpp = bp; return 0; } + +/* + * Read in the agi to initialise the per-ag data in the mount structure + */ +int +xfs_ialloc_pagi_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno) /* allocation group number */ +{ + xfs_buf_t *bp = NULL; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 7f5debe1acb6..97f4040931ca 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -149,6 +149,16 @@ xfs_ialloc_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp); /* allocation group hdr buf */ +/* + * Read in the allocation group header to initialise the per-ag data + * in the mount structure + */ +int +xfs_ialloc_pagi_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno); /* allocation group number */ + #endif /* __KERNEL__ */ #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fb50fd400e53..9d4c4fbeb3ee 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -817,10 +817,8 @@ xfs_log_need_covered(xfs_mount_t *mp) SPLDECL(s); int needed = 0, gen; xlog_t *log = mp->m_log; - bhv_vfs_t *vfsp = XFS_MTOVFS(mp); - if (vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) || - (vfsp->vfs_flag & VFS_RDONLY)) + if (!xfs_fs_writable(mp)) return 0; s = LOG_LOCK(log); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 080fabf61c92..fddbb091a86f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -927,6 +927,14 @@ xlog_find_tail( ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle, after_umount_blk); *tail_blk = after_umount_blk; + + /* + * Note that the unmount was clean. If the unmount + * was not clean, we need to know this to rebuild the + * superblock counters from the perag headers if we + * have a filesystem using non-persistent counters. + */ + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; } } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5de1f392e632..f6fe47d8c4dc 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -643,6 +643,64 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; } + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +STATIC int +xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + int error; + int s; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the inforamtion we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = &mp->m_perag[index]; + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + } + /* + * Overwrite incore superblock counters with just-read data + */ + s = XFS_SB_LOCK(mp); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = bfree + bfreelst + btree; + XFS_SB_UNLOCK(mp, s); + + /* Fixup the per-cpu counters as well. */ + xfs_icsb_reinit_counters(mp); + + return 0; +} + /* * xfs_mountfs * @@ -986,6 +1044,34 @@ xfs_mountfs( goto error2; } + /* + * Now the log is mounted, we know if it was an unclean shutdown or + * not. If it was, with the first phase of recovery has completed, we + * have consistent AG blocks on disk. We have not recovered EFIs yet, + * but they are recovered transactionally in the second recovery phase + * later. + * + * Hence we can safely re-initialise incore superblock counters from + * the per-ag data. These may not be correct if the filesystem was not + * cleanly unmounted, so we need to wait for recovery to finish before + * doing this. + * + * If the filesystem was cleanly unmounted, then we can trust the + * values in the superblock to be correct and we don't need to do + * anything here. + * + * If we are currently making the filesystem, the initialisation will + * fail as the perag data is in an undefined state. + */ + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && + !mp->m_sb.sb_inprogress) { + error = xfs_initialize_perag_data(mp, sbp->sb_agcount); + if (error) { + goto error2; + } + } /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -1049,6 +1135,7 @@ xfs_mountfs( goto error4; } + /* * Complete the quota initialisation, post-log-replay component. */ @@ -1111,10 +1198,9 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) xfs_binval(mp->m_rtdev_targp); } + xfs_log_sbcount(mp, 1); xfs_unmountfs_writesb(mp); - xfs_unmountfs_wait(mp); /* wait for async bufs */ - xfs_log_unmount(mp); /* Done! No more fs ops. */ xfs_freesb(mp); @@ -1160,6 +1246,62 @@ xfs_unmountfs_wait(xfs_mount_t *mp) xfs_wait_buftarg(mp->m_ddev_targp); } +int +xfs_fs_writable(xfs_mount_t *mp) +{ + bhv_vfs_t *vfsp = XFS_MTOVFS(mp); + + return !(vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) || + (vfsp->vfs_flag & VFS_RDONLY)); +} + +/* + * xfs_log_sbcount + * + * Called either periodically to keep the on disk superblock values + * roughly up to date or from unmount to make sure the values are + * correct on a clean unmount. + * + * Note this code can be called during the process of freezing, so + * we may need to use the transaction allocator which does not not + * block when the transaction subsystem is in its frozen state. + */ +int +xfs_log_sbcount( + xfs_mount_t *mp, + uint sync) +{ + xfs_trans_t *tp; + int error; + + if (!xfs_fs_writable(mp)) + return 0; + + xfs_icsb_sync_counters(mp); + + /* + * we don't need to do this if we are updating the superblock + * counters on every modification. + */ + if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; + + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); + if (sync) + xfs_trans_set_sync(tp); + xfs_trans_commit(tp, 0); + + return 0; +} + int xfs_unmountfs_writesb(xfs_mount_t *mp) { @@ -1171,16 +1313,15 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) * skip superblock write if fs is read-only, or * if we are doing a forced umount. */ - sbp = xfs_getsb(mp, 0); if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY || XFS_FORCED_SHUTDOWN(mp))) { - xfs_icsb_sync_counters(mp); + sbp = xfs_getsb(mp, 0); + sb = XFS_BUF_TO_SBP(sbp); /* * mark shared-readonly if desired */ - sb = XFS_BUF_TO_SBP(sbp); if (mp->m_mk_sharedro) { if (!(sb->sb_flags & XFS_SBF_READONLY)) sb->sb_flags |= XFS_SBF_READONLY; @@ -1189,6 +1330,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) xfs_fs_cmn_err(CE_NOTE, mp, "Unmounting, marking shared read-only"); } + XFS_BUF_UNDONE(sbp); XFS_BUF_UNREAD(sbp); XFS_BUF_UNDELAYWRITE(sbp); @@ -1203,8 +1345,8 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) mp, sbp, XFS_BUF_ADDR(sbp)); if (error && mp->m_mk_sharedro) xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); + xfs_buf_relse(sbp); } - xfs_buf_relse(sbp); return error; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 871a5bfd8617..0bca2d422719 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -429,12 +429,12 @@ typedef struct xfs_mount { /* * Flags for m_flags. */ -#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops +#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ -#define XFS_MOUNT_INO64 (1ULL << 1) +#define XFS_MOUNT_INO64 (1ULL << 1) /* (1ULL << 2) -- currently unused */ - /* (1ULL << 3) -- currently unused */ +#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for disk errors in metadata */ @@ -511,6 +511,8 @@ xfs_preferred_iosize(xfs_mount_t *mp) #define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) +#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ + ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) #define xfs_force_shutdown(m,f) \ bhv_vfs_force_shutdown((XFS_MTOVFS(m)), f, __FILE__, __LINE__) @@ -602,6 +604,7 @@ typedef struct xfs_mod_sb { extern xfs_mount_t *xfs_mount_init(void); extern void xfs_mod_sb(xfs_trans_t *, __int64_t); +extern int xfs_log_sbcount(xfs_mount_t *, uint); extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv); extern int xfs_mountfs(struct bhv_vfs *, xfs_mount_t *mp, int); extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); @@ -618,6 +621,7 @@ extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); +extern int xfs_fs_writable(xfs_mount_t *); extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); extern int xfs_syncsub(xfs_mount_t *, int, int *); extern int xfs_sync_inodes(xfs_mount_t *, int, int *); diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 467854b45c8f..ef42537a607a 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -74,12 +74,13 @@ struct xfs_mount; */ #define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ #define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 -#define XFS_SB_VERSION2_RESERVED2BIT 0x00000002 +#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ #define XFS_SB_VERSION2_OKREALFBITS \ - (XFS_SB_VERSION2_ATTR2BIT) + (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT) #define XFS_SB_VERSION2_OKSASHFBITS \ (0) #define XFS_SB_VERSION2_OKREALBITS \ @@ -181,6 +182,9 @@ typedef enum { #define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) #define XFS_SB_UNIT XFS_SB_MVAL(UNIT) #define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) +#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) +#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) +#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) #define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) #define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) #define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) @@ -188,7 +192,7 @@ typedef enum { (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_FEATURES2) + XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2) /* @@ -414,6 +418,12 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT) */ +static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) +{ + return (XFS_SB_VERSION_HASMOREBITS(sbp) && \ + ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); +} + #define XFS_SB_VERSION_HASATTR2(sbp) xfs_sb_version_hasattr2(sbp) static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index cc2d60951e21..7133fd9ab868 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -427,6 +427,14 @@ undo_blocks: * * Mark the transaction structure to indicate that the superblock * needs to be updated before committing. + * + * Because we may not be keeping track of allocated/free inodes and + * used filesystem blocks in the superblock, we do not mark the + * superblock dirty in this transaction if we modify these fields. + * We still need to update the transaction deltas so that they get + * applied to the incore superblock, but we don't want them to + * cause the superblock to get locked and logged if these are the + * only fields in the superblock that the transaction modifies. */ void xfs_trans_mod_sb( @@ -434,13 +442,19 @@ xfs_trans_mod_sb( uint field, int64_t delta) { + uint32_t flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY); + xfs_mount_t *mp = tp->t_mountp; switch (field) { case XFS_TRANS_SB_ICOUNT: tp->t_icount_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_IFREE: tp->t_ifree_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FDBLOCKS: /* @@ -453,6 +467,8 @@ xfs_trans_mod_sb( ASSERT(tp->t_blk_res_used <= tp->t_blk_res); } tp->t_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_RES_FDBLOCKS: /* @@ -462,6 +478,8 @@ xfs_trans_mod_sb( */ ASSERT(delta < 0); tp->t_res_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FREXTENTS: /* @@ -544,18 +562,23 @@ xfs_trans_apply_sb_deltas( (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + tp->t_ag_btree_delta)); - if (tp->t_icount_delta != 0) { - INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta); - } - if (tp->t_ifree_delta != 0) { - INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta); - } + /* + * Only update the superblock counters if we are logging them + */ + if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) { + if (tp->t_icount_delta != 0) { + INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta); + } + if (tp->t_ifree_delta != 0) { + INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta); + } - if (tp->t_fdblocks_delta != 0) { - INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta); - } - if (tp->t_res_fdblocks_delta != 0) { - INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta); + if (tp->t_fdblocks_delta != 0) { + INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta); + } + if (tp->t_res_fdblocks_delta != 0) { + INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta); + } } if (tp->t_frextents_delta != 0) { @@ -627,6 +650,7 @@ xfs_trans_unreserve_and_mod_sb( { xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ xfs_mod_sb_t *msbp; + xfs_mount_t *mp = tp->t_mountp; /* REFERENCED */ int error; int rsvd; @@ -659,8 +683,15 @@ xfs_trans_unreserve_and_mod_sb( * The t_res_fdblocks_delta and t_res_frextents_delta fields are * explicitly NOT applied to the in-core superblock. * The idea is that that has already been done. + * + * If we are not logging superblock counters, then the inode + * allocated/free and used block counts are not updated in the + * on disk superblock. In this case, XFS_TRANS_SB_DIRTY will + * not be set when the transaction is updated but we still need + * to update the incore superblock with the changes. */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { + if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY)) { if (tp->t_icount_delta != 0) { msbp->msb_field = XFS_SBS_ICOUNT; msbp->msb_delta = tp->t_icount_delta; @@ -676,6 +707,9 @@ xfs_trans_unreserve_and_mod_sb( msbp->msb_delta = tp->t_fdblocks_delta; msbp++; } + } + + if (tp->t_flags & XFS_TRANS_SB_DIRTY) { if (tp->t_frextents_delta != 0) { msbp->msb_field = XFS_SBS_FREXTENTS; msbp->msb_delta = tp->t_frextents_delta; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 7dfcc450366f..0e26e729023e 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -94,7 +94,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFSRT_ZERO 38 #define XFS_TRANS_GROWFSRT_FREE 39 #define XFS_TRANS_SWAPEXT 40 -#define XFS_TRANS_TYPE_MAX 40 +#define XFS_TRANS_SB_COUNT 41 +#define XFS_TRANS_TYPE_MAX 41 /* new transaction types need to be reflected in xfs_logprint(8) */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 92c1425d06ce..3a647339f40e 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -672,6 +672,7 @@ xfs_mntupdate( } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ bhv_vfs_sync(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL); xfs_quiesce_fs(mp); + xfs_log_sbcount(mp, 1); xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); vfsp->vfs_flag |= VFS_RDONLY; @@ -1496,6 +1497,15 @@ xfs_syncsub( xfs_refcache_purge_some(mp); } + /* + * If asked, update the disk superblock with incore counter values if we + * are using non-persistent counters so that they don't get too far out + * of sync if we crash or get a forced shutdown. We don't want to force + * this to disk, just get a transaction into the iclogs.... + */ + if (flags & SYNC_SUPER) + xfs_log_sbcount(mp, 0); + /* * Now check to see if the log needs a "dummy" transaction. */ @@ -1962,6 +1972,7 @@ xfs_freeze( ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); /* Push the superblock and write an unmount record */ + xfs_log_sbcount(mp, 1); xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); xfs_fs_log_dummy(mp); -- cgit v1.2.1 From 210c6f1caa451623e14a7cd71000d2c2e0d9cc43 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 24 May 2007 15:26:51 +1000 Subject: [XFS] Fix the transaction flags to make lazy superblock counters work. SGI-PV: 964999 SGI-Modid: xfs-linux-melb:xfs-kern:28653a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/xfs_trans.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7133fd9ab868..2caa0783049b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -533,7 +533,7 @@ xfs_trans_mod_sb( return; } - tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY); + tp->t_flags |= flags; } /* -- cgit v1.2.1 From 4e5ae8386b55677bde05bbd38b8fc82c67ad4564 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 5 Jun 2007 16:24:15 +1000 Subject: [XFS] xfs_bmapi fails to update the previous extent pointer When processing multiple extent maps, xfs_bmapi needs to keep track of the extent behind the one it is currently working on to be able to trim extent ranges correctly. Failing to update the previous pointer can result in corrupted extent lists in memory and this will result in panics or assert failures. Update the previous pointer correctly when we move to the next extent to process. SGI-PV: 965631 SGI-Modid: xfs-linux-melb:xfs-kern:28773a Signed-off-by: David Chinner Signed-off-by: Vlad Apostolov Signed-off-by: Tim Shimmin --- fs/xfs/xfs_bmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index b1ea26e40aaf..9a654faf0c08 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5219,10 +5219,10 @@ xfs_bmapi( * Else go on to the next record. */ ep = xfs_iext_get_ext(ifp, ++lastx); - if (lastx >= nextents) { + prev = got; + if (lastx >= nextents) eof = 1; - prev = got; - } else + else xfs_bmbt_get_all(ep, &got); } ifp->if_lastex = lastx; -- cgit v1.2.1 From f4a9f28a909debe97cd3f6ca30e82e5811125bff Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 5 Jun 2007 16:24:27 +1000 Subject: [XFS] Flush the block device before closing it on unmount. SGI-PV: 965630 SGI-Modid: xfs-linux-melb:xfs-kern:28774a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_buf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index a3eff54a201c..2df63622354e 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1447,6 +1447,7 @@ xfs_free_buftarg( int external) { xfs_flush_buftarg(btp, 1); + xfs_blkdev_issue_flush(btp); if (external) xfs_blkdev_put(btp->bt_bdev); xfs_free_bufhash(btp); -- cgit v1.2.1 From e927af90aaa7d75543edbbd9c2810e6963d0443f Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 5 Jun 2007 16:24:36 +1000 Subject: [XFS] Block on unwritten extent conversion during synchronous direct I/O. Currently we do not wait on extent conversion to occur, and hence we can return to userspace from a synchronous direct I/O write without having completed all the actions in the write. Hence a read after the write may see zeroes (unwritten extent) rather than the data that was written. Block the I/O completion by triggering a synchronous workqueue flush to ensure that the conversion has occurred before we return to userspace. SGI-PV: 964092 SGI-Modid: xfs-linux-melb:xfs-kern:28775a Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_aops.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7361861e3aac..80a321462315 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -108,14 +108,19 @@ xfs_page_trace( /* * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. */ STATIC void xfs_finish_ioend( - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int wait) { - if (atomic_dec_and_test(&ioend->io_remaining)) + if (atomic_dec_and_test(&ioend->io_remaining)) { queue_work(xfsdatad_workqueue, &ioend->io_work); + if (wait) + flush_workqueue(xfsdatad_workqueue); + } } /* @@ -334,7 +339,7 @@ xfs_end_bio( bio->bi_end_io = NULL; bio_put(bio); - xfs_finish_ioend(ioend); + xfs_finish_ioend(ioend, 0); return 0; } @@ -470,7 +475,7 @@ xfs_submit_ioend( } if (bio) xfs_submit_ioend_bio(ioend, bio); - xfs_finish_ioend(ioend); + xfs_finish_ioend(ioend, 0); } while ((ioend = next) != NULL); } @@ -1416,6 +1421,13 @@ xfs_end_io_direct( * This is not necessary for synchronous direct I/O, but we do * it anyway to keep the code uniform and simpler. * + * Well, if only it were that simple. Because synchronous direct I/O + * requires extent conversion to occur *before* we return to userspace, + * we have to wait for extent conversion to complete. Look at the + * iocb that has been passed to us to determine if this is AIO or + * not. If it is synchronous, tell xfs_finish_ioend() to kick the + * workqueue and wait for it to complete. + * * The core direct I/O code might be changed to always call the * completion handler in the future, in which case all this can * go away. @@ -1423,9 +1435,9 @@ xfs_end_io_direct( ioend->io_offset = offset; ioend->io_size = size; if (ioend->io_type == IOMAP_READ) { - xfs_finish_ioend(ioend); + xfs_finish_ioend(ioend, 0); } else if (private && size > 0) { - xfs_finish_ioend(ioend); + xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); } else { /* * A direct I/O write ioend starts it's life in unwritten @@ -1434,7 +1446,7 @@ xfs_end_io_direct( * handler. */ INIT_WORK(&ioend->io_work, xfs_end_bio_written); - xfs_finish_ioend(ioend); + xfs_finish_ioend(ioend, 0); } /* -- cgit v1.2.1 From b2826136a1fc3ea451bcbb73a75ca50b3231aa8f Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 5 Jun 2007 16:24:44 +1000 Subject: [XFS] Handle null returned from xfs_vtoi() in xfs_setfilesize(). SGI-PV: 965636 SGI-Modid: xfs-linux-melb:xfs-kern:28777a Signed-off-by: David Chinner Signed-off-by: Olaf Weber Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_aops.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 80a321462315..c097e4e69768 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -161,6 +161,8 @@ xfs_setfilesize( xfs_fsize_t bsize; ip = xfs_vtoi(ioend->io_vnode); + if (!ip) + return; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(ioend->io_type != IOMAP_READ); -- cgit v1.2.1 From 45c34141126a89da07197d5b89c04c6847f1171a Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 18 Jun 2007 16:49:44 +1000 Subject: [XFS] Apply transaction delta counts atomically to incore counters With the per-cpu superblock counters, batch updates are no longer atomic across the entire batch of changes. This is not an issue if each individual change in the batch is applied atomically. Unfortunately, free block count changes are not applied atomically, and they are applied in a manner guaranteed to cause problems. Essentially, the free block count reservation that the transaction took initially is returned to the in core counters before a second delta takes away what is used. because these two operations are not atomic, we can race with another thread that can use the returned transaction reservation before the transaction takes the space away again and we can then get ENOSPC being reported in a spot where we don't have an ENOSPC condition, nor should we ever see one there. Fix it up by rolling the two deltas into the one so it can be applied safely (i.e. atomically) to the incore counters. SGI-PV: 964465 SGI-Modid: xfs-linux-melb:xfs-kern:28796a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/xfs_trans.c | 77 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2caa0783049b..356d6627f581 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -638,11 +638,23 @@ xfs_trans_apply_sb_deltas( } /* - * xfs_trans_unreserve_and_mod_sb() is called to release unused - * reservations and apply superblock counter changes to the in-core - * superblock. + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations + * and apply superblock counter changes to the in-core superblock. The + * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT + * applied to the in-core superblock. The idea is that that has already been + * done. * * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). + * However, we have to ensure that we only modify each superblock field only + * once because the application of the delta values may not be atomic. That can + * lead to ENOSPC races occurring if we have two separate modifcations of the + * free space counter to put back the entire reservation and then take away + * what we used. + * + * If we are not logging superblock counters, then the inode allocated/free and + * used block counts are not updated in the on disk superblock. In this case, + * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we + * still need to update the incore superblock with the changes. */ STATIC void xfs_trans_unreserve_and_mod_sb( @@ -654,42 +666,43 @@ xfs_trans_unreserve_and_mod_sb( /* REFERENCED */ int error; int rsvd; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; msbp = msb; rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - /* - * Release any reserved blocks. Any that were allocated - * will be taken back again by fdblocks_delta below. - */ - if (tp->t_blk_res > 0) { + /* calculate free blocks delta */ + if (tp->t_blk_res > 0) + blkdelta = tp->t_blk_res; + + if ((tp->t_fdblocks_delta != 0) && + (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY))) + blkdelta += tp->t_fdblocks_delta; + + if (blkdelta != 0) { msbp->msb_field = XFS_SBS_FDBLOCKS; - msbp->msb_delta = tp->t_blk_res; + msbp->msb_delta = blkdelta; msbp++; } - /* - * Release any reserved real time extents . Any that were - * allocated will be taken back again by frextents_delta below. - */ - if (tp->t_rtx_res > 0) { + /* calculate free realtime extents delta */ + if (tp->t_rtx_res > 0) + rtxdelta = tp->t_rtx_res; + + if ((tp->t_frextents_delta != 0) && + (tp->t_flags & XFS_TRANS_SB_DIRTY)) + rtxdelta += tp->t_frextents_delta; + + if (rtxdelta != 0) { msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = tp->t_rtx_res; + msbp->msb_delta = rtxdelta; msbp++; } - /* - * Apply any superblock modifications to the in-core version. - * The t_res_fdblocks_delta and t_res_frextents_delta fields are - * explicitly NOT applied to the in-core superblock. - * The idea is that that has already been done. - * - * If we are not logging superblock counters, then the inode - * allocated/free and used block counts are not updated in the - * on disk superblock. In this case, XFS_TRANS_SB_DIRTY will - * not be set when the transaction is updated but we still need - * to update the incore superblock with the changes. - */ + /* apply remaining deltas */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { if (tp->t_icount_delta != 0) { @@ -702,19 +715,9 @@ xfs_trans_unreserve_and_mod_sb( msbp->msb_delta = tp->t_ifree_delta; msbp++; } - if (tp->t_fdblocks_delta != 0) { - msbp->msb_field = XFS_SBS_FDBLOCKS; - msbp->msb_delta = tp->t_fdblocks_delta; - msbp++; - } } if (tp->t_flags & XFS_TRANS_SB_DIRTY) { - if (tp->t_frextents_delta != 0) { - msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = tp->t_frextents_delta; - msbp++; - } if (tp->t_dblocks_delta != 0) { msbp->msb_field = XFS_SBS_DBLOCKS; msbp->msb_delta = tp->t_dblocks_delta; -- cgit v1.2.1 From effd120edb7609069cca9f3d1cb4bfae464b2f85 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 18 Jun 2007 16:49:58 +1000 Subject: [XFS] Map unwritten extents correctly for I/o completion processing If we have multiple unwritten extents within a single page, we fail to tell the I/o completion construction handlers we need a new handle for the second and subsequent blocks in the page. While we still issue the I/O correctly, we do not have the correct ranges recorded in the ioend structures and hence when we go to convert the unwritten extents we screw it up. Make sure we start a new ioend every time the mapping changes so that we convert the correct ranges on I/O completion. SGI-PV: 964647 SGI-Modid: xfs-linux-melb:xfs-kern:28797a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_aops.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c097e4e69768..fd4105d662e0 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1010,6 +1010,8 @@ xfs_page_state_convert( if (buffer_unwritten(bh) || buffer_delay(bh) || ((buffer_uptodate(bh) || PageUptodate(page)) && !buffer_mapped(bh) && (unmapped || startio))) { + int new_ioend = 0; + /* * Make sure we don't use a read-only iomap */ @@ -1028,6 +1030,15 @@ xfs_page_state_convert( } if (!iomap_valid) { + /* + * if we didn't have a valid mapping then we + * need to ensure that we put the new mapping + * in a new ioend structure. This needs to be + * done to ensure that the ioends correctly + * reflect the block mappings at io completion + * for unwritten extent conversion. + */ + new_ioend = 1; if (type == IOMAP_NEW) { size = xfs_probe_cluster(inode, page, bh, head, 0); @@ -1047,7 +1058,7 @@ xfs_page_state_convert( if (startio) { xfs_add_to_ioend(inode, bh, offset, type, &ioend, - !iomap_valid); + new_ioend); } else { set_buffer_dirty(bh); unlock_buffer(bh); -- cgit v1.2.1 From 0164af51cedf46e1d58fd53854373f544150c597 Mon Sep 17 00:00:00 2001 From: Tim Shimmin Date: Mon, 18 Jun 2007 16:50:08 +1000 Subject: [XFS] Log the agf_length change in xfs_growfs_data_private(). SGI-PV: 963528 SGI-Modid: xfs-linux-melb:xfs-kern:28856a Signed-off-by: Tim Shimmin Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_fsops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 27d01afe8465..ddd45e5b9383 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -332,6 +332,7 @@ xfs_growfs_data_private( be32_add(&agf->agf_length, new); ASSERT(be32_to_cpu(agf->agf_length) == be32_to_cpu(agi->agi_length)); + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); /* * Free the new space. */ -- cgit v1.2.1 From 641c56fbfeae85d5ec87fee90a752f7b7224f236 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 18 Jun 2007 16:50:17 +1000 Subject: [XFS] Prevent deadlock when flushing inodes on unmount When we are unmounting the filesystem, we flush all the inodes to disk. Unfortunately, if we have an inode cluster that has just been freed and marked stale sitting in an incore log buffer (i.e. hasn't been flushed to disk), it will be holding all the flush locks on the inodes in that cluster. xfs_iflush_all() which is called during unmount walks all the inodes trying to reclaim them, and it doing so calls xfs_finish_reclaim() on each inode. If the inode is dirty, if grabs the flush lock and flushes it. Unfortunately, find dirty inodes that already have their flush lock held and so we sleep. At this point in the unmount process, we are running single-threaded. There is nothing more that can push on the log to force the transaction holding the inode flush locks to disk and hence we deadlock. The fix is to issue a log force before flushing the inodes on unmount so that all the flush locks will be released before we start flushing the inodes. SGI-PV: 964538 SGI-Modid: xfs-linux-melb:xfs-kern:28862a Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_mount.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f6fe47d8c4dc..39cf6f3267c3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1182,6 +1182,17 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) int64_t fsid; #endif + /* + * We can potentially deadlock here if we have an inode cluster + * that has been freed has it's buffer still pinned in memory because + * the transaction is still sitting in a iclog. The stale inodes + * on that buffer will have their flush locks held until the + * transaction hits the disk and the callbacks run. the inode + * flush takes the flush lock unconditionally and with nothing to + * push out the iclog we will never get that unlocked. hence we + * need to force the log first. + */ + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); xfs_iflush_all(mp); XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); -- cgit v1.2.1 From 84e1e99f112dead8f9ba036c02d24a9f5ce7f544 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 18 Jun 2007 16:50:27 +1000 Subject: [XFS] Prevent ENOSPC from aborting transactions that need to succeed During delayed allocation extent conversion or unwritten extent conversion, we need to reserve some blocks for transactions reservations. We need to reserve these blocks in case a btree split occurs and we need to allocate some blocks. Unfortunately, we've only ever reserved the number of data blocks we are allocating, so in both the unwritten and delalloc case we can get ENOSPC to the transaction reservation. This is bad because in both cases we cannot report the failure to the writing application. The fix is two-fold: 1 - leverage the reserved block infrastructure XFS already has to reserve a small pool of blocks by default to allow specially marked transactions to dip into when we are at ENOSPC. Default setting is min(5%, 1024 blocks). 2 - convert critical transaction reservations to be allowed to dip into this pool. Spots changed are delalloc conversion, unwritten extent conversion and growing a filesystem at ENOSPC. This also allows growing the filesytsem to succeed at ENOSPC. SGI-PV: 964468 SGI-Modid: xfs-linux-melb:xfs-kern:28865a Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_fsops.c | 10 +++++++--- fs/xfs/xfs_iomap.c | 22 ++++++++-------------- fs/xfs/xfs_mount.c | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 50 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ddd45e5b9383..2251a49f3e17 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -177,6 +177,7 @@ xfs_growfs_data_private( up_write(&mp->m_peraglock); } tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); + tp->t_flags |= XFS_TRANS_RESERVE; if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { xfs_trans_cancel(tp, 0); @@ -499,8 +500,9 @@ xfs_reserve_blocks( unsigned long s; /* If inval is null, report current values and return */ - if (inval == (__uint64_t *)NULL) { + if (!outval) + return EINVAL; outval->resblks = mp->m_resblks; outval->resblks_avail = mp->m_resblks_avail; return 0; @@ -563,8 +565,10 @@ retry: } } out: - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } XFS_SB_UNLOCK(mp, s); if (fdblks_delta) { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3f2b9f2a7b94..ab5062199f55 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -489,13 +489,13 @@ xfs_iomap_write_direct( if (unlikely(rt)) { resrtextents = qblocks = resaligned; resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; - } else { - resrtextents = 0; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - quota_flag = XFS_QMOPT_RES_REGBLKS; - } + quota_flag = XFS_QMOPT_RES_REGBLKS; + } /* * Allocate and setup the transaction @@ -788,18 +788,12 @@ xfs_iomap_write_allocate( nimaps = 0; while (nimaps == 0) { tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_reserve(tp, nres, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); - if (error == ENOSPC) { - error = xfs_trans_reserve(tp, 0, - XFS_WRITE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - } if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); @@ -917,8 +911,8 @@ xfs_iomap_write_unwritten( * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 39cf6f3267c3..31453ca0f3dd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -725,7 +725,7 @@ xfs_mountfs( bhv_vnode_t *rvp = NULL; int readio_log, writeio_log; xfs_daddr_t d; - __uint64_t ret64; + __uint64_t resblks; __int64_t update_flags; uint quotamount, quotaflags; int agno; @@ -842,6 +842,7 @@ xfs_mountfs( */ if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && (mp->m_flags & XFS_MOUNT_NOUUID) == 0) { + __uint64_t ret64; if (xfs_uuid_mount(mp)) { error = XFS_ERROR(EINVAL); goto error1; @@ -1135,13 +1136,27 @@ xfs_mountfs( goto error4; } - /* * Complete the quota initialisation, post-log-replay component. */ if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags))) goto error4; + /* + * Now we are mounted, reserve a small amount of unused space for + * privileged transactions. This is needed so that transaction + * space required for critical operations can dip into this pool + * when at ENOSPC. This is needed for operations like create with + * attr, unwritten extent conversion at ENOSPC, etc. Data allocations + * are not allowed to use this reserved space. + * + * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. + */ + resblks = min_t(__uint64_t, mp->m_sb.sb_dblocks / 20, 1024); + xfs_reserve_blocks(mp, &resblks, NULL); + return 0; error4: @@ -1181,6 +1196,7 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) #if defined(DEBUG) || defined(INDUCE_IO_ERROR) int64_t fsid; #endif + __uint64_t resblks; /* * We can potentially deadlock here if we have an inode cluster @@ -1209,6 +1225,23 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) xfs_binval(mp->m_rtdev_targp); } + /* + * Unreserve any blocks we have so that when we unmount we don't account + * the reserved free space as used. This is really only necessary for + * lazy superblock counting because it trusts the incore superblock + * counters to be aboslutely correct on clean unmount. + * + * We don't bother correcting this elsewhere for lazy superblock + * counting because on mount of an unclean filesystem we reconstruct the + * correct counter value and this is irrelevant. + * + * For non-lazy counter filesystems, this doesn't matter at all because + * we only every apply deltas to the superblock and hence the incore + * value does not matter.... + */ + resblks = 0; + xfs_reserve_blocks(mp, &resblks, NULL); + xfs_log_sbcount(mp, 1); xfs_unmountfs_writesb(mp); xfs_unmountfs_wait(mp); /* wait for async bufs */ -- cgit v1.2.1 From 957d0ebed04239b734552c7da3fae9094b6f090c Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 18 Jun 2007 16:50:37 +1000 Subject: [XFS] Cleanup inode extent size hint extraction SGI-PV: 966004 SGI-Modid: xfs-linux-melb:xfs-kern:28866a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/xfs_bmap.c | 23 +++++++---------------- fs/xfs/xfs_iomap.c | 19 ++++--------------- fs/xfs/xfs_rw.h | 22 ++++++++++++++++++++++ fs/xfs/xfs_vnodeops.c | 17 +++++------------ 4 files changed, 38 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 9a654faf0c08..183add2f21b7 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2609,8 +2609,7 @@ xfs_bmap_rtalloc( xfs_rtblock_t rtb; mp = ap->ip->i_mount; - align = ap->ip->i_d.di_extsize ? - ap->ip->i_d.di_extsize : mp->m_sb.sb_rextsize; + align = xfs_get_extsz_hint(ap->ip); prod = align / mp->m_sb.sb_rextsize; error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, align, 1, ap->eof, 0, @@ -2715,9 +2714,7 @@ xfs_bmap_btalloc( int error; mp = ap->ip->i_mount; - align = (ap->userdata && ap->ip->i_d.di_extsize && - (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)) ? - ap->ip->i_d.di_extsize : 0; + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, align, 0, ap->eof, 0, ap->conv, @@ -2817,9 +2814,9 @@ xfs_bmap_btalloc( args.total = ap->total; args.minlen = ap->minlen; } - if (unlikely(ap->userdata && ap->ip->i_d.di_extsize && - (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE))) { - args.prod = ap->ip->i_d.di_extsize; + /* apply extent size hints if obtained earlier */ + if (unlikely(align)) { + args.prod = align; if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } else if (mp->m_sb.sb_blocksize >= NBPP) { @@ -4868,12 +4865,7 @@ xfs_bmapi( xfs_extlen_t extsz; /* Figure out the extent size, adjust alen */ - if (rt) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } + extsz = xfs_get_extsz_hint(ip); if (extsz) { error = xfs_bmap_extsize_align(mp, &got, &prev, extsz, @@ -5813,8 +5805,7 @@ xfs_getbmap( ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) return XFS_ERROR(EINVAL); if (whichfork == XFS_DATA_FORK) { - if ((ip->i_d.di_extsize && (ip->i_d.di_flags & - (XFS_DIFLAG_REALTIME|XFS_DIFLAG_EXTSIZE))) || + if (xfs_get_extsz_hint(ip) || ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ prealloced = 1; fixlen = XFS_MAXIOFFSET(mp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ab5062199f55..bf57b75acb90 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -451,19 +451,14 @@ xfs_iomap_write_direct( return XFS_ERROR(error); rt = XFS_IS_REALTIME_INODE(ip); - if (unlikely(rt)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } + extsz = xfs_get_extsz_hint(ip); isize = ip->i_size; if (io->io_new_size > isize) isize = io->io_new_size; - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); if ((offset + count) > isize) { error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, &last_fsb); @@ -666,13 +661,7 @@ xfs_iomap_write_delay( if (error) return XFS_ERROR(error); - if (XFS_IS_REALTIME_INODE(ip)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } - + extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); retry: diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index ba74000931ec..fcf28dbded7c 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -77,6 +77,28 @@ xfs_fsb_to_db_io(struct xfs_iocore *io, xfs_fsblock_t fsb) #define XFS_FREE_EOF_LOCK (1<<0) #define XFS_FREE_EOF_NOLOCK (1<<1) + +/* + * helper function to extract extent size hint from inode + */ +STATIC_INLINE xfs_extlen_t +xfs_get_extsz_hint( + xfs_inode_t *ip) +{ + xfs_extlen_t extsz; + + if (unlikely(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + ? ip->i_d.di_extsize + : ip->i_mount->m_sb.sb_rextsize; + ASSERT(extsz); + } else { + extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + ? ip->i_d.di_extsize : 0; + } + return extsz; +} + /* * Prototypes for functions in xfs_rw.c. */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 6420ca8df5ec..8c830a48165a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -183,9 +183,8 @@ xfs_getattr( * realtime extent size or the realtime volume's * extent size. */ - vap->va_blocksize = ip->i_d.di_extsize ? - (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) : - (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog); + vap->va_blocksize = + xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; } break; } @@ -4055,22 +4054,16 @@ xfs_alloc_file_space( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rt = XFS_IS_REALTIME_INODE(ip); - if (unlikely(rt)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } - if ((error = XFS_QM_DQATTACH(mp, ip, 0))) return error; if (len <= 0) return XFS_ERROR(EINVAL); + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + count = len; - error = 0; imapp = &imaps[0]; nimaps = 1; bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); -- cgit v1.2.1 From 516b2e7c2661615ba5d5ad9fb584f068363502d3 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Mon, 18 Jun 2007 16:50:48 +1000 Subject: [XFS] Fix remount,readonly path to flush everything correctly. The remount readonly path can fail to writeback properly because we still have active transactions after calling xfs_quiesce_fs(). Further investigation shows that this path is broken in the same ways that the xfs freeze path was broken so fix it the same way. SGI-PV: 964464 SGI-Modid: xfs-linux-melb:xfs-kern:28869a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_super.c | 2 +- fs/xfs/linux-2.6/xfs_vfs.h | 14 ++++++++++++ fs/xfs/xfs_vfsops.c | 53 ++++++++++++++++++++++++++------------------ 3 files changed, 46 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 05f188ed1206..06894cf00b12 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -664,7 +664,7 @@ xfs_fs_sync_super( * occur here so don't bother flushing the buftarg (i.e * SYNC_QUIESCE) because it'll just get dirty again. */ - flags = SYNC_FSDATA | SYNC_DELWRI | SYNC_WAIT | SYNC_IOWAIT; + flags = SYNC_DATA_QUIESCE; } else flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0); diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h index cb7b0d62fb96..dca3481aaafa 100644 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -94,6 +94,20 @@ typedef enum { #define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */ #define SYNC_SUPER 0x0200 /* flush superblock to disk */ +/* + * When remounting a filesystem read-only or freezing the filesystem, + * we have two phases to execute. This first phase is syncing the data + * before we quiesce the fielsystem, and the second is flushing all the + * inodes out after we've waited for all the transactions created by + * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE + * to ensure that the inodes are written to their location on disk + * rather than just existing in transactions in the log. This means + * after a quiesce there is no log replay required to write the inodes + * to disk (this is the main difference between a sync and a quiesce). + */ +#define SYNC_DATA_QUIESCE (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT) +#define SYNC_INODE_QUIESCE (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT) + #define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ #define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 3a647339f40e..c343fde10ef9 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -640,7 +640,7 @@ xfs_quiesce_fs( * we can write the unmount record. */ do { - xfs_syncsub(mp, SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT, NULL); + xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL); pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); if (!pincount) { delay(50); @@ -651,6 +651,30 @@ xfs_quiesce_fs( return 0; } +/* + * Second stage of a quiesce. The data is already synced, now we have to take + * care of the metadata. New transactions are already blocked, so we need to + * wait for any remaining transactions to drain out before proceding. + */ +STATIC void +xfs_attr_quiesce( + xfs_mount_t *mp) +{ + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* flush inodes and push all remaining buffers out to disk */ + xfs_quiesce_fs(mp); + + ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); + + /* Push the superblock and write an unmount record */ + xfs_log_sbcount(mp, 1); + xfs_log_unmount_write(mp); + xfs_unmountfs_writesb(mp); +} + STATIC int xfs_mntupdate( bhv_desc_t *bdp, @@ -670,11 +694,8 @@ xfs_mntupdate( mp->m_flags &= ~XFS_MOUNT_BARRIER; } } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ - bhv_vfs_sync(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL); - xfs_quiesce_fs(mp); - xfs_log_sbcount(mp, 1); - xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); + bhv_vfs_sync(vfsp, SYNC_DATA_QUIESCE, NULL); + xfs_attr_quiesce(mp); vfsp->vfs_flag |= VFS_RDONLY; } return 0; @@ -1952,9 +1973,9 @@ xfs_showargs( } /* - * Second stage of a freeze. The data is already frozen, now we have to take - * care of the metadata. New transactions are already blocked, so we need to - * wait for any remaining transactions to drain out before proceding. + * Second stage of a freeze. The data is already frozen so we only + * need to take care of themetadata. Once that's done write a dummy + * record to dirty the log in case of a crash while frozen. */ STATIC void xfs_freeze( @@ -1962,19 +1983,7 @@ xfs_freeze( { xfs_mount_t *mp = XFS_BHVTOM(bdp); - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* flush inodes and push all remaining buffers out to disk */ - xfs_quiesce_fs(mp); - - ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); - - /* Push the superblock and write an unmount record */ - xfs_log_sbcount(mp, 1); - xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); + xfs_attr_quiesce(mp); xfs_fs_log_dummy(mp); } -- cgit v1.2.1 From 39726be2a2e6e61f352852da2c3a807773e33346 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Jun 2007 17:57:45 +1000 Subject: [XFS] Use do_div() on 64 bit types. SGI-PV: 966145 SGI-Modid: xfs-linux-melb:xfs-kern:28889a Signed-off-by: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_mount.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 31453ca0f3dd..a66b39805176 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1154,7 +1154,9 @@ xfs_mountfs( * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. */ - resblks = min_t(__uint64_t, mp->m_sb.sb_dblocks / 20, 1024); + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 1024); xfs_reserve_blocks(mp, &resblks, NULL); return 0; -- cgit v1.2.1 From 87ae3c2411cfd280e8289e232b718fae9f63950b Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 28 Jun 2007 16:43:14 +1000 Subject: [XFS] Cancel transactions on xfs_itruncate_start error. SGI-PV: 966502 SGI-Modid: xfs-linux-melb:xfs-kern:28943a Signed-off-by: Jesper Juhl Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_vnodeops.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 8c830a48165a..5dbca95598e0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1263,6 +1263,7 @@ xfs_free_eofblocks( error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, ip->i_size); if (error) { + xfs_trans_cancel(tp, 0); if (use_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -1687,6 +1688,7 @@ xfs_inactive( error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0); if (error) { + xfs_trans_cancel(tp, 0); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return VN_INACTIVE_CACHE; } -- cgit v1.2.1 From 24ad33ff714bd117cab30e71e2ad41e4e1185108 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 28 Jun 2007 16:43:30 +1000 Subject: [XFS] Kill off xfs_count_bits xfs_count_bits is only called once, and is then compared to 0. IOW, what it really wants to know is, is the bitmap empty. This can be done more simply, certainly. SGI-PV: 966503 SGI-Modid: xfs-linux-melb:xfs-kern:28944a Signed-off-by: Eric Sandeen Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_bit.c | 91 +++++---------------------------------------------- fs/xfs/xfs_bit.h | 4 +-- fs/xfs/xfs_buf_item.c | 4 +-- 3 files changed, 13 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c index 1afe07f67e3b..fab0b6d5a41b 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/xfs_bit.c @@ -65,44 +65,6 @@ static const char xfs_highbit[256] = { }; #endif -/* - * Count of bits set in byte, 0..8. - */ -static const char xfs_countbit[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, /* 00 .. 07 */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 08 .. 0f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 10 .. 17 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 18 .. 1f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 20 .. 27 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 28 .. 2f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 30 .. 37 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 38 .. 3f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 40 .. 47 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 48 .. 4f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 50 .. 57 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 58 .. 5f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 60 .. 67 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 68 .. 6f */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 70 .. 77 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* 78 .. 7f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 80 .. 87 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 88 .. 8f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 90 .. 97 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 98 .. 9f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* a0 .. a7 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* a8 .. af */ - 3, 4, 4, 5, 4, 5, 5, 6, /* b0 .. b7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* b8 .. bf */ - 2, 3, 3, 4, 3, 4, 4, 5, /* c0 .. c7 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* c8 .. cf */ - 3, 4, 4, 5, 4, 5, 5, 6, /* d0 .. d7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* d8 .. df */ - 3, 4, 4, 5, 4, 5, 5, 6, /* e0 .. e7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* e8 .. ef */ - 4, 5, 5, 6, 5, 6, 6, 7, /* f0 .. f7 */ - 5, 6, 6, 7, 6, 7, 7, 8, /* f8 .. ff */ -}; - /* * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set. */ @@ -167,56 +129,21 @@ xfs_highbit64( /* - * Count the number of bits set in the bitmap starting with bit - * start_bit. Size is the size of the bitmap in words. - * - * Do the counting by mapping a byte value to the number of set - * bits for that value using the xfs_countbit array, i.e. - * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1, - * xfs_countbit[3] == 2, etc. + * Return whether bitmap is empty. + * Size is number of words in the bitmap, which is padded to word boundary + * Returns 1 for empty, 0 for non-empty. */ int -xfs_count_bits(uint *map, uint size, uint start_bit) +xfs_bitmap_empty(uint *map, uint size) { - register int bits; - register unsigned char *bytep; - register unsigned char *end_map; - int byte_bit; - - bits = 0; - end_map = (char*)(map + size); - bytep = (char*)(map + (start_bit & ~0x7)); - byte_bit = start_bit & 0x7; - - /* - * If the caller fell off the end of the map, return 0. - */ - if (bytep >= end_map) { - return (0); - } - - /* - * If start_bit is not byte aligned, then process the - * first byte separately. - */ - if (byte_bit != 0) { - /* - * Shift off the bits we don't want to look at, - * before indexing into xfs_countbit. - */ - bits += xfs_countbit[(*bytep >> byte_bit)]; - bytep++; - } + uint i; + uint ret = 0; - /* - * Count the bits in each byte until the end of the bitmap. - */ - while (bytep < end_map) { - bits += xfs_countbit[*bytep]; - bytep++; + for (i = 0; i < size; i++) { + ret |= map[i]; } - return (bits); + return (ret == 0); } /* diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h index 0bbe56817542..082641a9782c 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/xfs_bit.h @@ -55,8 +55,8 @@ extern int xfs_lowbit64(__uint64_t v); /* Get high bit set out of 64-bit argument, -1 if none set */ extern int xfs_highbit64(__uint64_t); -/* Count set bits in map starting with start_bit */ -extern int xfs_count_bits(uint *map, uint size, uint start_bit); +/* Return whether bitmap is empty (1 == empty) */ +extern int xfs_bitmap_empty(uint *map, uint size); /* Count continuous one bits in map starting with start_bit */ extern int xfs_contig_bits(uint *map, uint size, uint start_bit); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 6c1bddc04e31..b0667cb27d66 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -580,8 +580,8 @@ xfs_buf_item_unlock( * If the buf item isn't tracking any data, free it. * Otherwise, if XFS_BLI_HOLD is set clear it. */ - if (xfs_count_bits(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0) == 0) { + if (xfs_bitmap_empty(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size)) { xfs_buf_item_relse(bp); } else if (hold) { bip->bli_flags &= ~XFS_BLI_HOLD; -- cgit v1.2.1 From 54aa8e26e90da882b145fcd33ed752431d6b318b Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 28 Jun 2007 16:43:39 +1000 Subject: [XFS] Simplify XFS min/max macros. SGI-PV: 964547 SGI-Modid: xfs-linux-melb:xfs-kern:28945a Signed-off-by: David Chinner Signed-off-by: Nathan Scott Signed-off-by: Tim Shimmin --- fs/xfs/xfs_btree.h | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 4e27d55a1e73..6e40a0a198ff 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -444,30 +444,14 @@ xfs_btree_setbuf( /* * Min and max functions for extlen, agblock, fileoff, and filblks types. */ -#define XFS_EXTLEN_MIN(a,b) \ - ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \ - (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) -#define XFS_EXTLEN_MAX(a,b) \ - ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \ - (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) -#define XFS_AGBLOCK_MIN(a,b) \ - ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \ - (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) -#define XFS_AGBLOCK_MAX(a,b) \ - ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \ - (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) -#define XFS_FILEOFF_MIN(a,b) \ - ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \ - (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) -#define XFS_FILEOFF_MAX(a,b) \ - ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \ - (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) -#define XFS_FILBLKS_MIN(a,b) \ - ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \ - (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) -#define XFS_FILBLKS_MAX(a,b) \ - ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \ - (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) +#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) +#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) +#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) +#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) +#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) +#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) #define XFS_FSB_SANITY_CHECK(mp,fsb) \ (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ -- cgit v1.2.1 From bbaaf53808c778bda24f8245a440c5ceacc1a37d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jun 2007 16:43:50 +1000 Subject: [XFS] Reduce shouting by removing unnecessary macros from dir2 code. SGI-PV: 966505 SGI-Modid: xfs-linux-melb:xfs-kern:28947a Signed-off-by: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_dir2.c | 12 +-- fs/xfs/xfs_dir2_block.c | 98 +++++++++++------------ fs/xfs/xfs_dir2_block.h | 2 - fs/xfs/xfs_dir2_data.c | 54 ++++++------- fs/xfs/xfs_dir2_data.h | 12 +-- fs/xfs/xfs_dir2_leaf.c | 106 ++++++++++++------------- fs/xfs/xfs_dir2_leaf.h | 29 ++----- fs/xfs/xfs_dir2_node.c | 66 ++++++++-------- fs/xfs/xfs_dir2_node.h | 4 +- fs/xfs/xfs_dir2_sf.c | 204 ++++++++++++++++++++++++------------------------ fs/xfs/xfs_dir2_sf.h | 20 +---- 11 files changed, 283 insertions(+), 324 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 8e8e5279334a..29e091914df4 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -55,9 +55,9 @@ xfs_dir_mount( XFS_MAX_BLOCKSIZE); mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; - mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp)); - mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); - mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp)); + mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); + mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); + mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / (uint)sizeof(xfs_da_node_entry_t); @@ -554,7 +554,7 @@ xfs_dir2_grow_inode( */ if (mapp != &map) kmem_free(mapp, sizeof(*mapp) * count); - *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno); + *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); /* * Update file's size if this is the data space and it grew. */ @@ -706,7 +706,7 @@ xfs_dir2_shrink_inode( dp = args->dp; mp = dp->i_mount; tp = args->trans; - da = XFS_DIR2_DB_TO_DA(mp, db); + da = xfs_dir2_db_to_da(mp, db); /* * Unmap the fsblock(s). */ @@ -742,7 +742,7 @@ xfs_dir2_shrink_inode( /* * If the block isn't the last one in the directory, we're done. */ - if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0)) + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 3accc1dcd6c9..e4df1aaae2a2 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -115,13 +115,13 @@ xfs_dir2_block_addname( xfs_da_brelse(tp, bp); return XFS_ERROR(EFSCORRUPTED); } - len = XFS_DIR2_DATA_ENTSIZE(args->namelen); + len = xfs_dir2_data_entsize(args->namelen); /* * Set up pointers to parts of the block. */ bf = block->hdr.bestfree; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * No stale entries? Need space for entry and new leaf. */ @@ -396,7 +396,7 @@ xfs_dir2_block_addname( * Fill in the leaf entry. */ blp[mid].hashval = cpu_to_be32(args->hashval); - blp[mid].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* @@ -411,7 +411,7 @@ xfs_dir2_block_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); /* * Clean up the bestfree array and log the header, tail, and entry. @@ -455,7 +455,7 @@ xfs_dir2_block_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) { + if (xfs_dir2_dataptr_to_db(mp, uio->uio_offset) > mp->m_dirdatablk) { *eofp = 1; return 0; } @@ -471,15 +471,15 @@ xfs_dir2_block_getdents( * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ - wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset); + wantoff = xfs_dir2_dataptr_to_off(mp, uio->uio_offset); block = bp->data; xfs_dir2_data_check(dp, bp); /* * Set up values for the loop. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); ptr = (char *)block->u; - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + endptr = (char *)xfs_dir2_block_leaf_p(btp); p.dbp = dbp; p.put = put; p.uio = uio; @@ -502,7 +502,7 @@ xfs_dir2_block_getdents( /* * Bump pointer for the next iteration. */ - ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + ptr += xfs_dir2_data_entsize(dep->namelen); /* * The entry is before the desired starting point, skip it. */ @@ -513,7 +513,7 @@ xfs_dir2_block_getdents( */ p.namelen = dep->namelen; - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + p.cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, ptr - (char *)block); p.ino = be64_to_cpu(dep->inumber); #if XFS_BIG_INUMS @@ -531,7 +531,7 @@ xfs_dir2_block_getdents( */ if (!p.done) { uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (char *)dep - (char *)block); xfs_da_brelse(tp, bp); return error; @@ -545,7 +545,7 @@ xfs_dir2_block_getdents( *eofp = 1; uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); + xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0); xfs_da_brelse(tp, bp); @@ -569,8 +569,8 @@ xfs_dir2_block_log_leaf( mp = tp->t_mountp; block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block), (uint)((char *)&blp[last + 1] - (char *)block - 1)); } @@ -589,7 +589,7 @@ xfs_dir2_block_log_tail( mp = tp->t_mountp; block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block), (uint)((char *)(btp + 1) - (char *)block - 1)); } @@ -623,13 +623,13 @@ xfs_dir2_block_lookup( mp = dp->i_mount; block = bp->data; xfs_dir2_data_check(dp, bp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); /* * Fill in inode number, release the block. */ @@ -675,8 +675,8 @@ xfs_dir2_block_lookup_int( ASSERT(bp != NULL); block = bp->data; xfs_dir2_data_check(dp, bp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. * Find our entry, ENOENT if it's not there. @@ -713,7 +713,7 @@ xfs_dir2_block_lookup_int( * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); /* * Compare, if it's right give back buffer & entry number. */ @@ -768,20 +768,20 @@ xfs_dir2_block_removename( tp = args->trans; mp = dp->i_mount; block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; xfs_dir2_data_make_free(tp, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)block), - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); /* * Fix up the block tail. */ @@ -843,13 +843,13 @@ xfs_dir2_block_replace( dp = args->dp; mp = dp->i_mount; block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. @@ -912,7 +912,7 @@ xfs_dir2_leaf_to_block( mp = dp->i_mount; leaf = lbp->data; ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have @@ -920,7 +920,7 @@ xfs_dir2_leaf_to_block( * These will show up in the leaf bests table. */ while (dp->i_d.di_size > mp->m_dirblksize) { - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == mp->m_dirblksize - (uint)sizeof(block->hdr)) { if ((error = @@ -974,14 +974,14 @@ xfs_dir2_leaf_to_block( /* * Initialize the block tail. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ - lep = XFS_DIR2_BLOCK_LEAF_P(btp); + lep = xfs_dir2_block_leaf_p(btp); for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) continue; @@ -1067,7 +1067,7 @@ xfs_dir2_sf_to_block( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * Copy the directory into the stack buffer. * Then pitch the incore inode data so we can make extents. @@ -1119,10 +1119,10 @@ xfs_dir2_sf_to_block( /* * Fill in the tail. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); btp->count = cpu_to_be32(sfp->hdr.count + 2); /* ., .. */ btp->stale = 0; - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + blp = xfs_dir2_block_leaf_p(btp); endoffset = (uint)((char *)blp - (char *)block); /* * Remove the freespace, we'll manage it. @@ -1138,25 +1138,25 @@ xfs_dir2_sf_to_block( dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); xfs_dir2_data_log_entry(tp, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); - blp[0].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); /* * Create entry for .. */ dep = (xfs_dir2_data_entry_t *) ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET); - dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); xfs_dir2_data_log_entry(tp, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); - blp[1].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); offset = XFS_DIR2_DATA_FIRST_OFFSET; /* @@ -1165,7 +1165,7 @@ xfs_dir2_sf_to_block( if ((i = 0) == sfp->hdr.count) sfep = NULL; else - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + sfep = xfs_dir2_sf_firstentry(sfp); /* * Need to preserve the existing offset values in the sf directory. * Insert holes (unused entries) where necessary. @@ -1177,7 +1177,7 @@ xfs_dir2_sf_to_block( if (sfep == NULL) newoffset = endoffset; else - newoffset = XFS_DIR2_SF_GET_OFFSET(sfep); + newoffset = xfs_dir2_sf_get_offset(sfep); /* * There should be a hole here, make one. */ @@ -1186,7 +1186,7 @@ xfs_dir2_sf_to_block( ((char *)block + offset); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); dup->length = cpu_to_be16(newoffset - offset); - *XFS_DIR2_DATA_UNUSED_TAG_P(dup) = cpu_to_be16( + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( ((char *)dup - (char *)block)); xfs_dir2_data_log_unused(tp, bp, dup); (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block, @@ -1198,22 +1198,22 @@ xfs_dir2_sf_to_block( * Copy a real entry. */ dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset); - dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep))); + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep))); dep->namelen = sfep->namelen; memcpy(dep->name, sfep->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)block); xfs_dir2_data_log_entry(tp, bp, dep); blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname( (char *)sfep->name, sfep->namelen)); - blp[2 + i].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)block)); offset = (int)((char *)(tagp + 1) - (char *)block); if (++i == sfp->hdr.count) sfep = NULL; else - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); } /* Done with the temporary buffer */ kmem_free(buf, buf_len); diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h index 6722effd0b20..e7c2606161e9 100644 --- a/fs/xfs/xfs_dir2_block.h +++ b/fs/xfs/xfs_dir2_block.h @@ -60,7 +60,6 @@ typedef struct xfs_dir2_block { /* * Pointer to the leaf header embedded in a data block (1-block format) */ -#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block) static inline xfs_dir2_block_tail_t * xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block) { @@ -71,7 +70,6 @@ xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block) /* * Pointer to the leaf entries embedded in a data block (1-block format) */ -#define XFS_DIR2_BLOCK_LEAF_P(btp) xfs_dir2_block_leaf_p(btp) static inline struct xfs_dir2_leaf_entry * xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp) { diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index c211c37ef67c..7ebe295bd6d3 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -72,8 +72,8 @@ xfs_dir2_data_check( bf = d->hdr.bestfree; p = (char *)d->u; if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - lep = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; } else endp = (char *)d + mp->m_dirblksize; @@ -107,7 +107,7 @@ xfs_dir2_data_check( */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { ASSERT(lastfree == 0); - ASSERT(be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup)) == + ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)d); dfp = xfs_dir2_data_freefind(d, dup); if (dfp) { @@ -131,12 +131,12 @@ xfs_dir2_data_check( dep = (xfs_dir2_data_entry_t *)p; ASSERT(dep->namelen != 0); ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); - ASSERT(be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep)) == + ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == (char *)dep - (char *)d); count++; lastfree = 0; if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (xfs_dir2_data_aoff_t) ((char *)dep - (char *)d)); hash = xfs_da_hashname((char *)dep->name, dep->namelen); @@ -147,7 +147,7 @@ xfs_dir2_data_check( } ASSERT(i < be32_to_cpu(btp->count)); } - p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + p += xfs_dir2_data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. @@ -346,8 +346,8 @@ xfs_dir2_data_freescan( */ p = (char *)d->u; if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + endp = (char *)xfs_dir2_block_leaf_p(btp); } else endp = (char *)d + mp->m_dirblksize; /* @@ -360,7 +360,7 @@ xfs_dir2_data_freescan( */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { ASSERT((char *)dup - (char *)d == - be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup))); + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); xfs_dir2_data_freeinsert(d, dup, loghead); p += be16_to_cpu(dup->length); } @@ -370,8 +370,8 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)d == - be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep))); - p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep))); + p += xfs_dir2_data_entsize(dep->namelen); } } } @@ -402,7 +402,7 @@ xfs_dir2_data_init( /* * Get the buffer set up for the block. */ - error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp, + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, XFS_DATA_FORK); if (error) { return error; @@ -427,7 +427,7 @@ xfs_dir2_data_init( t=mp->m_dirblksize - (uint)sizeof(d->hdr); d->hdr.bestfree[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); - *XFS_DIR2_DATA_UNUSED_TAG_P(dup) = cpu_to_be16((char *)dup - (char *)d); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)d); /* * Log it and return it. */ @@ -452,7 +452,7 @@ xfs_dir2_data_log_entry( ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d), - (uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) - + (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - (char *)d - 1)); } @@ -497,8 +497,8 @@ xfs_dir2_data_log_unused( * Log the end (tag) of the unused entry. */ xfs_da_log_buf(tp, bp, - (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d), - (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d + + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d), + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d + sizeof(xfs_dir2_data_off_t) - 1)); } @@ -535,8 +535,8 @@ xfs_dir2_data_make_free( xfs_dir2_block_tail_t *btp; /* block tail */ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + endptr = (char *)xfs_dir2_block_leaf_p(btp); } /* * If this isn't the start of the block, then back up to @@ -587,7 +587,7 @@ xfs_dir2_data_make_free( * Fix up the new big freespace. */ be16_add(&prevdup->length, len + be16_to_cpu(postdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(prevdup) = + *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, prevdup); if (!needscan) { @@ -621,7 +621,7 @@ xfs_dir2_data_make_free( else if (prevdup) { dfp = xfs_dir2_data_freefind(d, prevdup); be16_add(&prevdup->length, len); - *XFS_DIR2_DATA_UNUSED_TAG_P(prevdup) = + *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, prevdup); /* @@ -649,7 +649,7 @@ xfs_dir2_data_make_free( newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); /* @@ -676,7 +676,7 @@ xfs_dir2_data_make_free( newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); @@ -712,7 +712,7 @@ xfs_dir2_data_use_free( ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); ASSERT(offset >= (char *)dup - (char *)d); ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d); - ASSERT((char *)dup - (char *)d == be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup))); + ASSERT((char *)dup - (char *)d == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); /* * Look up the entry in the bestfree table. */ @@ -745,7 +745,7 @@ xfs_dir2_data_use_free( newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(oldlen - len); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); /* @@ -772,7 +772,7 @@ xfs_dir2_data_use_free( else if (matchback) { newdup = dup; newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); /* @@ -799,13 +799,13 @@ xfs_dir2_data_use_free( else { newdup = dup; newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = + *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup); newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len); newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup2) = + *xfs_dir2_data_unused_tag_p(newdup2) = cpu_to_be16((char *)newdup2 - (char *)d); xfs_dir2_data_log_unused(tp, bp, newdup2); /* diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h index c94c9099cfb1..b816e0252739 100644 --- a/fs/xfs/xfs_dir2_data.h +++ b/fs/xfs/xfs_dir2_data.h @@ -44,7 +44,7 @@ struct xfs_trans; #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) #define XFS_DIR2_DATA_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET) + xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) /* * Offsets of . and .. in data space (always block 0) @@ -52,9 +52,9 @@ struct xfs_trans; #define XFS_DIR2_DATA_DOT_OFFSET \ ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t)) #define XFS_DIR2_DATA_DOTDOT_OFFSET \ - (XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1)) + (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) #define XFS_DIR2_DATA_FIRST_OFFSET \ - (XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2)) + (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) /* * Structures. @@ -123,7 +123,6 @@ typedef struct xfs_dir2_data { /* * Size of a data entry. */ -#define XFS_DIR2_DATA_ENTSIZE(n) xfs_dir2_data_entsize(n) static inline int xfs_dir2_data_entsize(int n) { return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \ @@ -133,19 +132,16 @@ static inline int xfs_dir2_data_entsize(int n) /* * Pointer to an entry's tag word. */ -#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) xfs_dir2_data_entry_tag_p(dep) static inline __be16 * xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep) { return (__be16 *)((char *)dep + - XFS_DIR2_DATA_ENTSIZE(dep->namelen) - sizeof(__be16)); + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); } /* * Pointer to a freespace's tag word. */ -#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \ - xfs_dir2_data_unused_tag_p(dup) static inline __be16 * xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup) { diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index db14ea71459f..1b73c9ad646a 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -92,7 +92,7 @@ xfs_dir2_block_to_leaf( if ((error = xfs_da_grow_inode(args, &blkno))) { return error; } - ldb = XFS_DIR2_DA_TO_DB(mp, blkno); + ldb = xfs_dir2_da_to_db(mp, blkno); ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); /* * Initialize the leaf block, get a buffer for it. @@ -104,8 +104,8 @@ xfs_dir2_block_to_leaf( leaf = lbp->data; block = dbp->data; xfs_dir2_data_check(dp, dbp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Set the counts in the leaf header. */ @@ -137,9 +137,9 @@ xfs_dir2_block_to_leaf( /* * Set up leaf tail and bests table. */ - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = cpu_to_be32(1); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + bestsp = xfs_dir2_leaf_bests_p(ltp); bestsp[0] = block->hdr.bestfree[0].length; /* * Log the data header and leaf bests table. @@ -209,9 +209,9 @@ xfs_dir2_leaf_addname( */ index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + length = xfs_dir2_data_entsize(args->namelen); /* * See if there are any entries with the same hash value * and space in their block for the new entry. @@ -223,7 +223,7 @@ xfs_dir2_leaf_addname( index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; - i = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); ASSERT(i < be32_to_cpu(ltp->bestcount)); ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF); if (be16_to_cpu(bestsp[i]) >= length) { @@ -378,7 +378,7 @@ xfs_dir2_leaf_addname( */ else { if ((error = - xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block), + xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), -1, &dbp, XFS_DATA_FORK))) { xfs_da_brelse(tp, lbp); return error; @@ -407,7 +407,7 @@ xfs_dir2_leaf_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)data); /* * Need to scan fix up the bestfree table. @@ -529,7 +529,7 @@ xfs_dir2_leaf_addname( * Fill in the new leaf entry. */ lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block, be16_to_cpu(*tagp))); /* * Log the leaf fields and give up the buffers. @@ -567,13 +567,13 @@ xfs_dir2_leaf_check( * Should factor in the size of the bests table as well. * We can deduce a value for that from di_size. */ - ASSERT(be16_to_cpu(leaf->hdr.count) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * Leaves and bests don't overlap. */ ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= - (char *)XFS_DIR2_LEAF_BESTS_P(ltp)); + (char *)xfs_dir2_leaf_bests_p(ltp)); /* * Check hash value order, count stale entries. */ @@ -815,12 +815,12 @@ xfs_dir2_leaf_getdents( * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ - curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset); + curoff = xfs_dir2_dataptr_to_byte(mp, uio->uio_offset); /* * Force this conversion through db so we truncate the offset * down to get the start of the data block. */ - map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff)); + map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff)); /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. @@ -870,7 +870,7 @@ xfs_dir2_leaf_getdents( */ if (1 + ra_want > map_blocks && map_off < - XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) { + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { /* * Get more bmaps, fill in after the ones * we already have in the table. @@ -878,7 +878,7 @@ xfs_dir2_leaf_getdents( nmap = map_size - map_valid; error = xfs_bmapi(tp, dp, map_off, - XFS_DIR2_BYTE_TO_DA(mp, + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - map_off, XFS_BMAPI_METADATA, NULL, 0, &map[map_valid], &nmap, NULL, NULL); @@ -903,7 +903,7 @@ xfs_dir2_leaf_getdents( map[map_valid + nmap - 1].br_blockcount; else map_off = - XFS_DIR2_BYTE_TO_DA(mp, + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET); /* * Look for holes in the mapping, and @@ -931,14 +931,14 @@ xfs_dir2_leaf_getdents( * No valid mappings, so no more data blocks. */ if (!map_valid) { - curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off); + curoff = xfs_dir2_da_to_byte(mp, map_off); break; } /* * Read the directory block starting at the first * mapping. */ - curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff); + curdb = xfs_dir2_da_to_db(mp, map->br_startoff); error = xfs_da_read_buf(tp, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? XFS_FSB_TO_DADDR(mp, map->br_startblock) : @@ -1014,7 +1014,7 @@ xfs_dir2_leaf_getdents( /* * Having done a read, we need to set a new offset. */ - newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0); + newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0); /* * Start of the current block. */ @@ -1024,7 +1024,7 @@ xfs_dir2_leaf_getdents( * Make sure we're in the right block. */ else if (curoff > newoff) - ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) == + ASSERT(xfs_dir2_byte_to_db(mp, curoff) == curdb); data = bp->data; xfs_dir2_data_check(dp, bp); @@ -1032,7 +1032,7 @@ xfs_dir2_leaf_getdents( * Find our position in the block. */ ptr = (char *)&data->u; - byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff); + byteoff = xfs_dir2_byte_to_off(mp, curoff); /* * Skip past the header. */ @@ -1054,15 +1054,15 @@ xfs_dir2_leaf_getdents( } dep = (xfs_dir2_data_entry_t *)ptr; length = - XFS_DIR2_DATA_ENTSIZE(dep->namelen); + xfs_dir2_data_entsize(dep->namelen); ptr += length; } /* * Now set our real offset. */ curoff = - XFS_DIR2_DB_OFF_TO_BYTE(mp, - XFS_DIR2_BYTE_TO_DB(mp, curoff), + xfs_dir2_db_off_to_byte(mp, + xfs_dir2_byte_to_db(mp, curoff), (char *)ptr - (char *)data); if (ptr >= (char *)data + mp->m_dirblksize) { continue; @@ -1091,9 +1091,9 @@ xfs_dir2_leaf_getdents( p->namelen = dep->namelen; - length = XFS_DIR2_DATA_ENTSIZE(p->namelen); + length = xfs_dir2_data_entsize(p->namelen); - p->cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length); + p->cook = xfs_dir2_byte_to_dataptr(mp, curoff + length); p->ino = be64_to_cpu(dep->inumber); #if XFS_BIG_INUMS @@ -1121,10 +1121,10 @@ xfs_dir2_leaf_getdents( * All done. Set output offset value to current offset. */ *eofp = eof; - if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR)) + if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) uio->uio_offset = XFS_DIR2_MAX_DATAPTR; else - uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff); + uio->uio_offset = xfs_dir2_byte_to_dataptr(mp, curoff); kmem_free(map, map_size * sizeof(*map)); kmem_free(p, sizeof(*p)); if (bp) @@ -1159,7 +1159,7 @@ xfs_dir2_leaf_init( /* * Get the buffer for the block. */ - error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp, + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, XFS_DATA_FORK); if (error) { return error; @@ -1181,7 +1181,7 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); } @@ -1206,9 +1206,9 @@ xfs_dir2_leaf_log_bests( leaf = bp->data; ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf); - firstb = XFS_DIR2_LEAF_BESTS_P(ltp) + first; - lastb = XFS_DIR2_LEAF_BESTS_P(ltp) + last; + ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); + firstb = xfs_dir2_leaf_bests_p(ltp) + first; + lastb = xfs_dir2_leaf_bests_p(ltp) + last; xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1268,7 +1268,7 @@ xfs_dir2_leaf_log_tail( mp = tp->t_mountp; leaf = bp->data; ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); } @@ -1312,7 +1312,7 @@ xfs_dir2_leaf_lookup( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); /* * Return the found inode number. */ @@ -1381,7 +1381,7 @@ xfs_dir2_leaf_lookup_int( /* * Get the new data block number. */ - newdb = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); /* * If it's not the same as the old data block number, * need to pitch the old one and read the new one. @@ -1391,7 +1391,7 @@ xfs_dir2_leaf_lookup_int( xfs_da_brelse(tp, dbp); if ((error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp, + xfs_dir2_db_to_da(mp, newdb), -1, &dbp, XFS_DATA_FORK))) { xfs_da_brelse(tp, lbp); return error; @@ -1404,7 +1404,7 @@ xfs_dir2_leaf_lookup_int( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* * If it matches then return it. */ @@ -1469,20 +1469,20 @@ xfs_dir2_leaf_removename( * Point to the leaf entry, use that to point to the data entry. */ lep = &leaf->ents[index]; - db = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); dep = (xfs_dir2_data_entry_t *) - ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + ((char *)data + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); needscan = needlog = 0; oldbest = be16_to_cpu(data->hdr.bestfree[0].length); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); /* * Mark the former data entry unused. */ xfs_dir2_data_make_free(tp, dbp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)data), - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); /* * We just mark the leaf entry stale by putting a null in it. */ @@ -1602,7 +1602,7 @@ xfs_dir2_leaf_replace( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* * Put the new inode number in, log it. @@ -1698,7 +1698,7 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp, + if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, XFS_DATA_FORK))) { return error; } @@ -1712,7 +1712,7 @@ xfs_dir2_leaf_trim_data( */ leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) == mp->m_dirblksize - (uint)sizeof(data->hdr)); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); @@ -1727,7 +1727,7 @@ xfs_dir2_leaf_trim_data( /* * Eliminate the last bests entry from the table. */ - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + bestsp = xfs_dir2_leaf_bests_p(ltp); be32_add(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); xfs_dir2_leaf_log_tail(tp, lbp); @@ -1838,12 +1838,12 @@ xfs_dir2_node_to_leaf( /* * Set up the leaf tail from the freespace block. */ - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = free->hdr.nvalid; /* * Set up the leaf bests table. */ - memcpy(XFS_DIR2_LEAF_BESTS_P(ltp), free->bests, + memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests, be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0])); xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); xfs_dir2_leaf_log_tail(tp, lbp); diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h index f57ca1162412..70c97f3f815e 100644 --- a/fs/xfs/xfs_dir2_leaf.h +++ b/fs/xfs/xfs_dir2_leaf.h @@ -32,7 +32,7 @@ struct xfs_trans; #define XFS_DIR2_LEAF_SPACE 1 #define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) #define XFS_DIR2_LEAF_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET) + xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET) /* * Offset in data space of a data entry. @@ -82,7 +82,6 @@ typedef struct xfs_dir2_leaf { * DB blocks here are logical directory block numbers, not filesystem blocks. */ -#define XFS_DIR2_MAX_LEAF_ENTS(mp) xfs_dir2_max_leaf_ents(mp) static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) { return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / @@ -92,7 +91,6 @@ static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) /* * Get address of the bestcount field in the single-leaf block. */ -#define XFS_DIR2_LEAF_TAIL_P(mp,lp) xfs_dir2_leaf_tail_p(mp, lp) static inline xfs_dir2_leaf_tail_t * xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp) { @@ -104,7 +102,6 @@ xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp) /* * Get address of the bests array in the single-leaf block. */ -#define XFS_DIR2_LEAF_BESTS_P(ltp) xfs_dir2_leaf_bests_p(ltp) static inline __be16 * xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp) { @@ -114,7 +111,6 @@ xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp) /* * Convert dataptr to byte in file space */ -#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp) static inline xfs_dir2_off_t xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) { @@ -124,7 +120,6 @@ xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) /* * Convert byte in file space to dataptr. It had better be aligned. */ -#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by) static inline xfs_dir2_dataptr_t xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) { @@ -134,7 +129,6 @@ xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) /* * Convert byte in space to (DB) block */ -#define XFS_DIR2_BYTE_TO_DB(mp,by) xfs_dir2_byte_to_db(mp, by) static inline xfs_dir2_db_t xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) { @@ -145,17 +139,15 @@ xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) /* * Convert dataptr to a block number */ -#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) xfs_dir2_dataptr_to_db(mp, dp) static inline xfs_dir2_db_t xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) { - return XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)); + return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp)); } /* * Convert byte in space to offset in a block */ -#define XFS_DIR2_BYTE_TO_OFF(mp,by) xfs_dir2_byte_to_off(mp, by) static inline xfs_dir2_data_aoff_t xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) { @@ -166,18 +158,15 @@ xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) /* * Convert dataptr to a byte offset in a block */ -#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) xfs_dir2_dataptr_to_off(mp, dp) static inline xfs_dir2_data_aoff_t xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) { - return XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)); + return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp)); } /* * Convert block and offset to byte in space */ -#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \ - xfs_dir2_db_off_to_byte(mp, db, o) static inline xfs_dir2_off_t xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, xfs_dir2_data_aoff_t o) @@ -189,7 +178,6 @@ xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, /* * Convert block (DB) to block (dablk) */ -#define XFS_DIR2_DB_TO_DA(mp,db) xfs_dir2_db_to_da(mp, db) static inline xfs_dablk_t xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) { @@ -199,29 +187,25 @@ xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) /* * Convert byte in space to (DA) block */ -#define XFS_DIR2_BYTE_TO_DA(mp,by) xfs_dir2_byte_to_da(mp, by) static inline xfs_dablk_t xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) { - return XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by)); + return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by)); } /* * Convert block and offset to dataptr */ -#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \ - xfs_dir2_db_off_to_dataptr(mp, db, o) static inline xfs_dir2_dataptr_t xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, xfs_dir2_data_aoff_t o) { - return XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o)); + return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o)); } /* * Convert block (dablk) to block (DB) */ -#define XFS_DIR2_DA_TO_DB(mp,da) xfs_dir2_da_to_db(mp, da) static inline xfs_dir2_db_t xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) { @@ -231,11 +215,10 @@ xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) /* * Convert block (dablk) to byte offset in space */ -#define XFS_DIR2_DA_TO_BYTE(mp,da) xfs_dir2_da_to_byte(mp, da) static inline xfs_dir2_off_t xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) { - return XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0); + return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0); } /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index d083c3819934..91c61d9632c8 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -136,14 +136,14 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp, + if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, XFS_DATA_FORK))) { return error; } ASSERT(fbp != NULL); free = fbp->data; leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * Initialize the freespace block header. */ @@ -155,7 +155,7 @@ xfs_dir2_leaf_to_node( * Copy freespace entries from the leaf block to the new block. * Count active entries. */ - for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P(ltp), to = free->bests; + for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { if ((off = be16_to_cpu(*from)) != NULLDATAOFF) n++; @@ -215,7 +215,7 @@ xfs_dir2_leafn_add( * a compact. */ - if (be16_to_cpu(leaf->hdr.count) == XFS_DIR2_MAX_LEAF_ENTS(mp)) { + if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) { if (!leaf->hdr.stale) return XFS_ERROR(ENOSPC); compact = be16_to_cpu(leaf->hdr.stale) > 1; @@ -327,7 +327,7 @@ xfs_dir2_leafn_add( * Insert the new entry, log everything. */ lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(XFS_DIR2_DB_OFF_TO_DATAPTR(mp, + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, args->blkno, args->index)); xfs_dir2_leaf_log_header(tp, bp); xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); @@ -352,7 +352,7 @@ xfs_dir2_leafn_check( leaf = bp->data; mp = dp->i_mount; ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - ASSERT(be16_to_cpu(leaf->hdr.count) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); + ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { if (i + 1 < be16_to_cpu(leaf->hdr.count)) { ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= @@ -440,7 +440,7 @@ xfs_dir2_leafn_lookup_int( if (args->addname) { curfdb = curbp ? state->extrablk.blkno : -1; curdb = -1; - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + length = xfs_dir2_data_entsize(args->namelen); if ((free = (curbp ? curbp->data : NULL))) ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); } @@ -465,7 +465,7 @@ xfs_dir2_leafn_lookup_int( /* * Pull the data block number from the entry. */ - newdb = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); /* * For addname, we're looking for a place to put the new entry. * We want to use a data block with an entry of equal @@ -482,7 +482,7 @@ xfs_dir2_leafn_lookup_int( * Convert the data block to the free block * holding its freespace information. */ - newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb); + newfdb = xfs_dir2_db_to_fdb(mp, newdb); /* * If it's not the one we have in hand, * read it in. @@ -497,7 +497,7 @@ xfs_dir2_leafn_lookup_int( * Read the free block. */ if ((error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, + xfs_dir2_db_to_da(mp, newfdb), -1, &curbp, XFS_DATA_FORK))) { @@ -517,7 +517,7 @@ xfs_dir2_leafn_lookup_int( /* * Get the index for our entry. */ - fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb); + fi = xfs_dir2_db_to_fdindex(mp, curdb); /* * If it has room, return it. */ @@ -561,7 +561,7 @@ xfs_dir2_leafn_lookup_int( */ if ((error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, newdb), -1, + xfs_dir2_db_to_da(mp, newdb), -1, &curbp, XFS_DATA_FORK))) { return error; } @@ -573,7 +573,7 @@ xfs_dir2_leafn_lookup_int( */ dep = (xfs_dir2_data_entry_t *) ((char *)curbp->data + - XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* * Compare the entry, return it if it matches. */ @@ -876,9 +876,9 @@ xfs_dir2_leafn_remove( /* * Extract the data block and offset from the entry. */ - db = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); ASSERT(dblk->blkno == db); - off = XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address)); + off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); /* * Kill the leaf entry by marking it stale. @@ -898,7 +898,7 @@ xfs_dir2_leafn_remove( longest = be16_to_cpu(data->hdr.bestfree[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(tp, dbp, off, - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); /* * Rescan the data block freespaces for bestfree. * Log the data block header if needed. @@ -924,8 +924,8 @@ xfs_dir2_leafn_remove( * Convert the data block number to a free block, * read in the free block. */ - fdb = XFS_DIR2_DB_TO_FDB(mp, db); - if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), + fdb = xfs_dir2_db_to_fdb(mp, db); + if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, XFS_DATA_FORK))) { return error; } @@ -937,7 +937,7 @@ xfs_dir2_leafn_remove( /* * Calculate which entry we need to fix. */ - findex = XFS_DIR2_DB_TO_FDINDEX(mp, db); + findex = xfs_dir2_db_to_fdindex(mp, db); longest = be16_to_cpu(data->hdr.bestfree[0].length); /* * If the data block is now empty we can get rid of it @@ -1073,7 +1073,7 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno), + error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno), &newblk->bp, XFS_DIR2_LEAFN_MAGIC); if (error) { return error; @@ -1385,7 +1385,7 @@ xfs_dir2_node_addname_int( dp = args->dp; mp = dp->i_mount; tp = args->trans; - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + length = xfs_dir2_data_entsize(args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -1438,7 +1438,7 @@ xfs_dir2_node_addname_int( if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) return error; - lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo); + lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo); fbno = ifbno; } /* @@ -1474,7 +1474,7 @@ xfs_dir2_node_addname_int( * to avoid it. */ if ((error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, + xfs_dir2_db_to_da(mp, fbno), -2, &fbp, XFS_DATA_FORK))) { return error; } @@ -1550,9 +1550,9 @@ xfs_dir2_node_addname_int( * Get the freespace block corresponding to the data block * that was just allocated. */ - fbno = XFS_DIR2_DB_TO_FDB(mp, dbno); + fbno = xfs_dir2_db_to_fdb(mp, dbno); if (unlikely(error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, + xfs_dir2_db_to_da(mp, fbno), -2, &fbp, XFS_DATA_FORK))) { xfs_da_buf_done(dbp); return error; @@ -1567,14 +1567,14 @@ xfs_dir2_node_addname_int( return error; } - if (unlikely(XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno)) { + if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { cmn_err(CE_ALERT, "xfs_dir2_node_addname_int: dir ino " "%llu needed freesp block %lld for\n" " data block %lld, got %lld\n" " ifbno %llu lastfbno %d\n", (unsigned long long)dp->i_ino, - (long long)XFS_DIR2_DB_TO_FDB(mp, dbno), + (long long)xfs_dir2_db_to_fdb(mp, dbno), (long long)dbno, (long long)fbno, (unsigned long long)ifbno, lastfbno); if (fblk) { @@ -1598,7 +1598,7 @@ xfs_dir2_node_addname_int( * Get a buffer for the new block. */ if ((error = xfs_da_get_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), + xfs_dir2_db_to_da(mp, fbno), -1, &fbp, XFS_DATA_FORK))) { return error; } @@ -1623,7 +1623,7 @@ xfs_dir2_node_addname_int( /* * Set the freespace block index from the data block number. */ - findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno); + findex = xfs_dir2_db_to_fdindex(mp, dbno); /* * If it's after the end of the current entries in the * freespace block, extend that table. @@ -1669,7 +1669,7 @@ xfs_dir2_node_addname_int( * Read the data block in. */ if (unlikely( - error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno), + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), -1, &dbp, XFS_DATA_FORK))) { if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) xfs_da_buf_done(fbp); @@ -1698,7 +1698,7 @@ xfs_dir2_node_addname_int( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + tagp = xfs_dir2_data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)data); xfs_dir2_data_log_entry(tp, dbp, dep); /* @@ -1904,7 +1904,7 @@ xfs_dir2_node_replace( ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); dep = (xfs_dir2_data_entry_t *) ((char *)data + - XFS_DIR2_DATAPTR_TO_OFF(state->mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* * Fill in the new inode number and log the entry. @@ -1980,7 +1980,7 @@ xfs_dir2_node_trim_free( * Blow the block away. */ if ((error = - xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo), + xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo), bp))) { /* * Can't fail with ENOSPC since that only happens with no diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h index c7c870ee7857..dde72db3d695 100644 --- a/fs/xfs/xfs_dir2_node.h +++ b/fs/xfs/xfs_dir2_node.h @@ -36,7 +36,7 @@ struct xfs_trans; #define XFS_DIR2_FREE_SPACE 2 #define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) #define XFS_DIR2_FREE_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET) + xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET) #define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */ @@ -60,7 +60,6 @@ typedef struct xfs_dir2_free { /* * Convert data space db to the corresponding free db. */ -#define XFS_DIR2_DB_TO_FDB(mp,db) xfs_dir2_db_to_fdb(mp, db) static inline xfs_dir2_db_t xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) { @@ -70,7 +69,6 @@ xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) /* * Convert data space db to the corresponding index in a free db. */ -#define XFS_DIR2_DB_TO_FDINDEX(mp,db) xfs_dir2_db_to_fdindex(mp, db) static inline int xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) { diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 0cd77b17bf92..38fc4f22b76d 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -89,8 +89,8 @@ xfs_dir2_block_sfsize( mp = dp->i_mount; count = i8count = namelen = 0; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); /* * Iterate over the block's data entries by using the leaf pointers. @@ -102,7 +102,7 @@ xfs_dir2_block_sfsize( * Calculate the pointer to the entry at hand. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); /* * Detect . and .., so we can special-case them. * . is not included in sf directories. @@ -124,7 +124,7 @@ xfs_dir2_block_sfsize( /* * Calculate the new size, see if we should give up yet. */ - size = XFS_DIR2_SF_HDR_SIZE(i8count) + /* header */ + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ count + /* namelen */ count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ namelen + /* name */ @@ -139,7 +139,7 @@ xfs_dir2_block_sfsize( */ sfhp->count = count; sfhp->i8count = i8count; - XFS_DIR2_SF_PUT_INUMBER((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); + xfs_dir2_sf_put_inumber((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); return size; } @@ -199,15 +199,15 @@ xfs_dir2_block_to_sf( * Copy the header into the newly allocate local space. */ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - memcpy(sfp, sfhp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count)); + memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); dp->i_d.di_size = size; /* * Set up to loop over the block's entries. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + btp = xfs_dir2_block_tail_p(mp, block); ptr = (char *)block->u; - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + sfep = xfs_dir2_sf_firstentry(sfp); /* * Loop over the active and unused entries. * Stop when we reach the leaf/tail portion of the block. @@ -233,22 +233,22 @@ xfs_dir2_block_to_sf( else if (dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.') ASSERT(be64_to_cpu(dep->inumber) == - XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); + xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent)); /* * Normal entry, copy it into shortform. */ else { sfep->namelen = dep->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, + xfs_dir2_sf_put_offset(sfep, (xfs_dir2_data_aoff_t) ((char *)dep - (char *)block)); memcpy(sfep->name, dep->name, dep->namelen); temp = be64_to_cpu(dep->inumber); - XFS_DIR2_SF_PUT_INUMBER(sfp, &temp, - XFS_DIR2_SF_INUMBERP(sfep)); - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + xfs_dir2_sf_put_inumber(sfp, &temp, + xfs_dir2_sf_inumberp(sfep)); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); } - ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + ptr += xfs_dir2_data_entsize(dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); xfs_dir2_sf_check(args); @@ -294,11 +294,11 @@ xfs_dir2_sf_addname( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * Compute entry (and change in) size. */ - add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); + add_entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen); incr_isize = add_entsize; objchange = 0; #if XFS_BIG_INUMS @@ -392,7 +392,7 @@ xfs_dir2_sf_addname_easy( /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen), + xfs_idata_realloc(dp, xfs_dir2_sf_entsize_byname(sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. @@ -403,10 +403,10 @@ xfs_dir2_sf_addname_easy( * Fill in the new entry. */ sfep->namelen = args->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, offset); + xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); /* * Update the header and inode. */ @@ -463,14 +463,14 @@ xfs_dir2_sf_addname_hard( * If it's going to end up at the end then oldsfep will point there. */ for (offset = XFS_DIR2_DATA_FIRST_OFFSET, - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp), - add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen), + oldsfep = xfs_dir2_sf_firstentry(oldsfp), + add_datasize = xfs_dir2_data_entsize(args->namelen), eof = (char *)oldsfep == &buf[old_isize]; !eof; - offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep), + offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep), eof = (char *)oldsfep == &buf[old_isize]) { - new_offset = XFS_DIR2_SF_GET_OFFSET(oldsfep); + new_offset = xfs_dir2_sf_get_offset(oldsfep); if (offset + add_datasize <= new_offset) break; } @@ -495,10 +495,10 @@ xfs_dir2_sf_addname_hard( * Fill in the new entry, and update the header counts. */ sfep->namelen = args->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, offset); + xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); sfp->hdr.count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) @@ -508,7 +508,7 @@ xfs_dir2_sf_addname_hard( * If there's more left to copy, do that. */ if (!eof) { - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } kmem_free(buf, old_isize); @@ -544,9 +544,9 @@ xfs_dir2_sf_addname_pick( mp = dp->i_mount; sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - size = XFS_DIR2_DATA_ENTSIZE(args->namelen); + size = xfs_dir2_data_entsize(args->namelen); offset = XFS_DIR2_DATA_FIRST_OFFSET; - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* * Loop over sf entries. @@ -555,10 +555,10 @@ xfs_dir2_sf_addname_pick( */ for (i = 0; i < sfp->hdr.count; i++) { if (!holefit) - holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET(sfep); - offset = XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(sfep->namelen); - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); + offset = xfs_dir2_sf_get_offset(sfep) + + xfs_dir2_data_entsize(sfep->namelen); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); } /* * Calculate data bytes used excluding the new entry, if this @@ -617,18 +617,18 @@ xfs_dir2_sf_check( sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; offset = XFS_DIR2_DATA_FIRST_OFFSET; - ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - ASSERT(XFS_DIR2_SF_GET_OFFSET(sfep) >= offset); - ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); + ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = - XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(sfep->namelen); + xfs_dir2_sf_get_offset(sfep) + + xfs_dir2_data_entsize(sfep->namelen); } ASSERT(i8count == sfp->hdr.i8count); ASSERT(XFS_BIG_INUMS || i8count == 0); @@ -671,7 +671,7 @@ xfs_dir2_sf_create( ASSERT(dp->i_df.if_flags & XFS_IFINLINE); ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; - size = XFS_DIR2_SF_HDR_SIZE(i8count); + size = xfs_dir2_sf_hdr_size(i8count); /* * Make a buffer for the data. */ @@ -684,7 +684,7 @@ xfs_dir2_sf_create( /* * Now can put in the inode number, since i8count is set. */ - XFS_DIR2_SF_PUT_INUMBER(sfp, &pino, &sfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &pino, &sfp->hdr.parent); sfp->hdr.count = 0; dp->i_d.di_size = size; xfs_dir2_sf_check(args); @@ -727,12 +727,12 @@ xfs_dir2_sf_getdents( sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * If the block number in the offset is out of range, we're done. */ - if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) { + if (xfs_dir2_dataptr_to_db(mp, dir_offset) > mp->m_dirdatablk) { *eofp = 1; return 0; } @@ -747,9 +747,9 @@ xfs_dir2_sf_getdents( * Put . entry unless we're starting past it. */ if (dir_offset <= - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, XFS_DIR2_DATA_DOT_OFFSET)) { - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0, + p.cook = xfs_dir2_db_off_to_dataptr(mp, 0, XFS_DIR2_DATA_DOTDOT_OFFSET); p.ino = dp->i_ino; #if XFS_BIG_INUMS @@ -762,7 +762,7 @@ xfs_dir2_sf_getdents( if (!p.done) { uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, XFS_DIR2_DATA_DOT_OFFSET); return error; } @@ -772,11 +772,11 @@ xfs_dir2_sf_getdents( * Put .. entry unless we're starting past it. */ if (dir_offset <= - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, XFS_DIR2_DATA_DOTDOT_OFFSET)) { - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + p.cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, XFS_DIR2_DATA_FIRST_OFFSET); - p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + p.ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); #if XFS_BIG_INUMS p.ino += mp->m_inoadd; #endif @@ -787,7 +787,7 @@ xfs_dir2_sf_getdents( if (!p.done) { uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, XFS_DIR2_DATA_DOTDOT_OFFSET); return error; } @@ -796,23 +796,23 @@ xfs_dir2_sf_getdents( /* * Loop while there are more entries and put'ing works. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { - off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_SF_GET_OFFSET(sfep)); + off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + xfs_dir2_sf_get_offset(sfep)); if (dir_offset > off) continue; p.namelen = sfep->namelen; - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(p.namelen)); + p.cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + xfs_dir2_sf_get_offset(sfep) + + xfs_dir2_data_entsize(p.namelen)); - p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); + p.ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); #if XFS_BIG_INUMS p.ino += mp->m_inoadd; #endif @@ -832,7 +832,7 @@ xfs_dir2_sf_getdents( *eofp = 1; uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); + xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0); return 0; } @@ -865,7 +865,7 @@ xfs_dir2_sf_lookup( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * Special case for . */ @@ -878,21 +878,21 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - args->inumber = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); return XFS_ERROR(EEXIST); } /* * Loop over all the entries trying to match ours. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { if (sfep->namelen == args->namelen && sfep->name[0] == args->name[0] && memcmp(args->name, sfep->name, args->namelen) == 0) { args->inumber = - XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)); + xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)); return XFS_ERROR(EEXIST); } } @@ -934,19 +934,19 @@ xfs_dir2_sf_removename( ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); /* * Loop over the old directory entries. * Find the one we're deleting. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { if (sfep->namelen == args->namelen && sfep->name[0] == args->name[0] && memcmp(sfep->name, args->name, args->namelen) == 0) { - ASSERT(XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)) == + ASSERT(xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)) == args->inumber); break; } @@ -961,7 +961,7 @@ xfs_dir2_sf_removename( * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); + entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -1027,7 +1027,7 @@ xfs_dir2_sf_replace( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); #if XFS_BIG_INUMS /* * New inode number is large, and need to convert to 8-byte inodes. @@ -1067,28 +1067,28 @@ xfs_dir2_sf_replace( if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { #if XFS_BIG_INUMS || defined(DEBUG) - ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); ASSERT(args->inumber != ino); #endif - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, &sfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, &sfp->hdr.parent); } /* * Normal entry, look for the name. */ else { - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { if (sfep->namelen == args->namelen && sfep->name[0] == args->name[0] && memcmp(args->name, sfep->name, args->namelen) == 0) { #if XFS_BIG_INUMS || defined(DEBUG) - ino = XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)); + ino = xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)); ASSERT(args->inumber != ino); #endif - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); break; } } @@ -1189,22 +1189,22 @@ xfs_dir2_sf_toino4( */ sfp->hdr.count = oldsfp->hdr.count; sfp->hdr.i8count = 0; - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent); /* * Copy the entries field by field. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, - XFS_DIR2_SF_INUMBERP(oldsfep)); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); + ino = xfs_dir2_sf_get_inumber(oldsfp, + xfs_dir2_sf_inumberp(oldsfep)); + xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep)); } /* * Clean up the inode. @@ -1266,22 +1266,22 @@ xfs_dir2_sf_toino8( */ sfp->hdr.count = oldsfp->hdr.count; sfp->hdr.i8count = 1; - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent); /* * Copy the entries field by field. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, - XFS_DIR2_SF_INUMBERP(oldsfep)); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); + ino = xfs_dir2_sf_get_inumber(oldsfp, + xfs_dir2_sf_inumberp(oldsfep)); + xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep)); } /* * Clean up the inode. diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h index 42f015b70018..11e503209afa 100644 --- a/fs/xfs/xfs_dir2_sf.h +++ b/fs/xfs/xfs_dir2_sf.h @@ -90,7 +90,6 @@ typedef struct xfs_dir2_sf { xfs_dir2_sf_entry_t list[1]; /* shortform entries */ } xfs_dir2_sf_t; -#define XFS_DIR2_SF_HDR_SIZE(i8count) xfs_dir2_sf_hdr_size(i8count) static inline int xfs_dir2_sf_hdr_size(int i8count) { return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \ @@ -98,14 +97,11 @@ static inline int xfs_dir2_sf_hdr_size(int i8count) ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); } -#define XFS_DIR2_SF_INUMBERP(sfep) xfs_dir2_sf_inumberp(sfep) static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep) { return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen]; } -#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \ - xfs_dir2_sf_get_inumber(sfp, from) static inline xfs_intino_t xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from) { @@ -114,8 +110,6 @@ xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from) (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8)); } -#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \ - xfs_dir2_sf_put_inumber(sfp,from,to) static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, xfs_dir2_inou_t *to) { @@ -125,24 +119,18 @@ static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, XFS_PUT_DIR_INO8(*(from), (to)->i8); } -#define XFS_DIR2_SF_GET_OFFSET(sfep) \ - xfs_dir2_sf_get_offset(sfep) static inline xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) { return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i); } -#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \ - xfs_dir2_sf_put_offset(sfep,off) static inline void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) { INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off); } -#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) \ - xfs_dir2_sf_entsize_byname(sfp,len) static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len) { return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \ @@ -150,8 +138,6 @@ static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len) ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); } -#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) \ - xfs_dir2_sf_entsize_byentry(sfp,sfep) static inline int xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) { @@ -160,19 +146,17 @@ xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); } -#define XFS_DIR2_SF_FIRSTENTRY(sfp) xfs_dir2_sf_firstentry(sfp) static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp) { return ((xfs_dir2_sf_entry_t *) \ - ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count))); + ((char *)(sfp) + xfs_dir2_sf_hdr_size(sfp->hdr.i8count))); } -#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) xfs_dir2_sf_nextentry(sfp,sfep) static inline xfs_dir2_sf_entry_t * xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) { return ((xfs_dir2_sf_entry_t *) \ - ((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep))); + ((char *)(sfep) + xfs_dir2_sf_entsize_byentry(sfp,sfep))); } /* -- cgit v1.2.1 From 16a087d8e1af9b974125870dceb9e4a35249ad1d Mon Sep 17 00:00:00 2001 From: Vignesh Babu Date: Thu, 28 Jun 2007 16:46:37 +1000 Subject: [XFS] Use is_power_of_2 instead of open coding checks SGI-PV: 966576 SGI-Modid: xfs-linux-melb:xfs-kern:28950a Signed-off-by: Vignesh Babu Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3ca5d43b8345..8fdd30d9ba56 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -49,6 +49,7 @@ #include "xfs_quota.h" #include "xfs_acl.h" +#include kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; @@ -4184,7 +4185,7 @@ xfs_iext_realloc_direct( ifp->if_bytes = new_size; return; } - if ((new_size & (new_size - 1)) != 0) { + if (!is_power_of_2(new_size)){ rnew_size = xfs_iroundup(new_size); } if (rnew_size != ifp->if_real_bytes) { @@ -4207,7 +4208,7 @@ xfs_iext_realloc_direct( */ else { new_size += ifp->if_bytes; - if ((new_size & (new_size - 1)) != 0) { + if (!is_power_of_2(new_size)) { rnew_size = xfs_iroundup(new_size); } xfs_iext_inline_to_direct(ifp, rnew_size); -- cgit v1.2.1 From fbf3ce8d8ec508f6bd99b36de034d2ae3e1ae7ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jun 2007 16:46:47 +1000 Subject: [XFS] XFS should not be looking at filp reference counts A check for file_count is always a bad idea. Linux has the ->release method to deal with cleanups on last close and ->flush is only for the very rare case where we want to perform an operation on every drop of a reference to a file struct. This patch gets rid of vop_close and surrounding code in favour of simply doing the page flushing from ->release. SGI-PV: 966562 SGI-Modid: xfs-linux-melb:xfs-kern:28952a Signed-off-by: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_file.c | 11 ----------- fs/xfs/linux-2.6/xfs_vnode.h | 5 ----- fs/xfs/xfs_vnodeops.c | 47 +++++++++++++++----------------------------- 3 files changed, 16 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 8c43cd2e237a..cbcd40c8c2a0 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -183,15 +183,6 @@ xfs_file_open( return -bhv_vop_open(vn_from_inode(inode), NULL); } -STATIC int -xfs_file_close( - struct file *filp, - fl_owner_t id) -{ - return -bhv_vop_close(vn_from_inode(filp->f_path.dentry->d_inode), 0, - file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL); -} - STATIC int xfs_file_release( struct inode *inode, @@ -436,7 +427,6 @@ const struct file_operations xfs_file_operations = { #endif .mmap = xfs_file_mmap, .open = xfs_file_open, - .flush = xfs_file_close, .release = xfs_file_release, .fsync = xfs_file_fsync, #ifdef HAVE_FOP_OPEN_EXEC @@ -458,7 +448,6 @@ const struct file_operations xfs_invis_file_operations = { #endif .mmap = xfs_file_mmap, .open = xfs_file_open, - .flush = xfs_file_close, .release = xfs_file_release, .fsync = xfs_file_fsync, }; diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 013048a92643..5742d65f0785 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -129,10 +129,7 @@ typedef enum bhv_vchange { VCHANGE_FLAGS_IOEXCL_COUNT = 4 } bhv_vchange_t; -typedef enum { L_FALSE, L_TRUE } lastclose_t; - typedef int (*vop_open_t)(bhv_desc_t *, struct cred *); -typedef int (*vop_close_t)(bhv_desc_t *, int, lastclose_t, struct cred *); typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *, const struct iovec *, unsigned int, loff_t *, int, struct cred *); @@ -200,7 +197,6 @@ typedef int (*vop_iflush_t)(bhv_desc_t *, int); typedef struct bhv_vnodeops { bhv_position_t vn_position; /* position within behavior chain */ vop_open_t vop_open; - vop_close_t vop_close; vop_read_t vop_read; vop_write_t vop_write; vop_splice_read_t vop_splice_read; @@ -245,7 +241,6 @@ typedef struct bhv_vnodeops { #define VNHEAD(vp) ((vp)->v_bh.bh_first) #define VOP(op, vp) (*((bhv_vnodeops_t *)VNHEAD(vp)->bd_ops)->op) #define bhv_vop_open(vp, cr) VOP(vop_open, vp)(VNHEAD(vp),cr) -#define bhv_vop_close(vp, f,last,cr) VOP(vop_close, vp)(VNHEAD(vp),f,last,cr) #define bhv_vop_read(vp,file,iov,segs,offset,ioflags,cr) \ VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) #define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr) \ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 5dbca95598e0..2067d0b0a10e 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -77,36 +77,6 @@ xfs_open( return 0; } -STATIC int -xfs_close( - bhv_desc_t *bdp, - int flags, - lastclose_t lastclose, - cred_t *credp) -{ - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - xfs_inode_t *ip = XFS_BHVTOI(bdp); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - if (lastclose != L_TRUE || !VN_ISREG(vp)) - return 0; - - /* - * If we previously truncated this file and removed old data in - * the process, we want to initiate "early" writeout on the last - * close. This is an attempt to combat the notorious NULL files - * problem which is particularly noticable from a truncate down, - * buffered (re-)write (delalloc), followed by a crash. What we - * are effectively doing here is significantly reducing the time - * window where we'd otherwise be exposed to that problem. - */ - if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0) - return bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE); - return 0; -} - /* * xfs_getattr */ @@ -1566,6 +1536,22 @@ xfs_release( if (vp->v_vfsp->vfs_flag & VFS_RDONLY) return 0; + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0) + bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE); + } + + #ifdef HAVE_REFCACHE /* If we are in the NFS reference cache then don't do this now */ if (ip->i_refcache) @@ -4681,7 +4667,6 @@ xfs_change_file_space( bhv_vnodeops_t xfs_vnodeops = { BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS), .vop_open = xfs_open, - .vop_close = xfs_close, .vop_read = xfs_read, #ifdef HAVE_SPLICE .vop_splice_read = xfs_splice_read, -- cgit v1.2.1 From 0892ccd6fe13e08ad9e57007afbb78fe02d66005 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 28 Jun 2007 16:46:56 +1000 Subject: [XFS] Use uninitialized_var macro to stop warning about rtx Appease gcc in regards to "warning: 'rtx' is used uninitialized in this function". SGI-PV: 907752 SGI-Modid: xfs-linux-melb:xfs-kern:29007a Signed-off-by: Andrew Morton Signed-off-by: Tim Shimmin --- fs/xfs/xfs_bmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 183add2f21b7..09d86388bb71 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2605,7 +2605,6 @@ xfs_bmap_rtalloc( xfs_extlen_t prod = 0; /* product factor for allocators */ xfs_extlen_t ralen = 0; /* realtime allocation length */ xfs_extlen_t align; /* minimum allocation alignment */ - xfs_rtblock_t rtx; /* realtime extent number */ xfs_rtblock_t rtb; mp = ap->ip->i_mount; @@ -2643,6 +2642,8 @@ xfs_bmap_rtalloc( * pick an extent that will space things out in the rt area. */ if (ap->eof && ap->off == 0) { + xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) return error; -- cgit v1.2.1 From 2a82b8be8a8dacb48cb7371449a7a9daa558b4a8 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Wed, 11 Jul 2007 11:09:12 +1000 Subject: [XFS] Concurrent Multi-File Data Streams In media spaces, video is often stored in a frame-per-file format. When dealing with uncompressed realtime HD video streams in this format, it is crucial that files do not get fragmented and that multiple files a placed contiguously on disk. When multiple streams are being ingested and played out at the same time, it is critical that the filesystem does not cross the streams and interleave them together as this creates seek and readahead cache miss latency and prevents both ingest and playout from meeting frame rate targets. This patch set creates a "stream of files" concept into the allocator to place all the data from a single stream contiguously on disk so that RAID array readahead can be used effectively. Each additional stream gets placed in different allocation groups within the filesystem, thereby ensuring that we don't cross any streams. When an AG fills up, we select a new AG for the stream that is not in use. The core of the functionality is the stream tracking - each inode that we create in a directory needs to be associated with the directories' stream. Hence every time we create a file, we look up the directories' stream object and associate the new file with that object. Once we have a stream object for a file, we use the AG that the stream object point to for allocations. If we can't allocate in that AG (e.g. it is full) we move the entire stream to another AG. Other inodes in the same stream are moved to the new AG on their next allocation (i.e. lazy update). Stream objects are kept in a cache and hold a reference on the inode. Hence the inode cannot be reclaimed while there is an outstanding stream reference. This means that on unlink we need to remove the stream association and we also need to flush all the associations on certain events that want to reclaim all unreferenced inodes (e.g. filesystem freeze). SGI-PV: 964469 SGI-Modid: xfs-linux-melb:xfs-kern:29096a Signed-off-by: David Chinner Signed-off-by: Barry Naujok Signed-off-by: Donald Douwsma Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin Signed-off-by: Vlad Apostolov --- fs/xfs/Makefile-linux-2.6 | 2 + fs/xfs/linux-2.6/xfs_globals.c | 1 + fs/xfs/linux-2.6/xfs_linux.h | 1 + fs/xfs/linux-2.6/xfs_sysctl.c | 11 + fs/xfs/linux-2.6/xfs_sysctl.h | 2 + fs/xfs/xfs.h | 1 + fs/xfs/xfs_ag.h | 1 + fs/xfs/xfs_bmap.c | 69 +++- fs/xfs/xfs_clnt.h | 2 + fs/xfs/xfs_dinode.h | 4 +- fs/xfs/xfs_filestream.c | 771 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_filestream.h | 136 ++++++++ fs/xfs/xfs_fs.h | 1 + fs/xfs/xfs_fsops.c | 2 + fs/xfs/xfs_inode.c | 17 +- fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_mount.h | 4 + fs/xfs/xfs_mru_cache.c | 608 ++++++++++++++++++++++++++++++++ fs/xfs/xfs_mru_cache.h | 57 +++ fs/xfs/xfs_vfsops.c | 26 ++ fs/xfs/xfs_vnodeops.c | 25 +- 21 files changed, 1730 insertions(+), 12 deletions(-) create mode 100644 fs/xfs/xfs_filestream.c create mode 100644 fs/xfs/xfs_filestream.h create mode 100644 fs/xfs/xfs_mru_cache.c create mode 100644 fs/xfs/xfs_mru_cache.h (limited to 'fs') diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6 index b49989bb89ad..e7a9a83f0087 100644 --- a/fs/xfs/Makefile-linux-2.6 +++ b/fs/xfs/Makefile-linux-2.6 @@ -64,6 +64,7 @@ xfs-y += xfs_alloc.o \ xfs_dir2_sf.o \ xfs_error.o \ xfs_extfree_item.o \ + xfs_filestream.o \ xfs_fsops.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ @@ -77,6 +78,7 @@ xfs-y += xfs_alloc.o \ xfs_log.o \ xfs_log_recover.o \ xfs_mount.o \ + xfs_mru_cache.o \ xfs_rename.o \ xfs_trans.o \ xfs_trans_ail.o \ diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c index ed3a5e1b4b67..bb72c3d4141f 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -46,6 +46,7 @@ xfs_param_t xfs_params = { .inherit_nosym = { 0, 0, 1 }, .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, + .fstrm_timer = { 1, 50, 3600*100}, }; /* diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index af24a457d3a3..330c4ba9d404 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -123,6 +123,7 @@ #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val +#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index cd6eaa44aa2b..bb997d75c05c 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -210,6 +210,17 @@ static ctl_table xfs_table[] = { .extra1 = &xfs_params.inherit_nodfrg.min, .extra2 = &xfs_params.inherit_nodfrg.max }, + { + .ctl_name = XFS_FILESTREAM_TIMER, + .procname = "filestream_centisecs", + .data = &xfs_params.fstrm_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.fstrm_timer.min, + .extra2 = &xfs_params.fstrm_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h index a631fb8cc5ac..98b97e399d6f 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.h +++ b/fs/xfs/linux-2.6/xfs_sysctl.h @@ -47,6 +47,7 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ + xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ } xfs_param_t; /* @@ -86,6 +87,7 @@ enum { XFS_INHERIT_NOSYM = 19, XFS_ROTORSTEP = 20, XFS_INHERIT_NODFRG = 21, + XFS_FILESTREAM_TIMER = 22, }; extern xfs_param_t xfs_params; diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index bf0a12040b13..b5a7d92c6843 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -38,6 +38,7 @@ #define XFS_RW_TRACE 1 #define XFS_BUF_TRACE 1 #define XFS_VNODE_TRACE 1 +#define XFS_FILESTREAMS_TRACE 1 #endif #include diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index b1dd0029c60e..51c09c114a20 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -196,6 +196,7 @@ typedef struct xfs_perag lock_t pagb_lock; /* lock for pagb_list */ #endif xfs_perag_busy_t *pagb_list; /* unstable blocks */ + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ } xfs_perag_t; #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 09d86388bb71..51ba689a4552 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -52,6 +52,7 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" +#include "xfs_filestream.h" #ifdef DEBUG @@ -2725,9 +2726,15 @@ xfs_bmap_btalloc( } nullfb = ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); - if (nullfb) - ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); - else + if (nullfb) { + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + ag = xfs_filestream_lookup_ag(ap->ip); + ag = (ag != NULLAGNUMBER) ? ag : 0; + ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); + } else { + ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + } + } else ap->rval = ap->firstblock; xfs_bmap_adjacent(ap); @@ -2751,13 +2758,22 @@ xfs_bmap_btalloc( args.firstblock = ap->firstblock; blen = 0; if (nullfb) { - args.type = XFS_ALLOCTYPE_START_BNO; + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + args.type = XFS_ALLOCTYPE_NEAR_BNO; + else + args.type = XFS_ALLOCTYPE_START_BNO; args.total = ap->total; + /* - * Find the longest available space. - * We're going to try for the whole allocation at once. + * Search for an allocation group with a single extent + * large enough for the request. + * + * If one isn't found, then adjust the minimum allocation + * size to the largest space found. */ startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; notinit = 0; down_read(&mp->m_peraglock); while (blen < ap->alen) { @@ -2783,6 +2799,35 @@ xfs_bmap_btalloc( blen = longest; } else notinit = 1; + + if (xfs_inode_is_filestream(ap->ip)) { + if (blen >= ap->alen) + break; + + if (ap->userdata) { + /* + * If startag is an invalid AG, we've + * come here once before and + * xfs_filestream_new_ag picked the + * best currently available. + * + * Don't continue looping, since we + * could loop forever. + */ + if (startag == NULLAGNUMBER) + break; + + error = xfs_filestream_new_ag(ap, &ag); + if (error) { + up_read(&mp->m_peraglock); + return error; + } + + /* loop again to set 'blen'*/ + startag = NULLAGNUMBER; + continue; + } + } if (++ag == mp->m_sb.sb_agcount) ag = 0; if (ag == startag) @@ -2807,8 +2852,18 @@ xfs_bmap_btalloc( */ else args.minlen = ap->alen; + + /* + * set the failure fallback case to look in the selected + * AG as the stream may have moved. + */ + if (xfs_inode_is_filestream(ap->ip)) + ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0); } else if (ap->low) { - args.type = XFS_ALLOCTYPE_START_BNO; + if (xfs_inode_is_filestream(ap->ip)) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_START_BNO; args.total = args.minlen = ap->minlen; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index 5b7eb81453be..f89196cb08d2 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -99,5 +99,7 @@ struct xfs_mount_args { */ #define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred * I/O size in stat(2) */ +#define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams + * allocator */ #endif /* __XFS_CLNT_H__ */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index b33826961c45..fefd0116bac9 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -257,6 +257,7 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ +#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) @@ -271,12 +272,13 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) #define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) #define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) +#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) #define XFS_DIFLAG_ANY \ (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ - XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG) + XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) #endif /* __XFS_DINODE_H__ */ diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c new file mode 100644 index 000000000000..ce2278611bb7 --- /dev/null +++ b/fs/xfs/xfs_filestream.c @@ -0,0 +1,771 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_bmap_btree.h" +#include "xfs_inum.h" +#include "xfs_dir2.h" +#include "xfs_dir2_sf.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_ag.h" +#include "xfs_dmapi.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_bmap.h" +#include "xfs_alloc.h" +#include "xfs_utils.h" +#include "xfs_mru_cache.h" +#include "xfs_filestream.h" + +#ifdef XFS_FILESTREAMS_TRACE + +ktrace_t *xfs_filestreams_trace_buf; + +STATIC void +xfs_filestreams_trace( + xfs_mount_t *mp, /* mount point */ + int type, /* type of trace */ + const char *func, /* source function */ + int line, /* source line number */ + __psunsigned_t arg0, + __psunsigned_t arg1, + __psunsigned_t arg2, + __psunsigned_t arg3, + __psunsigned_t arg4, + __psunsigned_t arg5) +{ + ktrace_enter(xfs_filestreams_trace_buf, + (void *)(__psint_t)(type | (line << 16)), + (void *)func, + (void *)(__psunsigned_t)current_pid(), + (void *)mp, + (void *)(__psunsigned_t)arg0, + (void *)(__psunsigned_t)arg1, + (void *)(__psunsigned_t)arg2, + (void *)(__psunsigned_t)arg3, + (void *)(__psunsigned_t)arg4, + (void *)(__psunsigned_t)arg5, + NULL, NULL, NULL, NULL, NULL, NULL); +} + +#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0) +#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0) +#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0) +#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0) +#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) +#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) +#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ + xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \ + (__psunsigned_t)a0, (__psunsigned_t)a1, \ + (__psunsigned_t)a2, (__psunsigned_t)a3, \ + (__psunsigned_t)a4, (__psunsigned_t)a5) + +#define TRACE_AG_SCAN(mp, ag, ag2) \ + TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2); +#define TRACE_AG_PICK1(mp, max_ag, maxfree) \ + TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree); +#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \ + TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \ + cnt, free, scan, flag) +#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \ + TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2) +#define TRACE_FREE(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt) +#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt) +#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt) +#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \ + TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt) +#define TRACE_ORPHAN(mp, ip, ag) \ + TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag); + + +#else +#define TRACE_AG_SCAN(mp, ag, ag2) +#define TRACE_AG_PICK1(mp, max_ag, maxfree) +#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) +#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) +#define TRACE_FREE(mp, ip, pip, ag, cnt) +#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) +#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) +#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) +#define TRACE_ORPHAN(mp, ip, ag) +#endif + +static kmem_zone_t *item_zone; + +/* + * Structure for associating a file or a directory with an allocation group. + * The parent directory pointer is only needed for files, but since there will + * generally be vastly more files than directories in the cache, using the same + * data structure simplifies the code with very little memory overhead. + */ +typedef struct fstrm_item +{ + xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ + xfs_inode_t *ip; /* inode self-pointer. */ + xfs_inode_t *pip; /* Parent directory inode pointer. */ +} fstrm_item_t; + + +/* + * Scan the AGs starting at startag looking for an AG that isn't in use and has + * at least minlen blocks free. + */ +static int +_xfs_filestream_pick_ag( + xfs_mount_t *mp, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) +{ + int err, trylock, nscan; + xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + struct xfs_perag *pag; + + /* 2% of an AG's blocks must be free for it to be chosen. */ + minfree = mp->m_sb.sb_agblocks / 50; + + ag = startag; + *agp = NULLAGNUMBER; + + /* For the first pass, don't sleep trying to init the per-AG. */ + trylock = XFS_ALLOC_FLAG_TRYLOCK; + + for (nscan = 0; 1; nscan++) { + + TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); + + pag = mp->m_perag + ag; + + if (!pag->pagf_init) { + err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + if (err && !trylock) + return err; + } + + /* Might fail sometimes during the 1st pass with trylock set. */ + if (!pag->pagf_init) + goto next_ag; + + /* Keep track of the AG with the most free blocks. */ + if (pag->pagf_freeblks > maxfree) { + maxfree = pag->pagf_freeblks; + max_ag = ag; + } + + /* + * The AG reference count does two things: it enforces mutual + * exclusion when examining the suitability of an AG in this + * loop, and it guards against two filestreams being established + * in the same AG as each other. + */ + if (xfs_filestream_get_ag(mp, ag) > 1) { + xfs_filestream_put_ag(mp, ag); + goto next_ag; + } + + need = XFS_MIN_FREELIST_PAG(pag, mp); + delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; + longest = (pag->pagf_longest > delta) ? + (pag->pagf_longest - delta) : + (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + + if (((minlen && longest >= minlen) || + (!minlen && pag->pagf_freeblks >= minfree)) && + (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + + /* Break out, retaining the reference on the AG. */ + free = pag->pagf_freeblks; + *agp = ag; + break; + } + + /* Drop the reference on this AG, it's not usable. */ + xfs_filestream_put_ag(mp, ag); +next_ag: + /* Move to the next AG, wrapping to AG 0 if necessary. */ + if (++ag >= mp->m_sb.sb_agcount) + ag = 0; + + /* If a full pass of the AGs hasn't been done yet, continue. */ + if (ag != startag) + continue; + + /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + if (trylock != 0) { + trylock = 0; + continue; + } + + /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + continue; + } + + /* + * Take the AG with the most free space, regardless of whether + * it's already in use by another filestream. + */ + if (max_ag != NULLAGNUMBER) { + xfs_filestream_get_ag(mp, max_ag); + TRACE_AG_PICK1(mp, max_ag, maxfree); + free = maxfree; + *agp = max_ag; + break; + } + + /* take AG 0 if none matched */ + TRACE_AG_PICK1(mp, max_ag, maxfree); + *agp = 0; + return 0; + } + + TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), + free, nscan, flags); + + return 0; +} + +/* + * Set the allocation group number for a file or a directory, updating inode + * references and per-AG references as appropriate. Must be called with the + * m_peraglock held in read mode. + */ +static int +_xfs_filestream_update_ag( + xfs_inode_t *ip, + xfs_inode_t *pip, + xfs_agnumber_t ag) +{ + int err = 0; + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t old_ag; + xfs_inode_t *old_pip; + + /* + * Either ip is a regular file and pip is a directory, or ip is a + * directory and pip is NULL. + */ + ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && + (pip->i_d.di_mode & S_IFDIR)) || + ((ip->i_d.di_mode & S_IFDIR) && !pip))); + + mp = ip->i_mount; + cache = mp->m_filestream; + + item = xfs_mru_cache_lookup(cache, ip->i_ino); + if (item) { + ASSERT(item->ip == ip); + old_ag = item->ag; + item->ag = ag; + old_pip = item->pip; + item->pip = pip; + xfs_mru_cache_done(cache); + + /* + * If the AG has changed, drop the old ref and take a new one, + * effectively transferring the reference from old to new AG. + */ + if (ag != old_ag) { + xfs_filestream_put_ag(mp, old_ag); + xfs_filestream_get_ag(mp, ag); + } + + /* + * If ip is a file and its pip has changed, drop the old ref and + * take a new one. + */ + if (pip && pip != old_pip) { + IRELE(old_pip); + IHOLD(pip); + } + + TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag), + ag, xfs_filestream_peek_ag(mp, ag)); + return 0; + } + + item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); + if (!item) + return ENOMEM; + + item->ag = ag; + item->ip = ip; + item->pip = pip; + + err = xfs_mru_cache_insert(cache, ip->i_ino, item); + if (err) { + kmem_zone_free(item_zone, item); + return err; + } + + /* Take a reference on the AG. */ + xfs_filestream_get_ag(mp, ag); + + /* + * Take a reference on the inode itself regardless of whether it's a + * regular file or a directory. + */ + IHOLD(ip); + + /* + * In the case of a regular file, take a reference on the parent inode + * as well to ensure it remains in-core. + */ + if (pip) + IHOLD(pip); + + TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag), + ag, xfs_filestream_peek_ag(mp, ag)); + + return 0; +} + +/* xfs_fstrm_free_func(): callback for freeing cached stream items. */ +void +xfs_fstrm_free_func( + xfs_ino_t ino, + fstrm_item_t *item) +{ + xfs_inode_t *ip = item->ip; + int ref; + + ASSERT(ip->i_ino == ino); + + xfs_iflags_clear(ip, XFS_IFILESTREAM); + + /* Drop the reference taken on the AG when the item was added. */ + ref = xfs_filestream_put_ag(ip->i_mount, item->ag); + + ASSERT(ref >= 0); + TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, + xfs_filestream_peek_ag(ip->i_mount, item->ag)); + + /* + * _xfs_filestream_update_ag() always takes a reference on the inode + * itself, whether it's a file or a directory. Release it here. + * This can result in the inode being freed and so we must + * not hold any inode locks when freeing filesstreams objects + * otherwise we can deadlock here. + */ + IRELE(ip); + + /* + * In the case of a regular file, _xfs_filestream_update_ag() also + * takes a ref on the parent inode to keep it in-core. Release that + * too. + */ + if (item->pip) + IRELE(item->pip); + + /* Finally, free the memory allocated for the item. */ + kmem_zone_free(item_zone, item); +} + +/* + * xfs_filestream_init() is called at xfs initialisation time to set up the + * memory zone that will be used for filestream data structure allocation. + */ +int +xfs_filestream_init(void) +{ + item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); +#ifdef XFS_FILESTREAMS_TRACE + xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP); +#endif + return item_zone ? 0 : -ENOMEM; +} + +/* + * xfs_filestream_uninit() is called at xfs termination time to destroy the + * memory zone that was used for filestream data structure allocation. + */ +void +xfs_filestream_uninit(void) +{ +#ifdef XFS_FILESTREAMS_TRACE + ktrace_free(xfs_filestreams_trace_buf); +#endif + kmem_zone_destroy(item_zone); +} + +/* + * xfs_filestream_mount() is called when a file system is mounted with the + * filestream option. It is responsible for allocating the data structures + * needed to track the new file system's file streams. + */ +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + int err; + unsigned int lifetime, grp_count; + + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + lifetime = xfs_fstrm_centisecs * 10; + grp_count = 10; + + err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, + (xfs_mru_cache_free_func_t)xfs_fstrm_free_func); + + return err; +} + +/* + * xfs_filestream_unmount() is called when a file system that was mounted with + * the filestream option is unmounted. It drains the data structures created + * to track the file system's file streams and frees all the memory that was + * allocated. + */ +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); +} + +/* + * If the mount point's m_perag array is going to be reallocated, all + * outstanding cache entries must be flushed to avoid accessing reference count + * addresses that have been freed. The call to xfs_filestream_flush() must be + * made inside the block that holds the m_peraglock in write mode to do the + * reallocation. + */ +void +xfs_filestream_flush( + xfs_mount_t *mp) +{ + /* point in time flush, so keep the reaper running */ + xfs_mru_cache_flush(mp->m_filestream, 1); +} + +/* + * Return the AG of the filestream the file or directory belongs to, or + * NULLAGNUMBER otherwise. + */ +xfs_agnumber_t +xfs_filestream_lookup_ag( + xfs_inode_t *ip) +{ + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t ag; + int ref; + + if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { + ASSERT(0); + return NULLAGNUMBER; + } + + cache = ip->i_mount->m_filestream; + item = xfs_mru_cache_lookup(cache, ip->i_ino); + if (!item) { + TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); + return NULLAGNUMBER; + } + + ASSERT(ip == item->ip); + ag = item->ag; + ref = xfs_filestream_peek_ag(ip->i_mount, ag); + xfs_mru_cache_done(cache); + + TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); + return ag; +} + +/* + * xfs_filestream_associate() should only be called to associate a regular file + * with its parent directory. Calling it with a child directory isn't + * appropriate because filestreams don't apply to entire directory hierarchies. + * Creating a file in a child directory of an existing filestream directory + * starts a new filestream with its own allocation group association. + * + * Returns < 0 on error, 0 if successful association occurred, > 0 if + * we failed to get an association because of locking issues. + */ +int +xfs_filestream_associate( + xfs_inode_t *pip, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t ag, rotorstep, startag; + int err = 0; + + ASSERT(pip->i_d.di_mode & S_IFDIR); + ASSERT(ip->i_d.di_mode & S_IFREG); + if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) + return -EINVAL; + + mp = pip->i_mount; + cache = mp->m_filestream; + down_read(&mp->m_peraglock); + + /* + * We have a problem, Houston. + * + * Taking the iolock here violates inode locking order - we already + * hold the ilock. Hence if we block getting this lock we may never + * wake. Unfortunately, that means if we can't get the lock, we're + * screwed in terms of getting a stream association - we can't spin + * waiting for the lock because someone else is waiting on the lock we + * hold and we cannot drop that as we are in a transaction here. + * + * Lucky for us, this inversion is rarely a problem because it's a + * directory inode that we are trying to lock here and that means the + * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is + * used. i.e. freeze, remount-ro, quotasync or unmount. + * + * So, if we can't get the iolock without sleeping then just give up + */ + if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { + up_read(&mp->m_peraglock); + return 1; + } + + /* If the parent directory is already in the cache, use its AG. */ + item = xfs_mru_cache_lookup(cache, pip->i_ino); + if (item) { + ASSERT(item->ip == pip); + ag = item->ag; + xfs_mru_cache_done(cache); + + TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); + err = _xfs_filestream_update_ag(ip, pip, ag); + + goto exit; + } + + /* + * Set the starting AG using the rotor for inode32, otherwise + * use the directory inode's AG. + */ + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + rotorstep = xfs_rotorstep; + startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } else + startag = XFS_INO_TO_AGNO(mp, pip->i_ino); + + /* Pick a new AG for the parent inode starting at startag. */ + err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); + if (err || ag == NULLAGNUMBER) + goto exit_did_pick; + + /* Associate the parent inode with the AG. */ + err = _xfs_filestream_update_ag(pip, NULL, ag); + if (err) + goto exit_did_pick; + + /* Associate the file inode with the AG. */ + err = _xfs_filestream_update_ag(ip, pip, ag); + if (err) + goto exit_did_pick; + + TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); + +exit_did_pick: + /* + * If _xfs_filestream_pick_ag() returned a valid AG, remove the + * reference it took on it, since the file and directory will have taken + * their own now if they were successfully cached. + */ + if (ag != NULLAGNUMBER) + xfs_filestream_put_ag(mp, ag); + +exit: + xfs_iunlock(pip, XFS_IOLOCK_EXCL); + up_read(&mp->m_peraglock); + return -err; +} + +/* + * Pick a new allocation group for the current file and its file stream. This + * function is called by xfs_bmap_filestreams() with the mount point's per-ag + * lock held. + */ +int +xfs_filestream_new_ag( + xfs_bmalloca_t *ap, + xfs_agnumber_t *agp) +{ + int flags, err; + xfs_inode_t *ip, *pip = NULL; + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + xfs_extlen_t minlen; + fstrm_item_t *dir, *file; + xfs_agnumber_t ag = NULLAGNUMBER; + + ip = ap->ip; + mp = ip->i_mount; + cache = mp->m_filestream; + minlen = ap->alen; + *agp = NULLAGNUMBER; + + /* + * Look for the file in the cache, removing it if it's found. Doing + * this allows it to be held across the dir lookup that follows. + */ + file = xfs_mru_cache_remove(cache, ip->i_ino); + if (file) { + ASSERT(ip == file->ip); + + /* Save the file's parent inode and old AG number for later. */ + pip = file->pip; + ag = file->ag; + + /* Look for the file's directory in the cache. */ + dir = xfs_mru_cache_lookup(cache, pip->i_ino); + if (dir) { + ASSERT(pip == dir->ip); + + /* + * If the directory has already moved on to a new AG, + * use that AG as the new AG for the file. Don't + * forget to twiddle the AG refcounts to match the + * movement. + */ + if (dir->ag != file->ag) { + xfs_filestream_put_ag(mp, file->ag); + xfs_filestream_get_ag(mp, dir->ag); + *agp = file->ag = dir->ag; + } + + xfs_mru_cache_done(cache); + } + + /* + * Put the file back in the cache. If this fails, the free + * function needs to be called to tidy up in the same way as if + * the item had simply expired from the cache. + */ + err = xfs_mru_cache_insert(cache, ip->i_ino, file); + if (err) { + xfs_fstrm_free_func(ip->i_ino, file); + return err; + } + + /* + * If the file's AG was moved to the directory's new AG, there's + * nothing more to be done. + */ + if (*agp != NULLAGNUMBER) { + TRACE_MOVEAG(mp, ip, pip, + ag, xfs_filestream_peek_ag(mp, ag), + *agp, xfs_filestream_peek_ag(mp, *agp)); + return 0; + } + } + + /* + * If the file's parent directory is known, take its iolock in exclusive + * mode to prevent two sibling files from racing each other to migrate + * themselves and their parent to different AGs. + */ + if (pip) + xfs_ilock(pip, XFS_IOLOCK_EXCL); + + /* + * A new AG needs to be found for the file. If the file's parent + * directory is also known, it will be moved to the new AG as well to + * ensure that files created inside it in future use the new AG. + */ + ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; + flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | + (ap->low ? XFS_PICK_LOWSPACE : 0); + + err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); + if (err || *agp == NULLAGNUMBER) + goto exit; + + /* + * If the file wasn't found in the file cache, then its parent directory + * inode isn't known. For this to have happened, the file must either + * be pre-existing, or it was created long enough ago that its cache + * entry has expired. This isn't the sort of usage that the filestreams + * allocator is trying to optimise, so there's no point trying to track + * its new AG somehow in the filestream data structures. + */ + if (!pip) { + TRACE_ORPHAN(mp, ip, *agp); + goto exit; + } + + /* Associate the parent inode with the AG. */ + err = _xfs_filestream_update_ag(pip, NULL, *agp); + if (err) + goto exit; + + /* Associate the file inode with the AG. */ + err = _xfs_filestream_update_ag(ip, pip, *agp); + if (err) + goto exit; + + TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, + *agp, xfs_filestream_peek_ag(mp, *agp)); + +exit: + /* + * If _xfs_filestream_pick_ag() returned a valid AG, remove the + * reference it took on it, since the file and directory will have taken + * their own now if they were successfully cached. + */ + if (*agp != NULLAGNUMBER) + xfs_filestream_put_ag(mp, *agp); + else + *agp = 0; + + if (pip) + xfs_iunlock(pip, XFS_IOLOCK_EXCL); + + return err; +} + +/* + * Remove an association between an inode and a filestream object. + * Typically this is done on last close of an unlinked file. + */ +void +xfs_filestream_deassociate( + xfs_inode_t *ip) +{ + xfs_mru_cache_t *cache = ip->i_mount->m_filestream; + + xfs_mru_cache_delete(cache, ip->i_ino); +} diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h new file mode 100644 index 000000000000..f655f7dc334c --- /dev/null +++ b/fs/xfs/xfs_filestream.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FILESTREAM_H__ +#define __XFS_FILESTREAM_H__ + +#ifdef __KERNEL__ + +struct xfs_mount; +struct xfs_inode; +struct xfs_perag; +struct xfs_bmalloca; + +#ifdef XFS_FILESTREAMS_TRACE +#define XFS_FSTRM_KTRACE_INFO 1 +#define XFS_FSTRM_KTRACE_AGSCAN 2 +#define XFS_FSTRM_KTRACE_AGPICK1 3 +#define XFS_FSTRM_KTRACE_AGPICK2 4 +#define XFS_FSTRM_KTRACE_UPDATE 5 +#define XFS_FSTRM_KTRACE_FREE 6 +#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7 +#define XFS_FSTRM_KTRACE_ASSOCIATE 8 +#define XFS_FSTRM_KTRACE_MOVEAG 9 +#define XFS_FSTRM_KTRACE_ORPHAN 10 + +#define XFS_FSTRM_KTRACE_SIZE 16384 +extern ktrace_t *xfs_filestreams_trace_buf; + +#endif + +/* + * Allocation group filestream associations are tracked with per-ag atomic + * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a + * particular AG already has active filestreams associated with it. The mount + * point's m_peraglock is used to protect these counters from per-ag array + * re-allocation during a growfs operation. When xfs_growfs_data_private() is + * about to reallocate the array, it calls xfs_filestream_flush() with the + * m_peraglock held in write mode. + * + * Since xfs_mru_cache_flush() guarantees that all the free functions for all + * the cache elements have finished executing before it returns, it's safe for + * the free functions to use the atomic counters without m_peraglock protection. + * This allows the implementation of xfs_fstrm_free_func() to be agnostic about + * whether it was called with the m_peraglock held in read mode, write mode or + * not held at all. The race condition this addresses is the following: + * + * - The work queue scheduler fires and pulls a filestream directory cache + * element off the LRU end of the cache for deletion, then gets pre-empted. + * - A growfs operation grabs the m_peraglock in write mode, flushes all the + * remaining items from the cache and reallocates the mount point's per-ag + * array, resetting all the counters to zero. + * - The work queue thread resumes and calls the free function for the element + * it started cleaning up earlier. In the process it decrements the + * filestreams counter for an AG that now has no references. + * + * With a shrinkfs feature, the above scenario could panic the system. + * + * All other uses of the following macros should be protected by either the + * m_peraglock held in read mode, or the cache's internal locking exposed by the + * interval between a call to xfs_mru_cache_lookup() and a call to + * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode + * when new elements are added to the cache. + * + * Combined, these locking rules ensure that no associations will ever exist in + * the cache that reference per-ag array elements that have since been + * reallocated. + */ +STATIC_INLINE int +xfs_filestream_peek_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + return atomic_read(&mp->m_perag[agno].pagf_fstrms); +} + +STATIC_INLINE int +xfs_filestream_get_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); +} + +STATIC_INLINE int +xfs_filestream_put_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); +} + +/* allocation selection flags */ +typedef enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +} xfs_fstrm_alloc_t; + +/* prototypes for filestream.c */ +int xfs_filestream_init(void); +void xfs_filestream_uninit(void); +int xfs_filestream_mount(struct xfs_mount *mp); +void xfs_filestream_unmount(struct xfs_mount *mp); +void xfs_filestream_flush(struct xfs_mount *mp); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); +int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); +void xfs_filestream_deassociate(struct xfs_inode *ip); +int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); + + +/* filestreams for the inode? */ +STATIC_INLINE int +xfs_inode_is_filestream( + struct xfs_inode *ip) +{ + return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || + xfs_iflags_test(ip, XFS_IFILESTREAM) || + (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); +} + +#endif /* __KERNEL__ */ + +#endif /* __XFS_FILESTREAM_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 1b60cfc28be5..ec3c9c27e0de 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -66,6 +66,7 @@ struct fsxattr { #define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ #define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ #define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2251a49f3e17..432e82347ed6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -44,6 +44,7 @@ #include "xfs_trans_space.h" #include "xfs_rtalloc.h" #include "xfs_rw.h" +#include "xfs_filestream.h" /* * File system operations @@ -165,6 +166,7 @@ xfs_growfs_data_private( new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; if (nagcount > oagcount) { + xfs_filestream_flush(mp); down_write(&mp->m_peraglock); mp->m_perag = kmem_realloc(mp->m_perag, sizeof(xfs_perag_t) * nagcount, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8fdd30d9ba56..2ef100be6c4f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -48,6 +48,7 @@ #include "xfs_dir2_trace.h" #include "xfs_quota.h" #include "xfs_acl.h" +#include "xfs_filestream.h" #include @@ -818,6 +819,8 @@ _xfs_dic2xflags( flags |= XFS_XFLAG_EXTSZINHERIT; if (di_flags & XFS_DIFLAG_NODEFRAG) flags |= XFS_XFLAG_NODEFRAG; + if (di_flags & XFS_DIFLAG_FILESTREAM) + flags |= XFS_XFLAG_FILESTREAM; } return flags; @@ -1151,7 +1154,7 @@ xfs_ialloc( /* * Project ids won't be stored on disk if we are using a version 1 inode. */ - if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) + if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) xfs_bump_ino_vers2(tp, ip); if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { @@ -1196,8 +1199,16 @@ xfs_ialloc( flags |= XFS_ILOG_DEV; break; case S_IFREG: + if (xfs_inode_is_filestream(pip)) { + error = xfs_filestream_associate(pip, ip); + if (error < 0) + return -error; + if (!error) + xfs_iflags_set(ip, XFS_IFILESTREAM); + } + /* fall through */ case S_IFDIR: - if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { + if (pip->i_d.di_flags & XFS_DIFLAG_ANY) { uint di_flags = 0; if ((mode & S_IFMT) == S_IFDIR) { @@ -1234,6 +1245,8 @@ xfs_ialloc( if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; ip->i_d.di_flags |= di_flags; } /* FALLTHROUGH */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f75afecef8e7..d418eeed4ebd 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -379,6 +379,7 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) #define XFS_ISTALE 0x0010 /* inode has been staled */ #define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ #define XFS_INEW 0x0040 +#define XFS_IFILESTREAM 0x0080 /* inode is in a filestream directory */ /* * Flags for inode locking. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 0bca2d422719..76ad74758696 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -66,6 +66,7 @@ struct xfs_bmbt_irec; struct xfs_bmap_free; struct xfs_extdelta; struct xfs_swapext; +struct xfs_mru_cache; extern struct bhv_vfsops xfs_vfsops; extern struct bhv_vnodeops xfs_vnodeops; @@ -424,6 +425,7 @@ typedef struct xfs_mount { struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ struct mutex m_icsb_mutex; /* balancer sync lock */ #endif + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ } xfs_mount_t; /* @@ -463,6 +465,8 @@ typedef struct xfs_mount { * I/O size in stat() */ #define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock counters */ +#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams + allocator */ /* diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c new file mode 100644 index 000000000000..7deb9e3cbbd3 --- /dev/null +++ b/fs/xfs/xfs_mru_cache.c @@ -0,0 +1,608 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_mru_cache.h" + +/* + * The MRU Cache data structure consists of a data store, an array of lists and + * a lock to protect its internal state. At initialisation time, the client + * supplies an element lifetime in milliseconds and a group count, as well as a + * function pointer to call when deleting elements. A data structure for + * queueing up work in the form of timed callbacks is also included. + * + * The group count controls how many lists are created, and thereby how finely + * the elements are grouped in time. When reaping occurs, all the elements in + * all the lists whose time has expired are deleted. + * + * To give an example of how this works in practice, consider a client that + * initialises an MRU Cache with a lifetime of ten seconds and a group count of + * five. Five internal lists will be created, each representing a two second + * period in time. When the first element is added, time zero for the data + * structure is initialised to the current time. + * + * All the elements added in the first two seconds are appended to the first + * list. Elements added in the third second go into the second list, and so on. + * If an element is accessed at any point, it is removed from its list and + * inserted at the head of the current most-recently-used list. + * + * The reaper function will have nothing to do until at least twelve seconds + * have elapsed since the first element was added. The reason for this is that + * if it were called at t=11s, there could be elements in the first list that + * have only been inactive for nine seconds, so it still does nothing. If it is + * called anywhere between t=12 and t=14 seconds, it will delete all the + * elements that remain in the first list. It's therefore possible for elements + * to remain in the data store even after they've been inactive for up to + * (t + t/g) seconds, where t is the inactive element lifetime and g is the + * number of groups. + * + * The above example assumes that the reaper function gets called at least once + * every (t/g) seconds. If it is called less frequently, unused elements will + * accumulate in the reap list until the reaper function is eventually called. + * The current implementation uses work queue callbacks to carefully time the + * reaper function calls, so this should happen rarely, if at all. + * + * From a design perspective, the primary reason for the choice of a list array + * representing discrete time intervals is that it's only practical to reap + * expired elements in groups of some appreciable size. This automatically + * introduces a granularity to element lifetimes, so there's no point storing an + * individual timeout with each element that specifies a more precise reap time. + * The bonus is a saving of sizeof(long) bytes of memory per element stored. + * + * The elements could have been stored in just one list, but an array of + * counters or pointers would need to be maintained to allow them to be divided + * up into discrete time groups. More critically, the process of touching or + * removing an element would involve walking large portions of the entire list, + * which would have a detrimental effect on performance. The additional memory + * requirement for the array of list heads is minimal. + * + * When an element is touched or deleted, it needs to be removed from its + * current list. Doubly linked lists are used to make the list maintenance + * portion of these operations O(1). Since reaper timing can be imprecise, + * inserts and lookups can occur when there are no free lists available. When + * this happens, all the elements on the LRU list need to be migrated to the end + * of the reap list. To keep the list maintenance portion of these operations + * O(1) also, list tails need to be accessible without walking the entire list. + * This is the reason why doubly linked list heads are used. + */ + +/* + * An MRU Cache is a dynamic data structure that stores its elements in a way + * that allows efficient lookups, but also groups them into discrete time + * intervals based on insertion time. This allows elements to be efficiently + * and automatically reaped after a fixed period of inactivity. + * + * When a client data pointer is stored in the MRU Cache it needs to be added to + * both the data store and to one of the lists. It must also be possible to + * access each of these entries via the other, i.e. to: + * + * a) Walk a list, removing the corresponding data store entry for each item. + * b) Look up a data store entry, then access its list entry directly. + * + * To achieve both of these goals, each entry must contain both a list entry and + * a key, in addition to the user's data pointer. Note that it's not a good + * idea to have the client embed one of these structures at the top of their own + * data structure, because inserting the same item more than once would most + * likely result in a loop in one of the lists. That's a sure-fire recipe for + * an infinite loop in the code. + */ +typedef struct xfs_mru_cache_elem +{ + struct list_head list_node; + unsigned long key; + void *value; +} xfs_mru_cache_elem_t; + +static kmem_zone_t *xfs_mru_elem_zone; +static struct workqueue_struct *xfs_mru_reap_wq; + +/* + * When inserting, destroying or reaping, it's first necessary to update the + * lists relative to a particular time. In the case of destroying, that time + * will be well in the future to ensure that all items are moved to the reap + * list. In all other cases though, the time will be the current time. + * + * This function enters a loop, moving the contents of the LRU list to the reap + * list again and again until either a) the lists are all empty, or b) time zero + * has been advanced sufficiently to be within the immediate element lifetime. + * + * Case a) above is detected by counting how many groups are migrated and + * stopping when they've all been moved. Case b) is detected by monitoring the + * time_zero field, which is updated as each group is migrated. + * + * The return value is the earliest time that more migration could be needed, or + * zero if there's no need to schedule more work because the lists are empty. + */ +STATIC unsigned long +_xfs_mru_cache_migrate( + xfs_mru_cache_t *mru, + unsigned long now) +{ + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; + + /* Nothing to do if the data store is empty. */ + if (!mru->time_zero) + return 0; + + /* While time zero is older than the time spanned by all the lists. */ + while (mru->time_zero <= now - mru->grp_count * mru->grp_time) { + + /* + * If the LRU list isn't empty, migrate its elements to the tail + * of the reap list. + */ + lru_list = mru->lists + mru->lru_grp; + if (!list_empty(lru_list)) + list_splice_init(lru_list, mru->reap_list.prev); + + /* + * Advance the LRU group number, freeing the old LRU list to + * become the new MRU list; advance time zero accordingly. + */ + mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count; + mru->time_zero += mru->grp_time; + + /* + * If reaping is so far behind that all the elements on all the + * lists have been migrated to the reap list, it's now empty. + */ + if (++migrated == mru->grp_count) { + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; + } + } + + /* Find the first non-empty list from the LRU end. */ + for (grp = 0; grp < mru->grp_count; grp++) { + + /* Check the grp'th list from the LRU end. */ + lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count); + if (!list_empty(lru_list)) + return mru->time_zero + + (mru->grp_count + grp) * mru->grp_time; + } + + /* All the lists must be empty. */ + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; +} + +/* + * When inserting or doing a lookup, an element needs to be inserted into the + * MRU list. The lists must be migrated first to ensure that they're + * up-to-date, otherwise the new element could be given a shorter lifetime in + * the cache than it should. + */ +STATIC void +_xfs_mru_cache_list_insert( + xfs_mru_cache_t *mru, + xfs_mru_cache_elem_t *elem) +{ + unsigned int grp = 0; + unsigned long now = jiffies; + + /* + * If the data store is empty, initialise time zero, leave grp set to + * zero and start the work queue timer if necessary. Otherwise, set grp + * to the number of group times that have elapsed since time zero. + */ + if (!_xfs_mru_cache_migrate(mru, now)) { + mru->time_zero = now; + if (!mru->next_reap) + mru->next_reap = mru->grp_count * mru->grp_time; + } else { + grp = (now - mru->time_zero) / mru->grp_time; + grp = (mru->lru_grp + grp) % mru->grp_count; + } + + /* Insert the element at the tail of the corresponding list. */ + list_add_tail(&elem->list_node, mru->lists + grp); +} + +/* + * When destroying or reaping, all the elements that were migrated to the reap + * list need to be deleted. For each element this involves removing it from the + * data store, removing it from the reap list, calling the client's free + * function and deleting the element from the element zone. + */ +STATIC void +_xfs_mru_cache_clear_reap_list( + xfs_mru_cache_t *mru) +{ + xfs_mru_cache_elem_t *elem, *next; + struct list_head tmp; + + INIT_LIST_HEAD(&tmp); + list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { + + /* Remove the element from the data store. */ + radix_tree_delete(&mru->store, elem->key); + + /* + * remove to temp list so it can be freed without + * needing to hold the lock + */ + list_move(&elem->list_node, &tmp); + } + mutex_spinunlock(&mru->lock, 0); + + list_for_each_entry_safe(elem, next, &tmp, list_node) { + + /* Remove the element from the reap list. */ + list_del_init(&elem->list_node); + + /* Call the client's free function with the key and value pointer. */ + mru->free_func(elem->key, elem->value); + + /* Free the element structure. */ + kmem_zone_free(xfs_mru_elem_zone, elem); + } + + mutex_spinlock(&mru->lock); +} + +/* + * We fire the reap timer every group expiry interval so + * we always have a reaper ready to run. This makes shutdown + * and flushing of the reaper easy to do. Hence we need to + * keep when the next reap must occur so we can determine + * at each interval whether there is anything we need to do. + */ +STATIC void +_xfs_mru_cache_reap( + struct work_struct *work) +{ + xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work); + unsigned long now; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return; + + mutex_spinlock(&mru->lock); + now = jiffies; + if (mru->reap_all || + (mru->next_reap && time_after(now, mru->next_reap))) { + if (mru->reap_all) + now += mru->grp_count * mru->grp_time * 2; + mru->next_reap = _xfs_mru_cache_migrate(mru, now); + _xfs_mru_cache_clear_reap_list(mru); + } + + /* + * the process that triggered the reap_all is responsible + * for restating the periodic reap if it is required. + */ + if (!mru->reap_all) + queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time); + mru->reap_all = 0; + mutex_spinunlock(&mru->lock, 0); +} + +int +xfs_mru_cache_init(void) +{ + xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t), + "xfs_mru_cache_elem"); + if (!xfs_mru_elem_zone) + return ENOMEM; + + xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); + if (!xfs_mru_reap_wq) { + kmem_zone_destroy(xfs_mru_elem_zone); + return ENOMEM; + } + + return 0; +} + +void +xfs_mru_cache_uninit(void) +{ + destroy_workqueue(xfs_mru_reap_wq); + kmem_zone_destroy(xfs_mru_elem_zone); +} + +/* + * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create() + * with the address of the pointer, a lifetime value in milliseconds, a group + * count and a free function to use when deleting elements. This function + * returns 0 if the initialisation was successful. + */ +int +xfs_mru_cache_create( + xfs_mru_cache_t **mrup, + unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func) +{ + xfs_mru_cache_t *mru = NULL; + int err = 0, grp; + unsigned int grp_time; + + if (mrup) + *mrup = NULL; + + if (!mrup || !grp_count || !lifetime_ms || !free_func) + return EINVAL; + + if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) + return EINVAL; + + if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + return ENOMEM; + + /* An extra list is needed to avoid reaping up to a grp_time early. */ + mru->grp_count = grp_count + 1; + mru->lists = kmem_alloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + + if (!mru->lists) { + err = ENOMEM; + goto exit; + } + + for (grp = 0; grp < mru->grp_count; grp++) + INIT_LIST_HEAD(mru->lists + grp); + + /* + * We use GFP_KERNEL radix tree preload and do inserts under a + * spinlock so GFP_ATOMIC is appropriate for the radix tree itself. + */ + INIT_RADIX_TREE(&mru->store, GFP_ATOMIC); + INIT_LIST_HEAD(&mru->reap_list); + spinlock_init(&mru->lock, "xfs_mru_cache"); + INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap); + + mru->grp_time = grp_time; + mru->free_func = free_func; + + /* start up the reaper event */ + mru->next_reap = 0; + mru->reap_all = 0; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time); + + *mrup = mru; + +exit: + if (err && mru && mru->lists) + kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists)); + if (err && mru) + kmem_free(mru, sizeof(*mru)); + + return err; +} + +/* + * Call xfs_mru_cache_flush() to flush out all cached entries, calling their + * free functions as they're deleted. When this function returns, the caller is + * guaranteed that all the free functions for all the elements have finished + * executing. + * + * While we are flushing, we stop the periodic reaper event from triggering. + * Normally, we want to restart this periodic event, but if we are shutting + * down the cache we do not want it restarted. hence the restart parameter + * where 0 = do not restart reaper and 1 = restart reaper. + */ +void +xfs_mru_cache_flush( + xfs_mru_cache_t *mru, + int restart) +{ + if (!mru || !mru->lists) + return; + + cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work); + + mutex_spinlock(&mru->lock); + mru->reap_all = 1; + mutex_spinunlock(&mru->lock, 0); + + queue_work(xfs_mru_reap_wq, &mru->work.work); + flush_workqueue(xfs_mru_reap_wq); + + mutex_spinlock(&mru->lock); + WARN_ON_ONCE(mru->reap_all != 0); + mru->reap_all = 0; + if (restart) + queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time); + mutex_spinunlock(&mru->lock, 0); +} + +void +xfs_mru_cache_destroy( + xfs_mru_cache_t *mru) +{ + if (!mru || !mru->lists) + return; + + /* we don't want the reaper to restart here */ + xfs_mru_cache_flush(mru, 0); + + kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists)); + kmem_free(mru, sizeof(*mru)); +} + +/* + * To insert an element, call xfs_mru_cache_insert() with the data store, the + * element's key and the client data pointer. This function returns 0 on + * success or ENOMEM if memory for the data element couldn't be allocated. + */ +int +xfs_mru_cache_insert( + xfs_mru_cache_t *mru, + unsigned long key, + void *value) +{ + xfs_mru_cache_elem_t *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return EINVAL; + + elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP); + if (!elem) + return ENOMEM; + + if (radix_tree_preload(GFP_KERNEL)) { + kmem_zone_free(xfs_mru_elem_zone, elem); + return ENOMEM; + } + + INIT_LIST_HEAD(&elem->list_node); + elem->key = key; + elem->value = value; + + mutex_spinlock(&mru->lock); + + radix_tree_insert(&mru->store, key, elem); + radix_tree_preload_end(); + _xfs_mru_cache_list_insert(mru, elem); + + mutex_spinunlock(&mru->lock, 0); + + return 0; +} + +/* + * To remove an element without calling the free function, call + * xfs_mru_cache_remove() with the data store and the element's key. On success + * the client data pointer for the removed element is returned, otherwise this + * function will return a NULL pointer. + */ +void * +xfs_mru_cache_remove( + xfs_mru_cache_t *mru, + unsigned long key) +{ + xfs_mru_cache_elem_t *elem; + void *value = NULL; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + mutex_spinlock(&mru->lock); + elem = radix_tree_delete(&mru->store, key); + if (elem) { + value = elem->value; + list_del(&elem->list_node); + } + + mutex_spinunlock(&mru->lock, 0); + + if (elem) + kmem_zone_free(xfs_mru_elem_zone, elem); + + return value; +} + +/* + * To remove and element and call the free function, call xfs_mru_cache_delete() + * with the data store and the element's key. + */ +void +xfs_mru_cache_delete( + xfs_mru_cache_t *mru, + unsigned long key) +{ + void *value = xfs_mru_cache_remove(mru, key); + + if (value) + mru->free_func(key, value); +} + +/* + * To look up an element using its key, call xfs_mru_cache_lookup() with the + * data store and the element's key. If found, the element will be moved to the + * head of the MRU list to indicate that it's been touched. + * + * The internal data structures are protected by a spinlock that is STILL HELD + * when this function returns. Call xfs_mru_cache_done() to release it. Note + * that it is not safe to call any function that might sleep in the interim. + * + * The implementation could have used reference counting to avoid this + * restriction, but since most clients simply want to get, set or test a member + * of the returned data structure, the extra per-element memory isn't warranted. + * + * If the element isn't found, this function returns NULL and the spinlock is + * released. xfs_mru_cache_done() should NOT be called when this occurs. + */ +void * +xfs_mru_cache_lookup( + xfs_mru_cache_t *mru, + unsigned long key) +{ + xfs_mru_cache_elem_t *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + mutex_spinlock(&mru->lock); + elem = radix_tree_lookup(&mru->store, key); + if (elem) { + list_del(&elem->list_node); + _xfs_mru_cache_list_insert(mru, elem); + } + else + mutex_spinunlock(&mru->lock, 0); + + return elem ? elem->value : NULL; +} + +/* + * To look up an element using its key, but leave its location in the internal + * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this + * function returns NULL. + * + * See the comments above the declaration of the xfs_mru_cache_lookup() function + * for important locking information pertaining to this call. + */ +void * +xfs_mru_cache_peek( + xfs_mru_cache_t *mru, + unsigned long key) +{ + xfs_mru_cache_elem_t *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + mutex_spinlock(&mru->lock); + elem = radix_tree_lookup(&mru->store, key); + if (!elem) + mutex_spinunlock(&mru->lock, 0); + + return elem ? elem->value : NULL; +} + +/* + * To release the internal data structure spinlock after having performed an + * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() + * with the data store pointer. + */ +void +xfs_mru_cache_done( + xfs_mru_cache_t *mru) +{ + mutex_spinunlock(&mru->lock, 0); +} diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h new file mode 100644 index 000000000000..624fd10ee8e5 --- /dev/null +++ b/fs/xfs/xfs_mru_cache.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MRU_CACHE_H__ +#define __XFS_MRU_CACHE_H__ + + +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*); + +typedef struct xfs_mru_cache +{ + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + unsigned long next_reap; /* Time that the reaper should + next do something. */ + unsigned int reap_all; /* if set, reap all lists */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ +} xfs_mru_cache_t; + +int xfs_mru_cache_init(void); +void xfs_mru_cache_uninit(void); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); +void xfs_mru_cache_flush(xfs_mru_cache_t *mru, int restart); +void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); +int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, + void *value); +void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); +void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_done(struct xfs_mru_cache *mru); + +#endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index c343fde10ef9..11f5ea29a038 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -51,6 +51,8 @@ #include "xfs_acl.h" #include "xfs_attr.h" #include "xfs_clnt.h" +#include "xfs_mru_cache.h" +#include "xfs_filestream.h" #include "xfs_fsops.h" STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); @@ -81,6 +83,8 @@ xfs_init(void) xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); + xfs_mru_cache_init(); + xfs_filestream_init(); /* * The size of the zone allocated buf log item is the maximum @@ -164,6 +168,8 @@ xfs_cleanup(void) xfs_cleanup_procfs(); xfs_sysctl_unregister(); xfs_refcache_destroy(); + xfs_filestream_uninit(); + xfs_mru_cache_uninit(); xfs_acl_zone_destroy(xfs_acl_zone); #ifdef XFS_DIR2_TRACE @@ -320,6 +326,9 @@ xfs_start_flags( else mp->m_flags &= ~XFS_MOUNT_BARRIER; + if (ap->flags2 & XFSMNT2_FILESTREAMS) + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + return 0; } @@ -518,6 +527,9 @@ xfs_mount( if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); + if ((error = xfs_filestream_mount(mp))) + goto error2; + error = XFS_IOINIT(vfsp, args, flags); if (error) goto error2; @@ -575,6 +587,13 @@ xfs_unmount( */ xfs_refcache_purge_mp(mp); + /* + * Blow away any referenced inode in the filestreams cache. + * This can and will cause log traffic as inodes go inactive + * here. + */ + xfs_filestream_unmount(mp); + XFS_bflush(mp->m_ddev_targp); error = xfs_unmount_flush(mp, 0); if (error) @@ -694,6 +713,7 @@ xfs_mntupdate( mp->m_flags &= ~XFS_MOUNT_BARRIER; } } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ + xfs_filestream_flush(mp); bhv_vfs_sync(vfsp, SYNC_DATA_QUIESCE, NULL); xfs_attr_quiesce(mp); vfsp->vfs_flag |= VFS_RDONLY; @@ -909,6 +929,9 @@ xfs_sync( { xfs_mount_t *mp = XFS_BHVTOM(bdp); + if (flags & SYNC_IOWAIT) + xfs_filestream_flush(mp); + return xfs_syncsub(mp, flags, NULL); } @@ -1659,6 +1682,7 @@ xfs_vget( * in stat(). */ #define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ #define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ +#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */ STATIC unsigned long suffix_strtoul(char *s, char **endp, unsigned int base) @@ -1845,6 +1869,8 @@ xfs_parseargs( args->flags |= XFSMNT_ATTR2; } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { args->flags &= ~XFSMNT_ATTR2; + } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { + args->flags2 |= XFSMNT2_FILESTREAMS; } else if (!strcmp(this_char, "osyncisdsync")) { /* no-op, this is now the default */ cmn_err(CE_WARN, diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2067d0b0a10e..60fd0be90a16 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -51,6 +51,7 @@ #include "xfs_refcache.h" #include "xfs_trans_space.h" #include "xfs_log_priv.h" +#include "xfs_filestream.h" STATIC int xfs_open( @@ -783,6 +784,8 @@ xfs_setattr( di_flags |= XFS_DIFLAG_PROJINHERIT; if (vap->va_xflags & XFS_XFLAG_NODEFRAG) di_flags |= XFS_DIFLAG_NODEFRAG; + if (vap->va_xflags & XFS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { if (vap->va_xflags & XFS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; @@ -1536,7 +1539,17 @@ xfs_release( if (vp->v_vfsp->vfs_flag & VFS_RDONLY) return 0; - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (!XFS_FORCED_SHUTDOWN(mp)) { + /* + * If we are using filestreams, and we have an unlinked + * file that we are processing the last close on, then nothing + * will be able to reopen and write to this file. Purge this + * inode from the filestreams cache so that it doesn't delay + * teardown of the inode. + */ + if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + /* * If we previously truncated this file and removed old data * in the process, we want to initiate "early" writeout on @@ -1551,7 +1564,6 @@ xfs_release( bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE); } - #ifdef HAVE_REFCACHE /* If we are in the NFS reference cache then don't do this now */ if (ip->i_refcache) @@ -2541,6 +2553,15 @@ xfs_remove( */ xfs_refcache_purge_ip(ip); + /* + * If we are using filestreams, kill the stream association. + * If the file is still open it may get a new one but that + * will get killed on last close in xfs_close() so we don't + * have to worry about that. + */ + if (link_zero && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); /* -- cgit v1.2.1 From b11f94d537e6b69f13770143fd7ded3d09fdbaab Mon Sep 17 00:00:00 2001 From: David Chinner Date: Wed, 11 Jul 2007 11:09:33 +1000 Subject: [XFS] Quota inode has no parent. Avoid using a special "zero inode" as the parent of the quota inode as this can confuse the filestreams code into thinking the quota inode has a parent. We do not want the quota inode to follow filestreams allocation rules, so pass a NULL as the parent inode and detect this condition when doing stream associations. SGI-PV: 964469 SGI-Modid: xfs-linux-melb:xfs-kern:29098a Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/quota/xfs_qm.c | 3 +-- fs/xfs/xfs_inode.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index eeb3d9cc0e8e..7def4c699343 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -65,7 +65,6 @@ kmem_zone_t *qm_dqtrxzone; static struct shrinker *xfs_qm_shaker; static cred_t xfs_zerocr; -static xfs_inode_t xfs_zeroino; STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); @@ -1415,7 +1414,7 @@ xfs_qm_qino_alloc( return error; } - if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0, + if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, &xfs_zerocr, 0, 1, ip, &committed))) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2ef100be6c4f..59829523ab07 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1078,6 +1078,11 @@ xfs_iread_extents( * also returns the [locked] bp pointing to the head of the freelist * as ialloc_context. The caller should hold this buffer across * the commit and pass it back into this routine on the second call. + * + * If we are allocating quota inodes, we do not have a parent inode + * to attach to or associate with (i.e. pip == NULL) because they + * are not linked into the directory structure - they are attached + * directly to the superblock - and so have no parent. */ int xfs_ialloc( @@ -1103,7 +1108,7 @@ xfs_ialloc( * Call the space management code to pick * the on-disk inode to be allocated. */ - error = xfs_dialloc(tp, pip->i_ino, mode, okalloc, + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, ialloc_context, call_again, &ino); if (error != 0) { return error; @@ -1157,7 +1162,7 @@ xfs_ialloc( if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) xfs_bump_ino_vers2(tp, ip); - if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { + if (pip && XFS_INHERIT_GID(pip, vp->v_vfsp)) { ip->i_d.di_gid = pip->i_d.di_gid; if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { ip->i_d.di_mode |= S_ISGID; @@ -1199,7 +1204,7 @@ xfs_ialloc( flags |= XFS_ILOG_DEV; break; case S_IFREG: - if (xfs_inode_is_filestream(pip)) { + if (pip && xfs_inode_is_filestream(pip)) { error = xfs_filestream_associate(pip, ip); if (error < 0) return -error; @@ -1208,7 +1213,7 @@ xfs_ialloc( } /* fall through */ case S_IFDIR: - if (pip->i_d.di_flags & XFS_DIFLAG_ANY) { + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; if ((mode & S_IFMT) == S_IFDIR) { -- cgit v1.2.1 From 3a59c94c4b48878c6af047cdfc8c137d0fa7a0f0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 11 Jul 2007 11:09:47 +1000 Subject: [XFS] Clean up function name handling in tracing code Remove the hardcoded "fnames" for tracing, and just embed them in tracing macros via __FUNCTION__. Kills a lot of #ifdefs too. SGI-PV: 967353 SGI-Modid: xfs-linux-melb:xfs-kern:29099a Signed-off-by: Eric Sandeen Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_alloc.c | 53 +++------- fs/xfs/xfs_bmap.c | 268 ++++++++++++++++++++---------------------------- fs/xfs/xfs_bmap.h | 6 +- fs/xfs/xfs_bmap_btree.c | 88 +++------------- fs/xfs/xfs_inode.c | 8 +- 5 files changed, 150 insertions(+), 273 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 98f95d4c4bcc..012a649a19c3 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -55,17 +55,17 @@ xfs_alloc_search_busy(xfs_trans_t *tp, ktrace_t *xfs_alloc_trace_buf; #define TRACE_ALLOC(s,a) \ - xfs_alloc_trace_alloc(fname, s, a, __LINE__) + xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__) #define TRACE_FREE(s,a,b,x,f) \ - xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__) + xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__) #define TRACE_MODAGF(s,a,f) \ - xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__) -#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) -#define TRACE_UNBUSY(fname,s,ag,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) -#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) + xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__) +#define TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp) \ + xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) +#define TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp) \ + xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) +#define TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp) \ + xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) #else #define TRACE_ALLOC(s,a) #define TRACE_FREE(s,a,b,x,f) @@ -420,7 +420,7 @@ xfs_alloc_read_agfl( */ STATIC void xfs_alloc_trace_alloc( - char *name, /* function tag string */ + const char *name, /* function tag string */ char *str, /* additional string */ xfs_alloc_arg_t *args, /* allocation argument structure */ int line) /* source line number */ @@ -453,7 +453,7 @@ xfs_alloc_trace_alloc( */ STATIC void xfs_alloc_trace_free( - char *name, /* function tag string */ + const char *name, /* function tag string */ char *str, /* additional string */ xfs_mount_t *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ @@ -479,7 +479,7 @@ xfs_alloc_trace_free( */ STATIC void xfs_alloc_trace_modagf( - char *name, /* function tag string */ + const char *name, /* function tag string */ char *str, /* additional string */ xfs_mount_t *mp, /* file system mount point */ xfs_agf_t *agf, /* new agf value */ @@ -507,7 +507,7 @@ xfs_alloc_trace_modagf( STATIC void xfs_alloc_trace_busy( - char *name, /* function tag string */ + const char *name, /* function tag string */ char *str, /* additional string */ xfs_mount_t *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ @@ -549,9 +549,6 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent"; -#endif ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); @@ -635,9 +632,6 @@ xfs_alloc_ag_vextent_exact( xfs_agblock_t fbno; /* start block of found extent */ xfs_agblock_t fend; /* end block of found extent */ xfs_extlen_t flen; /* length of found extent */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_exact"; -#endif int i; /* success/failure of operation */ xfs_agblock_t maxend; /* end of maximal extent */ xfs_agblock_t minend; /* end of minimal extent */ @@ -737,9 +731,6 @@ xfs_alloc_ag_vextent_near( xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_near"; -#endif xfs_agblock_t gtbno; /* start bno of right side entry */ xfs_agblock_t gtbnoa; /* aligned ... */ xfs_extlen_t gtdiff; /* difference to right side entry */ @@ -1270,9 +1261,6 @@ xfs_alloc_ag_vextent_size( int error; /* error result */ xfs_agblock_t fbno; /* start of found freespace */ xfs_extlen_t flen; /* length of found freespace */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_size"; -#endif int i; /* temp status variable */ xfs_agblock_t rbno; /* returned block number */ xfs_extlen_t rlen; /* length of returned extent */ @@ -1427,9 +1415,6 @@ xfs_alloc_ag_vextent_small( int error; xfs_agblock_t fbno; xfs_extlen_t flen; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_small"; -#endif int i; if ((error = xfs_alloc_decrement(ccur, 0, &i))) @@ -1516,9 +1501,6 @@ xfs_free_ag_extent( xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ int error; /* error return value */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_free_ag_extent"; -#endif xfs_agblock_t gtbno; /* start of right neighbor block */ xfs_extlen_t gtlen; /* length of right neighbor block */ int haveleft; /* have a left neighbor block */ @@ -2003,9 +1985,6 @@ xfs_alloc_get_freelist( xfs_agblock_t bno; /* block number returned */ int error; int logflags; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_get_freelist"; -#endif xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ @@ -2128,9 +2107,6 @@ xfs_alloc_put_freelist( __be32 *blockp;/* pointer to array entry */ int error; int logflags; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_put_freelist"; -#endif xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ @@ -2263,9 +2239,6 @@ xfs_alloc_vextent( xfs_agblock_t agsize; /* allocation group size */ int error; int flags; /* XFS_ALLOC_FLAG_... locking flags */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_vextent"; -#endif xfs_extlen_t minleft;/* minimum left value, temp copy */ xfs_mount_t *mp; /* mount structure pointer */ xfs_agnumber_t sagno; /* starting allocation group number */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 51ba689a4552..94b5c5fe2681 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -278,7 +278,7 @@ xfs_bmap_isaeof( STATIC void xfs_bmap_trace_addentry( int opcode, /* operation */ - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(ies) */ @@ -292,7 +292,7 @@ xfs_bmap_trace_addentry( */ STATIC void xfs_bmap_trace_delete( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(entries) deleted */ @@ -305,7 +305,7 @@ xfs_bmap_trace_delete( */ STATIC void xfs_bmap_trace_insert( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(entries) inserted */ @@ -319,7 +319,7 @@ xfs_bmap_trace_insert( */ STATIC void xfs_bmap_trace_post_update( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry updated */ @@ -330,17 +330,25 @@ xfs_bmap_trace_post_update( */ STATIC void xfs_bmap_trace_pre_update( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry to be updated */ int whichfork); /* data or attr fork */ +#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \ + xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w) +#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \ + xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w) +#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \ + xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w) +#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \ + xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w) #else -#define xfs_bmap_trace_delete(f,d,ip,i,c,w) -#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w) -#define xfs_bmap_trace_post_update(f,d,ip,i,w) -#define xfs_bmap_trace_pre_update(f,d,ip,i,w) +#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) +#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) +#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) +#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) #endif /* XFS_BMAP_TRACE */ /* @@ -532,9 +540,6 @@ xfs_bmap_add_extent( xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent"; -#endif xfs_ifork_t *ifp; /* inode fork ptr */ int logflags; /* returned value */ xfs_extnum_t nextents; /* number of extents in file now */ @@ -552,8 +557,8 @@ xfs_bmap_add_extent( * already extents in the list. */ if (nextents == 0) { - xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new, - NULL, whichfork); + XFS_BMAP_TRACE_INSERT("insert empty", ip, 0, 1, new, NULL, + whichfork); xfs_iext_insert(ifp, 0, 1, new); ASSERT(cur == NULL); ifp->if_lastex = 0; @@ -711,9 +716,6 @@ xfs_bmap_add_extent_delay_real( int diff; /* temp value */ xfs_bmbt_rec_t *ep; /* extent entry for idx */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_delay_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ @@ -809,15 +811,14 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1, XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 2); ip->i_df.if_lastex = idx - 1; ip->i_d.di_nextents--; @@ -856,15 +857,14 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; - xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 1); if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -893,16 +893,13 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; - xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx + 1, 1); if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -932,11 +929,9 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_startblock(ep, new->br_startblock); - xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) @@ -964,17 +959,14 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, temp); ip->i_df.if_lastex = idx - 1; if (cur == NULL) @@ -996,8 +988,7 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), STARTBLOCKVAL(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ temp = LEFT.br_startoff; @@ -1010,11 +1001,11 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, + XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL, XFS_DATA_FORK); xfs_iext_insert(ifp, idx, 1, new); ip->i_df.if_lastex = idx; @@ -1047,8 +1038,7 @@ xfs_bmap_add_extent_delay_real( (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx + 1); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK); *dnew = temp; /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1061,17 +1051,14 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, - XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, temp); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx + 1; if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -1092,8 +1079,7 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), STARTBLOCKVAL(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ temp = PREV.br_startoff; @@ -1107,10 +1093,10 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, - new, NULL, XFS_DATA_FORK); + XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL, + XFS_DATA_FORK); xfs_iext_insert(ifp, idx + 1, 1, new); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; @@ -1142,7 +1128,7 @@ xfs_bmap_add_extent_delay_real( (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1156,7 +1142,7 @@ xfs_bmap_add_extent_delay_real( * This case is avoided almost all the time. */ temp = new->br_startoff - PREV.br_startoff; - xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, temp); r[0] = *new; r[1].br_state = PREV.br_state; @@ -1164,7 +1150,7 @@ xfs_bmap_add_extent_delay_real( r[1].br_startoff = new_endoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_blockcount = temp2; - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], + XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1], XFS_DATA_FORK); xfs_iext_insert(ifp, idx + 1, 2, &r[0]); ip->i_df.if_lastex = idx + 1; @@ -1223,13 +1209,11 @@ xfs_bmap_add_extent_delay_real( } ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), NULLSTARTBLOCK((int)temp2)); - xfs_bmap_trace_post_update(fname, "0", ip, idx + 2, - XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); *dnew = temp + temp2; /* DELTA: One in-core extent is split in three. */ temp = PREV.br_startoff; @@ -1288,9 +1272,6 @@ xfs_bmap_add_extent_unwritten_real( xfs_btree_cur_t *cur; /* btree cursor */ xfs_bmbt_rec_t *ep; /* extent entry for idx */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_unwritten_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ @@ -1391,15 +1372,14 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1, XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 2); ip->i_df.if_lastex = idx - 1; ip->i_d.di_nextents -= 2; @@ -1442,15 +1422,14 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; - xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 1); ip->i_d.di_nextents--; if (cur == NULL) @@ -1485,16 +1464,15 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); xfs_bmbt_set_state(ep, newext); - xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; - xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, - XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx + 1, 1); ip->i_d.di_nextents--; if (cur == NULL) @@ -1530,10 +1508,10 @@ xfs_bmap_add_extent_unwritten_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, + XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_state(ep, newext); - xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; if (cur == NULL) @@ -1560,21 +1538,21 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, + XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; if (cur == NULL) @@ -1611,15 +1589,15 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK); ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); xfs_bmbt_set_startoff(ep, new_endoff); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, + XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL, XFS_DATA_FORK); xfs_iext_insert(ifp, idx, 1, new); ip->i_df.if_lastex = idx; @@ -1654,18 +1632,18 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, + XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, + XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, newext); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, + XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx + 1; if (cur == NULL) @@ -1701,12 +1679,12 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, - new, NULL, XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL, + XFS_DATA_FORK); xfs_iext_insert(ifp, idx + 1, 1, new); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; @@ -1745,17 +1723,17 @@ xfs_bmap_add_extent_unwritten_real( * newext. Contiguity is impossible here. * One extent becomes three extents. */ - xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK); xfs_bmbt_set_blockcount(ep, new->br_startoff - PREV.br_startoff); - xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); r[0] = *new; r[1].br_startoff = new_endoff; r[1].br_blockcount = PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; r[1].br_state = oldext; - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], + XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1], XFS_DATA_FORK); xfs_iext_insert(ifp, idx + 1, 2, &r[0]); ip->i_df.if_lastex = idx + 1; @@ -1846,9 +1824,6 @@ xfs_bmap_add_extent_hole_delay( int rsvd) /* OK to allocate reserved blocks */ { xfs_bmbt_rec_t *ep; /* extent record for idx */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_hole_delay"; -#endif xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_filblks_t newlen=0; /* new indirect size */ @@ -1920,7 +1895,7 @@ xfs_bmap_add_extent_hole_delay( */ temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; - xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); oldlen = STARTBLOCKVAL(left.br_startblock) + @@ -1929,10 +1904,9 @@ xfs_bmap_add_extent_hole_delay( newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), NULLSTARTBLOCK((int)newlen)); - xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1, + XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, XFS_DATA_FORK); + XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK); xfs_iext_remove(ifp, idx, 1); ip->i_df.if_lastex = idx - 1; /* DELTA: Two in-core extents were replaced by one. */ @@ -1947,7 +1921,7 @@ xfs_bmap_add_extent_hole_delay( * Merge the new allocation with the left neighbor. */ temp = left.br_blockcount + new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); oldlen = STARTBLOCKVAL(left.br_startblock) + @@ -1955,7 +1929,7 @@ xfs_bmap_add_extent_hole_delay( newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), NULLSTARTBLOCK((int)newlen)); - xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; /* DELTA: One in-core extent grew into a hole. */ @@ -1969,14 +1943,14 @@ xfs_bmap_add_extent_hole_delay( * on the right. * Merge the new allocation with the right neighbor. */ - xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK); temp = new->br_blockcount + right.br_blockcount; oldlen = STARTBLOCKVAL(new->br_startblock) + STARTBLOCKVAL(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_allf(ep, new->br_startoff, NULLSTARTBLOCK((int)newlen), temp, right.br_state); - xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK); + XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; /* DELTA: One in-core extent grew into a hole. */ temp2 = temp; @@ -1990,7 +1964,7 @@ xfs_bmap_add_extent_hole_delay( * Insert a new entry. */ oldlen = newlen = 0; - xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, + XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, XFS_DATA_FORK); xfs_iext_insert(ifp, idx, 1, new); ip->i_df.if_lastex = idx; @@ -2040,9 +2014,6 @@ xfs_bmap_add_extent_hole_real( { xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_hole_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ @@ -2119,15 +2090,14 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, whichfork); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), left.br_blockcount + new->br_blockcount + right.br_blockcount); - xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, + XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, whichfork); - xfs_bmap_trace_delete(fname, "LC|RC", ip, - idx, 1, whichfork); + XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, whichfork); xfs_iext_remove(ifp, idx, 1); ifp->if_lastex = idx - 1; XFS_IFORK_NEXT_SET(ip, whichfork, @@ -2169,10 +2139,10 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, whichfork); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), left.br_blockcount + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork); ifp->if_lastex = idx - 1; if (cur == NULL) { rval = XFS_ILOG_FEXT(whichfork); @@ -2203,11 +2173,11 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, whichfork); xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork); ifp->if_lastex = idx; if (cur == NULL) { rval = XFS_ILOG_FEXT(whichfork); @@ -2238,8 +2208,7 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, - whichfork); + XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, whichfork); xfs_iext_insert(ifp, idx, 1, new); ifp->if_lastex = idx; XFS_IFORK_NEXT_SET(ip, whichfork, @@ -3104,9 +3073,6 @@ xfs_bmap_del_extent( xfs_bmbt_rec_t *ep; /* current extent entry pointer */ int error; /* error return value */ int flags; /* inode logging flags */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_del_extent"; -#endif xfs_bmbt_irec_t got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -3200,7 +3166,7 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ - xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork); + XFS_BMAP_TRACE_DELETE("3", ip, idx, 1, whichfork); xfs_iext_remove(ifp, idx, 1); ifp->if_lastex = idx; if (delay) @@ -3221,7 +3187,7 @@ xfs_bmap_del_extent( /* * Deleting the first part of the extent. */ - xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("2", ip, idx, whichfork); xfs_bmbt_set_startoff(ep, del_endoff); temp = got.br_blockcount - del->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); @@ -3230,13 +3196,13 @@ xfs_bmap_del_extent( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "2", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); da_new = temp; break; } xfs_bmbt_set_startblock(ep, del_endblock); - xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); if (!cur) { flags |= XFS_ILOG_FEXT(whichfork); break; @@ -3252,19 +3218,19 @@ xfs_bmap_del_extent( * Deleting the last part of the extent. */ temp = got.br_blockcount - del->br_blockcount; - xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("1", ip, idx, whichfork); xfs_bmbt_set_blockcount(ep, temp); ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "1", ip, idx, + XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); da_new = temp; break; } - xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); if (!cur) { flags |= XFS_ILOG_FEXT(whichfork); break; @@ -3281,7 +3247,7 @@ xfs_bmap_del_extent( * Deleting the middle of the extent. */ temp = del->br_startoff - got.br_startoff; - xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork); + XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, whichfork); xfs_bmbt_set_blockcount(ep, temp); new.br_startoff = del_endoff; temp2 = got_endoff - del_endoff; @@ -3368,8 +3334,8 @@ xfs_bmap_del_extent( } } } - xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork); - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL, + XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, whichfork); + XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 1, &new, NULL, whichfork); xfs_iext_insert(ifp, idx + 1, 1, &new); ifp->if_lastex = idx + 1; @@ -3609,9 +3575,6 @@ xfs_bmap_local_to_extents( { int error; /* error return value */ int flags; /* logging flags returned */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_local_to_extents"; -#endif xfs_ifork_t *ifp; /* inode fork pointer */ /* @@ -3666,7 +3629,7 @@ xfs_bmap_local_to_extents( xfs_iext_add(ifp, 0, 1); ep = xfs_iext_get_ext(ifp, 0); xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); - xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork); + XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, 1); ip->i_d.di_nblocks = 1; XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, @@ -3789,7 +3752,7 @@ ktrace_t *xfs_bmap_trace_buf; STATIC void xfs_bmap_trace_addentry( int opcode, /* operation */ - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(ies) */ @@ -3848,7 +3811,7 @@ xfs_bmap_trace_addentry( */ STATIC void xfs_bmap_trace_delete( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(entries) deleted */ @@ -3870,7 +3833,7 @@ xfs_bmap_trace_delete( */ STATIC void xfs_bmap_trace_insert( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry(entries) inserted */ @@ -3899,7 +3862,7 @@ xfs_bmap_trace_insert( */ STATIC void xfs_bmap_trace_post_update( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry updated */ @@ -3917,7 +3880,7 @@ xfs_bmap_trace_post_update( */ STATIC void xfs_bmap_trace_pre_update( - char *fname, /* function name */ + const char *fname, /* function name */ char *desc, /* operation description */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index of entry to be updated */ @@ -4534,9 +4497,6 @@ xfs_bmap_read_extents( xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_read_extents"; -#endif xfs_extnum_t i, j; /* index into the extents list */ xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ @@ -4653,7 +4613,7 @@ xfs_bmap_read_extents( } ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); - xfs_bmap_trace_exlist(fname, ip, i, whichfork); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; error0: xfs_trans_brelse(tp, bp); @@ -4666,7 +4626,7 @@ error0: */ void xfs_bmap_trace_exlist( - char *fname, /* function name */ + const char *fname, /* function name */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ int whichfork) /* data or attr fork */ @@ -4681,7 +4641,7 @@ xfs_bmap_trace_exlist( for (idx = 0; idx < cnt; idx++) { ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_get_all(ep, &s); - xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL, + XFS_BMAP_TRACE_INSERT("exlist", ip, idx, 1, &s, NULL, whichfork); } } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 4f24c7e39b31..524b1c9d5246 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -144,12 +144,14 @@ extern ktrace_t *xfs_bmap_trace_buf; */ void xfs_bmap_trace_exlist( - char *fname, /* function name */ + const char *fname, /* function name */ struct xfs_inode *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in list */ int whichfork); /* data or attr fork */ +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ + xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w) #else -#define xfs_bmap_trace_exlist(f,ip,c,w) +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif /* diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 0bf192fea3eb..89b891f51cfb 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -76,7 +76,7 @@ static char EXIT[] = "exit"; */ STATIC void xfs_bmbt_trace_enter( - char *func, + const char *func, xfs_btree_cur_t *cur, char *s, int type, @@ -117,7 +117,7 @@ xfs_bmbt_trace_enter( */ STATIC void xfs_bmbt_trace_argbi( - char *func, + const char *func, xfs_btree_cur_t *cur, xfs_buf_t *b, int i, @@ -134,7 +134,7 @@ xfs_bmbt_trace_argbi( */ STATIC void xfs_bmbt_trace_argbii( - char *func, + const char *func, xfs_btree_cur_t *cur, xfs_buf_t *b, int i0, @@ -153,7 +153,7 @@ xfs_bmbt_trace_argbii( */ STATIC void xfs_bmbt_trace_argfffi( - char *func, + const char *func, xfs_btree_cur_t *cur, xfs_dfiloff_t o, xfs_dfsbno_t b, @@ -172,7 +172,7 @@ xfs_bmbt_trace_argfffi( */ STATIC void xfs_bmbt_trace_argi( - char *func, + const char *func, xfs_btree_cur_t *cur, int i, int line) @@ -188,7 +188,7 @@ xfs_bmbt_trace_argi( */ STATIC void xfs_bmbt_trace_argifk( - char *func, + const char *func, xfs_btree_cur_t *cur, int i, xfs_fsblock_t f, @@ -206,7 +206,7 @@ xfs_bmbt_trace_argifk( */ STATIC void xfs_bmbt_trace_argifr( - char *func, + const char *func, xfs_btree_cur_t *cur, int i, xfs_fsblock_t f, @@ -235,7 +235,7 @@ xfs_bmbt_trace_argifr( */ STATIC void xfs_bmbt_trace_argik( - char *func, + const char *func, xfs_btree_cur_t *cur, int i, xfs_bmbt_key_t *k, @@ -255,7 +255,7 @@ xfs_bmbt_trace_argik( */ STATIC void xfs_bmbt_trace_cursor( - char *func, + const char *func, xfs_btree_cur_t *cur, char *s, int line) @@ -274,21 +274,21 @@ xfs_bmbt_trace_cursor( } #define XFS_BMBT_TRACE_ARGBI(c,b,i) \ - xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__) + xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__) #define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ - xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__) + xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__) #define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ - xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__) + xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__) #define XFS_BMBT_TRACE_ARGI(c,i) \ - xfs_bmbt_trace_argi(fname, c, i, __LINE__) + xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__) #define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \ - xfs_bmbt_trace_argifk(fname, c, i, f, s, __LINE__) + xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__) #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ - xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__) + xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__) #define XFS_BMBT_TRACE_ARGIK(c,i,k) \ - xfs_bmbt_trace_argik(fname, c, i, k, __LINE__) + xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__) #define XFS_BMBT_TRACE_CURSOR(c,s) \ - xfs_bmbt_trace_cursor(fname, c, s, __LINE__) + xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__) #else #define XFS_BMBT_TRACE_ARGBI(c,b,i) #define XFS_BMBT_TRACE_ARGBII(c,b,i,j) @@ -318,9 +318,6 @@ xfs_bmbt_delrec( xfs_fsblock_t bno; /* fs-relative block number */ xfs_buf_t *bp; /* buffer for block */ int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_delrec"; -#endif int i; /* loop counter */ int j; /* temp state */ xfs_bmbt_key_t key; /* bmap btree key */ @@ -694,9 +691,6 @@ xfs_bmbt_insrec( xfs_bmbt_block_t *block; /* bmap btree block */ xfs_buf_t *bp; /* buffer for block */ int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_insrec"; -#endif int i; /* loop index */ xfs_bmbt_key_t key; /* bmap btree key */ xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ @@ -880,9 +874,6 @@ xfs_bmbt_killroot( xfs_bmbt_ptr_t *cpp; #ifdef DEBUG int error; -#endif -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_killroot"; #endif int i; xfs_bmbt_key_t *kp; @@ -973,9 +964,6 @@ xfs_bmbt_log_keys( int kfirst, int klast) { -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_keys"; -#endif xfs_trans_t *tp; XFS_BMBT_TRACE_CURSOR(cur, ENTRY); @@ -1012,9 +1000,6 @@ xfs_bmbt_log_ptrs( int pfirst, int plast) { -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_ptrs"; -#endif xfs_trans_t *tp; XFS_BMBT_TRACE_CURSOR(cur, ENTRY); @@ -1055,9 +1040,6 @@ xfs_bmbt_lookup( xfs_daddr_t d; xfs_sfiloff_t diff; int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_lookup"; -#endif xfs_fsblock_t fsbno=0; int high; int i; @@ -1195,9 +1177,6 @@ xfs_bmbt_lshift( int *stat) /* success/failure */ { int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_lshift"; -#endif #ifdef DEBUG int i; /* loop counter */ #endif @@ -1331,9 +1310,6 @@ xfs_bmbt_rshift( int *stat) /* success/failure */ { int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_rshift"; -#endif int i; /* loop counter */ xfs_bmbt_key_t key; /* bmap btree key */ xfs_buf_t *lbp; /* left buffer pointer */ @@ -1492,9 +1468,6 @@ xfs_bmbt_split( { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_split"; -#endif int i; /* loop counter */ xfs_fsblock_t lbno; /* left sibling block number */ xfs_buf_t *lbp; /* left buffer pointer */ @@ -1640,9 +1613,6 @@ xfs_bmbt_updkey( xfs_buf_t *bp; #ifdef DEBUG int error; -#endif -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_updkey"; #endif xfs_bmbt_key_t *kp; int ptr; @@ -1712,9 +1682,6 @@ xfs_bmbt_decrement( xfs_bmbt_block_t *block; xfs_buf_t *bp; int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_decrement"; -#endif xfs_fsblock_t fsbno; int lev; xfs_mount_t *mp; @@ -1785,9 +1752,6 @@ xfs_bmbt_delete( int *stat) /* success/failure */ { int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_delete"; -#endif int i; int level; @@ -2000,9 +1964,6 @@ xfs_bmbt_increment( xfs_bmbt_block_t *block; xfs_buf_t *bp; int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_increment"; -#endif xfs_fsblock_t fsbno; int lev; xfs_mount_t *mp; @@ -2080,9 +2041,6 @@ xfs_bmbt_insert( int *stat) /* success/failure */ { int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_insert"; -#endif int i; int level; xfs_fsblock_t nbno; @@ -2142,9 +2100,6 @@ xfs_bmbt_log_block( int fields) { int first; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_block"; -#endif int last; xfs_trans_t *tp; static const short offsets[] = { @@ -2181,9 +2136,6 @@ xfs_bmbt_log_recs( { xfs_bmbt_block_t *block; int first; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_recs"; -#endif int last; xfs_bmbt_rec_t *rp; xfs_trans_t *tp; @@ -2245,9 +2197,6 @@ xfs_bmbt_newroot( xfs_bmbt_key_t *ckp; /* child key pointer */ xfs_bmbt_ptr_t *cpp; /* child ptr pointer */ int error; /* error return code */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_newroot"; -#endif #ifdef DEBUG int i; /* loop counter */ #endif @@ -2630,9 +2579,6 @@ xfs_bmbt_update( xfs_bmbt_block_t *block; xfs_buf_t *bp; int error; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_update"; -#endif xfs_bmbt_key_t key; int ptr; xfs_bmbt_rec_t *rp; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 59829523ab07..cdc4c28926d0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -645,8 +645,7 @@ xfs_iformat_extents( ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1), ARCH_CONVERT); } - xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex, - whichfork); + XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); if (whichfork != XFS_DATA_FORK || XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) if (unlikely(xfs_check_nostate_extents( @@ -2894,9 +2893,6 @@ xfs_iextents_copy( int copied; xfs_bmbt_rec_t *dest_ep; xfs_bmbt_rec_t *ep; -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_iextents_copy"; -#endif int i; xfs_ifork_t *ifp; int nrecs; @@ -2907,7 +2903,7 @@ xfs_iextents_copy( ASSERT(ifp->if_bytes > 0); nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork); + XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); ASSERT(nrecs > 0); /* -- cgit v1.2.1 From 547e00c3c681265b1fe5e34c7643f3ddac748ba0 Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Wed, 11 Jul 2007 11:09:57 +1000 Subject: [XFS] Compat ioctl handler for XFS_IOC_FSGEOMETRY_V1. i386 struct xfs_fsop_geom_v1 has no padding after the last member, so the size is different. SGI-PV: 967354 SGI-Modid: xfs-linux-melb:xfs-kern:29100a Signed-off-by: Michal Marek Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_ioctl32.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index b83cebc165f1..5b91335e97dd 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -75,6 +75,42 @@ xfs_ioctl32_flock( return (unsigned long)p; } +typedef struct compat_xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} __attribute__((packed)) compat_xfs_fsop_geom_v1_t; + +#define XFS_IOC_FSGEOMETRY_V1_32 \ + _IOR ('X', 100, struct compat_xfs_fsop_geom_v1) + +STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg) +{ + compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg; + xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p)); + + if (copy_in_user(p, p32, sizeof(*p32))) + return -EFAULT; + return (unsigned long)p; +} + #else typedef struct xfs_fsop_bulkreq32 { @@ -118,7 +154,6 @@ xfs_compat_ioctl( switch (cmd) { case XFS_IOC_DIOINFO: - case XFS_IOC_FSGEOMETRY_V1: case XFS_IOC_FSGEOMETRY: case XFS_IOC_GETVERSION: case XFS_IOC_GETXFLAGS: @@ -166,6 +201,10 @@ xfs_compat_ioctl( arg = xfs_ioctl32_flock(arg); cmd = _NATIVE_IOC(cmd, struct xfs_flock64); break; + case XFS_IOC_FSGEOMETRY_V1_32: + arg = xfs_ioctl32_geom_v1(arg); + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1); + break; #else /* These are handled fine if no alignment issues */ case XFS_IOC_ALLOCSP: @@ -176,6 +215,7 @@ xfs_compat_ioctl( case XFS_IOC_FREESP64: case XFS_IOC_RESVSP64: case XFS_IOC_UNRESVSP64: + case XFS_IOC_FSGEOMETRY_V1: break; /* xfs_bstat_t still has wrong u32 vs u64 alignment */ -- cgit v1.2.1 From 1fa503df66f7bffc0ff62662626897eec79446c2 Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Wed, 11 Jul 2007 11:10:09 +1000 Subject: [XFS] Compat ioctl handler for handle operations 32bit struct xfs_fsop_handlereq has different size and offsets (due to pointers). TODO: case XFS_IOC_{FSSETDM,ATTRLIST,ATTRMULTI}_BY_HANDLE still not handled. SGI-PV: 967354 SGI-Modid: xfs-linux-melb:xfs-kern:29101a Signed-off-by: Michal Marek Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_ioctl32.c | 57 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 5b91335e97dd..87163fb9746b 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -141,6 +141,50 @@ xfs_ioctl32_bulkstat( } #endif +typedef struct compat_xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + compat_uptr_t path; /* user pathname */ + __u32 oflags; /* open flags */ + compat_uptr_t ihandle; /* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + compat_uptr_t ohandle; /* user buffer for handle */ + compat_uptr_t ohandlen; /* user buffer length */ +} compat_xfs_fsop_handlereq_t; + +#define XFS_IOC_PATH_TO_FSHANDLE_32 \ + _IOWR('X', 104, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE_32 \ + _IOWR('X', 105, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE_32 \ + _IOWR('X', 106, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE_32 \ + _IOWR('X', 107, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE_32 \ + _IOWR('X', 108, struct compat_xfs_fsop_handlereq) + +STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg) +{ + compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg; + xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p)); + u32 addr; + + if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) || + get_user(addr, &p32->path) || + put_user(compat_ptr(addr), &p->path) || + copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) || + get_user(addr, &p32->ihandle) || + put_user(compat_ptr(addr), &p->ihandle) || + copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) || + get_user(addr, &p32->ohandle) || + put_user(compat_ptr(addr), &p->ohandle) || + get_user(addr, &p32->ohandlen) || + put_user(compat_ptr(addr), &p->ohandlen)) + return -EFAULT; + + return (unsigned long)p; +} + + STATIC long xfs_compat_ioctl( int mode, @@ -166,12 +210,7 @@ xfs_compat_ioctl( case XFS_IOC_GETBMAPA: case XFS_IOC_GETBMAPX: /* not handled - case XFS_IOC_FD_TO_HANDLE: - case XFS_IOC_PATH_TO_HANDLE: - case XFS_IOC_PATH_TO_FSHANDLE: - case XFS_IOC_OPEN_BY_HANDLE: case XFS_IOC_FSSETDM_BY_HANDLE: - case XFS_IOC_READLINK_BY_HANDLE: case XFS_IOC_ATTRLIST_BY_HANDLE: case XFS_IOC_ATTRMULTI_BY_HANDLE: */ @@ -228,6 +267,14 @@ xfs_compat_ioctl( arg = xfs_ioctl32_bulkstat(arg); break; #endif + case XFS_IOC_FD_TO_HANDLE_32: + case XFS_IOC_PATH_TO_HANDLE_32: + case XFS_IOC_PATH_TO_FSHANDLE_32: + case XFS_IOC_OPEN_BY_HANDLE_32: + case XFS_IOC_READLINK_BY_HANDLE_32: + arg = xfs_ioctl32_fshandle(arg); + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); + break; default: return -ENOIOCTLCMD; } -- cgit v1.2.1 From faa63e9584df41020440756b8b90b7b63f95e4f6 Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Wed, 11 Jul 2007 11:10:19 +1000 Subject: [XFS] Fix XFS_IOC_FSBULKSTAT{,_SINGLE} & XFS_IOC_FSINUMBERS in compat mode * 32bit struct xfs_fsop_bulkreq has different size and layout of members, no matter the alignment. Move the code out of the #else branch (why was it there in the first place?). Define _32 variants of the ioctl constants. * 32bit struct xfs_bstat is different because of time_t and on i386 because of different padding. Make xfs_bulkstat_one() accept a custom "output formatter" in the private_data argument which takes care of the xfs_bulkstat_one_compat() that takes care of the different layout in the compat case. * i386 struct xfs_inogrp has different padding. Add a similar "output formatter" mecanism to xfs_inumbers(). SGI-PV: 967354 SGI-Modid: xfs-linux-melb:xfs-kern:29102a Signed-off-by: Michal Marek Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_ioctl.c | 2 +- fs/xfs/linux-2.6/xfs_ioctl32.c | 224 ++++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_itable.c | 42 ++++++-- fs/xfs/xfs_itable.h | 20 +++- 4 files changed, 255 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index ff5c41ff8d40..5917808abbd6 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -1019,7 +1019,7 @@ xfs_ioc_bulkstat( if (cmd == XFS_IOC_FSINUMBERS) error = xfs_inumbers(mp, &inlast, &count, - bulkreq.ubuffer); + bulkreq.ubuffer, xfs_inumbers_fmt); else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) error = xfs_bulkstat_single(mp, &inlast, bulkreq.ubuffer, &done); diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 87163fb9746b..141cf15067c2 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -23,10 +23,25 @@ #include #include #include "xfs.h" -#include "xfs_types.h" #include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir2_sf.h" #include "xfs_vfs.h" #include "xfs_vnode.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_error.h" #include "xfs_dfrag.h" #define _NATIVE_IOC(cmd, type) \ @@ -34,6 +49,7 @@ #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) #define BROKEN_X86_ALIGNMENT +#define _PACKED __attribute__((packed)) /* on ia32 l_start is on a 32-bit boundary */ typedef struct xfs_flock64_32 { __s16 l_type; @@ -111,35 +127,196 @@ STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg) return (unsigned long)p; } +typedef struct compat_xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} __attribute__((packed)) compat_xfs_inogrp_t; + +STATIC int xfs_inumbers_fmt_compat( + void __user *ubuffer, + const xfs_inogrp_t *buffer, + long count, + long *written) +{ + compat_xfs_inogrp_t *p32 = ubuffer; + long i; + + for (i = 0; i < count; i++) { + if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || + put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || + put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) + return -EFAULT; + } + *written = count * sizeof(*p32); + return 0; +} + #else -typedef struct xfs_fsop_bulkreq32 { +#define xfs_inumbers_fmt_compat xfs_inumbers_fmt +#define _PACKED + +#endif + +/* XFS_IOC_FSBULKSTAT and friends */ + +typedef struct compat_xfs_bstime { + __s32 tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} compat_xfs_bstime_t; + +STATIC int xfs_bstime_store_compat( + compat_xfs_bstime_t __user *p32, + const xfs_bstime_t *p) +{ + __s32 sec32; + + sec32 = p->tv_sec; + if (put_user(sec32, &p32->tv_sec) || + put_user(p->tv_nsec, &p32->tv_nsec)) + return -EFAULT; + return 0; +} + +typedef struct compat_xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + compat_xfs_bstime_t bs_atime; /* access time */ + compat_xfs_bstime_t bs_mtime; /* modify time */ + compat_xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid; /* project id */ + unsigned char bs_pad[14]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} _PACKED compat_xfs_bstat_t; + +STATIC int xfs_bulkstat_one_fmt_compat( + void __user *ubuffer, + const xfs_bstat_t *buffer) +{ + compat_xfs_bstat_t __user *p32 = ubuffer; + + if (put_user(buffer->bs_ino, &p32->bs_ino) || + put_user(buffer->bs_mode, &p32->bs_mode) || + put_user(buffer->bs_nlink, &p32->bs_nlink) || + put_user(buffer->bs_uid, &p32->bs_uid) || + put_user(buffer->bs_gid, &p32->bs_gid) || + put_user(buffer->bs_rdev, &p32->bs_rdev) || + put_user(buffer->bs_blksize, &p32->bs_blksize) || + put_user(buffer->bs_size, &p32->bs_size) || + xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) || + xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) || + xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) || + put_user(buffer->bs_blocks, &p32->bs_blocks) || + put_user(buffer->bs_xflags, &p32->bs_xflags) || + put_user(buffer->bs_extsize, &p32->bs_extsize) || + put_user(buffer->bs_extents, &p32->bs_extents) || + put_user(buffer->bs_gen, &p32->bs_gen) || + put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || + put_user(buffer->bs_dmstate, &p32->bs_dmstate) || + put_user(buffer->bs_aextents, &p32->bs_aextents)) + return -EFAULT; + return sizeof(*p32); +} + + + +typedef struct compat_xfs_fsop_bulkreq { compat_uptr_t lastip; /* last inode # pointer */ __s32 icount; /* count of entries in buffer */ compat_uptr_t ubuffer; /* user buffer for inode desc. */ - __s32 ocount; /* output count pointer */ -} xfs_fsop_bulkreq32_t; + compat_uptr_t ocount; /* output count pointer */ +} compat_xfs_fsop_bulkreq_t; -STATIC unsigned long -xfs_ioctl32_bulkstat( - unsigned long arg) +#define XFS_IOC_FSBULKSTAT_32 \ + _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ + _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS_32 \ + _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) + +/* copied from xfs_ioctl.c */ +STATIC int +xfs_ioc_bulkstat_compat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg) { - xfs_fsop_bulkreq32_t __user *p32 = (void __user *)arg; - xfs_fsop_bulkreq_t __user *p = compat_alloc_user_space(sizeof(*p)); + compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg; u32 addr; + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ - if (get_user(addr, &p32->lastip) || - put_user(compat_ptr(addr), &p->lastip) || - copy_in_user(&p->icount, &p32->icount, sizeof(s32)) || - get_user(addr, &p32->ubuffer) || - put_user(compat_ptr(addr), &p->ubuffer) || - get_user(addr, &p32->ocount) || - put_user(compat_ptr(addr), &p->ocount)) + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (get_user(addr, &p32->lastip)) + return -EFAULT; + bulkreq.lastip = compat_ptr(addr); + if (get_user(bulkreq.icount, &p32->icount) || + get_user(addr, &p32->ubuffer)) return -EFAULT; + bulkreq.ubuffer = compat_ptr(addr); + if (get_user(addr, &p32->ocount)) + return -EFAULT; + bulkreq.ocount = compat_ptr(addr); - return (unsigned long)p; + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt_compat); + else { + /* declare a var to get a warning in case the type changes */ + bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat; + error = xfs_bulkstat(mp, &inlast, &count, + xfs_bulkstat_one, formatter, + sizeof(compat_xfs_bstat_t), bulkreq.ubuffer, + BULKSTAT_FG_QUICK, &done); + } + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; } -#endif + + typedef struct compat_xfs_fsop_handlereq { __u32 fd; /* fd for FD_TO_HANDLE */ @@ -261,12 +438,13 @@ xfs_compat_ioctl( case XFS_IOC_SWAPEXT: break; - case XFS_IOC_FSBULKSTAT_SINGLE: - case XFS_IOC_FSBULKSTAT: - case XFS_IOC_FSINUMBERS: - arg = xfs_ioctl32_bulkstat(arg); - break; #endif + case XFS_IOC_FSBULKSTAT_32: + case XFS_IOC_FSBULKSTAT_SINGLE_32: + case XFS_IOC_FSINUMBERS_32: + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq); + return xfs_ioc_bulkstat_compat(XFS_BHVTOI(VNHEAD(vp))->i_mount, + cmd, (void*)arg); case XFS_IOC_FD_TO_HANDLE_32: case XFS_IOC_PATH_TO_HANDLE_32: case XFS_IOC_PATH_TO_FSHANDLE_32: diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index e725ddd3de5f..4c2454bcc714 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -202,6 +202,16 @@ xfs_bulkstat_one_dinode( return 0; } +STATIC int +xfs_bulkstat_one_fmt( + void __user *ubuffer, + const xfs_bstat_t *buffer) +{ + if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) + return -EFAULT; + return sizeof(*buffer); +} + /* * Return stat information for one inode. * Return 0 if ok, else errno. @@ -221,6 +231,7 @@ xfs_bulkstat_one( xfs_bstat_t *buf; /* return buffer */ int error = 0; /* error value */ xfs_dinode_t *dip; /* dinode inode pointer */ + bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt; dip = (xfs_dinode_t *)dibuff; *stat = BULKSTAT_RV_NOTHING; @@ -243,14 +254,15 @@ xfs_bulkstat_one( xfs_bulkstat_one_dinode(mp, ino, dip, buf); } - if (copy_to_user(buffer, buf, sizeof(*buf))) { + error = formatter(buffer, buf); + if (error < 0) { error = EFAULT; goto out_free; } *stat = BULKSTAT_RV_DIDONE; if (ubused) - *ubused = sizeof(*buf); + *ubused = error; out_free: kmem_free(buf, sizeof(*buf)); @@ -748,6 +760,19 @@ xfs_bulkstat_single( return 0; } +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written) /* # of bytes written */ +{ + if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer))) + return -EFAULT; + *written = count * sizeof(*buffer); + return 0; +} + /* * Return inode number table for the filesystem. */ @@ -756,7 +781,8 @@ xfs_inumbers( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t *lastino, /* last inode returned */ int *count, /* size of buffer/count returned */ - xfs_inogrp_t __user *ubuffer)/* buffer with inode descriptions */ + void __user *ubuffer,/* buffer with inode descriptions */ + inumbers_fmt_pf formatter) { xfs_buf_t *agbp; xfs_agino_t agino; @@ -835,12 +861,12 @@ xfs_inumbers( bufidx++; left--; if (bufidx == bcount) { - if (copy_to_user(ubuffer, buffer, - bufidx * sizeof(*buffer))) { + long written; + if (formatter(ubuffer, buffer, bufidx, &written)) { error = XFS_ERROR(EFAULT); break; } - ubuffer += bufidx; + ubuffer += written; *count += bufidx; bufidx = 0; } @@ -862,8 +888,8 @@ xfs_inumbers( } if (!error) { if (bufidx) { - if (copy_to_user(ubuffer, buffer, - bufidx * sizeof(*buffer))) + long written; + if (formatter(ubuffer, buffer, bufidx, &written)) error = XFS_ERROR(EFAULT); else *count += bufidx; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index f25a28862a17..a1f18fce9b70 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -69,6 +69,10 @@ xfs_bulkstat_single( char __user *buffer, int *done); +typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ + void __user *ubuffer, /* buffer to write to */ + const xfs_bstat_t *buffer); /* buffer to read from */ + int xfs_bulkstat_one( xfs_mount_t *mp, @@ -86,11 +90,25 @@ xfs_internal_inum( xfs_mount_t *mp, xfs_ino_t ino); +typedef int (*inumbers_fmt_pf)( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + int /* error status */ xfs_inumbers( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t *last, /* last inode returned */ int *count, /* size of buffer/count returned */ - xfs_inogrp_t __user *buffer);/* buffer with inode info */ + void __user *buffer, /* buffer with inode info */ + inumbers_fmt_pf formatter); #endif /* __XFS_ITABLE_H__ */ -- cgit v1.2.1 From 0f1145cc18e970ebe37da114fc34c297f135e062 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Fri, 29 Jun 2007 17:26:09 +1000 Subject: [XFS] Fix lockdep annotations for xfs_lock_inodes SGI-PV: 967035 SGI-Modid: xfs-linux-melb:xfs-kern:29026a Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/xfs_inode.h | 15 +++++++++------ fs/xfs/xfs_vnodeops.c | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index d418eeed4ebd..012dfd4a958c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -415,19 +415,22 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) * gets a lockdep subclass of 1 and the second lock will have a lockdep * subclass of 0. * - * XFS_I[O]LOCK_INUMORDER - for locking several inodes at the some time + * XFS_LOCK_INUMORDER - for locking several inodes at the some time * with xfs_lock_inodes(). This flag is used as the starting subclass * and each subsequent lock acquired will increment the subclass by one. * So the first lock acquired will have a lockdep subclass of 2, the - * second lock will have a lockdep subclass of 3, and so on. + * second lock will have a lockdep subclass of 3, and so on. It is + * the responsibility of the class builder to shift this to the correct + * portion of the lock_mode lockdep mask. */ +#define XFS_LOCK_PARENT 1 +#define XFS_LOCK_INUMORDER 2 + #define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT (1 << XFS_IOLOCK_SHIFT) -#define XFS_IOLOCK_INUMORDER (2 << XFS_IOLOCK_SHIFT) +#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) #define XFS_ILOCK_SHIFT 24 -#define XFS_ILOCK_PARENT (1 << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_INUMORDER (2 << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) #define XFS_IOLOCK_DEP_MASK 0x00ff0000 #define XFS_ILOCK_DEP_MASK 0xff000000 diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 60fd0be90a16..79b522779aa4 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2224,9 +2224,9 @@ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - lock_mode |= (subclass + XFS_IOLOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) - lock_mode |= (subclass + XFS_ILOCK_INUMORDER) << XFS_ILOCK_SHIFT; + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; return lock_mode; } -- cgit v1.2.1 From bd238fb431f31989898423c8b6496bc8c4204a86 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Tue, 10 Jul 2007 17:57:28 -0500 Subject: 9p: Reorganization of 9p file system code This patchset moves non-filesystem interfaces of v9fs from fs/9p to net/9p. It moves the transport, packet marshalling and connection layers to net/9p leaving only the VFS related files in fs/9p. This work is being done in preparation for in-kernel 9p servers as well as alternate 9p clients (other than VFS). Signed-off-by: Latchesar Ionkov Signed-off-by: Eric Van Hensbergen --- fs/9p/9p.h | 375 ------------------- fs/9p/Makefile | 6 - fs/9p/conv.c | 845 ------------------------------------------ fs/9p/conv.h | 50 --- fs/9p/debug.h | 77 ---- fs/9p/error.c | 93 ----- fs/9p/error.h | 177 --------- fs/9p/fcall.c | 427 ---------------------- fs/9p/fcprint.c | 345 ------------------ fs/9p/fid.c | 168 +++------ fs/9p/fid.h | 43 +-- fs/9p/mux.c | 1033 ---------------------------------------------------- fs/9p/mux.h | 55 --- fs/9p/trans_fd.c | 308 ---------------- fs/9p/transport.h | 45 --- fs/9p/v9fs.c | 288 +++------------ fs/9p/v9fs.h | 32 +- fs/9p/v9fs_vfs.h | 6 +- fs/9p/vfs_addr.c | 57 +-- fs/9p/vfs_dentry.c | 37 +- fs/9p/vfs_dir.c | 155 ++------ fs/9p/vfs_file.c | 160 ++------ fs/9p/vfs_inode.c | 753 ++++++++++++++------------------------ fs/9p/vfs_super.c | 91 ++--- fs/Kconfig | 2 +- 25 files changed, 553 insertions(+), 5075 deletions(-) delete mode 100644 fs/9p/9p.h delete mode 100644 fs/9p/conv.c delete mode 100644 fs/9p/conv.h delete mode 100644 fs/9p/debug.h delete mode 100644 fs/9p/error.c delete mode 100644 fs/9p/error.h delete mode 100644 fs/9p/fcall.c delete mode 100644 fs/9p/fcprint.c delete mode 100644 fs/9p/mux.c delete mode 100644 fs/9p/mux.h delete mode 100644 fs/9p/trans_fd.c delete mode 100644 fs/9p/transport.h (limited to 'fs') diff --git a/fs/9p/9p.h b/fs/9p/9p.h deleted file mode 100644 index 94e2f92ab2e8..000000000000 --- a/fs/9p/9p.h +++ /dev/null @@ -1,375 +0,0 @@ -/* - * linux/fs/9p/9p.h - * - * 9P protocol definitions. - * - * Copyright (C) 2005 by Latchesar Ionkov - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -/* Message Types */ -enum { - TVERSION = 100, - RVERSION, - TAUTH = 102, - RAUTH, - TATTACH = 104, - RATTACH, - TERROR = 106, - RERROR, - TFLUSH = 108, - RFLUSH, - TWALK = 110, - RWALK, - TOPEN = 112, - ROPEN, - TCREATE = 114, - RCREATE, - TREAD = 116, - RREAD, - TWRITE = 118, - RWRITE, - TCLUNK = 120, - RCLUNK, - TREMOVE = 122, - RREMOVE, - TSTAT = 124, - RSTAT, - TWSTAT = 126, - RWSTAT, -}; - -/* modes */ -enum { - V9FS_OREAD = 0x00, - V9FS_OWRITE = 0x01, - V9FS_ORDWR = 0x02, - V9FS_OEXEC = 0x03, - V9FS_OEXCL = 0x04, - V9FS_OTRUNC = 0x10, - V9FS_OREXEC = 0x20, - V9FS_ORCLOSE = 0x40, - V9FS_OAPPEND = 0x80, -}; - -/* permissions */ -enum { - V9FS_DMDIR = 0x80000000, - V9FS_DMAPPEND = 0x40000000, - V9FS_DMEXCL = 0x20000000, - V9FS_DMMOUNT = 0x10000000, - V9FS_DMAUTH = 0x08000000, - V9FS_DMTMP = 0x04000000, - V9FS_DMSYMLINK = 0x02000000, - V9FS_DMLINK = 0x01000000, - /* 9P2000.u extensions */ - V9FS_DMDEVICE = 0x00800000, - V9FS_DMNAMEDPIPE = 0x00200000, - V9FS_DMSOCKET = 0x00100000, - V9FS_DMSETUID = 0x00080000, - V9FS_DMSETGID = 0x00040000, -}; - -/* qid.types */ -enum { - V9FS_QTDIR = 0x80, - V9FS_QTAPPEND = 0x40, - V9FS_QTEXCL = 0x20, - V9FS_QTMOUNT = 0x10, - V9FS_QTAUTH = 0x08, - V9FS_QTTMP = 0x04, - V9FS_QTSYMLINK = 0x02, - V9FS_QTLINK = 0x01, - V9FS_QTFILE = 0x00, -}; - -#define V9FS_NOTAG (u16)(~0) -#define V9FS_NOFID (u32)(~0) -#define V9FS_MAXWELEM 16 - -/* ample room for Twrite/Rread header (iounit) */ -#define V9FS_IOHDRSZ 24 - -struct v9fs_str { - u16 len; - char *str; -}; - -/* qids are the unique ID for a file (like an inode */ -struct v9fs_qid { - u8 type; - u32 version; - u64 path; -}; - -/* Plan 9 file metadata (stat) structure */ -struct v9fs_stat { - u16 size; - u16 type; - u32 dev; - struct v9fs_qid qid; - u32 mode; - u32 atime; - u32 mtime; - u64 length; - struct v9fs_str name; - struct v9fs_str uid; - struct v9fs_str gid; - struct v9fs_str muid; - struct v9fs_str extension; /* 9p2000.u extensions */ - u32 n_uid; /* 9p2000.u extensions */ - u32 n_gid; /* 9p2000.u extensions */ - u32 n_muid; /* 9p2000.u extensions */ -}; - -/* file metadata (stat) structure used to create Twstat message - The is similar to v9fs_stat, but the strings don't point to - the same memory block and should be freed separately -*/ -struct v9fs_wstat { - u16 size; - u16 type; - u32 dev; - struct v9fs_qid qid; - u32 mode; - u32 atime; - u32 mtime; - u64 length; - char *name; - char *uid; - char *gid; - char *muid; - char *extension; /* 9p2000.u extensions */ - u32 n_uid; /* 9p2000.u extensions */ - u32 n_gid; /* 9p2000.u extensions */ - u32 n_muid; /* 9p2000.u extensions */ -}; - -/* Structures for Protocol Operations */ - -struct Tversion { - u32 msize; - struct v9fs_str version; -}; - -struct Rversion { - u32 msize; - struct v9fs_str version; -}; - -struct Tauth { - u32 afid; - struct v9fs_str uname; - struct v9fs_str aname; -}; - -struct Rauth { - struct v9fs_qid qid; -}; - -struct Rerror { - struct v9fs_str error; - u32 errno; /* 9p2000.u extension */ -}; - -struct Tflush { - u16 oldtag; -}; - -struct Rflush { -}; - -struct Tattach { - u32 fid; - u32 afid; - struct v9fs_str uname; - struct v9fs_str aname; -}; - -struct Rattach { - struct v9fs_qid qid; -}; - -struct Twalk { - u32 fid; - u32 newfid; - u16 nwname; - struct v9fs_str wnames[16]; -}; - -struct Rwalk { - u16 nwqid; - struct v9fs_qid wqids[16]; -}; - -struct Topen { - u32 fid; - u8 mode; -}; - -struct Ropen { - struct v9fs_qid qid; - u32 iounit; -}; - -struct Tcreate { - u32 fid; - struct v9fs_str name; - u32 perm; - u8 mode; - struct v9fs_str extension; -}; - -struct Rcreate { - struct v9fs_qid qid; - u32 iounit; -}; - -struct Tread { - u32 fid; - u64 offset; - u32 count; -}; - -struct Rread { - u32 count; - u8 *data; -}; - -struct Twrite { - u32 fid; - u64 offset; - u32 count; - u8 *data; -}; - -struct Rwrite { - u32 count; -}; - -struct Tclunk { - u32 fid; -}; - -struct Rclunk { -}; - -struct Tremove { - u32 fid; -}; - -struct Rremove { -}; - -struct Tstat { - u32 fid; -}; - -struct Rstat { - struct v9fs_stat stat; -}; - -struct Twstat { - u32 fid; - struct v9fs_stat stat; -}; - -struct Rwstat { -}; - -/* - * fcall is the primary packet structure - * - */ - -struct v9fs_fcall { - u32 size; - u8 id; - u16 tag; - void *sdata; - - union { - struct Tversion tversion; - struct Rversion rversion; - struct Tauth tauth; - struct Rauth rauth; - struct Rerror rerror; - struct Tflush tflush; - struct Rflush rflush; - struct Tattach tattach; - struct Rattach rattach; - struct Twalk twalk; - struct Rwalk rwalk; - struct Topen topen; - struct Ropen ropen; - struct Tcreate tcreate; - struct Rcreate rcreate; - struct Tread tread; - struct Rread rread; - struct Twrite twrite; - struct Rwrite rwrite; - struct Tclunk tclunk; - struct Rclunk rclunk; - struct Tremove tremove; - struct Rremove rremove; - struct Tstat tstat; - struct Rstat rstat; - struct Twstat twstat; - struct Rwstat rwstat; - } params; -}; - -#define PRINT_FCALL_ERROR(s, fcall) dprintk(DEBUG_ERROR, "%s: %.*s\n", s, \ - fcall?fcall->params.rerror.error.len:0, \ - fcall?fcall->params.rerror.error.str:""); - -int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize, - char *version, struct v9fs_fcall **rcall); - -int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, - u32 fid, u32 afid, struct v9fs_fcall **rcall); - -int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid); - -int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_fcall **rcall); - -int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_wstat *wstat, struct v9fs_fcall **rcall); - -int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid, - char *name, struct v9fs_fcall **rcall); - -int v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode, - struct v9fs_fcall **rcall); - -int v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_fcall **rcall); - -int v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name, - u32 perm, u8 mode, char *extension, struct v9fs_fcall **rcall); - -int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, - u64 offset, u32 count, struct v9fs_fcall **rcall); - -int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, - u32 count, const char __user * data, - struct v9fs_fcall **rcall); -int v9fs_printfcall(char *, int, struct v9fs_fcall *, int); diff --git a/fs/9p/Makefile b/fs/9p/Makefile index 87897f84dfb6..bc7f0d1551e6 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -1,18 +1,12 @@ obj-$(CONFIG_9P_FS) := 9p.o 9p-objs := \ - trans_fd.o \ - mux.o \ - fcall.o \ - conv.o \ vfs_super.o \ vfs_inode.o \ vfs_addr.o \ vfs_file.o \ vfs_dir.o \ vfs_dentry.o \ - error.o \ v9fs.o \ fid.o \ - fcprint.o diff --git a/fs/9p/conv.c b/fs/9p/conv.c deleted file mode 100644 index a3ed571eee31..000000000000 --- a/fs/9p/conv.c +++ /dev/null @@ -1,845 +0,0 @@ -/* - * linux/fs/9p/conv.c - * - * 9P protocol conversion functions - * - * Copyright (C) 2004, 2005 by Latchesar Ionkov - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include -#include -#include -#include -#include -#include -#include "debug.h" -#include "v9fs.h" -#include "9p.h" -#include "conv.h" - -/* - * Buffer to help with string parsing - */ -struct cbuf { - unsigned char *sp; - unsigned char *p; - unsigned char *ep; -}; - -static inline void buf_init(struct cbuf *buf, void *data, int datalen) -{ - buf->sp = buf->p = data; - buf->ep = data + datalen; -} - -static inline int buf_check_overflow(struct cbuf *buf) -{ - return buf->p > buf->ep; -} - -static int buf_check_size(struct cbuf *buf, int len) -{ - if (buf->p + len > buf->ep) { - if (buf->p < buf->ep) { - eprintk(KERN_ERR, "buffer overflow: want %d has %d\n", - len, (int)(buf->ep - buf->p)); - dump_stack(); - buf->p = buf->ep + 1; - } - - return 0; - } - - return 1; -} - -static void *buf_alloc(struct cbuf *buf, int len) -{ - void *ret = NULL; - - if (buf_check_size(buf, len)) { - ret = buf->p; - buf->p += len; - } - - return ret; -} - -static void buf_put_int8(struct cbuf *buf, u8 val) -{ - if (buf_check_size(buf, 1)) { - buf->p[0] = val; - buf->p++; - } -} - -static void buf_put_int16(struct cbuf *buf, u16 val) -{ - if (buf_check_size(buf, 2)) { - *(__le16 *) buf->p = cpu_to_le16(val); - buf->p += 2; - } -} - -static void buf_put_int32(struct cbuf *buf, u32 val) -{ - if (buf_check_size(buf, 4)) { - *(__le32 *)buf->p = cpu_to_le32(val); - buf->p += 4; - } -} - -static void buf_put_int64(struct cbuf *buf, u64 val) -{ - if (buf_check_size(buf, 8)) { - *(__le64 *)buf->p = cpu_to_le64(val); - buf->p += 8; - } -} - -static char *buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) -{ - char *ret; - - ret = NULL; - if (buf_check_size(buf, slen + 2)) { - buf_put_int16(buf, slen); - ret = buf->p; - memcpy(buf->p, s, slen); - buf->p += slen; - } - - return ret; -} - -static inline void buf_put_string(struct cbuf *buf, const char *s) -{ - buf_put_stringn(buf, s, strlen(s)); -} - -static u8 buf_get_int8(struct cbuf *buf) -{ - u8 ret = 0; - - if (buf_check_size(buf, 1)) { - ret = buf->p[0]; - buf->p++; - } - - return ret; -} - -static u16 buf_get_int16(struct cbuf *buf) -{ - u16 ret = 0; - - if (buf_check_size(buf, 2)) { - ret = le16_to_cpu(*(__le16 *)buf->p); - buf->p += 2; - } - - return ret; -} - -static u32 buf_get_int32(struct cbuf *buf) -{ - u32 ret = 0; - - if (buf_check_size(buf, 4)) { - ret = le32_to_cpu(*(__le32 *)buf->p); - buf->p += 4; - } - - return ret; -} - -static u64 buf_get_int64(struct cbuf *buf) -{ - u64 ret = 0; - - if (buf_check_size(buf, 8)) { - ret = le64_to_cpu(*(__le64 *)buf->p); - buf->p += 8; - } - - return ret; -} - -static void buf_get_str(struct cbuf *buf, struct v9fs_str *vstr) -{ - vstr->len = buf_get_int16(buf); - if (!buf_check_overflow(buf) && buf_check_size(buf, vstr->len)) { - vstr->str = buf->p; - buf->p += vstr->len; - } else { - vstr->len = 0; - vstr->str = NULL; - } -} - -static void buf_get_qid(struct cbuf *bufp, struct v9fs_qid *qid) -{ - qid->type = buf_get_int8(bufp); - qid->version = buf_get_int32(bufp); - qid->path = buf_get_int64(bufp); -} - -/** - * v9fs_size_wstat - calculate the size of a variable length stat struct - * @stat: metadata (stat) structure - * @extended: non-zero if 9P2000.u - * - */ - -static int v9fs_size_wstat(struct v9fs_wstat *wstat, int extended) -{ - int size = 0; - - if (wstat == NULL) { - eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n"); - return 0; - } - - size = /* 2 + *//* size[2] */ - 2 + /* type[2] */ - 4 + /* dev[4] */ - 1 + /* qid.type[1] */ - 4 + /* qid.vers[4] */ - 8 + /* qid.path[8] */ - 4 + /* mode[4] */ - 4 + /* atime[4] */ - 4 + /* mtime[4] */ - 8 + /* length[8] */ - 8; /* minimum sum of string lengths */ - - if (wstat->name) - size += strlen(wstat->name); - if (wstat->uid) - size += strlen(wstat->uid); - if (wstat->gid) - size += strlen(wstat->gid); - if (wstat->muid) - size += strlen(wstat->muid); - - if (extended) { - size += 4 + /* n_uid[4] */ - 4 + /* n_gid[4] */ - 4 + /* n_muid[4] */ - 2; /* string length of extension[4] */ - if (wstat->extension) - size += strlen(wstat->extension); - } - - return size; -} - -/** - * buf_get_stat - safely decode a recieved metadata (stat) structure - * @bufp: buffer to deserialize - * @stat: metadata (stat) structure - * @extended: non-zero if 9P2000.u - * - */ - -static void -buf_get_stat(struct cbuf *bufp, struct v9fs_stat *stat, int extended) -{ - stat->size = buf_get_int16(bufp); - stat->type = buf_get_int16(bufp); - stat->dev = buf_get_int32(bufp); - stat->qid.type = buf_get_int8(bufp); - stat->qid.version = buf_get_int32(bufp); - stat->qid.path = buf_get_int64(bufp); - stat->mode = buf_get_int32(bufp); - stat->atime = buf_get_int32(bufp); - stat->mtime = buf_get_int32(bufp); - stat->length = buf_get_int64(bufp); - buf_get_str(bufp, &stat->name); - buf_get_str(bufp, &stat->uid); - buf_get_str(bufp, &stat->gid); - buf_get_str(bufp, &stat->muid); - - if (extended) { - buf_get_str(bufp, &stat->extension); - stat->n_uid = buf_get_int32(bufp); - stat->n_gid = buf_get_int32(bufp); - stat->n_muid = buf_get_int32(bufp); - } -} - -/** - * v9fs_deserialize_stat - decode a received metadata structure - * @buf: buffer to deserialize - * @buflen: length of received buffer - * @stat: metadata structure to decode into - * @extended: non-zero if 9P2000.u - * - * Note: stat will point to the buf region. - */ - -int -v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat, - int extended) -{ - struct cbuf buffer; - struct cbuf *bufp = &buffer; - unsigned char *p; - - buf_init(bufp, buf, buflen); - p = bufp->p; - buf_get_stat(bufp, stat, extended); - - if (buf_check_overflow(bufp)) - return 0; - else - return bufp->p - p; -} - -/** - * deserialize_fcall - unmarshal a response - * @buf: recieved buffer - * @buflen: length of received buffer - * @rcall: fcall structure to populate - * @rcalllen: length of fcall structure to populate - * @extended: non-zero if 9P2000.u - * - */ - -int -v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall, - int extended) -{ - - struct cbuf buffer; - struct cbuf *bufp = &buffer; - int i = 0; - - buf_init(bufp, buf, buflen); - - rcall->size = buf_get_int32(bufp); - rcall->id = buf_get_int8(bufp); - rcall->tag = buf_get_int16(bufp); - - dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id, - rcall->tag); - - switch (rcall->id) { - default: - eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id); - return -EPROTO; - case RVERSION: - rcall->params.rversion.msize = buf_get_int32(bufp); - buf_get_str(bufp, &rcall->params.rversion.version); - break; - case RFLUSH: - break; - case RATTACH: - rcall->params.rattach.qid.type = buf_get_int8(bufp); - rcall->params.rattach.qid.version = buf_get_int32(bufp); - rcall->params.rattach.qid.path = buf_get_int64(bufp); - break; - case RWALK: - rcall->params.rwalk.nwqid = buf_get_int16(bufp); - if (rcall->params.rwalk.nwqid > V9FS_MAXWELEM) { - eprintk(KERN_ERR, "Rwalk with more than %d qids: %d\n", - V9FS_MAXWELEM, rcall->params.rwalk.nwqid); - return -EPROTO; - } - - for (i = 0; i < rcall->params.rwalk.nwqid; i++) - buf_get_qid(bufp, &rcall->params.rwalk.wqids[i]); - break; - case ROPEN: - buf_get_qid(bufp, &rcall->params.ropen.qid); - rcall->params.ropen.iounit = buf_get_int32(bufp); - break; - case RCREATE: - buf_get_qid(bufp, &rcall->params.rcreate.qid); - rcall->params.rcreate.iounit = buf_get_int32(bufp); - break; - case RREAD: - rcall->params.rread.count = buf_get_int32(bufp); - rcall->params.rread.data = bufp->p; - buf_check_size(bufp, rcall->params.rread.count); - break; - case RWRITE: - rcall->params.rwrite.count = buf_get_int32(bufp); - break; - case RCLUNK: - break; - case RREMOVE: - break; - case RSTAT: - buf_get_int16(bufp); - buf_get_stat(bufp, &rcall->params.rstat.stat, extended); - break; - case RWSTAT: - break; - case RERROR: - buf_get_str(bufp, &rcall->params.rerror.error); - if (extended) - rcall->params.rerror.errno = buf_get_int16(bufp); - break; - } - - if (buf_check_overflow(bufp)) { - dprintk(DEBUG_ERROR, "buffer overflow\n"); - return -EIO; - } - - return bufp->p - bufp->sp; -} - -static inline void v9fs_put_int8(struct cbuf *bufp, u8 val, u8 * p) -{ - *p = val; - buf_put_int8(bufp, val); -} - -static inline void v9fs_put_int16(struct cbuf *bufp, u16 val, u16 * p) -{ - *p = val; - buf_put_int16(bufp, val); -} - -static inline void v9fs_put_int32(struct cbuf *bufp, u32 val, u32 * p) -{ - *p = val; - buf_put_int32(bufp, val); -} - -static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p) -{ - *p = val; - buf_put_int64(bufp, val); -} - -static void -v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str) -{ - int len; - char *s; - - if (data) - len = strlen(data); - else - len = 0; - - s = buf_put_stringn(bufp, data, len); - if (str) { - str->len = len; - str->str = s; - } -} - -static int -v9fs_put_user_data(struct cbuf *bufp, const char __user * data, int count, - unsigned char **pdata) -{ - *pdata = buf_alloc(bufp, count); - return copy_from_user(*pdata, data, count); -} - -static void -v9fs_put_wstat(struct cbuf *bufp, struct v9fs_wstat *wstat, - struct v9fs_stat *stat, int statsz, int extended) -{ - v9fs_put_int16(bufp, statsz, &stat->size); - v9fs_put_int16(bufp, wstat->type, &stat->type); - v9fs_put_int32(bufp, wstat->dev, &stat->dev); - v9fs_put_int8(bufp, wstat->qid.type, &stat->qid.type); - v9fs_put_int32(bufp, wstat->qid.version, &stat->qid.version); - v9fs_put_int64(bufp, wstat->qid.path, &stat->qid.path); - v9fs_put_int32(bufp, wstat->mode, &stat->mode); - v9fs_put_int32(bufp, wstat->atime, &stat->atime); - v9fs_put_int32(bufp, wstat->mtime, &stat->mtime); - v9fs_put_int64(bufp, wstat->length, &stat->length); - - v9fs_put_str(bufp, wstat->name, &stat->name); - v9fs_put_str(bufp, wstat->uid, &stat->uid); - v9fs_put_str(bufp, wstat->gid, &stat->gid); - v9fs_put_str(bufp, wstat->muid, &stat->muid); - - if (extended) { - v9fs_put_str(bufp, wstat->extension, &stat->extension); - v9fs_put_int32(bufp, wstat->n_uid, &stat->n_uid); - v9fs_put_int32(bufp, wstat->n_gid, &stat->n_gid); - v9fs_put_int32(bufp, wstat->n_muid, &stat->n_muid); - } -} - -static struct v9fs_fcall * -v9fs_create_common(struct cbuf *bufp, u32 size, u8 id) -{ - struct v9fs_fcall *fc; - - size += 4 + 1 + 2; /* size[4] id[1] tag[2] */ - fc = kmalloc(sizeof(struct v9fs_fcall) + size, GFP_KERNEL); - if (!fc) - return ERR_PTR(-ENOMEM); - - fc->sdata = (char *)fc + sizeof(*fc); - - buf_init(bufp, (char *)fc->sdata, size); - v9fs_put_int32(bufp, size, &fc->size); - v9fs_put_int8(bufp, id, &fc->id); - v9fs_put_int16(bufp, V9FS_NOTAG, &fc->tag); - - return fc; -} - -void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag) -{ - fc->tag = tag; - *(__le16 *) (fc->sdata + 5) = cpu_to_le16(tag); -} - -struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4 + 2 + strlen(version); /* msize[4] version[s] */ - fc = v9fs_create_common(bufp, size, TVERSION); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, msize, &fc->params.tversion.msize); - v9fs_put_str(bufp, version, &fc->params.tversion.version); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -#if 0 -struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4 + 2 + strlen(uname) + 2 + strlen(aname); /* afid[4] uname[s] aname[s] */ - fc = v9fs_create_common(bufp, size, TAUTH); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, afid, &fc->params.tauth.afid); - v9fs_put_str(bufp, uname, &fc->params.tauth.uname); - v9fs_put_str(bufp, aname, &fc->params.tauth.aname); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} -#endif /* 0 */ - -struct v9fs_fcall * -v9fs_create_tattach(u32 fid, u32 afid, char *uname, char *aname) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4 + 4 + 2 + strlen(uname) + 2 + strlen(aname); /* fid[4] afid[4] uname[s] aname[s] */ - fc = v9fs_create_common(bufp, size, TATTACH); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.tattach.fid); - v9fs_put_int32(bufp, afid, &fc->params.tattach.afid); - v9fs_put_str(bufp, uname, &fc->params.tattach.uname); - v9fs_put_str(bufp, aname, &fc->params.tattach.aname); - - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_tflush(u16 oldtag) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 2; /* oldtag[2] */ - fc = v9fs_create_common(bufp, size, TFLUSH); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int16(bufp, oldtag, &fc->params.tflush.oldtag); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname, - char **wnames) -{ - int i, size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - if (nwname > V9FS_MAXWELEM) { - dprintk(DEBUG_ERROR, "nwname > %d\n", V9FS_MAXWELEM); - return NULL; - } - - size = 4 + 4 + 2; /* fid[4] newfid[4] nwname[2] ... */ - for (i = 0; i < nwname; i++) { - size += 2 + strlen(wnames[i]); /* wname[s] */ - } - - fc = v9fs_create_common(bufp, size, TWALK); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.twalk.fid); - v9fs_put_int32(bufp, newfid, &fc->params.twalk.newfid); - v9fs_put_int16(bufp, nwname, &fc->params.twalk.nwname); - for (i = 0; i < nwname; i++) { - v9fs_put_str(bufp, wnames[i], &fc->params.twalk.wnames[i]); - } - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4 + 1; /* fid[4] mode[1] */ - fc = v9fs_create_common(bufp, size, TOPEN); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.topen.fid); - v9fs_put_int8(bufp, mode, &fc->params.topen.mode); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode, - char *extension, int extended) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4 + 2 + strlen(name) + 4 + 1; /* fid[4] name[s] perm[4] mode[1] */ - if (extended) { - size += 2 + /* extension[s] */ - (extension == NULL ? 0 : strlen(extension)); - } - - fc = v9fs_create_common(bufp, size, TCREATE); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.tcreate.fid); - v9fs_put_str(bufp, name, &fc->params.tcreate.name); - v9fs_put_int32(bufp, perm, &fc->params.tcreate.perm); - v9fs_put_int8(bufp, mode, &fc->params.tcreate.mode); - if (extended) - v9fs_put_str(bufp, extension, &fc->params.tcreate.extension); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4 + 8 + 4; /* fid[4] offset[8] count[4] */ - fc = v9fs_create_common(bufp, size, TREAD); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.tread.fid); - v9fs_put_int64(bufp, offset, &fc->params.tread.offset); - v9fs_put_int32(bufp, count, &fc->params.tread.count); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count, - const char __user * data) -{ - int size, err; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4 + 8 + 4 + count; /* fid[4] offset[8] count[4] data[count] */ - fc = v9fs_create_common(bufp, size, TWRITE); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.twrite.fid); - v9fs_put_int64(bufp, offset, &fc->params.twrite.offset); - v9fs_put_int32(bufp, count, &fc->params.twrite.count); - err = v9fs_put_user_data(bufp, data, count, &fc->params.twrite.data); - if (err) { - kfree(fc); - fc = ERR_PTR(err); - } - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_tclunk(u32 fid) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4; /* fid[4] */ - fc = v9fs_create_common(bufp, size, TCLUNK); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.tclunk.fid); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_tremove(u32 fid) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4; /* fid[4] */ - fc = v9fs_create_common(bufp, size, TREMOVE); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.tremove.fid); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_tstat(u32 fid) -{ - int size; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - size = 4; /* fid[4] */ - fc = v9fs_create_common(bufp, size, TSTAT); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.tstat.fid); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} - -struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat, - int extended) -{ - int size, statsz; - struct v9fs_fcall *fc; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - statsz = v9fs_size_wstat(wstat, extended); - size = 4 + 2 + 2 + statsz; /* fid[4] stat[n] */ - fc = v9fs_create_common(bufp, size, TWSTAT); - if (IS_ERR(fc)) - goto error; - - v9fs_put_int32(bufp, fid, &fc->params.twstat.fid); - buf_put_int16(bufp, statsz + 2); - v9fs_put_wstat(bufp, wstat, &fc->params.twstat.stat, statsz, extended); - - if (buf_check_overflow(bufp)) { - kfree(fc); - fc = ERR_PTR(-ENOMEM); - } - error: - return fc; -} diff --git a/fs/9p/conv.h b/fs/9p/conv.h deleted file mode 100644 index dd5b6b1b610f..000000000000 --- a/fs/9p/conv.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * linux/fs/9p/conv.h - * - * 9P protocol conversion definitions. - * - * Copyright (C) 2005 by Latchesar Ionkov - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat, - int extended); -int v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall, - int extended); - -void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag); - -struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version); -struct v9fs_fcall *v9fs_create_tattach(u32 fid, u32 afid, char *uname, - char *aname); -struct v9fs_fcall *v9fs_create_tflush(u16 oldtag); -struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname, - char **wnames); -struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode); -struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode, - char *extension, int extended); -struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count); -struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count, - const char __user *data); -struct v9fs_fcall *v9fs_create_tclunk(u32 fid); -struct v9fs_fcall *v9fs_create_tremove(u32 fid); -struct v9fs_fcall *v9fs_create_tstat(u32 fid); -struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat, - int extended); diff --git a/fs/9p/debug.h b/fs/9p/debug.h deleted file mode 100644 index 4228c0bb3c32..000000000000 --- a/fs/9p/debug.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * linux/fs/9p/debug.h - V9FS Debug Definitions - * - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#define DEBUG_ERROR (1<<0) -#define DEBUG_CURRENT (1<<1) -#define DEBUG_9P (1<<2) -#define DEBUG_VFS (1<<3) -#define DEBUG_CONV (1<<4) -#define DEBUG_MUX (1<<5) -#define DEBUG_TRANS (1<<6) -#define DEBUG_SLABS (1<<7) -#define DEBUG_FCALL (1<<8) - -#define DEBUG_DUMP_PKT 0 - -extern int v9fs_debug_level; - -#define dprintk(level, format, arg...) \ -do { \ - if((v9fs_debug_level & level)==level) \ - printk(KERN_NOTICE "-- %s (%d): " \ - format , __FUNCTION__, current->pid , ## arg); \ -} while(0) - -#define eprintk(level, format, arg...) \ -do { \ - printk(level "v9fs: %s (%d): " \ - format , __FUNCTION__, current->pid , ## arg); \ -} while(0) - -#if DEBUG_DUMP_PKT -static inline void dump_data(const unsigned char *data, unsigned int datalen) -{ - int i, n; - char buf[5*8]; - - n = 0; - i = 0; - while (i < datalen) { - n += snprintf(buf+n, sizeof(buf)-n, "%02x", data[i++]); - if (i%4 == 0) - n += snprintf(buf+n, sizeof(buf)-n, " "); - - if (i%16 == 0) { - dprintk(DEBUG_ERROR, "%s\n", buf); - n = 0; - } - } - - dprintk(DEBUG_ERROR, "%s\n", buf); -} -#else /* DEBUG_DUMP_PKT */ -static inline void dump_data(const unsigned char *data, unsigned int datalen) -{ - -} -#endif /* DEBUG_DUMP_PKT */ diff --git a/fs/9p/error.c b/fs/9p/error.c deleted file mode 100644 index 0d7fa4e08812..000000000000 --- a/fs/9p/error.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * linux/fs/9p/error.c - * - * Error string handling - * - * Plan 9 uses error strings, Unix uses error numbers. These functions - * try to help manage that and provide for dynamically adding error - * mappings. - * - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include - -#include -#include - -#include "debug.h" -#include "error.h" - -/** - * v9fs_error_init - preload - * @errstr: error string - * - */ - -int v9fs_error_init(void) -{ - struct errormap *c; - int bucket; - - /* initialize hash table */ - for (bucket = 0; bucket < ERRHASHSZ; bucket++) - INIT_HLIST_HEAD(&hash_errmap[bucket]); - - /* load initial error map into hash table */ - for (c = errmap; c->name != NULL; c++) { - c->namelen = strlen(c->name); - bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ; - INIT_HLIST_NODE(&c->list); - hlist_add_head(&c->list, &hash_errmap[bucket]); - } - - return 1; -} - -/** - * errstr2errno - convert error string to error number - * @errstr: error string - * - */ - -int v9fs_errstr2errno(char *errstr, int len) -{ - int errno = 0; - struct hlist_node *p = NULL; - struct errormap *c = NULL; - int bucket = jhash(errstr, len, 0) % ERRHASHSZ; - - hlist_for_each_entry(c, p, &hash_errmap[bucket], list) { - if (c->namelen==len && !memcmp(c->name, errstr, len)) { - errno = c->val; - break; - } - } - - if (errno == 0) { - /* TODO: if error isn't found, add it dynamically */ - errstr[len] = 0; - printk(KERN_ERR "%s: errstr :%s: not found\n", __FUNCTION__, - errstr); - errno = 1; - } - - return -errno; -} diff --git a/fs/9p/error.h b/fs/9p/error.h deleted file mode 100644 index 5f3ca522b316..000000000000 --- a/fs/9p/error.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * linux/fs/9p/error.h - * - * Huge Nasty Error Table - * - * Plan 9 uses error strings, Unix uses error numbers. This table tries to - * match UNIX strings and Plan 9 strings to unix error numbers. It is used - * to preload the dynamic error table which can also track user-specific error - * strings. - * - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include -#include - -struct errormap { - char *name; - int val; - - int namelen; - struct hlist_node list; -}; - -#define ERRHASHSZ 32 -static struct hlist_head hash_errmap[ERRHASHSZ]; - -/* FixMe - reduce to a reasonable size */ -static struct errormap errmap[] = { - {"Operation not permitted", EPERM}, - {"wstat prohibited", EPERM}, - {"No such file or directory", ENOENT}, - {"directory entry not found", ENOENT}, - {"file not found", ENOENT}, - {"Interrupted system call", EINTR}, - {"Input/output error", EIO}, - {"No such device or address", ENXIO}, - {"Argument list too long", E2BIG}, - {"Bad file descriptor", EBADF}, - {"Resource temporarily unavailable", EAGAIN}, - {"Cannot allocate memory", ENOMEM}, - {"Permission denied", EACCES}, - {"Bad address", EFAULT}, - {"Block device required", ENOTBLK}, - {"Device or resource busy", EBUSY}, - {"File exists", EEXIST}, - {"Invalid cross-device link", EXDEV}, - {"No such device", ENODEV}, - {"Not a directory", ENOTDIR}, - {"Is a directory", EISDIR}, - {"Invalid argument", EINVAL}, - {"Too many open files in system", ENFILE}, - {"Too many open files", EMFILE}, - {"Text file busy", ETXTBSY}, - {"File too large", EFBIG}, - {"No space left on device", ENOSPC}, - {"Illegal seek", ESPIPE}, - {"Read-only file system", EROFS}, - {"Too many links", EMLINK}, - {"Broken pipe", EPIPE}, - {"Numerical argument out of domain", EDOM}, - {"Numerical result out of range", ERANGE}, - {"Resource deadlock avoided", EDEADLK}, - {"File name too long", ENAMETOOLONG}, - {"No locks available", ENOLCK}, - {"Function not implemented", ENOSYS}, - {"Directory not empty", ENOTEMPTY}, - {"Too many levels of symbolic links", ELOOP}, - {"No message of desired type", ENOMSG}, - {"Identifier removed", EIDRM}, - {"No data available", ENODATA}, - {"Machine is not on the network", ENONET}, - {"Package not installed", ENOPKG}, - {"Object is remote", EREMOTE}, - {"Link has been severed", ENOLINK}, - {"Communication error on send", ECOMM}, - {"Protocol error", EPROTO}, - {"Bad message", EBADMSG}, - {"File descriptor in bad state", EBADFD}, - {"Streams pipe error", ESTRPIPE}, - {"Too many users", EUSERS}, - {"Socket operation on non-socket", ENOTSOCK}, - {"Message too long", EMSGSIZE}, - {"Protocol not available", ENOPROTOOPT}, - {"Protocol not supported", EPROTONOSUPPORT}, - {"Socket type not supported", ESOCKTNOSUPPORT}, - {"Operation not supported", EOPNOTSUPP}, - {"Protocol family not supported", EPFNOSUPPORT}, - {"Network is down", ENETDOWN}, - {"Network is unreachable", ENETUNREACH}, - {"Network dropped connection on reset", ENETRESET}, - {"Software caused connection abort", ECONNABORTED}, - {"Connection reset by peer", ECONNRESET}, - {"No buffer space available", ENOBUFS}, - {"Transport endpoint is already connected", EISCONN}, - {"Transport endpoint is not connected", ENOTCONN}, - {"Cannot send after transport endpoint shutdown", ESHUTDOWN}, - {"Connection timed out", ETIMEDOUT}, - {"Connection refused", ECONNREFUSED}, - {"Host is down", EHOSTDOWN}, - {"No route to host", EHOSTUNREACH}, - {"Operation already in progress", EALREADY}, - {"Operation now in progress", EINPROGRESS}, - {"Is a named type file", EISNAM}, - {"Remote I/O error", EREMOTEIO}, - {"Disk quota exceeded", EDQUOT}, -/* errors from fossil, vacfs, and u9fs */ - {"fid unknown or out of range", EBADF}, - {"permission denied", EACCES}, - {"file does not exist", ENOENT}, - {"authentication failed", ECONNREFUSED}, - {"bad offset in directory read", ESPIPE}, - {"bad use of fid", EBADF}, - {"wstat can't convert between files and directories", EPERM}, - {"directory is not empty", ENOTEMPTY}, - {"file exists", EEXIST}, - {"file already exists", EEXIST}, - {"file or directory already exists", EEXIST}, - {"fid already in use", EBADF}, - {"file in use", ETXTBSY}, - {"i/o error", EIO}, - {"file already open for I/O", ETXTBSY}, - {"illegal mode", EINVAL}, - {"illegal name", ENAMETOOLONG}, - {"not a directory", ENOTDIR}, - {"not a member of proposed group", EPERM}, - {"not owner", EACCES}, - {"only owner can change group in wstat", EACCES}, - {"read only file system", EROFS}, - {"no access to special file", EPERM}, - {"i/o count too large", EIO}, - {"unknown group", EINVAL}, - {"unknown user", EINVAL}, - {"bogus wstat buffer", EPROTO}, - {"exclusive use file already open", EAGAIN}, - {"corrupted directory entry", EIO}, - {"corrupted file entry", EIO}, - {"corrupted block label", EIO}, - {"corrupted meta data", EIO}, - {"illegal offset", EINVAL}, - {"illegal path element", ENOENT}, - {"root of file system is corrupted", EIO}, - {"corrupted super block", EIO}, - {"protocol botch", EPROTO}, - {"file system is full", ENOSPC}, - {"file is in use", EAGAIN}, - {"directory entry is not allocated", ENOENT}, - {"file is read only", EROFS}, - {"file has been removed", EIDRM}, - {"only support truncation to zero length", EPERM}, - {"cannot remove root", EPERM}, - {"file too big", EFBIG}, - {"venti i/o error", EIO}, - /* these are not errors */ - {"u9fs rhostsauth: no authentication required", 0}, - {"u9fs authnone: no authentication required", 0}, - {NULL, -1} -}; - -extern int v9fs_error_init(void); diff --git a/fs/9p/fcall.c b/fs/9p/fcall.c deleted file mode 100644 index dc336a67592f..000000000000 --- a/fs/9p/fcall.c +++ /dev/null @@ -1,427 +0,0 @@ -/* - * linux/fs/9p/fcall.c - * - * This file contains functions to perform synchronous 9P calls - * - * Copyright (C) 2004 by Latchesar Ionkov - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include -#include -#include -#include -#include - -#include "debug.h" -#include "v9fs.h" -#include "9p.h" -#include "conv.h" -#include "mux.h" - -/** - * v9fs_t_version - negotiate protocol parameters with sever - * @v9ses: 9P2000 session information - * @msize: requested max size packet - * @version: requested version.extension string - * @fcall: pointer to response fcall pointer - * - */ - -int -v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize, - char *version, struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc; - - dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version); - tc = v9fs_create_tversion(msize, version); - - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -/** - * v9fs_t_attach - mount the server - * @v9ses: 9P2000 session information - * @uname: user name doing the attach - * @aname: remote name being attached to - * @fid: mount fid to attatch to root node - * @afid: authentication fid (in this case result key) - * @fcall: pointer to response fcall pointer - * - */ - -int -v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, - u32 fid, u32 afid, struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall* tc; - - dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname, - aname, fid, afid); - - tc = v9fs_create_tattach(fid, afid, uname, aname); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc, - struct v9fs_fcall *rc, int err) -{ - int fid, id; - struct v9fs_session_info *v9ses; - - id = 0; - fid = tc->params.tclunk.fid; - if (rc) - id = rc->id; - - kfree(tc); - kfree(rc); - if (id == RCLUNK) { - v9ses = a; - v9fs_put_idpool(fid, &v9ses->fidpool); - } -} - -/** - * v9fs_t_clunk - release a fid (finish a transaction) - * @v9ses: 9P2000 session information - * @fid: fid to release - * @fcall: pointer to response fcall pointer - * - */ - -int -v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid) -{ - int ret; - struct v9fs_fcall *tc, *rc; - - dprintk(DEBUG_9P, "fid %d\n", fid); - - rc = NULL; - tc = v9fs_create_tclunk(fid); - if (!IS_ERR(tc)) - ret = v9fs_mux_rpc(v9ses->mux, tc, &rc); - else - ret = PTR_ERR(tc); - - if (ret) - dprintk(DEBUG_ERROR, "failed fid %d err %d\n", fid, ret); - - v9fs_t_clunk_cb(v9ses, tc, rc, ret); - return ret; -} - -#if 0 -/** - * v9fs_v9fs_t_flush - flush a pending transaction - * @v9ses: 9P2000 session information - * @tag: tag to release - * - */ -int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag) -{ - int ret; - struct v9fs_fcall *tc; - - dprintk(DEBUG_9P, "oldtag %d\n", oldtag); - - tc = v9fs_create_tflush(oldtag); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, NULL); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} -#endif - -/** - * v9fs_t_stat - read a file's meta-data - * @v9ses: 9P2000 session information - * @fid: fid pointing to file or directory to get info about - * @fcall: pointer to response fcall - * - */ - -int -v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc; - - dprintk(DEBUG_9P, "fid %d\n", fid); - - ret = -ENOMEM; - tc = v9fs_create_tstat(fid); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -/** - * v9fs_t_wstat - write a file's meta-data - * @v9ses: 9P2000 session information - * @fid: fid pointing to file or directory to write info about - * @stat: metadata - * @fcall: pointer to response fcall - * - */ - -int -v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_wstat *wstat, struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc; - - dprintk(DEBUG_9P, "fid %d\n", fid); - - tc = v9fs_create_twstat(fid, wstat, v9ses->extended); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -/** - * v9fs_t_walk - walk a fid to a new file or directory - * @v9ses: 9P2000 session information - * @fid: fid to walk - * @newfid: new fid (for clone operations) - * @name: path to walk fid to - * @fcall: pointer to response fcall - * - */ - -/* TODO: support multiple walk */ - -int -v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid, - char *name, struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc; - int nwname; - - dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name); - - if (name) - nwname = 1; - else - nwname = 0; - - tc = v9fs_create_twalk(fid, newfid, nwname, &name); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -/** - * v9fs_t_open - open a file - * - * @v9ses - 9P2000 session information - * @fid - fid to open - * @mode - mode to open file (R, RW, etc) - * @fcall - pointer to response fcall - * - */ - -int -v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode, - struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc; - - dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode); - - tc = v9fs_create_topen(fid, mode); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -/** - * v9fs_t_remove - remove a file or directory - * @v9ses: 9P2000 session information - * @fid: fid to remove - * @fcall: pointer to response fcall - * - */ - -int -v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc; - - dprintk(DEBUG_9P, "fid %d\n", fid); - - tc = v9fs_create_tremove(fid); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -/** - * v9fs_t_create - create a file or directory - * @v9ses: 9P2000 session information - * @fid: fid to create - * @name: name of the file or directory to create - * @perm: permissions to create with - * @mode: mode to open file (R, RW, etc) - * @fcall: pointer to response fcall - * - */ - -int -v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name, u32 perm, - u8 mode, char *extension, struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc; - - dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n", - fid, name, perm, mode); - - tc = v9fs_create_tcreate(fid, name, perm, mode, extension, - v9ses->extended); - - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -/** - * v9fs_t_read - read data - * @v9ses: 9P2000 session information - * @fid: fid to read from - * @offset: offset to start read at - * @count: how many bytes to read - * @fcall: pointer to response fcall (with data) - * - */ - -int -v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset, - u32 count, struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc, *rc; - - dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid, - (long long unsigned) offset, count); - - tc = v9fs_create_tread(fid, offset, count); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, &rc); - if (!ret) - ret = rc->params.rread.count; - if (rcp) - *rcp = rc; - else - kfree(rc); - - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - -/** - * v9fs_t_write - write data - * @v9ses: 9P2000 session information - * @fid: fid to write to - * @offset: offset to start write at - * @count: how many bytes to write - * @fcall: pointer to response fcall - * - */ - -int -v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count, - const char __user *data, struct v9fs_fcall **rcp) -{ - int ret; - struct v9fs_fcall *tc, *rc; - - dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid, - (long long unsigned) offset, count); - - tc = v9fs_create_twrite(fid, offset, count, data); - if (!IS_ERR(tc)) { - ret = v9fs_mux_rpc(v9ses->mux, tc, &rc); - - if (!ret) - ret = rc->params.rwrite.count; - if (rcp) - *rcp = rc; - else - kfree(rc); - - kfree(tc); - } else - ret = PTR_ERR(tc); - - return ret; -} - diff --git a/fs/9p/fcprint.c b/fs/9p/fcprint.c deleted file mode 100644 index 34b96114a28d..000000000000 --- a/fs/9p/fcprint.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * linux/fs/9p/fcprint.c - * - * Print 9P call. - * - * Copyright (C) 2005 by Latchesar Ionkov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ -#include -#include -#include -#include - -#include "debug.h" -#include "v9fs.h" -#include "9p.h" -#include "mux.h" - -static int -v9fs_printqid(char *buf, int buflen, struct v9fs_qid *q) -{ - int n; - char b[10]; - - n = 0; - if (q->type & V9FS_QTDIR) - b[n++] = 'd'; - if (q->type & V9FS_QTAPPEND) - b[n++] = 'a'; - if (q->type & V9FS_QTAUTH) - b[n++] = 'A'; - if (q->type & V9FS_QTEXCL) - b[n++] = 'l'; - if (q->type & V9FS_QTTMP) - b[n++] = 't'; - if (q->type & V9FS_QTSYMLINK) - b[n++] = 'L'; - b[n] = '\0'; - - return scnprintf(buf, buflen, "(%.16llx %x %s)", (long long int) q->path, - q->version, b); -} - -static int -v9fs_printperm(char *buf, int buflen, int perm) -{ - int n; - char b[15]; - - n = 0; - if (perm & V9FS_DMDIR) - b[n++] = 'd'; - if (perm & V9FS_DMAPPEND) - b[n++] = 'a'; - if (perm & V9FS_DMAUTH) - b[n++] = 'A'; - if (perm & V9FS_DMEXCL) - b[n++] = 'l'; - if (perm & V9FS_DMTMP) - b[n++] = 't'; - if (perm & V9FS_DMDEVICE) - b[n++] = 'D'; - if (perm & V9FS_DMSOCKET) - b[n++] = 'S'; - if (perm & V9FS_DMNAMEDPIPE) - b[n++] = 'P'; - if (perm & V9FS_DMSYMLINK) - b[n++] = 'L'; - b[n] = '\0'; - - return scnprintf(buf, buflen, "%s%03o", b, perm&077); -} - -static int -v9fs_printstat(char *buf, int buflen, struct v9fs_stat *st, int extended) -{ - int n; - - n = scnprintf(buf, buflen, "'%.*s' '%.*s'", st->name.len, - st->name.str, st->uid.len, st->uid.str); - if (extended) - n += scnprintf(buf+n, buflen-n, "(%d)", st->n_uid); - - n += scnprintf(buf+n, buflen-n, " '%.*s'", st->gid.len, st->gid.str); - if (extended) - n += scnprintf(buf+n, buflen-n, "(%d)", st->n_gid); - - n += scnprintf(buf+n, buflen-n, " '%.*s'", st->muid.len, st->muid.str); - if (extended) - n += scnprintf(buf+n, buflen-n, "(%d)", st->n_muid); - - n += scnprintf(buf+n, buflen-n, " q "); - n += v9fs_printqid(buf+n, buflen-n, &st->qid); - n += scnprintf(buf+n, buflen-n, " m "); - n += v9fs_printperm(buf+n, buflen-n, st->mode); - n += scnprintf(buf+n, buflen-n, " at %d mt %d l %lld", - st->atime, st->mtime, (long long int) st->length); - - if (extended) - n += scnprintf(buf+n, buflen-n, " ext '%.*s'", - st->extension.len, st->extension.str); - - return n; -} - -static int -v9fs_dumpdata(char *buf, int buflen, u8 *data, int datalen) -{ - int i, n; - - i = n = 0; - while (i < datalen) { - n += scnprintf(buf + n, buflen - n, "%02x", data[i]); - if (i%4 == 3) - n += scnprintf(buf + n, buflen - n, " "); - if (i%32 == 31) - n += scnprintf(buf + n, buflen - n, "\n"); - - i++; - } - n += scnprintf(buf + n, buflen - n, "\n"); - - return n; -} - -static int -v9fs_printdata(char *buf, int buflen, u8 *data, int datalen) -{ - return v9fs_dumpdata(buf, buflen, data, datalen<16?datalen:16); -} - -int -v9fs_printfcall(char *buf, int buflen, struct v9fs_fcall *fc, int extended) -{ - int i, ret, type, tag; - - if (!fc) - return scnprintf(buf, buflen, ""); - - type = fc->id; - tag = fc->tag; - - ret = 0; - switch (type) { - case TVERSION: - ret += scnprintf(buf+ret, buflen-ret, - "Tversion tag %u msize %u version '%.*s'", tag, - fc->params.tversion.msize, fc->params.tversion.version.len, - fc->params.tversion.version.str); - break; - - case RVERSION: - ret += scnprintf(buf+ret, buflen-ret, - "Rversion tag %u msize %u version '%.*s'", tag, - fc->params.rversion.msize, fc->params.rversion.version.len, - fc->params.rversion.version.str); - break; - - case TAUTH: - ret += scnprintf(buf+ret, buflen-ret, - "Tauth tag %u afid %d uname '%.*s' aname '%.*s'", tag, - fc->params.tauth.afid, fc->params.tauth.uname.len, - fc->params.tauth.uname.str, fc->params.tauth.aname.len, - fc->params.tauth.aname.str); - break; - - case RAUTH: - ret += scnprintf(buf+ret, buflen-ret, "Rauth tag %u qid ", tag); - v9fs_printqid(buf+ret, buflen-ret, &fc->params.rauth.qid); - break; - - case TATTACH: - ret += scnprintf(buf+ret, buflen-ret, - "Tattach tag %u fid %d afid %d uname '%.*s' aname '%.*s'", - tag, fc->params.tattach.fid, fc->params.tattach.afid, - fc->params.tattach.uname.len, fc->params.tattach.uname.str, - fc->params.tattach.aname.len, fc->params.tattach.aname.str); - break; - - case RATTACH: - ret += scnprintf(buf+ret, buflen-ret, "Rattach tag %u qid ", tag); - v9fs_printqid(buf+ret, buflen-ret, &fc->params.rattach.qid); - break; - - case RERROR: - ret += scnprintf(buf+ret, buflen-ret, "Rerror tag %u ename '%.*s'", - tag, fc->params.rerror.error.len, - fc->params.rerror.error.str); - if (extended) - ret += scnprintf(buf+ret, buflen-ret, " ecode %d\n", - fc->params.rerror.errno); - break; - - case TFLUSH: - ret += scnprintf(buf+ret, buflen-ret, "Tflush tag %u oldtag %u", - tag, fc->params.tflush.oldtag); - break; - - case RFLUSH: - ret += scnprintf(buf+ret, buflen-ret, "Rflush tag %u", tag); - break; - - case TWALK: - ret += scnprintf(buf+ret, buflen-ret, - "Twalk tag %u fid %d newfid %d nwname %d", tag, - fc->params.twalk.fid, fc->params.twalk.newfid, - fc->params.twalk.nwname); - for(i = 0; i < fc->params.twalk.nwname; i++) - ret += scnprintf(buf+ret, buflen-ret," '%.*s'", - fc->params.twalk.wnames[i].len, - fc->params.twalk.wnames[i].str); - break; - - case RWALK: - ret += scnprintf(buf+ret, buflen-ret, "Rwalk tag %u nwqid %d", - tag, fc->params.rwalk.nwqid); - for(i = 0; i < fc->params.rwalk.nwqid; i++) - ret += v9fs_printqid(buf+ret, buflen-ret, - &fc->params.rwalk.wqids[i]); - break; - - case TOPEN: - ret += scnprintf(buf+ret, buflen-ret, - "Topen tag %u fid %d mode %d", tag, - fc->params.topen.fid, fc->params.topen.mode); - break; - - case ROPEN: - ret += scnprintf(buf+ret, buflen-ret, "Ropen tag %u", tag); - ret += v9fs_printqid(buf+ret, buflen-ret, &fc->params.ropen.qid); - ret += scnprintf(buf+ret, buflen-ret," iounit %d", - fc->params.ropen.iounit); - break; - - case TCREATE: - ret += scnprintf(buf+ret, buflen-ret, - "Tcreate tag %u fid %d name '%.*s' perm ", tag, - fc->params.tcreate.fid, fc->params.tcreate.name.len, - fc->params.tcreate.name.str); - - ret += v9fs_printperm(buf+ret, buflen-ret, fc->params.tcreate.perm); - ret += scnprintf(buf+ret, buflen-ret, " mode %d", - fc->params.tcreate.mode); - break; - - case RCREATE: - ret += scnprintf(buf+ret, buflen-ret, "Rcreate tag %u", tag); - ret += v9fs_printqid(buf+ret, buflen-ret, &fc->params.rcreate.qid); - ret += scnprintf(buf+ret, buflen-ret, " iounit %d", - fc->params.rcreate.iounit); - break; - - case TREAD: - ret += scnprintf(buf+ret, buflen-ret, - "Tread tag %u fid %d offset %lld count %u", tag, - fc->params.tread.fid, - (long long int) fc->params.tread.offset, - fc->params.tread.count); - break; - - case RREAD: - ret += scnprintf(buf+ret, buflen-ret, - "Rread tag %u count %u data ", tag, - fc->params.rread.count); - ret += v9fs_printdata(buf+ret, buflen-ret, fc->params.rread.data, - fc->params.rread.count); - break; - - case TWRITE: - ret += scnprintf(buf+ret, buflen-ret, - "Twrite tag %u fid %d offset %lld count %u data ", - tag, fc->params.twrite.fid, - (long long int) fc->params.twrite.offset, - fc->params.twrite.count); - ret += v9fs_printdata(buf+ret, buflen-ret, fc->params.twrite.data, - fc->params.twrite.count); - break; - - case RWRITE: - ret += scnprintf(buf+ret, buflen-ret, "Rwrite tag %u count %u", - tag, fc->params.rwrite.count); - break; - - case TCLUNK: - ret += scnprintf(buf+ret, buflen-ret, "Tclunk tag %u fid %d", - tag, fc->params.tclunk.fid); - break; - - case RCLUNK: - ret += scnprintf(buf+ret, buflen-ret, "Rclunk tag %u", tag); - break; - - case TREMOVE: - ret += scnprintf(buf+ret, buflen-ret, "Tremove tag %u fid %d", - tag, fc->params.tremove.fid); - break; - - case RREMOVE: - ret += scnprintf(buf+ret, buflen-ret, "Rremove tag %u", tag); - break; - - case TSTAT: - ret += scnprintf(buf+ret, buflen-ret, "Tstat tag %u fid %d", - tag, fc->params.tstat.fid); - break; - - case RSTAT: - ret += scnprintf(buf+ret, buflen-ret, "Rstat tag %u ", tag); - ret += v9fs_printstat(buf+ret, buflen-ret, &fc->params.rstat.stat, - extended); - break; - - case TWSTAT: - ret += scnprintf(buf+ret, buflen-ret, "Twstat tag %u fid %d ", - tag, fc->params.twstat.fid); - ret += v9fs_printstat(buf+ret, buflen-ret, &fc->params.twstat.stat, - extended); - break; - - case RWSTAT: - ret += scnprintf(buf+ret, buflen-ret, "Rwstat tag %u", tag); - break; - - default: - ret += scnprintf(buf+ret, buflen-ret, "unknown type %d", type); - break; - } - - return ret; -} diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 90419715c7e9..08fa320b7e6d 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -26,10 +26,10 @@ #include #include #include +#include +#include -#include "debug.h" #include "v9fs.h" -#include "9p.h" #include "v9fs_vfs.h" #include "fid.h" @@ -40,67 +40,29 @@ * */ -int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) +int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) { - struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; - dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid, - dentry->d_iname, dentry); - if (dentry->d_fsdata == NULL) { - dentry->d_fsdata = - kmalloc(sizeof(struct list_head), GFP_KERNEL); - if (dentry->d_fsdata == NULL) { - dprintk(DEBUG_ERROR, "Out of memory\n"); - return -ENOMEM; - } - fid_list = (struct list_head *)dentry->d_fsdata; - INIT_LIST_HEAD(fid_list); /* Initialize list head */ - } + struct v9fs_dentry *dent; - fid->uid = current->uid; - list_add(&fid->list, fid_list); - return 0; -} + P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n", + fid->fid, dentry->d_iname); -/** - * v9fs_fid_create - allocate a FID structure - * @dentry - dentry to link newly created fid to - * - */ - -struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid) -{ - struct v9fs_fid *new; + dent = dentry->d_fsdata; + if (!dent) { + dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL); + if (!dent) + return -ENOMEM; - dprintk(DEBUG_9P, "fid create fid %d\n", fid); - new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); - if (new == NULL) { - dprintk(DEBUG_ERROR, "Out of Memory\n"); - return ERR_PTR(-ENOMEM); + spin_lock_init(&dent->lock); + INIT_LIST_HEAD(&dent->fidlist); + dentry->d_fsdata = dent; } - new->fid = fid; - new->v9ses = v9ses; - new->fidopen = 0; - new->fidclunked = 0; - new->iounit = 0; - new->rdir_pos = 0; - new->rdir_fcall = NULL; - init_MUTEX(&new->lock); - INIT_LIST_HEAD(&new->list); - - return new; -} - -/** - * v9fs_fid_destroy - deallocate a FID structure - * @fid: fid to destroy - * - */ + spin_lock(&dent->lock); + list_add(&fid->dlist, &dent->fidlist); + spin_unlock(&dent->lock); -void v9fs_fid_destroy(struct v9fs_fid *fid) -{ - list_del(&fid->list); - kfree(fid); + return 0; } /** @@ -114,30 +76,42 @@ void v9fs_fid_destroy(struct v9fs_fid *fid) * */ -struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) +struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) { - struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; - struct v9fs_fid *return_fid = NULL; - - dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry); - - if (fid_list) - return_fid = list_entry(fid_list->next, struct v9fs_fid, list); + struct v9fs_dentry *dent; + struct p9_fid *fid; + + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + dent = dentry->d_fsdata; + if (dent) + fid = list_entry(dent->fidlist.next, struct p9_fid, dlist); + else + fid = ERR_PTR(-EBADF); + + P9_DPRINTK(P9_DEBUG_VFS, " fid: %p\n", fid); + return fid; +} - if (!return_fid) { - dprintk(DEBUG_ERROR, "Couldn't find a fid in dentry\n"); - return_fid = ERR_PTR(-EBADF); +struct p9_fid *v9fs_fid_lookup_remove(struct dentry *dentry) +{ + struct p9_fid *fid; + struct v9fs_dentry *dent; + + dent = dentry->d_fsdata; + fid = v9fs_fid_lookup(dentry); + if (!IS_ERR(fid)) { + spin_lock(&dent->lock); + list_del(&fid->dlist); + spin_unlock(&dent->lock); } - if(down_interruptible(&return_fid->lock)) - return ERR_PTR(-EINTR); - - return return_fid; + return fid; } + /** * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and - * release it + * release it * @dentry: dentry to look for fid in * * find a fid in the dentry and then clone to a new private fid @@ -146,49 +120,15 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) * */ -struct v9fs_fid *v9fs_fid_clone(struct dentry *dentry) +struct p9_fid *v9fs_fid_clone(struct dentry *dentry) { - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); - struct v9fs_fid *base_fid, *new_fid = ERR_PTR(-EBADF); - struct v9fs_fcall *fcall = NULL; - int fid, err; - - base_fid = v9fs_fid_lookup(dentry); - - if(IS_ERR(base_fid)) - return base_fid; - - if(base_fid) { /* clone fid */ - fid = v9fs_get_idpool(&v9ses->fidpool); - if (fid < 0) { - eprintk(KERN_WARNING, "newfid fails!\n"); - new_fid = ERR_PTR(-ENOSPC); - goto Release_Fid; - } - - err = v9fs_t_walk(v9ses, base_fid->fid, fid, NULL, &fcall); - if (err < 0) { - dprintk(DEBUG_ERROR, "clone walk didn't work\n"); - v9fs_put_idpool(fid, &v9ses->fidpool); - new_fid = ERR_PTR(err); - goto Free_Fcall; - } - new_fid = v9fs_fid_create(v9ses, fid); - if (new_fid == NULL) { - dprintk(DEBUG_ERROR, "out of memory\n"); - new_fid = ERR_PTR(-ENOMEM); - } -Free_Fcall: - kfree(fcall); - } + struct p9_fid *ofid, *fid; -Release_Fid: - up(&base_fid->lock); - return new_fid; -} + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + ofid = v9fs_fid_lookup(dentry); + if (IS_ERR(ofid)) + return ofid; -void v9fs_fid_clunk(struct v9fs_session_info *v9ses, struct v9fs_fid *fid) -{ - v9fs_t_clunk(v9ses, fid->fid); - v9fs_fid_destroy(fid); + fid = p9_client_walk(ofid, 0, NULL, 1); + return fid; } diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 48fc170c26c8..47a0ba742872 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -22,41 +22,12 @@ #include -#define FID_OP 0 -#define FID_WALK 1 -#define FID_CREATE 2 - -struct v9fs_fid { - struct list_head list; /* list of fids associated with a dentry */ - struct list_head active; /* XXX - debug */ - - struct semaphore lock; - - u32 fid; - unsigned char fidopen; /* set when fid is opened */ - unsigned char fidclunked; /* set when fid has already been clunked */ - - struct v9fs_qid qid; - u32 iounit; - - /* readdir stuff */ - int rdir_fpos; - loff_t rdir_pos; - struct v9fs_fcall *rdir_fcall; - - /* management stuff */ - uid_t uid; /* user associated with this fid */ - - /* private data */ - struct file *filp; /* backpointer to File struct for open files */ - struct v9fs_session_info *v9ses; /* session info for this FID */ +struct v9fs_dentry { + spinlock_t lock; /* protect fidlist */ + struct list_head fidlist; }; -struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry); -struct v9fs_fid *v9fs_fid_get_created(struct dentry *); -void v9fs_fid_destroy(struct v9fs_fid *fid); -struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *, int fid); -int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry); -struct v9fs_fid *v9fs_fid_clone(struct dentry *dentry); -void v9fs_fid_clunk(struct v9fs_session_info *v9ses, struct v9fs_fid *fid); - +struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); +struct p9_fid *v9fs_fid_lookup_remove(struct dentry *dentry); +struct p9_fid *v9fs_fid_clone(struct dentry *dentry); +int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); diff --git a/fs/9p/mux.c b/fs/9p/mux.c deleted file mode 100644 index c783874a9caf..000000000000 --- a/fs/9p/mux.c +++ /dev/null @@ -1,1033 +0,0 @@ -/* - * linux/fs/9p/mux.c - * - * Protocol Multiplexer - * - * Copyright (C) 2004 by Eric Van Hensbergen - * Copyright (C) 2004-2005 by Latchesar Ionkov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "debug.h" -#include "v9fs.h" -#include "9p.h" -#include "conv.h" -#include "transport.h" -#include "mux.h" - -#define ERREQFLUSH 1 -#define SCHED_TIMEOUT 10 -#define MAXPOLLWADDR 2 - -enum { - Rworksched = 1, /* read work scheduled or running */ - Rpending = 2, /* can read */ - Wworksched = 4, /* write work scheduled or running */ - Wpending = 8, /* can write */ -}; - -enum { - None, - Flushing, - Flushed, -}; - -struct v9fs_mux_poll_task; - -struct v9fs_req { - spinlock_t lock; - int tag; - struct v9fs_fcall *tcall; - struct v9fs_fcall *rcall; - int err; - v9fs_mux_req_callback cb; - void *cba; - int flush; - struct list_head req_list; -}; - -struct v9fs_mux_data { - spinlock_t lock; - struct list_head mux_list; - struct v9fs_mux_poll_task *poll_task; - int msize; - unsigned char *extended; - struct v9fs_transport *trans; - struct v9fs_idpool tagpool; - int err; - wait_queue_head_t equeue; - struct list_head req_list; - struct list_head unsent_req_list; - struct v9fs_fcall *rcall; - int rpos; - char *rbuf; - int wpos; - int wsize; - char *wbuf; - wait_queue_t poll_wait[MAXPOLLWADDR]; - wait_queue_head_t *poll_waddr[MAXPOLLWADDR]; - poll_table pt; - struct work_struct rq; - struct work_struct wq; - unsigned long wsched; -}; - -struct v9fs_mux_poll_task { - struct task_struct *task; - struct list_head mux_list; - int muxnum; -}; - -struct v9fs_mux_rpc { - struct v9fs_mux_data *m; - int err; - struct v9fs_fcall *tcall; - struct v9fs_fcall *rcall; - wait_queue_head_t wqueue; -}; - -static int v9fs_poll_proc(void *); -static void v9fs_read_work(struct work_struct *work); -static void v9fs_write_work(struct work_struct *work); -static void v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address, - poll_table * p); -static u16 v9fs_mux_get_tag(struct v9fs_mux_data *); -static void v9fs_mux_put_tag(struct v9fs_mux_data *, u16); - -static DEFINE_MUTEX(v9fs_mux_task_lock); -static struct workqueue_struct *v9fs_mux_wq; - -static int v9fs_mux_num; -static int v9fs_mux_poll_task_num; -static struct v9fs_mux_poll_task v9fs_mux_poll_tasks[100]; - -int v9fs_mux_global_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) - v9fs_mux_poll_tasks[i].task = NULL; - - v9fs_mux_wq = create_workqueue("v9fs"); - if (!v9fs_mux_wq) { - printk(KERN_WARNING "v9fs: mux: creating workqueue failed\n"); - return -ENOMEM; - } - - return 0; -} - -void v9fs_mux_global_exit(void) -{ - destroy_workqueue(v9fs_mux_wq); -} - -/** - * v9fs_mux_calc_poll_procs - calculates the number of polling procs - * based on the number of mounted v9fs filesystems. - * - * The current implementation returns sqrt of the number of mounts. - */ -static int v9fs_mux_calc_poll_procs(int muxnum) -{ - int n; - - if (v9fs_mux_poll_task_num) - n = muxnum / v9fs_mux_poll_task_num + - (muxnum % v9fs_mux_poll_task_num ? 1 : 0); - else - n = 1; - - if (n > ARRAY_SIZE(v9fs_mux_poll_tasks)) - n = ARRAY_SIZE(v9fs_mux_poll_tasks); - - return n; -} - -static int v9fs_mux_poll_start(struct v9fs_mux_data *m) -{ - int i, n; - struct v9fs_mux_poll_task *vpt, *vptlast; - struct task_struct *pproc; - - dprintk(DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, v9fs_mux_num, - v9fs_mux_poll_task_num); - mutex_lock(&v9fs_mux_task_lock); - - n = v9fs_mux_calc_poll_procs(v9fs_mux_num + 1); - if (n > v9fs_mux_poll_task_num) { - for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) { - if (v9fs_mux_poll_tasks[i].task == NULL) { - vpt = &v9fs_mux_poll_tasks[i]; - dprintk(DEBUG_MUX, "create proc %p\n", vpt); - pproc = kthread_create(v9fs_poll_proc, vpt, - "v9fs-poll"); - - if (!IS_ERR(pproc)) { - vpt->task = pproc; - INIT_LIST_HEAD(&vpt->mux_list); - vpt->muxnum = 0; - v9fs_mux_poll_task_num++; - wake_up_process(vpt->task); - } - break; - } - } - - if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) - dprintk(DEBUG_ERROR, "warning: no free poll slots\n"); - } - - n = (v9fs_mux_num + 1) / v9fs_mux_poll_task_num + - ((v9fs_mux_num + 1) % v9fs_mux_poll_task_num ? 1 : 0); - - vptlast = NULL; - for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) { - vpt = &v9fs_mux_poll_tasks[i]; - if (vpt->task != NULL) { - vptlast = vpt; - if (vpt->muxnum < n) { - dprintk(DEBUG_MUX, "put in proc %d\n", i); - list_add(&m->mux_list, &vpt->mux_list); - vpt->muxnum++; - m->poll_task = vpt; - memset(&m->poll_waddr, 0, sizeof(m->poll_waddr)); - init_poll_funcptr(&m->pt, v9fs_pollwait); - break; - } - } - } - - if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) { - if (vptlast == NULL) - return -ENOMEM; - - dprintk(DEBUG_MUX, "put in proc %d\n", i); - list_add(&m->mux_list, &vptlast->mux_list); - vptlast->muxnum++; - m->poll_task = vptlast; - memset(&m->poll_waddr, 0, sizeof(m->poll_waddr)); - init_poll_funcptr(&m->pt, v9fs_pollwait); - } - - v9fs_mux_num++; - mutex_unlock(&v9fs_mux_task_lock); - - return 0; -} - -static void v9fs_mux_poll_stop(struct v9fs_mux_data *m) -{ - int i; - struct v9fs_mux_poll_task *vpt; - - mutex_lock(&v9fs_mux_task_lock); - vpt = m->poll_task; - list_del(&m->mux_list); - for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) { - if (m->poll_waddr[i] != NULL) { - remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]); - m->poll_waddr[i] = NULL; - } - } - vpt->muxnum--; - if (!vpt->muxnum) { - dprintk(DEBUG_MUX, "destroy proc %p\n", vpt); - kthread_stop(vpt->task); - vpt->task = NULL; - v9fs_mux_poll_task_num--; - } - v9fs_mux_num--; - mutex_unlock(&v9fs_mux_task_lock); -} - -/** - * v9fs_mux_init - allocate and initialize the per-session mux data - * Creates the polling task if this is the first session. - * - * @trans - transport structure - * @msize - maximum message size - * @extended - pointer to the extended flag - */ -struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize, - unsigned char *extended) -{ - int i, n; - struct v9fs_mux_data *m, *mtmp; - - dprintk(DEBUG_MUX, "transport %p msize %d\n", trans, msize); - m = kmalloc(sizeof(struct v9fs_mux_data), GFP_KERNEL); - if (!m) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&m->lock); - INIT_LIST_HEAD(&m->mux_list); - m->msize = msize; - m->extended = extended; - m->trans = trans; - idr_init(&m->tagpool.pool); - init_MUTEX(&m->tagpool.lock); - m->err = 0; - init_waitqueue_head(&m->equeue); - INIT_LIST_HEAD(&m->req_list); - INIT_LIST_HEAD(&m->unsent_req_list); - m->rcall = NULL; - m->rpos = 0; - m->rbuf = NULL; - m->wpos = m->wsize = 0; - m->wbuf = NULL; - INIT_WORK(&m->rq, v9fs_read_work); - INIT_WORK(&m->wq, v9fs_write_work); - m->wsched = 0; - memset(&m->poll_waddr, 0, sizeof(m->poll_waddr)); - m->poll_task = NULL; - n = v9fs_mux_poll_start(m); - if (n) - return ERR_PTR(n); - - n = trans->poll(trans, &m->pt); - if (n & POLLIN) { - dprintk(DEBUG_MUX, "mux %p can read\n", m); - set_bit(Rpending, &m->wsched); - } - - if (n & POLLOUT) { - dprintk(DEBUG_MUX, "mux %p can write\n", m); - set_bit(Wpending, &m->wsched); - } - - for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) { - if (IS_ERR(m->poll_waddr[i])) { - v9fs_mux_poll_stop(m); - mtmp = (void *)m->poll_waddr; /* the error code */ - kfree(m); - m = mtmp; - break; - } - } - - return m; -} - -/** - * v9fs_mux_destroy - cancels all pending requests and frees mux resources - */ -void v9fs_mux_destroy(struct v9fs_mux_data *m) -{ - dprintk(DEBUG_MUX, "mux %p prev %p next %p\n", m, - m->mux_list.prev, m->mux_list.next); - v9fs_mux_cancel(m, -ECONNRESET); - - if (!list_empty(&m->req_list)) { - /* wait until all processes waiting on this session exit */ - dprintk(DEBUG_MUX, "mux %p waiting for empty request queue\n", - m); - wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000); - dprintk(DEBUG_MUX, "mux %p request queue empty: %d\n", m, - list_empty(&m->req_list)); - } - - v9fs_mux_poll_stop(m); - m->trans = NULL; - - kfree(m); -} - -/** - * v9fs_pollwait - called by files poll operation to add v9fs-poll task - * to files wait queue - */ -static void -v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address, - poll_table * p) -{ - int i; - struct v9fs_mux_data *m; - - m = container_of(p, struct v9fs_mux_data, pt); - for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) - if (m->poll_waddr[i] == NULL) - break; - - if (i >= ARRAY_SIZE(m->poll_waddr)) { - dprintk(DEBUG_ERROR, "not enough wait_address slots\n"); - return; - } - - m->poll_waddr[i] = wait_address; - - if (!wait_address) { - dprintk(DEBUG_ERROR, "no wait_address\n"); - m->poll_waddr[i] = ERR_PTR(-EIO); - return; - } - - init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task); - add_wait_queue(wait_address, &m->poll_wait[i]); -} - -/** - * v9fs_poll_mux - polls a mux and schedules read or write works if necessary - */ -static void v9fs_poll_mux(struct v9fs_mux_data *m) -{ - int n; - - if (m->err < 0) - return; - - n = m->trans->poll(m->trans, NULL); - if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) { - dprintk(DEBUG_MUX, "error mux %p err %d\n", m, n); - if (n >= 0) - n = -ECONNRESET; - v9fs_mux_cancel(m, n); - } - - if (n & POLLIN) { - set_bit(Rpending, &m->wsched); - dprintk(DEBUG_MUX, "mux %p can read\n", m); - if (!test_and_set_bit(Rworksched, &m->wsched)) { - dprintk(DEBUG_MUX, "schedule read work mux %p\n", m); - queue_work(v9fs_mux_wq, &m->rq); - } - } - - if (n & POLLOUT) { - set_bit(Wpending, &m->wsched); - dprintk(DEBUG_MUX, "mux %p can write\n", m); - if ((m->wsize || !list_empty(&m->unsent_req_list)) - && !test_and_set_bit(Wworksched, &m->wsched)) { - dprintk(DEBUG_MUX, "schedule write work mux %p\n", m); - queue_work(v9fs_mux_wq, &m->wq); - } - } -} - -/** - * v9fs_poll_proc - polls all v9fs transports for new events and queues - * the appropriate work to the work queue - */ -static int v9fs_poll_proc(void *a) -{ - struct v9fs_mux_data *m, *mtmp; - struct v9fs_mux_poll_task *vpt; - - vpt = a; - dprintk(DEBUG_MUX, "start %p %p\n", current, vpt); - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - - list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) { - v9fs_poll_mux(m); - } - - dprintk(DEBUG_MUX, "sleeping...\n"); - schedule_timeout(SCHED_TIMEOUT * HZ); - } - - __set_current_state(TASK_RUNNING); - dprintk(DEBUG_MUX, "finish\n"); - return 0; -} - -/** - * v9fs_write_work - called when a transport can send some data - */ -static void v9fs_write_work(struct work_struct *work) -{ - int n, err; - struct v9fs_mux_data *m; - struct v9fs_req *req; - - m = container_of(work, struct v9fs_mux_data, wq); - - if (m->err < 0) { - clear_bit(Wworksched, &m->wsched); - return; - } - - if (!m->wsize) { - if (list_empty(&m->unsent_req_list)) { - clear_bit(Wworksched, &m->wsched); - return; - } - - spin_lock(&m->lock); -again: - req = list_entry(m->unsent_req_list.next, struct v9fs_req, - req_list); - list_move_tail(&req->req_list, &m->req_list); - if (req->err == ERREQFLUSH) - goto again; - - m->wbuf = req->tcall->sdata; - m->wsize = req->tcall->size; - m->wpos = 0; - dump_data(m->wbuf, m->wsize); - spin_unlock(&m->lock); - } - - dprintk(DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos, m->wsize); - clear_bit(Wpending, &m->wsched); - err = m->trans->write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos); - dprintk(DEBUG_MUX, "mux %p sent %d bytes\n", m, err); - if (err == -EAGAIN) { - clear_bit(Wworksched, &m->wsched); - return; - } - - if (err <= 0) - goto error; - - m->wpos += err; - if (m->wpos == m->wsize) - m->wpos = m->wsize = 0; - - if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) { - if (test_and_clear_bit(Wpending, &m->wsched)) - n = POLLOUT; - else - n = m->trans->poll(m->trans, NULL); - - if (n & POLLOUT) { - dprintk(DEBUG_MUX, "schedule write work mux %p\n", m); - queue_work(v9fs_mux_wq, &m->wq); - } else - clear_bit(Wworksched, &m->wsched); - } else - clear_bit(Wworksched, &m->wsched); - - return; - - error: - v9fs_mux_cancel(m, err); - clear_bit(Wworksched, &m->wsched); -} - -static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) -{ - int ecode; - struct v9fs_str *ename; - - if (!req->err && req->rcall->id == RERROR) { - ecode = req->rcall->params.rerror.errno; - ename = &req->rcall->params.rerror.error; - - dprintk(DEBUG_MUX, "Rerror %.*s\n", ename->len, ename->str); - - if (*m->extended) - req->err = -ecode; - - if (!req->err) { - req->err = v9fs_errstr2errno(ename->str, ename->len); - - if (!req->err) { /* string match failed */ - PRINT_FCALL_ERROR("unknown error", req->rcall); - } - - if (!req->err) - req->err = -ESERVERFAULT; - } - } else if (req->tcall && req->rcall->id != req->tcall->id + 1) { - dprintk(DEBUG_ERROR, "fcall mismatch: expected %d, got %d\n", - req->tcall->id + 1, req->rcall->id); - if (!req->err) - req->err = -EIO; - } -} - -/** - * v9fs_read_work - called when there is some data to be read from a transport - */ -static void v9fs_read_work(struct work_struct *work) -{ - int n, err; - struct v9fs_mux_data *m; - struct v9fs_req *req, *rptr, *rreq; - struct v9fs_fcall *rcall; - char *rbuf; - - m = container_of(work, struct v9fs_mux_data, rq); - - if (m->err < 0) - return; - - rcall = NULL; - dprintk(DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos); - - if (!m->rcall) { - m->rcall = - kmalloc(sizeof(struct v9fs_fcall) + m->msize, GFP_KERNEL); - if (!m->rcall) { - err = -ENOMEM; - goto error; - } - - m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall); - m->rpos = 0; - } - - clear_bit(Rpending, &m->wsched); - err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos); - dprintk(DEBUG_MUX, "mux %p got %d bytes\n", m, err); - if (err == -EAGAIN) { - clear_bit(Rworksched, &m->wsched); - return; - } - - if (err <= 0) - goto error; - - m->rpos += err; - while (m->rpos > 4) { - n = le32_to_cpu(*(__le32 *) m->rbuf); - if (n >= m->msize) { - dprintk(DEBUG_ERROR, - "requested packet size too big: %d\n", n); - err = -EIO; - goto error; - } - - if (m->rpos < n) - break; - - dump_data(m->rbuf, n); - err = - v9fs_deserialize_fcall(m->rbuf, n, m->rcall, *m->extended); - if (err < 0) { - goto error; - } - - if ((v9fs_debug_level&DEBUG_FCALL) == DEBUG_FCALL) { - char buf[150]; - - v9fs_printfcall(buf, sizeof(buf), m->rcall, - *m->extended); - printk(KERN_NOTICE ">>> %p %s\n", m, buf); - } - - rcall = m->rcall; - rbuf = m->rbuf; - if (m->rpos > n) { - m->rcall = kmalloc(sizeof(struct v9fs_fcall) + m->msize, - GFP_KERNEL); - if (!m->rcall) { - err = -ENOMEM; - goto error; - } - - m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall); - memmove(m->rbuf, rbuf + n, m->rpos - n); - m->rpos -= n; - } else { - m->rcall = NULL; - m->rbuf = NULL; - m->rpos = 0; - } - - dprintk(DEBUG_MUX, "mux %p fcall id %d tag %d\n", m, rcall->id, - rcall->tag); - - req = NULL; - spin_lock(&m->lock); - list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) { - if (rreq->tag == rcall->tag) { - req = rreq; - if (req->flush != Flushing) - list_del(&req->req_list); - break; - } - } - spin_unlock(&m->lock); - - if (req) { - req->rcall = rcall; - process_request(m, req); - - if (req->flush != Flushing) { - if (req->cb) - (*req->cb) (req, req->cba); - else - kfree(req->rcall); - - wake_up(&m->equeue); - } - } else { - if (err >= 0 && rcall->id != RFLUSH) - dprintk(DEBUG_ERROR, - "unexpected response mux %p id %d tag %d\n", - m, rcall->id, rcall->tag); - kfree(rcall); - } - } - - if (!list_empty(&m->req_list)) { - if (test_and_clear_bit(Rpending, &m->wsched)) - n = POLLIN; - else - n = m->trans->poll(m->trans, NULL); - - if (n & POLLIN) { - dprintk(DEBUG_MUX, "schedule read work mux %p\n", m); - queue_work(v9fs_mux_wq, &m->rq); - } else - clear_bit(Rworksched, &m->wsched); - } else - clear_bit(Rworksched, &m->wsched); - - return; - - error: - v9fs_mux_cancel(m, err); - clear_bit(Rworksched, &m->wsched); -} - -/** - * v9fs_send_request - send 9P request - * The function can sleep until the request is scheduled for sending. - * The function can be interrupted. Return from the function is not - * a guarantee that the request is sent successfully. Can return errors - * that can be retrieved by PTR_ERR macros. - * - * @m: mux data - * @tc: request to be sent - * @cb: callback function to call when response is received - * @cba: parameter to pass to the callback function - */ -static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m, - struct v9fs_fcall *tc, - v9fs_mux_req_callback cb, void *cba) -{ - int n; - struct v9fs_req *req; - - dprintk(DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current, - tc, tc->id); - if (m->err < 0) - return ERR_PTR(m->err); - - req = kmalloc(sizeof(struct v9fs_req), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); - - if (tc->id == TVERSION) - n = V9FS_NOTAG; - else - n = v9fs_mux_get_tag(m); - - if (n < 0) - return ERR_PTR(-ENOMEM); - - v9fs_set_tag(tc, n); - if ((v9fs_debug_level&DEBUG_FCALL) == DEBUG_FCALL) { - char buf[150]; - - v9fs_printfcall(buf, sizeof(buf), tc, *m->extended); - printk(KERN_NOTICE "<<< %p %s\n", m, buf); - } - - spin_lock_init(&req->lock); - req->tag = n; - req->tcall = tc; - req->rcall = NULL; - req->err = 0; - req->cb = cb; - req->cba = cba; - req->flush = None; - - spin_lock(&m->lock); - list_add_tail(&req->req_list, &m->unsent_req_list); - spin_unlock(&m->lock); - - if (test_and_clear_bit(Wpending, &m->wsched)) - n = POLLOUT; - else - n = m->trans->poll(m->trans, NULL); - - if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched)) - queue_work(v9fs_mux_wq, &m->wq); - - return req; -} - -static void v9fs_mux_free_request(struct v9fs_mux_data *m, struct v9fs_req *req) -{ - v9fs_mux_put_tag(m, req->tag); - kfree(req); -} - -static void v9fs_mux_flush_cb(struct v9fs_req *freq, void *a) -{ - v9fs_mux_req_callback cb; - int tag; - struct v9fs_mux_data *m; - struct v9fs_req *req, *rreq, *rptr; - - m = a; - dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, - freq->tcall, freq->rcall, freq->err, - freq->tcall->params.tflush.oldtag); - - spin_lock(&m->lock); - cb = NULL; - tag = freq->tcall->params.tflush.oldtag; - req = NULL; - list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) { - if (rreq->tag == tag) { - req = rreq; - list_del(&req->req_list); - break; - } - } - spin_unlock(&m->lock); - - if (req) { - spin_lock(&req->lock); - req->flush = Flushed; - spin_unlock(&req->lock); - - if (req->cb) - (*req->cb) (req, req->cba); - else - kfree(req->rcall); - - wake_up(&m->equeue); - } - - kfree(freq->tcall); - kfree(freq->rcall); - v9fs_mux_free_request(m, freq); -} - -static int -v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req) -{ - struct v9fs_fcall *fc; - struct v9fs_req *rreq, *rptr; - - dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag); - - /* if a response was received for a request, do nothing */ - spin_lock(&req->lock); - if (req->rcall || req->err) { - spin_unlock(&req->lock); - dprintk(DEBUG_MUX, "mux %p req %p response already received\n", m, req); - return 0; - } - - req->flush = Flushing; - spin_unlock(&req->lock); - - spin_lock(&m->lock); - /* if the request is not sent yet, just remove it from the list */ - list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) { - if (rreq->tag == req->tag) { - dprintk(DEBUG_MUX, "mux %p req %p request is not sent yet\n", m, req); - list_del(&rreq->req_list); - req->flush = Flushed; - spin_unlock(&m->lock); - if (req->cb) - (*req->cb) (req, req->cba); - return 0; - } - } - spin_unlock(&m->lock); - - clear_thread_flag(TIF_SIGPENDING); - fc = v9fs_create_tflush(req->tag); - v9fs_send_request(m, fc, v9fs_mux_flush_cb, m); - return 1; -} - -static void -v9fs_mux_rpc_cb(struct v9fs_req *req, void *a) -{ - struct v9fs_mux_rpc *r; - - dprintk(DEBUG_MUX, "req %p r %p\n", req, a); - r = a; - r->rcall = req->rcall; - r->err = req->err; - - if (req->flush!=None && !req->err) - r->err = -ERESTARTSYS; - - wake_up(&r->wqueue); -} - -/** - * v9fs_mux_rpc - sends 9P request and waits until a response is available. - * The function can be interrupted. - * @m: mux data - * @tc: request to be sent - * @rc: pointer where a pointer to the response is stored - */ -int -v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, - struct v9fs_fcall **rc) -{ - int err, sigpending; - unsigned long flags; - struct v9fs_req *req; - struct v9fs_mux_rpc r; - - r.err = 0; - r.tcall = tc; - r.rcall = NULL; - r.m = m; - init_waitqueue_head(&r.wqueue); - - if (rc) - *rc = NULL; - - sigpending = 0; - if (signal_pending(current)) { - sigpending = 1; - clear_thread_flag(TIF_SIGPENDING); - } - - req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r); - if (IS_ERR(req)) { - err = PTR_ERR(req); - dprintk(DEBUG_MUX, "error %d\n", err); - return err; - } - - err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0); - if (r.err < 0) - err = r.err; - - if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) { - if (v9fs_mux_flush_request(m, req)) { - /* wait until we get response of the flush message */ - do { - clear_thread_flag(TIF_SIGPENDING); - err = wait_event_interruptible(r.wqueue, - r.rcall || r.err); - } while (!r.rcall && !r.err && err==-ERESTARTSYS && - m->trans->status==Connected && !m->err); - - err = -ERESTARTSYS; - } - sigpending = 1; - } - - if (sigpending) { - spin_lock_irqsave(¤t->sighand->siglock, flags); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - } - - if (rc) - *rc = r.rcall; - else - kfree(r.rcall); - - v9fs_mux_free_request(m, req); - if (err > 0) - err = -EIO; - - return err; -} - -#if 0 -/** - * v9fs_mux_rpcnb - sends 9P request without waiting for response. - * @m: mux data - * @tc: request to be sent - * @cb: callback function to be called when response arrives - * @cba: value to pass to the callback function - */ -int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc, - v9fs_mux_req_callback cb, void *a) -{ - int err; - struct v9fs_req *req; - - req = v9fs_send_request(m, tc, cb, a); - if (IS_ERR(req)) { - err = PTR_ERR(req); - dprintk(DEBUG_MUX, "error %d\n", err); - return PTR_ERR(req); - } - - dprintk(DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag); - return 0; -} -#endif /* 0 */ - -/** - * v9fs_mux_cancel - cancel all pending requests with error - * @m: mux data - * @err: error code - */ -void v9fs_mux_cancel(struct v9fs_mux_data *m, int err) -{ - struct v9fs_req *req, *rtmp; - LIST_HEAD(cancel_list); - - dprintk(DEBUG_ERROR, "mux %p err %d\n", m, err); - m->err = err; - spin_lock(&m->lock); - list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { - list_move(&req->req_list, &cancel_list); - } - list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { - list_move(&req->req_list, &cancel_list); - } - spin_unlock(&m->lock); - - list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { - list_del(&req->req_list); - if (!req->err) - req->err = err; - - if (req->cb) - (*req->cb) (req, req->cba); - else - kfree(req->rcall); - } - - wake_up(&m->equeue); -} - -static u16 v9fs_mux_get_tag(struct v9fs_mux_data *m) -{ - int tag; - - tag = v9fs_get_idpool(&m->tagpool); - if (tag < 0) - return V9FS_NOTAG; - else - return (u16) tag; -} - -static void v9fs_mux_put_tag(struct v9fs_mux_data *m, u16 tag) -{ - if (tag != V9FS_NOTAG && v9fs_check_idpool(tag, &m->tagpool)) - v9fs_put_idpool(tag, &m->tagpool); -} diff --git a/fs/9p/mux.h b/fs/9p/mux.h deleted file mode 100644 index fb10c50186a1..000000000000 --- a/fs/9p/mux.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * linux/fs/9p/mux.h - * - * Multiplexer Definitions - * - * Copyright (C) 2005 by Latchesar Ionkov - * Copyright (C) 2004 by Eric Van Hensbergen - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -struct v9fs_mux_data; -struct v9fs_req; - -/** - * v9fs_mux_req_callback - callback function that is called when the - * response of a request is received. The callback is called from - * a workqueue and shouldn't block. - * - * @a - the pointer that was specified when the request was send to be - * passed to the callback - * @tc - request call - * @rc - response call - * @err - error code (non-zero if error occured) - */ -typedef void (*v9fs_mux_req_callback)(struct v9fs_req *req, void *a); - -int v9fs_mux_global_init(void); -void v9fs_mux_global_exit(void); - -struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize, - unsigned char *extended); -void v9fs_mux_destroy(struct v9fs_mux_data *); - -int v9fs_mux_send(struct v9fs_mux_data *m, struct v9fs_fcall *tc); -struct v9fs_fcall *v9fs_mux_recv(struct v9fs_mux_data *m); -int v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, struct v9fs_fcall **rc); - -void v9fs_mux_flush(struct v9fs_mux_data *m, int sendflush); -void v9fs_mux_cancel(struct v9fs_mux_data *m, int err); -int v9fs_errstr2errno(char *errstr, int len); diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c deleted file mode 100644 index 34d43355beb7..000000000000 --- a/fs/9p/trans_fd.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * linux/fs/9p/trans_fd.c - * - * Fd transport layer. Includes deprecated socket layer. - * - * Copyright (C) 2006 by Russ Cox - * Copyright (C) 2004-2005 by Latchesar Ionkov - * Copyright (C) 2004-2005 by Eric Van Hensbergen - * Copyright (C) 1997-2002 by Ron Minnich - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "debug.h" -#include "v9fs.h" -#include "transport.h" - -#define V9FS_PORT 564 - -struct v9fs_trans_fd { - struct file *rd; - struct file *wr; -}; - -/** - * v9fs_fd_read- read from a fd - * @v9ses: session information - * @v: buffer to receive data into - * @len: size of receive buffer - * - */ -static int v9fs_fd_read(struct v9fs_transport *trans, void *v, int len) -{ - int ret; - struct v9fs_trans_fd *ts; - - if (!trans || trans->status == Disconnected || !(ts = trans->priv)) - return -EREMOTEIO; - - if (!(ts->rd->f_flags & O_NONBLOCK)) - dprintk(DEBUG_ERROR, "blocking read ...\n"); - - ret = kernel_read(ts->rd, ts->rd->f_pos, v, len); - if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) - trans->status = Disconnected; - return ret; -} - -/** - * v9fs_fd_write - write to a socket - * @v9ses: session information - * @v: buffer to send data from - * @len: size of send buffer - * - */ -static int v9fs_fd_write(struct v9fs_transport *trans, void *v, int len) -{ - int ret; - mm_segment_t oldfs; - struct v9fs_trans_fd *ts; - - if (!trans || trans->status == Disconnected || !(ts = trans->priv)) - return -EREMOTEIO; - - if (!(ts->wr->f_flags & O_NONBLOCK)) - dprintk(DEBUG_ERROR, "blocking write ...\n"); - - oldfs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - ret = vfs_write(ts->wr, (void __user *)v, len, &ts->wr->f_pos); - set_fs(oldfs); - - if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) - trans->status = Disconnected; - return ret; -} - -static unsigned int -v9fs_fd_poll(struct v9fs_transport *trans, struct poll_table_struct *pt) -{ - int ret, n; - struct v9fs_trans_fd *ts; - mm_segment_t oldfs; - - if (!trans || trans->status != Connected || !(ts = trans->priv)) - return -EREMOTEIO; - - if (!ts->rd->f_op || !ts->rd->f_op->poll) - return -EIO; - - if (!ts->wr->f_op || !ts->wr->f_op->poll) - return -EIO; - - oldfs = get_fs(); - set_fs(get_ds()); - - ret = ts->rd->f_op->poll(ts->rd, pt); - if (ret < 0) - goto end; - - if (ts->rd != ts->wr) { - n = ts->wr->f_op->poll(ts->wr, pt); - if (n < 0) { - ret = n; - goto end; - } - ret = (ret & ~POLLOUT) | (n & ~POLLIN); - } - - end: - set_fs(oldfs); - return ret; -} - -static int v9fs_fd_open(struct v9fs_session_info *v9ses, int rfd, int wfd) -{ - struct v9fs_transport *trans = v9ses->transport; - struct v9fs_trans_fd *ts = kmalloc(sizeof(struct v9fs_trans_fd), - GFP_KERNEL); - if (!ts) - return -ENOMEM; - - ts->rd = fget(rfd); - ts->wr = fget(wfd); - if (!ts->rd || !ts->wr) { - if (ts->rd) - fput(ts->rd); - if (ts->wr) - fput(ts->wr); - kfree(ts); - return -EIO; - } - - trans->priv = ts; - trans->status = Connected; - - return 0; -} - -static int v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr, - char *data) -{ - if (v9ses->rfdno == ~0 || v9ses->wfdno == ~0) { - printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n"); - return -ENOPROTOOPT; - } - - return v9fs_fd_open(v9ses, v9ses->rfdno, v9ses->wfdno); -} - -static int v9fs_socket_open(struct v9fs_session_info *v9ses, - struct socket *csocket) -{ - int fd, ret; - - csocket->sk->sk_allocation = GFP_NOIO; - if ((fd = sock_map_fd(csocket)) < 0) { - eprintk(KERN_ERR, "v9fs_socket_open: failed to map fd\n"); - ret = fd; - release_csocket: - sock_release(csocket); - return ret; - } - - if ((ret = v9fs_fd_open(v9ses, fd, fd)) < 0) { - sockfd_put(csocket); - eprintk(KERN_ERR, "v9fs_socket_open: failed to open fd\n"); - goto release_csocket; - } - - ((struct v9fs_trans_fd *)v9ses->transport->priv)->rd->f_flags |= - O_NONBLOCK; - return 0; -} - -static int v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, - char *data) -{ - int ret; - struct socket *csocket = NULL; - struct sockaddr_in sin_server; - - sin_server.sin_family = AF_INET; - sin_server.sin_addr.s_addr = in_aton(addr); - sin_server.sin_port = htons(v9ses->port); - sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &csocket); - - if (!csocket) { - eprintk(KERN_ERR, "v9fs_trans_tcp: problem creating socket\n"); - return -1; - } - - ret = csocket->ops->connect(csocket, - (struct sockaddr *)&sin_server, - sizeof(struct sockaddr_in), 0); - if (ret < 0) { - eprintk(KERN_ERR, - "v9fs_trans_tcp: problem connecting socket to %s\n", - addr); - return ret; - } - - return v9fs_socket_open(v9ses, csocket); -} - -static int -v9fs_unix_init(struct v9fs_session_info *v9ses, const char *addr, char *data) -{ - int ret; - struct socket *csocket; - struct sockaddr_un sun_server; - - if (strlen(addr) > UNIX_PATH_MAX) { - eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n", - addr); - return -ENAMETOOLONG; - } - - sun_server.sun_family = PF_UNIX; - strcpy(sun_server.sun_path, addr); - sock_create_kern(PF_UNIX, SOCK_STREAM, 0, &csocket); - ret = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server, - sizeof(struct sockaddr_un) - 1, 0); - if (ret < 0) { - eprintk(KERN_ERR, - "v9fs_trans_unix: problem connecting socket: %s: %d\n", - addr, ret); - return ret; - } - - return v9fs_socket_open(v9ses, csocket); -} - -/** - * v9fs_sock_close - shutdown socket - * @trans: private socket structure - * - */ -static void v9fs_fd_close(struct v9fs_transport *trans) -{ - struct v9fs_trans_fd *ts; - - if (!trans) - return; - - ts = xchg(&trans->priv, NULL); - - if (!ts) - return; - - trans->status = Disconnected; - if (ts->rd) - fput(ts->rd); - if (ts->wr) - fput(ts->wr); - kfree(ts); -} - -struct v9fs_transport v9fs_trans_fd = { - .init = v9fs_fd_init, - .write = v9fs_fd_write, - .read = v9fs_fd_read, - .close = v9fs_fd_close, - .poll = v9fs_fd_poll, -}; - -struct v9fs_transport v9fs_trans_tcp = { - .init = v9fs_tcp_init, - .write = v9fs_fd_write, - .read = v9fs_fd_read, - .close = v9fs_fd_close, - .poll = v9fs_fd_poll, -}; - -struct v9fs_transport v9fs_trans_unix = { - .init = v9fs_unix_init, - .write = v9fs_fd_write, - .read = v9fs_fd_read, - .close = v9fs_fd_close, - .poll = v9fs_fd_poll, -}; diff --git a/fs/9p/transport.h b/fs/9p/transport.h deleted file mode 100644 index b38a4b8a41ce..000000000000 --- a/fs/9p/transport.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * linux/fs/9p/transport.h - * - * Transport Definition - * - * Copyright (C) 2005 by Latchesar Ionkov - * Copyright (C) 2004 by Eric Van Hensbergen - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -enum v9fs_transport_status { - Connected, - Disconnected, - Hung, -}; - -struct v9fs_transport { - enum v9fs_transport_status status; - void *priv; - - int (*init) (struct v9fs_session_info *, const char *, char *); - int (*write) (struct v9fs_transport *, void *, int); - int (*read) (struct v9fs_transport *, void *, int); - void (*close) (struct v9fs_transport *); - unsigned int (*poll)(struct v9fs_transport *, struct poll_table_struct *); -}; - -extern struct v9fs_transport v9fs_trans_tcp; -extern struct v9fs_transport v9fs_trans_unix; -extern struct v9fs_transport v9fs_trans_fd; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 6ad6f192b6e4..4feb5ae63ecf 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -29,16 +29,12 @@ #include #include #include - -#include "debug.h" +#include +#include +#include +#include #include "v9fs.h" -#include "9p.h" #include "v9fs_vfs.h" -#include "transport.h" -#include "mux.h" - -/* TODO: sysfs or debugfs interface */ -int v9fs_debug_level = 0; /* feature-rific global debug level */ /* * Option Parsing (code inspired by NFS code) @@ -47,12 +43,12 @@ int v9fs_debug_level = 0; /* feature-rific global debug level */ enum { /* Options that take integer arguments */ - Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid, Opt_debug, + Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid, Opt_rfdno, Opt_wfdno, /* String options */ Opt_uname, Opt_remotename, /* Options that take no arguments */ - Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd, + Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd, Opt_pci, /* Cache options */ Opt_cache_loose, /* Error token */ @@ -67,12 +63,14 @@ static match_table_t tokens = { {Opt_afid, "afid=%u"}, {Opt_rfdno, "rfdno=%u"}, {Opt_wfdno, "wfdno=%u"}, - {Opt_debug, "debug=%x"}, {Opt_uname, "uname=%s"}, {Opt_remotename, "aname=%s"}, {Opt_unix, "proto=unix"}, {Opt_tcp, "proto=tcp"}, {Opt_fd, "proto=fd"}, +#ifdef CONFIG_PCI_9P + {Opt_pci, "proto=pci"}, +#endif {Opt_tcp, "tcp"}, {Opt_unix, "unix"}, {Opt_fd, "fd"}, @@ -83,6 +81,8 @@ static match_table_t tokens = { {Opt_err, NULL} }; +extern struct p9_transport *p9pci_trans_create(void); + /* * Parse option string. */ @@ -122,7 +122,7 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses) token = match_token(p, tokens, args); if (token < Opt_uname) { if ((ret = match_int(&args[0], &option)) < 0) { - dprintk(DEBUG_ERROR, + P9_DPRINTK(P9_DEBUG_ERROR, "integer field, but no integer?\n"); continue; } @@ -149,15 +149,15 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses) case Opt_wfdno: v9ses->wfdno = option; break; - case Opt_debug: - v9ses->debug = option; - break; case Opt_tcp: v9ses->proto = PROTO_TCP; break; case Opt_unix: v9ses->proto = PROTO_UNIX; break; + case Opt_pci: + v9ses->proto = PROTO_PCI; + break; case Opt_fd: v9ses->proto = PROTO_FD; break; @@ -182,82 +182,6 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses) } } -/** - * v9fs_inode2v9ses - safely extract v9fs session info from super block - * @inode: inode to extract information from - * - * Paranoid function to extract v9ses information from superblock, - * if anything is missing it will report an error. - * - */ - -struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) -{ - return (inode->i_sb->s_fs_info); -} - -/** - * v9fs_get_idpool - allocate numeric id from pool - * @p - pool to allocate from - * - * XXX - This seems to be an awful generic function, should it be in idr.c with - * the lock included in struct idr? - */ - -int v9fs_get_idpool(struct v9fs_idpool *p) -{ - int i = 0; - int error; - -retry: - if (idr_pre_get(&p->pool, GFP_KERNEL) == 0) - return 0; - - if (down_interruptible(&p->lock) == -EINTR) { - eprintk(KERN_WARNING, "Interrupted while locking\n"); - return -1; - } - - /* no need to store exactly p, we just need something non-null */ - error = idr_get_new(&p->pool, p, &i); - up(&p->lock); - - if (error == -EAGAIN) - goto retry; - else if (error) - return -1; - - return i; -} - -/** - * v9fs_put_idpool - release numeric id from pool - * @p - pool to allocate from - * - * XXX - This seems to be an awful generic function, should it be in idr.c with - * the lock included in struct idr? - */ - -void v9fs_put_idpool(int id, struct v9fs_idpool *p) -{ - if (down_interruptible(&p->lock) == -EINTR) { - eprintk(KERN_WARNING, "Interrupted while locking\n"); - return; - } - idr_remove(&p->pool, id); - up(&p->lock); -} - -/** - * v9fs_check_idpool - check if the specified id is available - * @id - id to check - * @p - pool - */ -int v9fs_check_idpool(int id, struct v9fs_idpool *p) -{ - return idr_find(&p->pool, id) != NULL; -} - /** * v9fs_session_init - initialize session * @v9ses: session information structure @@ -266,25 +190,21 @@ int v9fs_check_idpool(int id, struct v9fs_idpool *p) * */ -int -v9fs_session_init(struct v9fs_session_info *v9ses, +struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, const char *dev_name, char *data) { - struct v9fs_fcall *fcall = NULL; - struct v9fs_transport *trans_proto; - int n = 0; - int newfid = -1; int retval = -EINVAL; - struct v9fs_str *version; + struct p9_transport *trans; + struct p9_fid *fid; v9ses->name = __getname(); if (!v9ses->name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); v9ses->remotename = __getname(); if (!v9ses->remotename) { __putname(v9ses->name); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } strcpy(v9ses->name, V9FS_DEFUSER); @@ -292,130 +212,60 @@ v9fs_session_init(struct v9fs_session_info *v9ses, v9fs_parse_options(data, v9ses); - /* set global debug level */ - v9fs_debug_level = v9ses->debug; - - /* id pools that are session-dependent: fids and tags */ - idr_init(&v9ses->fidpool.pool); - init_MUTEX(&v9ses->fidpool.lock); - switch (v9ses->proto) { case PROTO_TCP: - trans_proto = &v9fs_trans_tcp; + trans = p9_trans_create_tcp(dev_name, v9ses->port); break; case PROTO_UNIX: - trans_proto = &v9fs_trans_unix; + trans = p9_trans_create_unix(dev_name); *v9ses->remotename = 0; break; case PROTO_FD: - trans_proto = &v9fs_trans_fd; + trans = p9_trans_create_fd(v9ses->rfdno, v9ses->wfdno); + *v9ses->remotename = 0; + break; +#ifdef CONFIG_PCI_9P + case PROTO_PCI: + trans = p9pci_trans_create(); *v9ses->remotename = 0; break; +#endif default: printk(KERN_ERR "v9fs: Bad mount protocol %d\n", v9ses->proto); retval = -ENOPROTOOPT; - goto SessCleanUp; + goto error; }; - v9ses->transport = kmalloc(sizeof(*v9ses->transport), GFP_KERNEL); - if (!v9ses->transport) { - retval = -ENOMEM; - goto SessCleanUp; + if (IS_ERR(trans)) { + retval = PTR_ERR(trans); + trans = NULL; + goto error; } - memmove(v9ses->transport, trans_proto, sizeof(*v9ses->transport)); + v9ses->clnt = p9_client_create(trans, v9ses->maxdata + P9_IOHDRSZ, + v9ses->extended); - if ((retval = v9ses->transport->init(v9ses, dev_name, data)) < 0) { - eprintk(KERN_ERR, "problem initializing transport\n"); - goto SessCleanUp; + if (IS_ERR(v9ses->clnt)) { + retval = PTR_ERR(v9ses->clnt); + v9ses->clnt = NULL; + P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n"); + goto error; } - v9ses->inprogress = 0; - v9ses->shutdown = 0; - v9ses->session_hung = 0; - - v9ses->mux = v9fs_mux_init(v9ses->transport, v9ses->maxdata + V9FS_IOHDRSZ, - &v9ses->extended); - - if (IS_ERR(v9ses->mux)) { - retval = PTR_ERR(v9ses->mux); - v9ses->mux = NULL; - dprintk(DEBUG_ERROR, "problem initializing mux\n"); - goto SessCleanUp; + fid = p9_client_attach(v9ses->clnt, NULL, v9ses->name, + v9ses->remotename); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + fid = NULL; + P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n"); + goto error; } - if (v9ses->afid == ~0) { - if (v9ses->extended) - retval = - v9fs_t_version(v9ses, v9ses->maxdata, "9P2000.u", - &fcall); - else - retval = v9fs_t_version(v9ses, v9ses->maxdata, "9P2000", - &fcall); - - if (retval < 0) { - dprintk(DEBUG_ERROR, "v9fs_t_version failed\n"); - goto FreeFcall; - } - - version = &fcall->params.rversion.version; - if (version->len==8 && !memcmp(version->str, "9P2000.u", 8)) { - dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n"); - v9ses->extended = 1; - } else if (version->len==6 && !memcmp(version->str, "9P2000", 6)) { - dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n"); - v9ses->extended = 0; - } else { - retval = -EREMOTEIO; - goto FreeFcall; - } + return fid; - n = fcall->params.rversion.msize; - kfree(fcall); - - if (n < v9ses->maxdata) - v9ses->maxdata = n; - } - - newfid = v9fs_get_idpool(&v9ses->fidpool); - if (newfid < 0) { - eprintk(KERN_WARNING, "couldn't allocate FID\n"); - retval = -ENOMEM; - goto SessCleanUp; - } - /* it is a little bit ugly, but we have to prevent newfid */ - /* being the same as afid, so if it is, get a new fid */ - if (v9ses->afid != ~0 && newfid == v9ses->afid) { - newfid = v9fs_get_idpool(&v9ses->fidpool); - if (newfid < 0) { - eprintk(KERN_WARNING, "couldn't allocate FID\n"); - retval = -ENOMEM; - goto SessCleanUp; - } - } - - if ((retval = - v9fs_t_attach(v9ses, v9ses->name, v9ses->remotename, newfid, - v9ses->afid, NULL)) - < 0) { - dprintk(DEBUG_ERROR, "cannot attach\n"); - goto SessCleanUp; - } - - if (v9ses->afid != ~0) { - dprintk(DEBUG_ERROR, "afid not equal to ~0\n"); - if (v9fs_t_clunk(v9ses, v9ses->afid)) - dprintk(DEBUG_ERROR, "clunk failed\n"); - } - - return newfid; - - FreeFcall: - kfree(fcall); - - SessCleanUp: +error: v9fs_session_close(v9ses); - return retval; + return ERR_PTR(retval); } /** @@ -426,15 +276,9 @@ v9fs_session_init(struct v9fs_session_info *v9ses, void v9fs_session_close(struct v9fs_session_info *v9ses) { - if (v9ses->mux) { - v9fs_mux_destroy(v9ses->mux); - v9ses->mux = NULL; - } - - if (v9ses->transport) { - v9ses->transport->close(v9ses->transport); - kfree(v9ses->transport); - v9ses->transport = NULL; + if (v9ses->clnt) { + p9_client_destroy(v9ses->clnt); + v9ses->clnt = NULL; } __putname(v9ses->name); @@ -446,9 +290,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) * and cancel all pending requests. */ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { - dprintk(DEBUG_ERROR, "cancel session %p\n", v9ses); - v9ses->transport->status = Disconnected; - v9fs_mux_cancel(v9ses->mux, -EIO); + P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); + p9_client_disconnect(v9ses->clnt); } extern int v9fs_error_init(void); @@ -460,24 +303,9 @@ extern int v9fs_error_init(void); static int __init init_v9fs(void) { - int ret; - - v9fs_error_init(); - printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); - ret = v9fs_mux_global_init(); - if (ret) { - printk(KERN_WARNING "v9fs: starting mux failed\n"); - return ret; - } - ret = register_filesystem(&v9fs_fs_type); - if (ret) { - printk(KERN_WARNING "v9fs: registering file system failed\n"); - v9fs_mux_global_exit(); - } - - return ret; + return register_filesystem(&v9fs_fs_type); } /** @@ -487,13 +315,13 @@ static int __init init_v9fs(void) static void __exit exit_v9fs(void) { - v9fs_mux_global_exit(); unregister_filesystem(&v9fs_fs_type); } module_init(init_v9fs) module_exit(exit_v9fs) +MODULE_AUTHOR("Latchesar Ionkov "); MODULE_AUTHOR("Eric Van Hensbergen "); MODULE_AUTHOR("Ron Minnich "); MODULE_LICENSE("GPL"); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 820bf5ca35d8..abc4b1668ace 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -21,16 +21,6 @@ * */ -/* - * Idpool structure provides lock and id management - * - */ - -struct v9fs_idpool { - struct semaphore lock; - struct idr pool; -}; - /* * Session structure provides information for an opened session * @@ -54,15 +44,7 @@ struct v9fs_session_info { unsigned int uid; /* default uid/muid for legacy support */ unsigned int gid; /* default gid for legacy support */ - /* book keeping */ - struct v9fs_idpool fidpool; /* The FID pool for file descriptors */ - - struct v9fs_transport *transport; - struct v9fs_mux_data *mux; - - int inprogress; /* session in progress => true */ - int shutdown; /* session shutting down. no more attaches. */ - unsigned char session_hung; + struct p9_client *clnt; /* 9p client */ struct dentry *debugfs_dir; }; @@ -71,6 +53,7 @@ enum { PROTO_TCP, PROTO_UNIX, PROTO_FD, + PROTO_PCI, }; /* possible values of ->cache */ @@ -82,12 +65,9 @@ enum { extern struct dentry *v9fs_debugfs_root; -int v9fs_session_init(struct v9fs_session_info *, const char *, char *); -struct v9fs_session_info *v9fs_inode2v9ses(struct inode *); +struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, + char *); void v9fs_session_close(struct v9fs_session_info *v9ses); -int v9fs_get_idpool(struct v9fs_idpool *p); -void v9fs_put_idpool(int id, struct v9fs_idpool *p); -int v9fs_check_idpool(int id, struct v9fs_idpool *p); void v9fs_session_cancel(struct v9fs_session_info *v9ses); #define V9FS_MAGIC 0x01021997 @@ -97,3 +77,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses); #define V9FS_DEFUSER "nobody" #define V9FS_DEFANAME "" +static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) +{ + return (inode->i_sb->s_fs_info); +} diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 6a82d39dc498..fd01d90cada5 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -45,10 +45,10 @@ extern struct dentry_operations v9fs_dentry_operations; extern struct dentry_operations v9fs_cached_dentry_operations; struct inode *v9fs_get_inode(struct super_block *sb, int mode); -ino_t v9fs_qid2ino(struct v9fs_qid *qid); -void v9fs_stat2inode(struct v9fs_stat *, struct inode *, struct super_block *); +ino_t v9fs_qid2ino(struct p9_qid *qid); +void v9fs_stat2inode(struct p9_stat *, struct inode *, struct super_block *); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); -void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat); +void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat); void v9fs_dentry_release(struct dentry *); int v9fs_uflags2omode(int uflags); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 9ac4ffe9ac7d..6248f0e727a3 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,10 +33,10 @@ #include #include #include +#include +#include -#include "debug.h" #include "v9fs.h" -#include "9p.h" #include "v9fs_vfs.h" #include "fid.h" @@ -50,55 +50,26 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) { - char *buffer = NULL; - int retval = -EIO; - loff_t offset = page_offset(page); - int count = PAGE_CACHE_SIZE; - struct inode *inode = filp->f_path.dentry->d_inode; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - int rsize = v9ses->maxdata - V9FS_IOHDRSZ; - struct v9fs_fid *v9f = filp->private_data; - struct v9fs_fcall *fcall = NULL; - int fid = v9f->fid; - int total = 0; - int result = 0; - - dprintk(DEBUG_VFS, "\n"); + int retval; + loff_t offset; + char *buffer; + struct p9_fid *fid; + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + fid = filp->private_data; buffer = kmap(page); - do { - if (count < rsize) - rsize = count; - - result = v9fs_t_read(v9ses, fid, offset, rsize, &fcall); - - if (result < 0) { - printk(KERN_ERR "v9fs_t_read returned %d\n", - result); - - kfree(fcall); - goto UnmapAndUnlock; - } else - offset += result; - - memcpy(buffer, fcall->params.rread.data, result); - - count -= result; - buffer += result; - total += result; - - kfree(fcall); + offset = page_offset(page); - if (result < rsize) - break; - } while (count); + retval = p9_client_readn(fid, buffer, offset, PAGE_CACHE_SIZE); + if (retval < 0) + goto done; - memset(buffer, 0, count); + memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval); flush_dcache_page(page); SetPageUptodate(page); retval = 0; -UnmapAndUnlock: +done: kunmap(page); unlock_page(page); return retval; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index d93960429c09..f9534f18df0a 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -34,10 +34,10 @@ #include #include #include +#include +#include -#include "debug.h" #include "v9fs.h" -#include "9p.h" #include "v9fs_vfs.h" #include "fid.h" @@ -52,7 +52,7 @@ static int v9fs_dentry_delete(struct dentry *dentry) { - dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); return 1; } @@ -69,7 +69,7 @@ static int v9fs_dentry_delete(struct dentry *dentry) static int v9fs_cached_dentry_delete(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); if(!inode) return 1; @@ -85,26 +85,19 @@ static int v9fs_cached_dentry_delete(struct dentry *dentry) void v9fs_dentry_release(struct dentry *dentry) { - int err; - - dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); - - if (dentry->d_fsdata != NULL) { - struct list_head *fid_list = dentry->d_fsdata; - struct v9fs_fid *temp = NULL; - struct v9fs_fid *current_fid = NULL; - - list_for_each_entry_safe(current_fid, temp, fid_list, list) { - err = v9fs_t_clunk(current_fid->v9ses, current_fid->fid); - - if (err < 0) - dprintk(DEBUG_ERROR, "clunk failed: %d name %s\n", - err, dentry->d_iname); - - v9fs_fid_destroy(current_fid); + struct v9fs_dentry *dent; + struct p9_fid *temp, *current_fid; + + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + dent = dentry->d_fsdata; + if (dent) { + list_for_each_entry_safe(current_fid, temp, &dent->fidlist, + dlist) { + p9_client_clunk(current_fid); } - kfree(dentry->d_fsdata); /* free the list_head */ + kfree(dent); + dentry->d_fsdata = NULL; } } diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 1dd86ee90bc5..0924d4477da3 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -32,11 +32,10 @@ #include #include #include +#include +#include -#include "debug.h" #include "v9fs.h" -#include "9p.h" -#include "conv.h" #include "v9fs_vfs.h" #include "fid.h" @@ -46,14 +45,14 @@ * */ -static inline int dt_type(struct v9fs_stat *mistat) +static inline int dt_type(struct p9_stat *mistat) { unsigned long perm = mistat->mode; int rettype = DT_REG; - if (perm & V9FS_DMDIR) + if (perm & P9_DMDIR) rettype = DT_DIR; - if (perm & V9FS_DMSYMLINK) + if (perm & P9_DMSYMLINK) rettype = DT_LNK; return rettype; @@ -69,106 +68,36 @@ static inline int dt_type(struct v9fs_stat *mistat) static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct v9fs_fcall *fcall = NULL; - struct inode *inode = filp->f_path.dentry->d_inode; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - struct v9fs_fid *file = filp->private_data; - unsigned int i, n, s; - int fid = -1; - int ret = 0; - struct v9fs_stat stat; - int over = 0; - - dprintk(DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); - - fid = file->fid; - - if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) { - kfree(file->rdir_fcall); - file->rdir_fcall = NULL; - } - - if (file->rdir_fcall) { - n = file->rdir_fcall->params.rread.count; - i = file->rdir_fpos; - while (i < n) { - s = v9fs_deserialize_stat( - file->rdir_fcall->params.rread.data + i, - n - i, &stat, v9ses->extended); - - if (s == 0) { - dprintk(DEBUG_ERROR, - "error while deserializing stat\n"); - ret = -EIO; - goto FreeStructs; - } - - over = filldir(dirent, stat.name.str, stat.name.len, - filp->f_pos, v9fs_qid2ino(&stat.qid), - dt_type(&stat)); - - if (over) { - file->rdir_fpos = i; - file->rdir_pos = filp->f_pos; - break; - } - - i += s; - filp->f_pos += s; - } - - if (!over) { - kfree(file->rdir_fcall); - file->rdir_fcall = NULL; - } - } - - while (!over) { - ret = v9fs_t_read(v9ses, fid, filp->f_pos, - v9ses->maxdata-V9FS_IOHDRSZ, &fcall); - if (ret < 0) { - dprintk(DEBUG_ERROR, "error while reading: %d: %p\n", - ret, fcall); - goto FreeStructs; - } else if (ret == 0) + int over; + struct p9_fid *fid; + struct v9fs_session_info *v9ses; + struct inode *inode; + struct p9_stat *st; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + inode = filp->f_path.dentry->d_inode; + v9ses = v9fs_inode2v9ses(inode); + fid = filp->private_data; + while ((st = p9_client_dirread(fid, filp->f_pos)) != NULL) { + if (IS_ERR(st)) + return PTR_ERR(st); + + over = filldir(dirent, st->name.str, st->name.len, filp->f_pos, + v9fs_qid2ino(&st->qid), dt_type(st)); + + if (over) break; - n = ret; - i = 0; - while (i < n) { - s = v9fs_deserialize_stat(fcall->params.rread.data + i, - n - i, &stat, v9ses->extended); - - if (s == 0) { - dprintk(DEBUG_ERROR, - "error while deserializing stat\n"); - return -EIO; - } - - over = filldir(dirent, stat.name.str, stat.name.len, - filp->f_pos, v9fs_qid2ino(&stat.qid), - dt_type(&stat)); - - if (over) { - file->rdir_fcall = fcall; - file->rdir_fpos = i; - file->rdir_pos = filp->f_pos; - fcall = NULL; - break; - } - - i += s; - filp->f_pos += s; - } - - kfree(fcall); + filp->f_pos += st->size; + kfree(st); + st = NULL; } - FreeStructs: - kfree(fcall); - return ret; + kfree(st); + return 0; } + /** * v9fs_dir_release - close a directory * @inode: inode of the directory @@ -178,29 +107,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) int v9fs_dir_release(struct inode *inode, struct file *filp) { - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - struct v9fs_fid *fid = filp->private_data; - int fidnum = -1; - - dprintk(DEBUG_VFS, "inode: %p filp: %p fid: %d\n", inode, filp, - fid->fid); - fidnum = fid->fid; + struct p9_fid *fid; + fid = filp->private_data; + P9_DPRINTK(P9_DEBUG_VFS, + "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid); filemap_write_and_wait(inode->i_mapping); - - if (fidnum >= 0) { - dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen, - fid->fid); - - if (v9fs_t_clunk(v9ses, fidnum)) - dprintk(DEBUG_ERROR, "clunk failed\n"); - - kfree(fid->rdir_fcall); - kfree(fid); - - filp->private_data = NULL; - } - + p9_client_clunk(fid); return 0; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 6e7678e4852f..c1f7c027cfeb 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -34,10 +34,10 @@ #include #include #include +#include +#include -#include "debug.h" #include "v9fs.h" -#include "9p.h" #include "v9fs_vfs.h" #include "fid.h" @@ -52,48 +52,36 @@ static const struct file_operations v9fs_cached_file_operations; int v9fs_file_open(struct inode *inode, struct file *file) { - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - struct v9fs_fid *vfid; - struct v9fs_fcall *fcall = NULL; - int omode; int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + int omode; - dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file); - - vfid = v9fs_fid_clone(file->f_path.dentry); - if (IS_ERR(vfid)) - return PTR_ERR(vfid); - + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); + v9ses = v9fs_inode2v9ses(inode); omode = v9fs_uflags2omode(file->f_flags); - err = v9fs_t_open(v9ses, vfid->fid, omode, &fcall); + fid = file->private_data; + if (!fid) { + fid = v9fs_fid_clone(file->f_path.dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + err = p9_client_open(fid, omode); if (err < 0) { - PRINT_FCALL_ERROR("open failed", fcall); - goto Clunk_Fid; + p9_client_clunk(fid); + return err; + } } - file->private_data = vfid; - vfid->fidopen = 1; - vfid->fidclunked = 0; - vfid->iounit = fcall->params.ropen.iounit; - vfid->rdir_pos = 0; - vfid->rdir_fcall = NULL; - vfid->filp = file; - kfree(fcall); - - if((vfid->qid.version) && (v9ses->cache)) { - dprintk(DEBUG_VFS, "cached"); + file->private_data = fid; + if ((fid->qid.version) && (v9ses->cache)) { + P9_DPRINTK(P9_DEBUG_VFS, "cached"); /* enable cached file options */ if(file->f_op == &v9fs_file_operations) file->f_op = &v9fs_cached_file_operations; } return 0; - -Clunk_Fid: - v9fs_fid_clunk(v9ses, vfid); - kfree(fcall); - - return err; } /** @@ -110,7 +98,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) int res = 0; struct inode *inode = filp->f_path.dentry->d_inode; - dprintk(DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); + P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); /* No mandatory locks */ if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) @@ -136,55 +124,16 @@ static ssize_t v9fs_file_read(struct file *filp, char __user * data, size_t count, loff_t * offset) { - struct inode *inode = filp->f_path.dentry->d_inode; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - struct v9fs_fid *v9f = filp->private_data; - struct v9fs_fcall *fcall = NULL; - int fid = v9f->fid; - int rsize = 0; - int result = 0; - int total = 0; - int n; - - dprintk(DEBUG_VFS, "\n"); - - rsize = v9ses->maxdata - V9FS_IOHDRSZ; - if (v9f->iounit != 0 && rsize > v9f->iounit) - rsize = v9f->iounit; - - do { - if (count < rsize) - rsize = count; + int ret; + struct p9_fid *fid; - result = v9fs_t_read(v9ses, fid, *offset, rsize, &fcall); + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + fid = filp->private_data; + ret = p9_client_uread(fid, data, *offset, count); + if (ret > 0) + *offset += ret; - if (result < 0) { - printk(KERN_ERR "9P2000: v9fs_t_read returned %d\n", - result); - - kfree(fcall); - return total; - } else - *offset += result; - - n = copy_to_user(data, fcall->params.rread.data, result); - if (n) { - dprintk(DEBUG_ERROR, "Problem copying to user %d\n", n); - kfree(fcall); - return -EFAULT; - } - - count -= result; - data += result; - total += result; - - kfree(fcall); - - if (result < rsize) - break; - } while (count); - - return total; + return ret; } /** @@ -200,50 +149,19 @@ static ssize_t v9fs_file_write(struct file *filp, const char __user * data, size_t count, loff_t * offset) { - struct inode *inode = filp->f_path.dentry->d_inode; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - struct v9fs_fid *v9fid = filp->private_data; - struct v9fs_fcall *fcall; - int fid = v9fid->fid; - int result = -EIO; - int rsize = 0; - int total = 0; - - dprintk(DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, - (int)*offset); - rsize = v9ses->maxdata - V9FS_IOHDRSZ; - if (v9fid->iounit != 0 && rsize > v9fid->iounit) - rsize = v9fid->iounit; - - do { - if (count < rsize) - rsize = count; + int ret; + struct p9_fid *fid; - result = v9fs_t_write(v9ses, fid, *offset, rsize, data, &fcall); - if (result < 0) { - PRINT_FCALL_ERROR("error while writing", fcall); - kfree(fcall); - return result; - } else - *offset += result; - - kfree(fcall); - fcall = NULL; - - if (result != rsize) { - eprintk(KERN_ERR, - "short write: v9fs_t_write returned %d\n", - result); - break; - } + P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, + (int)count, (int)*offset); - count -= result; - data += result; - total += result; - } while (count); + fid = filp->private_data; + ret = p9_client_uwrite(fid, data, *offset, count); + if (ret > 0) + *offset += ret; - invalidate_inode_pages2(inode->i_mapping); - return total; + invalidate_inode_pages2(filp->f_path.dentry->d_inode->i_mapping); + return ret; } static const struct file_operations v9fs_cached_file_operations = { diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c76cd8fa3f6c..c602c0e054a2 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -34,10 +34,10 @@ #include #include #include +#include +#include -#include "debug.h" #include "v9fs.h" -#include "9p.h" #include "v9fs_vfs.h" #include "fid.h" @@ -58,27 +58,27 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) int res; res = mode & 0777; if (S_ISDIR(mode)) - res |= V9FS_DMDIR; + res |= P9_DMDIR; if (v9ses->extended) { if (S_ISLNK(mode)) - res |= V9FS_DMSYMLINK; + res |= P9_DMSYMLINK; if (v9ses->nodev == 0) { if (S_ISSOCK(mode)) - res |= V9FS_DMSOCKET; + res |= P9_DMSOCKET; if (S_ISFIFO(mode)) - res |= V9FS_DMNAMEDPIPE; + res |= P9_DMNAMEDPIPE; if (S_ISBLK(mode)) - res |= V9FS_DMDEVICE; + res |= P9_DMDEVICE; if (S_ISCHR(mode)) - res |= V9FS_DMDEVICE; + res |= P9_DMDEVICE; } if ((mode & S_ISUID) == S_ISUID) - res |= V9FS_DMSETUID; + res |= P9_DMSETUID; if ((mode & S_ISGID) == S_ISGID) - res |= V9FS_DMSETGID; - if ((mode & V9FS_DMLINK)) - res |= V9FS_DMLINK; + res |= P9_DMSETGID; + if ((mode & P9_DMLINK)) + res |= P9_DMLINK; } return res; @@ -97,27 +97,27 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) res = mode & 0777; - if ((mode & V9FS_DMDIR) == V9FS_DMDIR) + if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; - else if ((mode & V9FS_DMSYMLINK) && (v9ses->extended)) + else if ((mode & P9_DMSYMLINK) && (v9ses->extended)) res |= S_IFLNK; - else if ((mode & V9FS_DMSOCKET) && (v9ses->extended) + else if ((mode & P9_DMSOCKET) && (v9ses->extended) && (v9ses->nodev == 0)) res |= S_IFSOCK; - else if ((mode & V9FS_DMNAMEDPIPE) && (v9ses->extended) + else if ((mode & P9_DMNAMEDPIPE) && (v9ses->extended) && (v9ses->nodev == 0)) res |= S_IFIFO; - else if ((mode & V9FS_DMDEVICE) && (v9ses->extended) + else if ((mode & P9_DMDEVICE) && (v9ses->extended) && (v9ses->nodev == 0)) res |= S_IFBLK; else res |= S_IFREG; if (v9ses->extended) { - if ((mode & V9FS_DMSETUID) == V9FS_DMSETUID) + if ((mode & P9_DMSETUID) == P9_DMSETUID) res |= S_ISUID; - if ((mode & V9FS_DMSETGID) == V9FS_DMSETGID) + if ((mode & P9_DMSETGID) == P9_DMSETGID) res |= S_ISGID; } @@ -132,26 +132,26 @@ int v9fs_uflags2omode(int uflags) switch (uflags&3) { default: case O_RDONLY: - ret = V9FS_OREAD; + ret = P9_OREAD; break; case O_WRONLY: - ret = V9FS_OWRITE; + ret = P9_OWRITE; break; case O_RDWR: - ret = V9FS_ORDWR; + ret = P9_ORDWR; break; } if (uflags & O_EXCL) - ret |= V9FS_OEXCL; + ret |= P9_OEXCL; if (uflags & O_TRUNC) - ret |= V9FS_OTRUNC; + ret |= P9_OTRUNC; if (uflags & O_APPEND) - ret |= V9FS_OAPPEND; + ret |= P9_OAPPEND; return ret; } @@ -164,7 +164,7 @@ int v9fs_uflags2omode(int uflags) */ static void -v9fs_blank_wstat(struct v9fs_wstat *wstat) +v9fs_blank_wstat(struct p9_wstat *wstat) { wstat->type = ~0; wstat->dev = ~0; @@ -197,7 +197,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; - dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); + P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); inode = new_inode(sb); if (inode) { @@ -215,7 +215,8 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFCHR: case S_IFSOCK: if(!v9ses->extended) { - dprintk(DEBUG_ERROR, "special files without extended mode\n"); + P9_DPRINTK(P9_DEBUG_ERROR, + "special files without extended mode\n"); return ERR_PTR(-EINVAL); } init_special_inode(inode, inode->i_mode, @@ -227,7 +228,8 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) break; case S_IFLNK: if(!v9ses->extended) { - dprintk(DEBUG_ERROR, "extended modes used w/o 9P2000.u\n"); + P9_DPRINTK(P9_DEBUG_ERROR, + "extended modes used w/o 9P2000.u\n"); return ERR_PTR(-EINVAL); } inode->i_op = &v9fs_symlink_inode_operations; @@ -241,71 +243,19 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) inode->i_fop = &v9fs_dir_operations; break; default: - dprintk(DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", + P9_DPRINTK(P9_DEBUG_ERROR, + "BAD mode 0x%x S_IFMT 0x%x\n", mode, mode & S_IFMT); return ERR_PTR(-EINVAL); } } else { - eprintk(KERN_WARNING, "Problem allocating inode\n"); + P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); return ERR_PTR(-ENOMEM); } return inode; } -static int -v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, u32 perm, - u8 mode, char *extension, u32 *fidp, struct v9fs_qid *qid, u32 *iounit) -{ - int fid; - int err; - struct v9fs_fcall *fcall; - - fid = v9fs_get_idpool(&v9ses->fidpool); - if (fid < 0) { - eprintk(KERN_WARNING, "no free fids available\n"); - return -ENOSPC; - } - - err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall); - if (err < 0) { - PRINT_FCALL_ERROR("clone error", fcall); - if (fcall && fcall->id == RWALK) - goto clunk_fid; - else - goto put_fid; - } - kfree(fcall); - - err = v9fs_t_create(v9ses, fid, name, perm, mode, extension, &fcall); - if (err < 0) { - PRINT_FCALL_ERROR("create fails", fcall); - goto clunk_fid; - } - - if (iounit) - *iounit = fcall->params.rcreate.iounit; - - if (qid) - *qid = fcall->params.rcreate.qid; - - if (fidp) - *fidp = fid; - - kfree(fcall); - return 0; - -clunk_fid: - v9fs_t_clunk(v9ses, fid); - fid = V9FS_NOFID; - -put_fid: - if (fid != V9FS_NOFID) - v9fs_put_idpool(fid, &v9ses->fidpool); - - kfree(fcall); - return err; -} - +/* static struct v9fs_fid* v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) { @@ -355,23 +305,25 @@ error: kfree(fcall); return ERR_PTR(err); } +*/ static struct inode * -v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid, +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { int err, umode; struct inode *ret; - struct v9fs_fcall *fcall; + struct p9_stat *st; ret = NULL; - err = v9fs_t_stat(v9ses, fid, &fcall); - if (err) { - PRINT_FCALL_ERROR("stat error", fcall); + st = p9_client_stat(fid); + if (IS_ERR(st)) { + err = PTR_ERR(st); + st = NULL; goto error; } - umode = p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode); + umode = p9mode2unixmode(v9ses, st->mode); ret = v9fs_get_inode(sb, umode); if (IS_ERR(ret)) { err = PTR_ERR(ret); @@ -379,12 +331,13 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid, goto error; } - v9fs_stat2inode(&fcall->params.rstat.stat, ret, sb); - kfree(fcall); + v9fs_stat2inode(st, ret, sb); + ret->i_ino = v9fs_qid2ino(&st->qid); + kfree(st); return ret; error: - kfree(fcall); + kfree(st); if (ret) iput(ret); @@ -401,43 +354,20 @@ error: static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) { - struct v9fs_fcall *fcall = NULL; - struct super_block *sb = NULL; - struct v9fs_session_info *v9ses = NULL; - struct v9fs_fid *v9fid = NULL; - struct inode *file_inode = NULL; - int fid = -1; - int result = 0; + struct inode *file_inode; + struct v9fs_session_info *v9ses; + struct p9_fid *v9fid; - dprintk(DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, rmdir); file_inode = file->d_inode; - sb = file_inode->i_sb; v9ses = v9fs_inode2v9ses(file_inode); v9fid = v9fs_fid_clone(file); if(IS_ERR(v9fid)) return PTR_ERR(v9fid); - fid = v9fid->fid; - if (fid < 0) { - dprintk(DEBUG_ERROR, "inode #%lu, no fid!\n", - file_inode->i_ino); - return -EBADF; - } - - result = v9fs_t_remove(v9ses, fid, &fcall); - if (result < 0) { - PRINT_FCALL_ERROR("remove fails", fcall); - goto Error; - } - - v9fs_put_idpool(fid, &v9ses->fidpool); - v9fs_fid_destroy(v9fid); - -Error: - kfree(fcall); - return result; + return p9_client_remove(v9fid); } static int @@ -446,61 +376,59 @@ v9fs_open_created(struct inode *inode, struct file *file) return 0; } + /** - * v9fs_vfs_create - VFS hook to create files - * @inode: directory inode that is being deleted - * @dentry: dentry that is being deleted - * @mode: create permissions - * @nd: path information + * v9fs_create - Create a file + * @dentry: dentry that is being created + * @perm: create permissions + * @mode: open mode * */ - -static int -v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static struct p9_fid * +v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, + struct dentry *dentry, char *extension, u32 perm, u8 mode) { int err; - u32 fid, perm, iounit; - int flags; - struct v9fs_session_info *v9ses; - struct v9fs_fid *dfid, *vfid, *ffid; + char *name; + struct p9_fid *dfid, *ofid, *fid; struct inode *inode; - struct v9fs_qid qid; - struct file *filp; - inode = NULL; - vfid = NULL; - v9ses = v9fs_inode2v9ses(dir); + err = 0; + ofid = NULL; + fid = NULL; + name = (char *) dentry->d_name.name; dfid = v9fs_fid_clone(dentry->d_parent); if(IS_ERR(dfid)) { err = PTR_ERR(dfid); + dfid = NULL; goto error; } - perm = unixmode2p9mode(v9ses, mode); - if (nd && nd->flags & LOOKUP_OPEN) - flags = nd->intent.open.flags - 1; - else - flags = O_RDWR; - - err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, - perm, v9fs_uflags2omode(flags), NULL, &fid, &qid, &iounit); + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + ofid = NULL; + goto error; + } - if (err) - goto clunk_dfid; + err = p9_client_fcreate(ofid, name, perm, mode, extension); + if (err < 0) + goto error; - vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); - v9fs_fid_clunk(v9ses, dfid); - if (IS_ERR(vfid)) { - err = PTR_ERR(vfid); - vfid = NULL; + /* now walk from the parent so we can get unopened fid */ + fid = p9_client_walk(dfid, 1, &name, 0); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; goto error; - } + } else + dfid = NULL; - inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); + /* instantiate inode and assign the unopened fid to the dentry */ + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - inode = NULL; goto error; } @@ -508,35 +436,78 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, dentry->d_op = &v9fs_cached_dentry_operations; else dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + v9fs_fid_add(dentry, fid); + return ofid; - if (nd && nd->flags & LOOKUP_OPEN) { - ffid = v9fs_fid_create(v9ses, fid); - if (!ffid) - return -ENOMEM; +error: + if (dfid) + p9_client_clunk(dfid); + + if (ofid) + p9_client_clunk(ofid); + + if (fid) + p9_client_clunk(fid); + + return ERR_PTR(err); +} + +/** + * v9fs_vfs_create - VFS hook to create files + * @inode: directory inode that is being created + * @dentry: dentry that is being deleted + * @mode: create permissions + * @nd: path information + * + */ +static int +v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int err; + u32 perm; + int flags; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct file *filp; + + err = 0; + fid = NULL; + v9ses = v9fs_inode2v9ses(dir); + perm = unixmode2p9mode(v9ses, mode); + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else + flags = O_RDWR; + + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, + v9fs_uflags2omode(flags)); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + goto error; + } + + /* if we are opening a file, assign the open fid to the file */ + if (nd && nd->flags & LOOKUP_OPEN) { filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); if (IS_ERR(filp)) { - v9fs_fid_destroy(ffid); - return PTR_ERR(filp); + err = PTR_ERR(filp); + goto error; } - ffid->rdir_pos = 0; - ffid->rdir_fcall = NULL; - ffid->fidopen = 1; - ffid->iounit = iounit; - ffid->filp = filp; - filp->private_data = ffid; - } + filp->private_data = fid; + } else + p9_client_clunk(fid); return 0; -clunk_dfid: - v9fs_fid_clunk(v9ses, dfid); - error: - if (vfid) - v9fs_fid_destroy(vfid); + if (fid) + p9_client_clunk(fid); return err; } @@ -552,57 +523,23 @@ error: static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { int err; - u32 fid, perm; + u32 perm; struct v9fs_session_info *v9ses; - struct v9fs_fid *dfid, *vfid; - struct inode *inode; + struct p9_fid *fid; - inode = NULL; - vfid = NULL; + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; v9ses = v9fs_inode2v9ses(dir); - dfid = v9fs_fid_clone(dentry->d_parent); - if(IS_ERR(dfid)) { - err = PTR_ERR(dfid); - goto error; - } - perm = unixmode2p9mode(v9ses, mode | S_IFDIR); - - err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, - perm, V9FS_OREAD, NULL, &fid, NULL, NULL); - - if (err) { - dprintk(DEBUG_ERROR, "create error %d\n", err); - goto clean_up_dfid; + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_OREAD); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; } - vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); - if (IS_ERR(vfid)) { - err = PTR_ERR(vfid); - vfid = NULL; - goto clean_up_dfid; - } + if (fid) + p9_client_clunk(fid); - v9fs_fid_clunk(v9ses, dfid); - inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - v9fs_fid_destroy(vfid); - goto error; - } - - if(v9ses->cache) - dentry->d_op = &v9fs_cached_dentry_operations; - else - dentry->d_op = &v9fs_dentry_operations; - d_instantiate(dentry, inode); - return 0; - -clean_up_dfid: - v9fs_fid_clunk(v9ses, dfid); - -error: return err; } @@ -619,104 +556,54 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, { struct super_block *sb; struct v9fs_session_info *v9ses; - struct v9fs_fid *dirfid; - struct v9fs_fid *fid; + struct p9_fid *dfid, *fid; struct inode *inode; - struct v9fs_fcall *fcall = NULL; - int dirfidnum = -1; - int newfid = -1; + char *name; int result = 0; - dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", + P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", dir, dentry->d_name.name, dentry, nameidata); sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); - dirfid = v9fs_fid_lookup(dentry->d_parent); - - if(IS_ERR(dirfid)) - return ERR_PTR(PTR_ERR(dirfid)); - - dirfidnum = dirfid->fid; - - newfid = v9fs_get_idpool(&v9ses->fidpool); - if (newfid < 0) { - eprintk(KERN_WARNING, "newfid fails!\n"); - result = -ENOSPC; - goto Release_Dirfid; - } - - result = v9fs_t_walk(v9ses, dirfidnum, newfid, - (char *)dentry->d_name.name, &fcall); - - up(&dirfid->lock); - - if (result < 0) { - if (fcall && fcall->id == RWALK) - v9fs_t_clunk(v9ses, newfid); - else - v9fs_put_idpool(newfid, &v9ses->fidpool); - + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) + return ERR_PTR(PTR_ERR(dfid)); + + name = (char *) dentry->d_name.name; + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + result = PTR_ERR(fid); if (result == -ENOENT) { d_add(dentry, NULL); - dprintk(DEBUG_VFS, - "Return negative dentry %p count %d\n", - dentry, atomic_read(&dentry->d_count)); - kfree(fcall); return NULL; } - dprintk(DEBUG_ERROR, "walk error:%d\n", result); - goto FreeFcall; - } - kfree(fcall); - - result = v9fs_t_stat(v9ses, newfid, &fcall); - if (result < 0) { - dprintk(DEBUG_ERROR, "stat error\n"); - goto FreeFcall; - } - - inode = v9fs_get_inode(sb, p9mode2unixmode(v9ses, - fcall->params.rstat.stat.mode)); - if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) { - eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n", - PTR_ERR(inode)); - - result = -ENOSPC; - goto FreeFcall; + return ERR_PTR(result); } - inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid); - - fid = v9fs_fid_create(v9ses, newfid); - if (fid == NULL) { - dprintk(DEBUG_ERROR, "couldn't insert\n"); - result = -ENOMEM; - goto FreeFcall; + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + result = PTR_ERR(inode); + inode = NULL; + goto error; } - result = v9fs_fid_insert(fid, dentry); + result = v9fs_fid_add(dentry, fid); if (result < 0) - goto FreeFcall; + goto error; - fid->qid = fcall->params.rstat.stat.qid; - v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb); if((fid->qid.version)&&(v9ses->cache)) dentry->d_op = &v9fs_cached_dentry_operations; else dentry->d_op = &v9fs_dentry_operations; d_add(dentry, inode); - kfree(fcall); - return NULL; -Release_Dirfid: - up(&dirfid->lock); - -FreeFcall: - kfree(fcall); +error: + if (fid) + p9_client_clunk(fid); return ERR_PTR(result); } @@ -758,73 +645,54 @@ static int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(old_inode); - struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry); - struct v9fs_fid *olddirfid; - struct v9fs_fid *newdirfid; - struct v9fs_wstat wstat; - struct v9fs_fcall *fcall = NULL; - int fid = -1; - int olddirfidnum = -1; - int newdirfidnum = -1; - int retval = 0; - - dprintk(DEBUG_VFS, "\n"); + struct inode *old_inode; + struct v9fs_session_info *v9ses; + struct p9_fid *oldfid; + struct p9_fid *olddirfid; + struct p9_fid *newdirfid; + struct p9_wstat wstat; + int retval; + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + retval = 0; + old_inode = old_dentry->d_inode; + v9ses = v9fs_inode2v9ses(old_inode); + oldfid = v9fs_fid_lookup(old_dentry); if(IS_ERR(oldfid)) return PTR_ERR(oldfid); olddirfid = v9fs_fid_clone(old_dentry->d_parent); if(IS_ERR(olddirfid)) { retval = PTR_ERR(olddirfid); - goto Release_lock; + goto done; } newdirfid = v9fs_fid_clone(new_dentry->d_parent); if(IS_ERR(newdirfid)) { retval = PTR_ERR(newdirfid); - goto Clunk_olddir; + goto clunk_olddir; } /* 9P can only handle file rename in the same directory */ if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { - dprintk(DEBUG_ERROR, "old dir and new dir are different\n"); + P9_DPRINTK(P9_DEBUG_ERROR, + "old dir and new dir are different\n"); retval = -EXDEV; - goto Clunk_newdir; - } - - fid = oldfid->fid; - olddirfidnum = olddirfid->fid; - newdirfidnum = newdirfid->fid; - - if (fid < 0) { - dprintk(DEBUG_ERROR, "no fid for old file #%lu\n", - old_inode->i_ino); - retval = -EBADF; - goto Clunk_newdir; + goto clunk_newdir; } v9fs_blank_wstat(&wstat); wstat.muid = v9ses->name; wstat.name = (char *) new_dentry->d_name.name; + retval = p9_client_wstat(oldfid, &wstat); - retval = v9fs_t_wstat(v9ses, fid, &wstat, &fcall); +clunk_newdir: + p9_client_clunk(olddirfid); - if (retval < 0) - PRINT_FCALL_ERROR("wstat error", fcall); - - kfree(fcall); - -Clunk_newdir: - v9fs_fid_clunk(v9ses, newdirfid); - -Clunk_olddir: - v9fs_fid_clunk(v9ses, olddirfid); - -Release_lock: - up(&oldfid->lock); +clunk_olddir: + p9_client_clunk(newdirfid); +done: return retval; } @@ -840,28 +708,27 @@ static int v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct v9fs_fcall *fcall = NULL; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); - struct v9fs_fid *fid = v9fs_fid_clone(dentry); - int err = -EPERM; - - dprintk(DEBUG_VFS, "dentry: %p\n", dentry); - if(IS_ERR(fid)) + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_stat *st; + + P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + err = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) return PTR_ERR(fid); - err = v9fs_t_stat(v9ses, fid->fid, &fcall); + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); - if (err < 0) - dprintk(DEBUG_ERROR, "stat error\n"); - else { - v9fs_stat2inode(&fcall->params.rstat.stat, dentry->d_inode, - dentry->d_inode->i_sb); + v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); generic_fillattr(dentry->d_inode, stat); - } - kfree(fcall); - v9fs_fid_clunk(v9ses, fid); - return err; + kfree(st); + return 0; } /** @@ -873,13 +740,15 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) { - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); - struct v9fs_fid *fid = v9fs_fid_clone(dentry); - struct v9fs_fcall *fcall = NULL; - struct v9fs_wstat wstat; - int res = -EPERM; + int retval; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_wstat wstat; - dprintk(DEBUG_VFS, "\n"); + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + retval = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + fid = v9fs_fid_lookup(dentry); if(IS_ERR(fid)) return PTR_ERR(fid); @@ -904,17 +773,11 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) wstat.n_gid = iattr->ia_gid; } - res = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall); + retval = p9_client_wstat(fid, &wstat); + if (retval >= 0) + retval = inode_setattr(dentry->d_inode, iattr); - if (res < 0) - PRINT_FCALL_ERROR("wstat error", fcall); - - kfree(fcall); - if (res >= 0) - res = inode_setattr(dentry->d_inode, iattr); - - v9fs_fid_clunk(v9ses, fid); - return res; + return retval; } /** @@ -926,7 +789,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) */ void -v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode, +v9fs_stat2inode(struct p9_stat *stat, struct inode *inode, struct super_block *sb) { int n; @@ -967,8 +830,9 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode, case 'b': break; default: - dprintk(DEBUG_ERROR, "Unknown special type %c (%.*s)\n", - type, stat->extension.len, stat->extension.str); + P9_DPRINTK(P9_DEBUG_ERROR, + "Unknown special type %c (%.*s)\n", type, + stat->extension.len, stat->extension.str); }; inode->i_rdev = MKDEV(major, minor); } else @@ -976,8 +840,8 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode, inode->i_size = stat->length; - inode->i_blocks = - (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + /* not real number of blocks, but 512 byte ones ... */ + inode->i_blocks = (inode->i_size + 512 - 1) >> 9; } /** @@ -987,7 +851,7 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode, * BUG: potential for inode number collisions? */ -ino_t v9fs_qid2ino(struct v9fs_qid *qid) +ino_t v9fs_qid2ino(struct p9_qid *qid) { u64 path = qid->path + 2; ino_t i = 0; @@ -1010,56 +874,46 @@ ino_t v9fs_qid2ino(struct v9fs_qid *qid) static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) { - int retval = -EPERM; + int retval; - struct v9fs_fcall *fcall = NULL; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); - struct v9fs_fid *fid = v9fs_fid_clone(dentry); + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_stat *st; + P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); + retval = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + fid = v9fs_fid_lookup(dentry); if(IS_ERR(fid)) return PTR_ERR(fid); - if (!v9ses->extended) { - retval = -EBADF; - dprintk(DEBUG_ERROR, "not extended\n"); - goto ClunkFid; - } - - dprintk(DEBUG_VFS, " %s\n", dentry->d_name.name); - retval = v9fs_t_stat(v9ses, fid->fid, &fcall); - - if (retval < 0) { - dprintk(DEBUG_ERROR, "stat error\n"); - goto FreeFcall; - } + if (!v9ses->extended) + return -EBADF; - if (!fcall) { - retval = -EIO; - goto ClunkFid; - } + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); - if (!(fcall->params.rstat.stat.mode & V9FS_DMSYMLINK)) { + if (!(st->mode & P9_DMSYMLINK)) { retval = -EINVAL; - goto FreeFcall; + goto done; } /* copy extension buffer into buffer */ - if (fcall->params.rstat.stat.extension.len < buflen) - buflen = fcall->params.rstat.stat.extension.len + 1; + if (st->extension.len < buflen) + buflen = st->extension.len + 1; - memmove(buffer, fcall->params.rstat.stat.extension.str, buflen - 1); + memmove(buffer, st->extension.str, buflen - 1); buffer[buflen-1] = 0; - dprintk(DEBUG_ERROR, "%s -> %.*s (%s)\n", dentry->d_name.name, fcall->params.rstat.stat.extension.len, - fcall->params.rstat.stat.extension.str, buffer); - retval = buflen; + P9_DPRINTK(P9_DEBUG_VFS, + "%s -> %.*s (%s)\n", dentry->d_name.name, st->extension.len, + st->extension.str, buffer); -FreeFcall: - kfree(fcall); - -ClunkFid: - v9fs_fid_clunk(v9ses, fid); + retval = buflen; +done: + kfree(st); return retval; } @@ -1084,14 +938,14 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer, if (buflen > PATH_MAX) buflen = PATH_MAX; - dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); retval = v9fs_readlink(dentry, link, buflen); if (retval > 0) { if ((ret = copy_to_user(buffer, link, retval)) != 0) { - dprintk(DEBUG_ERROR, "problem copying to user: %d\n", - ret); + P9_DPRINTK(P9_DEBUG_ERROR, + "problem copying to user: %d\n", ret); retval = ret; } } @@ -1112,7 +966,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) int len = 0; char *link = __getname(); - dprintk(DEBUG_VFS, "%s n", dentry->d_name.name); + P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name); if (!link) link = ERR_PTR(-ENOMEM); @@ -1141,7 +995,7 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void { char *s = nd_get_link(nd); - dprintk(DEBUG_VFS, " %s %s\n", dentry->d_name.name, s); + P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, s); if (!IS_ERR(s)) __putname(s); } @@ -1149,66 +1003,24 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, int mode, const char *extension) { - int err; - u32 fid, perm; + u32 perm; struct v9fs_session_info *v9ses; - struct v9fs_fid *dfid, *vfid = NULL; - struct inode *inode = NULL; + struct p9_fid *fid; v9ses = v9fs_inode2v9ses(dir); if (!v9ses->extended) { - dprintk(DEBUG_ERROR, "not extended\n"); + P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); return -EPERM; } - dfid = v9fs_fid_clone(dentry->d_parent); - if(IS_ERR(dfid)) { - err = PTR_ERR(dfid); - goto error; - } - perm = unixmode2p9mode(v9ses, mode); + fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm, + P9_OREAD); + if (IS_ERR(fid)) + return PTR_ERR(fid); - err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, - perm, V9FS_OREAD, (char *) extension, &fid, NULL, NULL); - - if (err) - goto clunk_dfid; - - err = v9fs_t_clunk(v9ses, fid); - if (err) - goto clunk_dfid; - - vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); - if (IS_ERR(vfid)) { - err = PTR_ERR(vfid); - vfid = NULL; - goto clunk_dfid; - } - - inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto free_vfid; - } - - if(v9ses->cache) - dentry->d_op = &v9fs_cached_dentry_operations; - else - dentry->d_op = &v9fs_dentry_operations; - d_instantiate(dentry, inode); + p9_client_clunk(fid); return 0; - -free_vfid: - v9fs_fid_destroy(vfid); - -clunk_dfid: - v9fs_fid_clunk(v9ses, dfid); - -error: - return err; - } /** @@ -1224,8 +1036,8 @@ error: static int v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, - symname); + P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, + dentry->d_name.name, symname); return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname); } @@ -1247,11 +1059,11 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); - struct v9fs_fid *oldfid; + struct p9_fid *oldfid; char *name; - dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, old_dentry->d_name.name); oldfid = v9fs_fid_clone(old_dentry); @@ -1265,11 +1077,11 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, } sprintf(name, "%d\n", oldfid->fid); - retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name); + retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); __putname(name); clunk_fid: - v9fs_fid_clunk(v9ses, oldfid); + p9_client_clunk(oldfid); return retval; } @@ -1288,7 +1100,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) int retval; char *name; - dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 7bdf8b326841..f6a0519ade8c 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -37,10 +37,10 @@ #include #include #include +#include +#include -#include "debug.h" #include "v9fs.h" -#include "9p.h" #include "v9fs_vfs.h" #include "fid.h" @@ -107,41 +107,48 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct vfsmount *mnt) { struct super_block *sb = NULL; - struct v9fs_fcall *fcall = NULL; struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - struct v9fs_fid *root_fid = NULL; + struct p9_stat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; uid_t uid = current->fsuid; gid_t gid = current->fsgid; - int stat_result = 0; - int newfid = 0; + struct p9_fid *fid; int retval = 0; - dprintk(DEBUG_VFS, " \n"); + P9_DPRINTK(P9_DEBUG_VFS, " \n"); v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return -ENOMEM; - if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { - dprintk(DEBUG_ERROR, "problem initiating session\n"); - retval = newfid; - goto out_free_session; + fid = v9fs_session_init(v9ses, dev_name, data); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + fid = NULL; + kfree(v9ses); + v9ses = NULL; + goto error; + } + + st = p9_client_stat(fid); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto error; } sb = sget(fs_type, NULL, v9fs_set_super, v9ses); if (IS_ERR(sb)) { retval = PTR_ERR(sb); - goto out_close_session; + goto error; } v9fs_fill_super(sb, v9ses, flags); inode = v9fs_get_inode(sb, S_IFDIR | mode); if (IS_ERR(inode)) { retval = PTR_ERR(inode); - goto put_back_sb; + goto error; } inode->i_uid = uid; @@ -150,54 +157,30 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, root = d_alloc_root(inode); if (!root) { retval = -ENOMEM; - goto put_back_sb; + goto error; } sb->s_root = root; + root->d_inode->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, root->d_inode, sb); + v9fs_fid_add(root, fid); - stat_result = v9fs_t_stat(v9ses, newfid, &fcall); - if (stat_result < 0) { - dprintk(DEBUG_ERROR, "stat error\n"); - v9fs_t_clunk(v9ses, newfid); - } else { - /* Setup the Root Inode */ - root_fid = v9fs_fid_create(v9ses, newfid); - if (root_fid == NULL) { - retval = -ENOMEM; - goto put_back_sb; - } - - retval = v9fs_fid_insert(root_fid, root); - if (retval < 0) { - kfree(fcall); - goto put_back_sb; - } - - root_fid->qid = fcall->params.rstat.stat.qid; - root->d_inode->i_ino = - v9fs_qid2ino(&fcall->params.rstat.stat.qid); - v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb); - } + return simple_set_mnt(mnt, sb); - kfree(fcall); +error: + if (fid) + p9_client_clunk(fid); - if (stat_result < 0) { - retval = stat_result; - goto put_back_sb; + if (v9ses) { + v9fs_session_close(v9ses); + kfree(v9ses); } - return simple_set_mnt(mnt, sb); - -out_close_session: - v9fs_session_close(v9ses); -out_free_session: - kfree(v9ses); - return retval; + if (sb) { + up_write(&sb->s_umount); + deactivate_super(sb); + } -put_back_sb: - /* deactivate_super calls v9fs_kill_super which will frees the rest */ - up_write(&sb->s_umount); - deactivate_super(sb); return retval; } @@ -211,7 +194,7 @@ static void v9fs_kill_super(struct super_block *s) { struct v9fs_session_info *v9ses = s->s_fs_info; - dprintk(DEBUG_VFS, " %p\n", s); + P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); v9fs_dentry_release(s->s_root); /* clunk root */ @@ -219,7 +202,7 @@ static void v9fs_kill_super(struct super_block *s) v9fs_session_close(v9ses); kfree(v9ses); - dprintk(DEBUG_VFS, "exiting kill_super\n"); + P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); } /** diff --git a/fs/Kconfig b/fs/Kconfig index 0fa0c1193e81..94b9d861bf9b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -2048,7 +2048,7 @@ config AFS_DEBUG config 9P_FS tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" - depends on INET && EXPERIMENTAL + depends on INET && NET_9P && EXPERIMENTAL help If you say Y here, you will get experimental support for Plan 9 resource sharing via the 9P2000 protocol. -- cgit v1.2.1 From 9523a841b109765f8779236d28be6458ee3a6824 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Fri, 13 Jul 2007 13:01:27 -0500 Subject: 9p: cache meta-data when cache=loose This patch expands the impact of the loose cache mode to allow for cached metadata increasing the performance of directory listings and other metadata read operations. Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_file.c | 14 ++++++++++++-- fs/9p/vfs_inode.c | 3 +++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c1f7c027cfeb..2a40c2946d0a 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -67,10 +67,14 @@ int v9fs_file_open(struct inode *inode, struct file *file) return PTR_ERR(fid); err = p9_client_open(fid, omode); - if (err < 0) { + if (err < 0) { p9_client_clunk(fid); return err; } + if (omode & P9_OTRUNC) { + inode->i_size = 0; + inode->i_blocks = 0; + } } file->private_data = fid; @@ -151,6 +155,7 @@ v9fs_file_write(struct file *filp, const char __user * data, { int ret; struct p9_fid *fid; + struct inode *inode = filp->f_path.dentry->d_inode; P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); @@ -160,7 +165,12 @@ v9fs_file_write(struct file *filp, const char __user * data, if (ret > 0) *offset += ret; - invalidate_inode_pages2(filp->f_path.dentry->d_inode->i_mapping); + if (*offset > inode->i_size) { + inode->i_size = *offset; + inode->i_blocks = (inode->i_size + 512 - 1) >> 9; + } + + invalidate_inode_pages2(inode->i_mapping); return ret; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c602c0e054a2..e5c45eed58a9 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -716,6 +716,9 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_inode2v9ses(dentry->d_inode); + if (v9ses->cache == CACHE_LOOSE) + return simple_getattr(mnt, dentry, stat); + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); -- cgit v1.2.1 From 9e2f6688c0b52882496aff576b009bc1f7eea0b8 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Fri, 13 Jul 2007 13:05:21 -0500 Subject: 9p: re-enable mount time debug option During reorganization, the mount time debug option was removed in favor of module-load-time parameters. However, the mount time option is still a useful for feature during debug and for user-fault isolation when the module is compiled into the kernel. Signed-off-by: Eric Van Hensbergen --- fs/9p/v9fs.c | 7 ++++++- fs/9p/vfs_super.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 4feb5ae63ecf..45c35986d49f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -43,7 +43,7 @@ enum { /* Options that take integer arguments */ - Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid, + Opt_debug, Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid, Opt_rfdno, Opt_wfdno, /* String options */ Opt_uname, Opt_remotename, @@ -56,6 +56,7 @@ enum { }; static match_table_t tokens = { + {Opt_debug, "debug=%x"}, {Opt_port, "port=%u"}, {Opt_msize, "msize=%u"}, {Opt_uid, "uid=%u"}, @@ -128,6 +129,10 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses) } } switch (token) { + case Opt_debug: + v9ses->debug = option; + p9_debug_level = option; + break; case Opt_port: v9ses->port = option; break; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index f6a0519ade8c..ba904371218b 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -217,7 +217,7 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt) struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info; if (v9ses->debug != 0) - seq_printf(m, ",debug=%u", v9ses->debug); + seq_printf(m, ",debug=%x", v9ses->debug); if (v9ses->port != V9FS_PORT) seq_printf(m, ",port=%u", v9ses->port); if (v9ses->maxdata != 9000) -- cgit v1.2.1 From 7c9e3c2e3b0437d10a09b77769baf325b94aa436 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Jul 2007 20:59:31 +0100 Subject: wrong order of arguments of ->readdir() Shows how many people are testing coda - the bug had been there for 5 years and results of stepping on it are not subtle. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/coda/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 9ddf5ed62162..898a86dde8f5 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -470,7 +470,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir) ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { - ret = host_file->f_op->readdir(host_file, filldir, dirent); + ret = host_file->f_op->readdir(host_file, dirent, filldir); file_accessed(host_file); } } -- cgit v1.2.1 From 5926c50b83b626991c8c38efbca2020ee96b215f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Jul 2007 09:46:30 +0200 Subject: [PATCH] sched: remove dead code from task_stime() Alexey Dobriyan noticed that task_stime() contains a piece of dead code. (which is a remnant of earlier versions of this code) Remove that code. Signed-off-by: Ingo Molnar --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 98e78e2f18d6..c6977796fafd 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -332,7 +332,7 @@ static clock_t task_utime(struct task_struct *p) static clock_t task_stime(struct task_struct *p) { - clock_t stime = cputime_to_clock_t(p->stime); + clock_t stime; /* * Use CFS's precise accounting. (we subtract utime from -- cgit v1.2.1 From 8ea02606681beb41568c62ba060bdf51fc9ba14e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Jul 2007 09:46:31 +0200 Subject: [PATCH] sched: fix up fs/proc/array.c whitespace problems while changing task_stime() i noticed a whitespace style problem in array.c - fix it. While at it, fix all the other style problems too, most of them in the scheduler-stats related portions of array.c. There is no change in functionality: text data bss dec hex filename 4356 28 0 4384 1120 array.o-before 4356 28 0 4384 1120 array.o-after Signed-off-by: Ingo Molnar --- fs/proc/array.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index c6977796fafd..4cb81776a7ff 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -62,6 +62,8 @@ #include #include #include +#include +#include #include #include #include @@ -76,9 +78,7 @@ #include #include -#include #include -#include #include #include "internal.h" @@ -87,10 +87,10 @@ do { memcpy(buffer, string, strlen(string)); \ buffer += strlen(string); } while (0) -static inline char * task_name(struct task_struct *p, char * buf) +static inline char *task_name(struct task_struct *p, char *buf) { int i; - char * name; + char *name; char tcomm[sizeof(p->comm)]; get_task_comm(tcomm, p); @@ -138,7 +138,7 @@ static const char *task_state_array[] = { "X (dead)" /* 32 */ }; -static inline const char * get_task_state(struct task_struct *tsk) +static inline const char *get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state & (TASK_RUNNING | TASK_INTERRUPTIBLE | @@ -156,7 +156,7 @@ static inline const char * get_task_state(struct task_struct *tsk) return *p; } -static inline char * task_state(struct task_struct *p, char *buffer) +static inline char *task_state(struct task_struct *p, char *buffer) { struct group_info *group_info; int g; @@ -172,8 +172,8 @@ static inline char * task_state(struct task_struct *p, char *buffer) "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - p->tgid, p->pid, - pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, + p->tgid, p->pid, + pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); @@ -191,15 +191,15 @@ static inline char * task_state(struct task_struct *p, char *buffer) get_group_info(group_info); task_unlock(p); - for (g = 0; g < min(group_info->ngroups,NGROUPS_SMALL); g++) - buffer += sprintf(buffer, "%d ", GROUP_AT(group_info,g)); + for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) + buffer += sprintf(buffer, "%d ", GROUP_AT(group_info, g)); put_group_info(group_info); buffer += sprintf(buffer, "\n"); return buffer; } -static char * render_sigset_t(const char *header, sigset_t *set, char *buffer) +static char *render_sigset_t(const char *header, sigset_t *set, char *buffer) { int i, len; @@ -239,7 +239,7 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, } } -static inline char * task_sig(struct task_struct *p, char *buffer) +static inline char *task_sig(struct task_struct *p, char *buffer) { unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; @@ -289,14 +289,14 @@ static inline char *task_cap(struct task_struct *p, char *buffer) cap_t(p->cap_effective)); } -int proc_pid_status(struct task_struct *task, char * buffer) +int proc_pid_status(struct task_struct *task, char *buffer) { - char * orig = buffer; + char *orig = buffer; struct mm_struct *mm = get_task_mm(task); buffer = task_name(task, buffer); buffer = task_state(task, buffer); - + if (mm) { buffer = task_mem(mm, buffer); mmput(mm); @@ -344,8 +344,7 @@ static clock_t task_stime(struct task_struct *p) return stime; } - -static int do_task_stat(struct task_struct *task, char * buffer, int whole) +static int do_task_stat(struct task_struct *task, char *buffer, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; long priority, nice; @@ -353,7 +352,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) sigset_t sigign, sigcatch; char state; int res; - pid_t ppid = 0, pgid = -1, sid = -1; + pid_t ppid = 0, pgid = -1, sid = -1; int num_threads = 0; struct mm_struct *mm; unsigned long long start_time; @@ -424,7 +423,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) } rcu_read_unlock(); - if (!whole || num_threads<2) + if (!whole || num_threads < 2) wchan = get_wchan(task); if (!whole) { min_flt = task->min_flt; @@ -445,7 +444,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); - res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %u %lu \ + res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", task->pid, @@ -471,7 +470,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) start_time, vsize, mm ? get_mm_rss(mm) : 0, - rsslim, + rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, mm ? mm->start_stack : 0, @@ -493,17 +492,17 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) task->rt_priority, task->policy, (unsigned long long)delayacct_blkio_ticks(task)); - if(mm) + if (mm) mmput(mm); return res; } -int proc_tid_stat(struct task_struct *task, char * buffer) +int proc_tid_stat(struct task_struct *task, char *buffer) { return do_task_stat(task, buffer, 0); } -int proc_tgid_stat(struct task_struct *task, char * buffer) +int proc_tgid_stat(struct task_struct *task, char *buffer) { return do_task_stat(task, buffer, 1); } @@ -512,12 +511,12 @@ int proc_pid_statm(struct task_struct *task, char *buffer) { int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; struct mm_struct *mm = get_task_mm(task); - + if (mm) { size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); } - return sprintf(buffer,"%d %d %d %d %d %d %d\n", + return sprintf(buffer, "%d %d %d %d %d %d %d\n", size, resident, shared, text, lib, data, 0); } -- cgit v1.2.1 From d6d281684913dabb878e2f53219eed5df2cd867b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Jun 2007 08:38:16 -0400 Subject: KVM: Remove kvmfs in favor of the anonymous inodes source kvm uses a pseudo filesystem, kvmfs, to generate inodes, a job that the new anonymous inodes source does much better. Cc: Davide Libenzi Signed-off-by: Avi Kivity --- fs/anon_inodes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 40fe3a3222e4..edc67486238f 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -139,6 +139,7 @@ err_put_filp: put_filp(file); return error; } +EXPORT_SYMBOL_GPL(anon_inode_getfd); /* * A single inode exist for all anon_inode files. Contrary to pipes, -- cgit v1.2.1 From bcd4f3acbaec102e2b8000c977ecc38dcd0fe367 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 16 Jul 2007 14:41:49 +0200 Subject: splice: direct splicing updates ppos twice OGAWA Hirofumi reported that he's noticed nfsd read corruption in recent kernels, and did the hard work of discovering that it's due to splice updating the file position twice. This means that the next operation would start further ahead than it should. nfsd_vfs_read() splice_direct_to_actor() while(len) { do_splice_to() [update sd->pos] -> generic_file_splice_read() [read from sd->pos] nfsd_direct_splice_actor() -> __splice_from_pipe() [update sd->pos] There's nothing wrong with the core splice code, but the direct splicing is an addon that calls both input and output paths. So it has to take care in locally caching offset so it remains correct. Signed-off-by: Jens Axboe --- fs/splice.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 6c9828651e6f..53fc2082a468 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1061,8 +1061,9 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, while (len) { size_t read_len; + loff_t pos = sd->pos; - ret = do_splice_to(in, &sd->pos, pipe, len, flags); + ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto out_release; @@ -1080,6 +1081,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, bytes += ret; len -= ret; + sd->pos = pos; if (ret < read_len) goto out_release; -- cgit v1.2.1 From fe28e42b99173ba088b1d8448e53029e100bff27 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 15 Jul 2007 23:37:18 -0700 Subject: jbd commit: fix transaction dropping We have to check that also the second checkpoint list is non-empty before dropping the transaction. Signed-off-by: Jan Kara Cc: Chuck Ebbert Cc: Kirill Korotaev Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 1facfaff97cb..a003d50edcdb 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -887,7 +887,8 @@ restart_loop: journal->j_committing_transaction = NULL; spin_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { -- cgit v1.2.1 From f89b779508f457b2af05dd5ac82a425db4a56ee3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 15 Jul 2007 23:37:20 -0700 Subject: jbd2 commit: fix transaction dropping We have to check that also the second checkpoint list is non-empty before dropping the transaction. Signed-off-by: Jan Kara Cc: Chuck Ebbert Cc: Kirill Korotaev Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd2/commit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2856e1100a5f..c0f59d1b13dc 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -896,7 +896,8 @@ restart_loop: journal->j_committing_transaction = NULL; spin_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { -- cgit v1.2.1 From fc9a07e7bf1a76e710f5df017abb07628db1781d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 15 Jul 2007 23:38:14 -0700 Subject: invalidate_mapping_pages(): add cond_resched invalidate_mapping_pages() can sometimes take a long time (millions of pages to free). Long enough for the softlockup detector to trigger. We used to have a cond_resched() in there but I took it out because the drop_caches code calls invalidate_mapping_pages() under inode_lock. The patch adds a nasty flag and puts the cond_resched() back. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/drop_caches.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 03ea7696fe39..59375efcf39d 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -20,7 +20,7 @@ static void drop_pagecache_sb(struct super_block *sb) list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_WILL_FREE)) continue; - invalidate_mapping_pages(inode->i_mapping, 0, -1); + __invalidate_mapping_pages(inode->i_mapping, 0, -1, true); } spin_unlock(&inode_lock); } -- cgit v1.2.1 From 786d7e1612f0b0adb6046f19b906609e4fe8b1ba Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 15 Jul 2007 23:39:00 -0700 Subject: Fix rmmod/read/write races in /proc entries Fix following races: =========================================== 1. Write via ->write_proc sleeps in copy_from_user(). Module disappears meanwhile. Or, more generically, system call done on /proc file, method supplied by module is called, module dissapeares meanwhile. pde = create_proc_entry() if (!pde) return -ENOMEM; pde->write_proc = ... open write copy_from_user pde = create_proc_entry(); if (!pde) { remove_proc_entry(); return -ENOMEM; /* module unloaded */ } *boom* ========================================== 2. bogo-revoke aka proc_kill_inodes() remove_proc_entry vfs_read proc_kill_inodes [check ->f_op validness] [check ->f_op->read validness] [verify_area, security permissions checks] ->f_op = NULL; if (file->f_op->read) /* ->f_op dereference, boom */ NOTE, NOTE, NOTE: file_operations are proxied for regular files only. Let's see how this scheme behaves, then extend if needed for directories. Directories creators in /proc only set ->owner for them, so proxying for directories may be unneeded. NOTE, NOTE, NOTE: methods being proxied are ->llseek, ->read, ->write, ->poll, ->unlocked_ioctl, ->ioctl, ->compat_ioctl, ->open, ->release. If your in-tree module uses something else, yell on me. Full audit pending. [akpm@linux-foundation.org: build fix] Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 32 ++++++- fs/proc/inode.c | 254 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 283 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8a40e15f5ecb..4f8e53568b22 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "internal.h" @@ -613,6 +614,9 @@ static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent, ent->namelen = len; ent->mode = mode; ent->nlink = nlink; + ent->pde_users = 0; + spin_lock_init(&ent->pde_unload_lock); + ent->pde_unload_completion = NULL; out: return ent; } @@ -734,9 +738,35 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = *p; *p = de->next; de->next = NULL; + + spin_lock(&de->pde_unload_lock); + /* + * Stop accepting new callers into module. If you're + * dynamically allocating ->proc_fops, save a pointer somewhere. + */ + de->proc_fops = NULL; + /* Wait until all existing callers into module are done. */ + if (de->pde_users > 0) { + DECLARE_COMPLETION_ONSTACK(c); + + if (!de->pde_unload_completion) + de->pde_unload_completion = &c; + + spin_unlock(&de->pde_unload_lock); + spin_unlock(&proc_subdir_lock); + + wait_for_completion(de->pde_unload_completion); + + spin_lock(&proc_subdir_lock); + goto continue_removing; + } + spin_unlock(&de->pde_unload_lock); + +continue_removing: if (S_ISDIR(de->mode)) parent->nlink--; - proc_kill_inodes(de); + if (!S_ISREG(de->mode)) + proc_kill_inodes(de); de->nlink = 0; WARN_ON(de->subdir); if (!atomic_read(&de->count)) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d5ce65c68d7b..dd28e86ab422 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -140,6 +141,251 @@ static const struct super_operations proc_sops = { .remount_fs = proc_remount, }; +static void pde_users_dec(struct proc_dir_entry *pde) +{ + spin_lock(&pde->pde_unload_lock); + pde->pde_users--; + if (pde->pde_unload_completion && pde->pde_users == 0) + complete(pde->pde_unload_completion); + spin_unlock(&pde->pde_unload_lock); +} + +static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + loff_t rv = -EINVAL; + loff_t (*llseek)(struct file *, loff_t, int); + + spin_lock(&pde->pde_unload_lock); + /* + * remove_proc_entry() is going to delete PDE (as part of module + * cleanup sequence). No new callers into module allowed. + */ + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + /* + * Bump refcount so that remove_proc_entry will wail for ->llseek to + * complete. + */ + pde->pde_users++; + /* + * Save function pointer under lock, to protect against ->proc_fops + * NULL'ifying right after ->pde_unload_lock is dropped. + */ + llseek = pde->proc_fops->llseek; + spin_unlock(&pde->pde_unload_lock); + + if (!llseek) + llseek = default_llseek; + rv = llseek(file, offset, whence); + + pde_users_dec(pde); + return rv; +} + +static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + read = pde->proc_fops->read; + spin_unlock(&pde->pde_unload_lock); + + if (read) + rv = read(file, buf, count, ppos); + + pde_users_dec(pde); + return rv; +} + +static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + write = pde->proc_fops->write; + spin_unlock(&pde->pde_unload_lock); + + if (write) + rv = write(file, buf, count, ppos); + + pde_users_dec(pde); + return rv; +} + +static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + unsigned int rv = 0; + unsigned int (*poll)(struct file *, struct poll_table_struct *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + poll = pde->proc_fops->poll; + spin_unlock(&pde->pde_unload_lock); + + if (poll) + rv = poll(file, pts); + + pde_users_dec(pde); + return rv; +} + +static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + long rv = -ENOTTY; + long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long); + int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + unlocked_ioctl = pde->proc_fops->unlocked_ioctl; + ioctl = pde->proc_fops->ioctl; + spin_unlock(&pde->pde_unload_lock); + + if (unlocked_ioctl) { + rv = unlocked_ioctl(file, cmd, arg); + if (rv == -ENOIOCTLCMD) + rv = -EINVAL; + } else if (ioctl) { + lock_kernel(); + rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg); + unlock_kernel(); + } + + pde_users_dec(pde); + return rv; +} + +#ifdef CONFIG_COMPAT +static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + long rv = -ENOTTY; + long (*compat_ioctl)(struct file *, unsigned int, unsigned long); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + compat_ioctl = pde->proc_fops->compat_ioctl; + spin_unlock(&pde->pde_unload_lock); + + if (compat_ioctl) + rv = compat_ioctl(file, cmd, arg); + + pde_users_dec(pde); + return rv; +} +#endif + +static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + int rv = -EIO; + int (*mmap)(struct file *, struct vm_area_struct *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + mmap = pde->proc_fops->mmap; + spin_unlock(&pde->pde_unload_lock); + + if (mmap) + rv = mmap(file, vma); + + pde_users_dec(pde); + return rv; +} + +static int proc_reg_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + int rv = 0; + int (*open)(struct inode *, struct file *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + open = pde->proc_fops->open; + spin_unlock(&pde->pde_unload_lock); + + if (open) + rv = open(inode, file); + + pde_users_dec(pde); + return rv; +} + +static int proc_reg_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + int rv = 0; + int (*release)(struct inode *, struct file *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + release = pde->proc_fops->release; + spin_unlock(&pde->pde_unload_lock); + + if (release) + rv = release(inode, file); + + pde_users_dec(pde); + return rv; +} + +static const struct file_operations proc_reg_file_ops = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = proc_reg_compat_ioctl, +#endif + .mmap = proc_reg_mmap, + .open = proc_reg_open, + .release = proc_reg_release, +}; + struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, struct proc_dir_entry *de) { @@ -166,8 +412,12 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, inode->i_nlink = de->nlink; if (de->proc_iops) inode->i_op = de->proc_iops; - if (de->proc_fops) - inode->i_fop = de->proc_fops; + if (de->proc_fops) { + if (S_ISREG(inode->i_mode)) + inode->i_fop = &proc_reg_file_ops; + else + inode->i_fop = de->proc_fops; + } } return inode; -- cgit v1.2.1 From 924b42d5a2dbe508407a0a6290d3751f826bccdd Mon Sep 17 00:00:00 2001 From: Tomas Janousek Date: Sun, 15 Jul 2007 23:39:42 -0700 Subject: Use boot based time for process start time and boot time in /proc Commit 411187fb05cd11676b0979d9fbf3291db69dbce2 caused boot time to move and process start times to become invalid after suspend. Using boot based time for those restores the old behaviour and fixes the issue. [akpm@linux-foundation.org: little cleanup] Signed-off-by: Tomas Janousek Cc: Tomas Smetana Acked-by: John Stultz Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 5 +++-- fs/proc/proc_misc.c | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 98e78e2f18d6..680c913575f0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -440,8 +440,9 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) /* Temporary variable needed for gcc-2.96 */ /* convert timespec -> nsec*/ - start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC - + task->start_time.tv_nsec; + start_time = + (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC + + task->real_start_time.tv_nsec; /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 5fd49e47f83a..19c9cbf1c320 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -443,12 +443,12 @@ static int show_stat(struct seq_file *p, void *v) unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; u64 sum = 0; + struct timespec boottime; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; - jif = - wall_to_monotonic.tv_sec; - if (wall_to_monotonic.tv_nsec) - --jif; + getboottime(&boottime); + jif = boottime.tv_sec; for_each_possible_cpu(i) { int j; -- cgit v1.2.1 From d62141414a55ff3f1410b27db2a95224446e77a4 Mon Sep 17 00:00:00 2001 From: Tomas Janousek Date: Sun, 15 Jul 2007 23:39:42 -0700 Subject: Use boot based time for uptime in /proc Commit 411187fb05cd11676b0979d9fbf3291db69dbce2 caused uptime not to increase during suspend. This may cause confusion so I restore the old behaviour by using the boot based time instead of monotonic for uptime. Signed-off-by: Tomas Janousek Acked-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 19c9cbf1c320..d24b8d46059a 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -105,6 +105,7 @@ static int uptime_read_proc(char *page, char **start, off_t off, cputime_t idletime = cputime_add(init_task.utime, init_task.stime); do_posix_clock_monotonic_gettime(&uptime); + monotonic_to_bootbased(&uptime); cputime_to_timespec(idletime, &idle); len = sprintf(page,"%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, -- cgit v1.2.1 From c9c64155f5a81b4b41e98f9fb9c464a565c1bf72 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 15 Jul 2007 23:39:43 -0700 Subject: UDF: check for allocated memory for data of new inodes Add checking for granted memory for inode data at the moment of its creation. Signed-off-by: Cyrill Gorcunov Cc: Jan Kara Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/ialloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 8206983f2ebf..10f3188738af 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -50,7 +50,7 @@ void udf_free_inode(struct inode * inode) else UDF_SB_LVIDIU(sb)->numFiles = cpu_to_le32(le32_to_cpu(UDF_SB_LVIDIU(sb)->numFiles) - 1); - + mark_buffer_dirty(sbi->s_lvidbh); } mutex_unlock(&sbi->s_alloc_mutex); @@ -136,6 +136,13 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err) UDF_I_EFE(inode) = 0; UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); } + if (!UDF_I_DATA(inode)) + { + iput(inode); + *err = -ENOMEM; + mutex_unlock(&sbi->s_alloc_mutex); + return NULL; + } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB; else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) -- cgit v1.2.1 From 647bd61a5f3a51a38c670f91af9d861ad66149a3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 15 Jul 2007 23:39:47 -0700 Subject: UDF: check for allocated memory for inode data This patch adds checking for granted memory while filling up inode data to prevent possible NULL pointer usage. If there is not enough memory to fill inode data we just mark it as "bad". Also some whitespace cleanup. Signed-off-by: Cyrill Gorcunov Cc: Jan Kara Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/inode.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index bf7de0bdbab3..5b82e489af78 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -49,6 +49,7 @@ MODULE_LICENSE("GPL"); static mode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static void udf_fill_inode(struct inode *, struct buffer_head *); +static int udf_alloc_i_data(struct inode *inode, size_t size); static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, long *, int *); static int8_t udf_insert_aext(struct inode *, struct extent_position, @@ -734,7 +735,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, int newbl (*c) ++; (*endnum) ++; } - + laarr[curr].extLocation.logicalBlockNum = newblocknum; if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) laarr[curr].extLocation.partitionReferenceNum = @@ -836,7 +837,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, { numalloc -= elen; if (*endnum > (i+1)) - memmove(&laarr[i], &laarr[i+1], + memmove(&laarr[i], &laarr[i+1], sizeof(long_ad) * (*endnum - (i+1))); i --; (*endnum) --; @@ -1024,7 +1025,7 @@ void udf_truncate(struct inode * inode) { block_truncate_page(inode->i_mapping, inode->i_size, udf_get_block); udf_truncate_extents(inode); - } + } inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); if (IS_SYNC(inode)) @@ -1087,10 +1088,10 @@ __udf_read_inode(struct inode *inode) { kernel_lb_addr loc; ie = (struct indirectEntry *)ibh->b_data; - + loc = lelb_to_cpu(ie->indirectICB.extLocation); - - if (ie->indirectICB.extLength && + + if (ie->indirectICB.extLength && (nbh = udf_read_ptagged(inode->i_sb, loc, 0, &ident))) { if (ident == TAG_IDENT_FE || @@ -1156,14 +1157,22 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) { UDF_I_EFE(inode) = 1; UDF_I_USE(inode) = 0; - UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL); + if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry))) + { + make_bad_inode(inode); + return; + } memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); } else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_FE) { UDF_I_EFE(inode) = 0; UDF_I_USE(inode) = 0; - UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); + if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct fileEntry))) + { + make_bad_inode(inode); + return; + } memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry)); } else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_USE) @@ -1173,7 +1182,11 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) UDF_I_LENALLOC(inode) = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)->lengthAllocDescs); - UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry), GFP_KERNEL); + if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry))) + { + make_bad_inode(inode); + return; + } memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); return; } @@ -1191,7 +1204,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_nlink = le16_to_cpu(fe->fileLinkCount); if (!inode->i_nlink) inode->i_nlink = 1; - + inode->i_size = le64_to_cpu(fe->informationLength); UDF_I_LENEXTENTS(inode) = inode->i_size; @@ -1243,7 +1256,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } else { - inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << + inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); if ( udf_stamp_to_time(&convtime, &convtime_usec, @@ -1374,6 +1387,20 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } } +static int udf_alloc_i_data(struct inode *inode, size_t size) +{ + UDF_I_DATA(inode) = kmalloc(size, GFP_KERNEL); + + if (!UDF_I_DATA(inode)) + { + printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) no free memory\n", + inode->i_ino); + return -ENOMEM; + } + + return 0; +} + static mode_t udf_convert_permissions(struct fileEntry *fe) { @@ -2072,7 +2099,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, mark_buffer_dirty_inode(oepos.bh, inode); } } - + brelse(epos.bh); brelse(oepos.bh); return (elen >> 30); -- cgit v1.2.1 From 85420ccad1610f123365eec89848ef25641bc6b7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 15 Jul 2007 23:39:50 -0700 Subject: vxfs warning fixes gcc-4.3: fs/freevxfs/vxfs_lookup.c: In function 'vxfs_find_entry': fs/freevxfs/vxfs_lookup.c:139: warning: cast from pointer to integer of different size fs/freevxfs/vxfs_lookup.c: In function 'vxfs_readdir': fs/freevxfs/vxfs_lookup.c:294: warning: cast from pointer to integer of different size Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/freevxfs/vxfs_dir.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index 8a4dfef1ddad..3c96d6e63978 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -80,7 +80,7 @@ struct vxfs_direct { * a d_name with size len. */ #define VXFS_DIRPAD 4 -#define VXFS_NAMEMIN ((int)((struct vxfs_direct *)0)->d_name) +#define VXFS_NAMEMIN offsetof(struct vxfs_direct, d_name) #define VXFS_DIRROUND(len) ((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1)) #define VXFS_DIRLEN(len) (VXFS_DIRROUND(VXFS_NAMEMIN + (len))) -- cgit v1.2.1 From a6a8bd6d2839f7134f191c6e13e2fd2e9e8c91a6 Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Sun, 15 Jul 2007 23:39:52 -0700 Subject: Make AFS use seq_list_xxx helpers These proc files show some header before dumping the list, so the seq_list_start_head() is used. Signed-off-by: Pavel Emelianov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/proc.c | 81 ++++++++--------------------------------------------------- 1 file changed, 11 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 13df512aea9e..6edb56683b9a 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -201,23 +201,9 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) */ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) { - struct list_head *_p; - loff_t pos = *_pos; - /* lock the list against modification */ down_read(&afs_proc_cells_sem); - - /* allow for the header line */ - if (!pos) - return (void *) 1; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &afs_proc_cells) - if (!pos--) - break; - - return _p != &afs_proc_cells ? _p : NULL; + return seq_list_start_head(&afs_proc_cells, *_pos); } /* @@ -225,14 +211,7 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) */ static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos) { - struct list_head *_p; - - (*pos)++; - - _p = v; - _p = v == (void *) 1 ? afs_proc_cells.next : _p->next; - - return _p != &afs_proc_cells ? _p : NULL; + return seq_list_next(v, &afs_proc_cells, pos); } /* @@ -250,7 +229,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v) { struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); - if (v == (void *) 1) { + if (v == &afs_proc_cells) { /* display header on line 1 */ seq_puts(m, "USE NAME\n"); return 0; @@ -503,26 +482,13 @@ static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file) */ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) { - struct list_head *_p; struct afs_cell *cell = m->private; - loff_t pos = *_pos; _enter("cell=%p pos=%Ld", cell, *_pos); /* lock the list against modification */ down_read(&cell->vl_sem); - - /* allow for the header line */ - if (!pos) - return (void *) 1; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &cell->vl_list) - if (!pos--) - break; - - return _p != &cell->vl_list ? _p : NULL; + return seq_list_start_head(&cell->vl_list, *_pos); } /* @@ -531,17 +497,10 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, loff_t *_pos) { - struct list_head *_p; struct afs_cell *cell = p->private; _enter("cell=%p pos=%Ld", cell, *_pos); - - (*_pos)++; - - _p = v; - _p = (v == (void *) 1) ? cell->vl_list.next : _p->next; - - return (_p != &cell->vl_list) ? _p : NULL; + return seq_list_next(v, &cell->vl_list, _pos); } /* @@ -569,11 +528,12 @@ const char afs_vlocation_states[][4] = { */ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) { + struct afs_cell *cell = m->private; struct afs_vlocation *vlocation = list_entry(v, struct afs_vlocation, link); /* display header on line 1 */ - if (v == (void *) 1) { + if (v == &cell->vl_list) { seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n"); return 0; } @@ -734,26 +694,13 @@ static int afs_proc_cell_servers_release(struct inode *inode, static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) __acquires(m->private->servers_lock) { - struct list_head *_p; struct afs_cell *cell = m->private; - loff_t pos = *_pos; _enter("cell=%p pos=%Ld", cell, *_pos); /* lock the list against modification */ read_lock(&cell->servers_lock); - - /* allow for the header line */ - if (!pos) - return (void *) 1; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &cell->servers) - if (!pos--) - break; - - return _p != &cell->servers ? _p : NULL; + return seq_list_start_head(&cell->servers, *_pos); } /* @@ -762,17 +709,10 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, loff_t *_pos) { - struct list_head *_p; struct afs_cell *cell = p->private; _enter("cell=%p pos=%Ld", cell, *_pos); - - (*_pos)++; - - _p = v; - _p = v == (void *) 1 ? cell->servers.next : _p->next; - - return _p != &cell->servers ? _p : NULL; + return seq_list_next(v, &cell->servers, _pos); } /* @@ -791,11 +731,12 @@ static void afs_proc_cell_servers_stop(struct seq_file *p, void *v) */ static int afs_proc_cell_servers_show(struct seq_file *m, void *v) { + struct afs_cell *cell = m->private; struct afs_server *server = list_entry(v, struct afs_server, link); char ipaddr[20]; /* display header on line 1 */ - if (v == (void *) 1) { + if (v == &cell->servers) { seq_puts(m, "USE ADDR STATE\n"); return 0; } -- cgit v1.2.1 From 25216b00395dfb52bfe06e4886a6ad831ede7b4b Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Sun, 15 Jul 2007 23:39:54 -0700 Subject: Make /proc/tty/drivers use seq_list_xxx helpers Simple and stupid like some previous ones. Just use new API. Signed-off-by: Pavel Emelianov Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_tty.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index b3a473b0a191..22846225acfa 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -69,7 +69,7 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p, static int show_tty_driver(struct seq_file *m, void *v) { - struct tty_driver *p = v; + struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers); dev_t from = MKDEV(p->major, p->minor_start); dev_t to = from + p->num; @@ -106,22 +106,13 @@ static int show_tty_driver(struct seq_file *m, void *v) /* iterator */ static void *t_start(struct seq_file *m, loff_t *pos) { - struct list_head *p; - loff_t l = *pos; - mutex_lock(&tty_mutex); - list_for_each(p, &tty_drivers) - if (!l--) - return list_entry(p, struct tty_driver, tty_drivers); - return NULL; + return seq_list_start(&tty_drivers, *pos); } static void *t_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next; - (*pos)++; - return p==&tty_drivers ? NULL : - list_entry(p, struct tty_driver, tty_drivers); + return seq_list_next(v, &tty_drivers, pos); } static void t_stop(struct seq_file *m, void *v) -- cgit v1.2.1 From b0765fb85782da9dca98482ebb1ae0d8c1a5e0f7 Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Sun, 15 Jul 2007 23:39:55 -0700 Subject: Make /proc/self/mounts(tats) use seq_list_xxx helpers One more simple and stupid switching to the new API. Signed-off-by: Pavel Emelianov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index b696e3a0d18f..c811a94e4c88 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -320,22 +320,16 @@ EXPORT_SYMBOL(mnt_unpin); static void *m_start(struct seq_file *m, loff_t *pos) { struct mnt_namespace *n = m->private; - struct list_head *p; - loff_t l = *pos; down_read(&namespace_sem); - list_for_each(p, &n->list) - if (!l--) - return list_entry(p, struct vfsmount, mnt_list); - return NULL; + return seq_list_start(&n->list, *pos); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct mnt_namespace *n = m->private; - struct list_head *p = ((struct vfsmount *)v)->mnt_list.next; - (*pos)++; - return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list); + + return seq_list_next(v, &n->list, pos); } static void m_stop(struct seq_file *m, void *v) @@ -350,7 +344,7 @@ static inline void mangle(struct seq_file *m, const char *s) static int show_vfsmnt(struct seq_file *m, void *v) { - struct vfsmount *mnt = v; + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); int err = 0; static struct proc_fs_info { int flag; @@ -405,7 +399,7 @@ struct seq_operations mounts_op = { static int show_vfsstat(struct seq_file *m, void *v) { - struct vfsmount *mnt = v; + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); int err = 0; /* device */ -- cgit v1.2.1 From 259902ea951008bcbd31a49f667062ff8012ef55 Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Sun, 15 Jul 2007 23:39:56 -0700 Subject: Make NFS client use seq_list_xxx helpers This includes /proc/fs/nfsfs/servers and /proc/fs/nfsfs/volumes entries. Both need to show the header and use the list_head. Signed-off-by: Pavel Emelianov Acked-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/client.c | 54 ++++++------------------------------------------------ 1 file changed, 6 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ccb455053ee4..a49f9feff776 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1206,23 +1206,9 @@ static int nfs_server_list_open(struct inode *inode, struct file *file) */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) { - struct list_head *_p; - loff_t pos = *_pos; - /* lock the list against modification */ spin_lock(&nfs_client_lock); - - /* allow for the header line */ - if (!pos) - return SEQ_START_TOKEN; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &nfs_client_list) - if (!pos--) - break; - - return _p != &nfs_client_list ? _p : NULL; + return seq_list_start_head(&nfs_client_list, *_pos); } /* @@ -1230,14 +1216,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { - struct list_head *_p; - - (*pos)++; - - _p = v; - _p = (v == SEQ_START_TOKEN) ? nfs_client_list.next : _p->next; - - return _p != &nfs_client_list ? _p : NULL; + return seq_list_next(v, &nfs_client_list, pos); } /* @@ -1256,7 +1235,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v) struct nfs_client *clp; /* display header on line 1 */ - if (v == SEQ_START_TOKEN) { + if (v == &nfs_client_list) { seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); return 0; } @@ -1297,23 +1276,9 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file) */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) { - struct list_head *_p; - loff_t pos = *_pos; - /* lock the list against modification */ spin_lock(&nfs_client_lock); - - /* allow for the header line */ - if (!pos) - return SEQ_START_TOKEN; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &nfs_volume_list) - if (!pos--) - break; - - return _p != &nfs_volume_list ? _p : NULL; + return seq_list_start_head(&nfs_volume_list, *_pos); } /* @@ -1321,14 +1286,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { - struct list_head *_p; - - (*pos)++; - - _p = v; - _p = (v == SEQ_START_TOKEN) ? nfs_volume_list.next : _p->next; - - return _p != &nfs_volume_list ? _p : NULL; + return seq_list_next(v, &nfs_volume_list, pos); } /* @@ -1349,7 +1307,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) char dev[8], fsid[17]; /* display header on line 1 */ - if (v == SEQ_START_TOKEN) { + if (v == &nfs_volume_list) { seq_puts(m, "NV SERVER PORT DEV FSID\n"); return 0; } -- cgit v1.2.1 From 9aacd599342fdfc1fb9422f37e900609b7a46249 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 15 Jul 2007 23:39:56 -0700 Subject: fat: gcc 4.3 warning fix This patch fixes the following warnings. fs/fat/dir.c: In function 'fat_parse_long': include/linux/msdos_fs.h:294: warning: array subscript is above array bounds include/linux/msdos_fs.h:295: warning: array subscript is above array bounds include/linux/msdos_fs.h:295: warning: array subscript is above array bounds The ->name is defined as "name[8], ext[3]", but fat_checksum() uses those as name[11]. There is no actual problem, but it's not a good manner. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 31 ++++++++++++++++++------------- fs/fat/inode.c | 3 +-- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/fat/dir.c b/fs/fat/dir.c index ccf161dffb63..72cbcd61bd95 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -313,7 +313,7 @@ int fat_search_long(struct inode *inode, const unsigned char *name, wchar_t bufuname[14]; unsigned char xlate_len, nr_slots; wchar_t *unicode = NULL; - unsigned char work[8], bufname[260]; /* 256 + 4 */ + unsigned char work[MSDOS_NAME], bufname[260]; /* 256 + 4 */ int uni_xlate = sbi->options.unicode_xlate; int utf8 = sbi->options.utf8; int anycase = (sbi->options.name_check != 's'); @@ -351,7 +351,8 @@ parse_record: if (work[0] == 0x05) work[0] = 0xE5; for (i = 0, j = 0, last_u = 0; i < 8;) { - if (!work[i]) break; + if (!work[i]) + break; chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, &bufuname[j++], opt_shortname, de->lcase & CASE_LOWER_BASE); @@ -365,13 +366,15 @@ parse_record: } j = last_u; fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); - for (i = 0; i < 3;) { - if (!de->ext[i]) break; - chl = fat_shortname2uni(nls_disk, &de->ext[i], 3 - i, + for (i = 8; i < MSDOS_NAME;) { + if (!work[i]) + break; + chl = fat_shortname2uni(nls_disk, &work[i], + MSDOS_NAME - i, &bufuname[j++], opt_shortname, de->lcase & CASE_LOWER_EXT); if (chl <= 1) { - if (de->ext[i] != ' ') + if (work[i] != ' ') last_u = j; } else { last_u = j; @@ -445,7 +448,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, int fill_len; wchar_t bufuname[14]; wchar_t *unicode = NULL; - unsigned char c, work[8], bufname[56], *ptname = bufname; + unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname; unsigned long lpos, dummy, *furrfu = &lpos; int uni_xlate = sbi->options.unicode_xlate; int isvfat = sbi->options.isvfat; @@ -527,7 +530,8 @@ parse_record: if (work[0] == 0x05) work[0] = 0xE5; for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) { - if (!(c = work[i])) break; + if (!(c = work[i])) + break; chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, &bufuname[j++], opt_shortname, de->lcase & CASE_LOWER_BASE); @@ -549,9 +553,10 @@ parse_record: j = last_u; fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); ptname[i++] = '.'; - for (i2 = 0; i2 < 3;) { - if (!(c = de->ext[i2])) break; - chl = fat_shortname2uni(nls_disk, &de->ext[i2], 3 - i2, + for (i2 = 8; i2 < MSDOS_NAME;) { + if (!(c = work[i2])) + break; + chl = fat_shortname2uni(nls_disk, &work[i2], MSDOS_NAME - i2, &bufuname[j++], opt_shortname, de->lcase & CASE_LOWER_EXT); if (chl <= 1) { @@ -563,8 +568,8 @@ parse_record: } } else { last_u = j; - for (chi = 0; chi < chl && i2 < 3; chi++) { - ptname[i++] = de->ext[i2++]; + for (chi = 0; chi < chl && i2 < MSDOS_NAME; chi++) { + ptname[i++] = work[i2++]; last = i; } } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 479722d89667..cfaf5877d98b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -354,8 +354,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = MSDOS_MKMODE(de->attr, - ((sbi->options.showexec && - !is_exec(de->ext)) + ((sbi->options.showexec && !is_exec(de->name + 8)) ? S_IRUGO|S_IWUGO : S_IRWXUGO) & ~sbi->options.fs_fmask) | S_IFREG; MSDOS_I(inode)->i_start = le16_to_cpu(de->start); -- cgit v1.2.1 From d05e96fe4664c0398f1d060c0f6a8ff8fc74a449 Mon Sep 17 00:00:00 2001 From: Denver Gingerich Date: Sun, 15 Jul 2007 23:39:59 -0700 Subject: fix compiler warnings in acorn.c warning: 'adfs_partition' defined but not used warning: 'riscix_partition' defined but not used warning: 'linux_partition' defined but not used Signed-off-by: Denver Gingerich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/acorn.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index e3491328596b..3d3e16631472 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c @@ -25,6 +25,8 @@ #define PARTITION_RISCIX_SCSI 2 #define PARTITION_LINUX 9 +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) static struct adfs_discrecord * adfs_partition(struct parsed_partitions *state, char *name, char *data, unsigned long first_sector, int slot) @@ -48,6 +50,7 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data, put_partition(state, slot, first_sector, nr_sects); return dr; } +#endif #ifdef CONFIG_ACORN_PARTITION_RISCIX @@ -65,6 +68,8 @@ struct riscix_record { struct riscix_part part[8]; }; +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) static int riscix_partition(struct parsed_partitions *state, struct block_device *bdev, unsigned long first_sect, int slot, unsigned long nr_sects) @@ -105,6 +110,7 @@ riscix_partition(struct parsed_partitions *state, struct block_device *bdev, return slot; } #endif +#endif #define LINUX_NATIVE_MAGIC 0xdeafa1de #define LINUX_SWAP_MAGIC 0xdeafab1e @@ -115,6 +121,8 @@ struct linux_part { __le32 nr_sects; }; +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) static int linux_partition(struct parsed_partitions *state, struct block_device *bdev, unsigned long first_sect, int slot, unsigned long nr_sects) @@ -146,6 +154,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, put_dev_sector(sect); return slot; } +#endif #ifdef CONFIG_ACORN_PARTITION_CUMANA int -- cgit v1.2.1 From c3ed85a36ff5b01f340db67ac5ae6e699d3b8a2b Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sun, 15 Jul 2007 23:40:03 -0700 Subject: isofs: fix up CodingStyle fs/isofs/* had a bunch of CodingStyle issues. * Indentation was a mix of spaces and tabs * "int * foo" instead of "int *foo" * "while ( foo )" instead of "while (foo)" * if (foo) blah; on one line instead of two * Missing printk KERN_ levels * lots of trailing whitespace * lines >80 columns changed to wrap. * Unnecessary prototype removed by shuffling code order in C file. Should be no functional changes other than slight size increase due to printk changes. Further improvement possible, but this is a start.. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/dir.c | 87 ++++++------ fs/isofs/inode.c | 415 ++++++++++++++++++++++++++++-------------------------- fs/isofs/joliet.c | 10 +- fs/isofs/namei.c | 26 ++-- 4 files changed, 277 insertions(+), 261 deletions(-) (limited to 'fs') diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 0e94c31cad9b..1ba407c64df1 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -7,34 +7,18 @@ * * Steve Beynon : Missing last directory entries fixed * (stephen@askone.demon.co.uk) : 21st June 1996 - * + * * isofs directory handling functions */ #include #include "isofs.h" -static int isofs_readdir(struct file *, void *, filldir_t); - -const struct file_operations isofs_dir_operations = -{ - .read = generic_read_dir, - .readdir = isofs_readdir, -}; - -/* - * directories can handle most operations... - */ -const struct inode_operations isofs_dir_inode_operations = -{ - .lookup = isofs_lookup, -}; - int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) { char * old = de->name; int len = de->name_len[0]; int i; - + for (i = 0; i < len; i++) { unsigned char c = old[i]; if (!c) @@ -62,22 +46,27 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod } /* Acorn extensions written by Matthew Wilcox 1998 */ -int get_acorn_filename(struct iso_directory_record * de, - char * retname, struct inode * inode) +int get_acorn_filename(struct iso_directory_record *de, + char *retname, struct inode *inode) { int std; - unsigned char * chr; + unsigned char *chr; int retnamlen = isofs_name_translate(de, retname, inode); - if (retnamlen == 0) return 0; + + if (retnamlen == 0) + return 0; std = sizeof(struct iso_directory_record) + de->name_len[0]; - if (std & 1) std++; - if ((*((unsigned char *) de) - std) != 32) return retnamlen; + if (std & 1) + std++; + if ((*((unsigned char *) de) - std) != 32) + return retnamlen; chr = ((unsigned char *) de) + std; - if (strncmp(chr, "ARCHIMEDES", 10)) return retnamlen; - if ((*retname == '_') && ((chr[19] & 1) == 1)) *retname = '!'; + if (strncmp(chr, "ARCHIMEDES", 10)) + return retnamlen; + if ((*retname == '_') && ((chr[19] & 1) == 1)) + *retname = '!'; if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff) - && ((chr[12] & 0xf0) == 0xf0)) - { + && ((chr[12] & 0xf0) == 0xf0)) { retname[retnamlen] = ','; sprintf(retname+retnamlen+1, "%3.3x", ((chr[12] & 0xf) << 8) | chr[11]); @@ -91,7 +80,7 @@ int get_acorn_filename(struct iso_directory_record * de, */ static int do_isofs_readdir(struct inode *inode, struct file *filp, void *dirent, filldir_t filldir, - char * tmpname, struct iso_directory_record * tmpde) + char *tmpname, struct iso_directory_record *tmpde) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned char bufbits = ISOFS_BUFFER_BITS(inode); @@ -121,9 +110,11 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, de_len = *(unsigned char *) de; - /* If the length byte is zero, we should move on to the next - CDROM sector. If we are at the end of the directory, we - kick out of the while loop. */ + /* + * If the length byte is zero, we should move on to the next + * CDROM sector. If we are at the end of the directory, we + * kick out of the while loop. + */ if (de_len == 0) { brelse(bh); @@ -157,11 +148,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, if (first_de) { isofs_normalize_block_and_offset(de, - &block_saved, - &offset_saved); + &block_saved, + &offset_saved); inode_number = isofs_get_ino(block_saved, - offset_saved, - bufbits); + offset_saved, bufbits); } if (de->flags[-sbi->s_high_sierra] & 0x80) { @@ -199,7 +189,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, */ if ((sbi->s_hide == 'y' && (de->flags[-sbi->s_high_sierra] & 1)) || - (sbi->s_showassoc =='n' && + (sbi->s_showassoc =='n' && (de->flags[-sbi->s_high_sierra] & 4))) { filp->f_pos += de_len; continue; @@ -240,7 +230,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, continue; } - if (bh) brelse(bh); + if (bh) + brelse(bh); return 0; } @@ -253,8 +244,8 @@ static int isofs_readdir(struct file *filp, void *dirent, filldir_t filldir) { int result; - char * tmpname; - struct iso_directory_record * tmpde; + char *tmpname; + struct iso_directory_record *tmpde; struct inode *inode = filp->f_path.dentry->d_inode; tmpname = (char *)__get_free_page(GFP_KERNEL); @@ -270,3 +261,19 @@ static int isofs_readdir(struct file *filp, unlock_kernel(); return result; } + +const struct file_operations isofs_dir_operations = +{ + .read = generic_read_dir, + .readdir = isofs_readdir, +}; + +/* + * directories can handle most operations... + */ +const struct inode_operations isofs_dir_inode_operations = +{ + .lookup = isofs_lookup, +}; + + diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 5c3eecf7542e..15c866f1a1fd 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -73,20 +73,20 @@ static void isofs_destroy_inode(struct inode *inode) kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } -static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags) +static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) { struct iso_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } - + static int init_inodecache(void) { isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", - sizeof(struct iso_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once, NULL); + sizeof(struct iso_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once, NULL); if (isofs_inode_cachep == NULL) return -ENOMEM; return 0; @@ -150,9 +150,9 @@ struct iso9660_options{ uid_t uid; char *iocharset; unsigned char utf8; - /* LVE */ - s32 session; - s32 sbsector; + /* LVE */ + s32 session; + s32 sbsector; }; /* @@ -360,10 +360,12 @@ static int parse_options(char *options, struct iso9660_options *popt) popt->check = 'u'; /* unset */ popt->nocompress = 0; popt->blocksize = 1024; - popt->mode = S_IRUGO | S_IXUGO; /* r-x for all. The disc could - be shared with DOS machines so - virtually anything could be - a valid executable. */ + popt->mode = S_IRUGO | S_IXUGO; /* + * r-x for all. The disc could + * be shared with DOS machines so + * virtually anything could be + * a valid executable. + */ popt->gid = 0; popt->uid = 0; popt->iocharset = NULL; @@ -503,30 +505,30 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) Te.cdte_format=CDROM_LBA; i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te); if (!i) { - printk(KERN_DEBUG "Session %d start %d type %d\n", - session, Te.cdte_addr.lba, - Te.cdte_ctrl&CDROM_DATA_TRACK); + printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n", + session, Te.cdte_addr.lba, + Te.cdte_ctrl&CDROM_DATA_TRACK); if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4) return Te.cdte_addr.lba; } - - printk(KERN_ERR "Invalid session number or type of track\n"); + + printk(KERN_ERR "ISOFS: Invalid session number or type of track\n"); } i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info); if (session > 0) - printk(KERN_ERR "Invalid session number\n"); + printk(KERN_ERR "ISOFS: Invalid session number\n"); #if 0 - printk("isofs.inode: CDROMMULTISESSION: rc=%d\n",i); + printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i); if (i==0) { - printk("isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no"); - printk("isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba); + printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no"); + printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba); } #endif if (i==0) #if WE_OBEY_THE_WRITTEN_STANDARDS - if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ + if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ #endif - vol_desc_start=ms_info.addr.lba; + vol_desc_start=ms_info.addr.lba; return vol_desc_start; } @@ -538,20 +540,20 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) */ static int isofs_fill_super(struct super_block *s, void *data, int silent) { - struct buffer_head * bh = NULL, *pri_bh = NULL; - struct hs_primary_descriptor * h_pri = NULL; - struct iso_primary_descriptor * pri = NULL; + struct buffer_head *bh = NULL, *pri_bh = NULL; + struct hs_primary_descriptor *h_pri = NULL; + struct iso_primary_descriptor *pri = NULL; struct iso_supplementary_descriptor *sec = NULL; - struct iso_directory_record * rootp; - int joliet_level = 0; - int iso_blknum, block; - int orig_zonesize; - int table; - unsigned int vol_desc_start; - unsigned long first_data_zone; - struct inode * inode; - struct iso9660_options opt; - struct isofs_sb_info * sbi; + struct iso_directory_record *rootp; + struct inode *inode; + struct iso9660_options opt; + struct isofs_sb_info *sbi; + unsigned long first_data_zone; + int joliet_level = 0; + int iso_blknum, block; + int orig_zonesize; + int table; + unsigned int vol_desc_start; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -577,72 +579,73 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) vol_desc_start = (opt.sbsector != -1) ? opt.sbsector : isofs_get_last_session(s,opt.session); - for (iso_blknum = vol_desc_start+16; - iso_blknum < vol_desc_start+100; iso_blknum++) - { - struct hs_volume_descriptor * hdp; - struct iso_volume_descriptor * vdp; - - block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits); - if (!(bh = sb_bread(s, block))) - goto out_no_read; - - vdp = (struct iso_volume_descriptor *)bh->b_data; - hdp = (struct hs_volume_descriptor *)bh->b_data; - - /* Due to the overlapping physical location of the descriptors, - * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure - * proper identification in this case, we first check for ISO. - */ - if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) { - if (isonum_711 (vdp->type) == ISO_VD_END) - break; - if (isonum_711 (vdp->type) == ISO_VD_PRIMARY) { - if (pri == NULL) { - pri = (struct iso_primary_descriptor *)vdp; - /* Save the buffer in case we need it ... */ - pri_bh = bh; - bh = NULL; - } - } + for (iso_blknum = vol_desc_start+16; + iso_blknum < vol_desc_start+100; iso_blknum++) { + struct hs_volume_descriptor *hdp; + struct iso_volume_descriptor *vdp; + + block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits); + if (!(bh = sb_bread(s, block))) + goto out_no_read; + + vdp = (struct iso_volume_descriptor *)bh->b_data; + hdp = (struct hs_volume_descriptor *)bh->b_data; + + /* + * Due to the overlapping physical location of the descriptors, + * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure + * proper identification in this case, we first check for ISO. + */ + if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) { + if (isonum_711(vdp->type) == ISO_VD_END) + break; + if (isonum_711(vdp->type) == ISO_VD_PRIMARY) { + if (pri == NULL) { + pri = (struct iso_primary_descriptor *)vdp; + /* Save the buffer in case we need it ... */ + pri_bh = bh; + bh = NULL; + } + } #ifdef CONFIG_JOLIET - else if (isonum_711 (vdp->type) == ISO_VD_SUPPLEMENTARY) { - sec = (struct iso_supplementary_descriptor *)vdp; - if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { - if (opt.joliet == 'y') { - if (sec->escape[2] == 0x40) { - joliet_level = 1; - } else if (sec->escape[2] == 0x43) { - joliet_level = 2; - } else if (sec->escape[2] == 0x45) { - joliet_level = 3; - } - printk(KERN_DEBUG"ISO 9660 Extensions: Microsoft Joliet Level %d\n", - joliet_level); + else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) { + sec = (struct iso_supplementary_descriptor *)vdp; + if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { + if (opt.joliet == 'y') { + if (sec->escape[2] == 0x40) + joliet_level = 1; + else if (sec->escape[2] == 0x43) + joliet_level = 2; + else if (sec->escape[2] == 0x45) + joliet_level = 3; + + printk(KERN_DEBUG "ISO 9660 Extensions: " + "Microsoft Joliet Level %d\n", + joliet_level); + } + goto root_found; + } else { + /* Unknown supplementary volume descriptor */ + sec = NULL; + } } - goto root_found; - } else { - /* Unknown supplementary volume descriptor */ - sec = NULL; - } - } #endif - } else { - if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) { - if (isonum_711 (hdp->type) != ISO_VD_PRIMARY) - goto out_freebh; - - sbi->s_high_sierra = 1; - opt.rock = 'n'; - h_pri = (struct hs_primary_descriptor *)vdp; - goto root_found; + } else { + if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) { + if (isonum_711(hdp->type) != ISO_VD_PRIMARY) + goto out_freebh; + + sbi->s_high_sierra = 1; + opt.rock = 'n'; + h_pri = (struct hs_primary_descriptor *)vdp; + goto root_found; + } } - } - /* Just skip any volume descriptors we don't recognize */ + /* Just skip any volume descriptors we don't recognize */ - brelse(bh); - bh = NULL; + brelse(bh); + bh = NULL; } /* * If we fall through, either no volume descriptor was found, @@ -657,24 +660,24 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) root_found: if (joliet_level && (pri == NULL || opt.rock == 'n')) { - /* This is the case of Joliet with the norock mount flag. - * A disc with both Joliet and Rock Ridge is handled later - */ - pri = (struct iso_primary_descriptor *) sec; + /* This is the case of Joliet with the norock mount flag. + * A disc with both Joliet and Rock Ridge is handled later + */ + pri = (struct iso_primary_descriptor *) sec; } if(sbi->s_high_sierra){ - rootp = (struct iso_directory_record *) h_pri->root_directory_record; - sbi->s_nzones = isonum_733 (h_pri->volume_space_size); - sbi->s_log_zone_size = isonum_723 (h_pri->logical_block_size); - sbi->s_max_size = isonum_733(h_pri->volume_space_size); + rootp = (struct iso_directory_record *) h_pri->root_directory_record; + sbi->s_nzones = isonum_733(h_pri->volume_space_size); + sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size); + sbi->s_max_size = isonum_733(h_pri->volume_space_size); } else { - if (!pri) - goto out_freebh; - rootp = (struct iso_directory_record *) pri->root_directory_record; - sbi->s_nzones = isonum_733 (pri->volume_space_size); - sbi->s_log_zone_size = isonum_723 (pri->logical_block_size); - sbi->s_max_size = isonum_733(pri->volume_space_size); + if (!pri) + goto out_freebh; + rootp = (struct iso_directory_record *) pri->root_directory_record; + sbi->s_nzones = isonum_733(pri->volume_space_size); + sbi->s_log_zone_size = isonum_723(pri->logical_block_size); + sbi->s_max_size = isonum_733(pri->volume_space_size); } sbi->s_ninodes = 0; /* No way to figure this out easily */ @@ -687,42 +690,43 @@ root_found: * blocks that were 512 bytes (which should only very rarely * happen.) */ - if(orig_zonesize < opt.blocksize) + if (orig_zonesize < opt.blocksize) goto out_bad_size; /* RDE: convert log zone size to bit shift */ - switch (sbi->s_log_zone_size) - { case 512: sbi->s_log_zone_size = 9; break; - case 1024: sbi->s_log_zone_size = 10; break; - case 2048: sbi->s_log_zone_size = 11; break; + switch (sbi->s_log_zone_size) { + case 512: sbi->s_log_zone_size = 9; break; + case 1024: sbi->s_log_zone_size = 10; break; + case 2048: sbi->s_log_zone_size = 11; break; - default: + default: goto out_bad_zone_size; - } + } s->s_magic = ISOFS_SUPER_MAGIC; s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */ - /* The CDROM is read-only, has no nodes (devices) on it, and since - all of the files appear to be owned by root, we really do not want - to allow suid. (suid or devices will not show up unless we have - Rock Ridge extensions) */ + /* + * The CDROM is read-only, has no nodes (devices) on it, and since + * all of the files appear to be owned by root, we really do not want + * to allow suid. (suid or devices will not show up unless we have + * Rock Ridge extensions) + */ s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */; /* Set this for reference. Its not currently used except on write which we don't have .. */ - - first_data_zone = isonum_733 (rootp->extent) + - isonum_711 (rootp->ext_attr_length); + + first_data_zone = isonum_733(rootp->extent) + + isonum_711(rootp->ext_attr_length); sbi->s_firstdatazone = first_data_zone; #ifndef BEQUIET - printk(KERN_DEBUG "Max size:%ld Log zone size:%ld\n", - sbi->s_max_size, - 1UL << sbi->s_log_zone_size); - printk(KERN_DEBUG "First datazone:%ld\n", sbi->s_firstdatazone); + printk(KERN_DEBUG "ISOFS: Max size:%ld Log zone size:%ld\n", + sbi->s_max_size, 1UL << sbi->s_log_zone_size); + printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone); if(sbi->s_high_sierra) - printk(KERN_DEBUG "Disc in High Sierra format.\n"); + printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n"); #endif /* @@ -737,8 +741,8 @@ root_found: pri = (struct iso_primary_descriptor *) sec; rootp = (struct iso_directory_record *) pri->root_directory_record; - first_data_zone = isonum_733 (rootp->extent) + - isonum_711 (rootp->ext_attr_length); + first_data_zone = isonum_733(rootp->extent) + + isonum_711(rootp->ext_attr_length); } /* @@ -771,7 +775,7 @@ root_found: #ifdef CONFIG_JOLIET if (joliet_level && opt.utf8 == 0) { - char * p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; + char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; sbi->s_nls_iocharset = load_nls(p); if (! sbi->s_nls_iocharset) { /* Fail only if explicit charset specified */ @@ -821,7 +825,7 @@ root_found: sbi->s_rock = 0; if (sbi->s_firstdatazone != first_data_zone) { sbi->s_firstdatazone = first_data_zone; - printk(KERN_DEBUG + printk(KERN_DEBUG "ISOFS: changing to secondary root\n"); iput(inode); inode = isofs_iget(s, sbi->s_firstdatazone, 0); @@ -830,8 +834,10 @@ root_found: if (opt.check == 'u') { /* Only Joliet is case insensitive by default */ - if (joliet_level) opt.check = 'r'; - else opt.check = 's'; + if (joliet_level) + opt.check = 'r'; + else + opt.check = 's'; } sbi->s_joliet_level = joliet_level; @@ -846,8 +852,10 @@ root_found: goto out_no_root; table = 0; - if (joliet_level) table += 2; - if (opt.check == 'r') table++; + if (joliet_level) + table += 2; + if (opt.check == 'r') + table++; s->s_root->d_op = &isofs_dentry_ops[table]; kfree(opt.iocharset); @@ -858,10 +866,10 @@ root_found: * Display error messages and free resources. */ out_bad_root: - printk(KERN_WARNING "isofs_fill_super: root inode not initialized\n"); + printk(KERN_WARNING "%s: root inode not initialized\n", __func__); goto out_iput; out_no_root: - printk(KERN_WARNING "isofs_fill_super: get root inode failed\n"); + printk(KERN_WARNING "%s: get root inode failed\n", __func__); out_iput: iput(inode); #ifdef CONFIG_JOLIET @@ -870,21 +878,20 @@ out_iput: #endif goto out_freesbi; out_no_read: - printk(KERN_WARNING "isofs_fill_super: " - "bread failed, dev=%s, iso_blknum=%d, block=%d\n", - s->s_id, iso_blknum, block); + printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", + __func__, s->s_id, iso_blknum, block); goto out_freesbi; out_bad_zone_size: - printk(KERN_WARNING "Bad logical zone size %ld\n", + printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", sbi->s_log_zone_size); goto out_freebh; out_bad_size: - printk(KERN_WARNING "Logical zone size(%d) < hardware blocksize(%u)\n", + printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n", orig_zonesize, opt.blocksize); goto out_freebh; out_unknown_format: if (!silent) - printk(KERN_WARNING "Unable to identify CD-ROM format.\n"); + printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n"); out_freebh: brelse(bh); @@ -902,7 +909,7 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) buf->f_type = ISOFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (ISOFS_SB(sb)->s_nzones - << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits)); + << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits)); buf->f_bfree = 0; buf->f_bavail = 0; buf->f_files = ISOFS_SB(sb)->s_ninodes; @@ -931,20 +938,20 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, rv = 0; if (iblock < 0 || iblock != iblock_s) { - printk("isofs_get_blocks: block number too large\n"); + printk(KERN_DEBUG "%s: block number too large\n", __func__); goto abort; } b_off = iblock; - - offset = 0; - firstext = ei->i_first_extent; + + offset = 0; + firstext = ei->i_first_extent; sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode); - nextblk = ei->i_next_section_block; - nextoff = ei->i_next_section_offset; - section = 0; + nextblk = ei->i_next_section_block; + nextoff = ei->i_next_section_offset; + section = 0; - while ( nblocks ) { + while (nblocks) { /* If we are *way* beyond the end of the file, print a message. * Access beyond the end of the file up to the next page boundary * is normal, however because of the way the page cache works. @@ -953,11 +960,11 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, * I/O errors. */ if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { - printk("isofs_get_blocks: block >= EOF (%ld, %ld)\n", - iblock, (unsigned long) inode->i_size); + printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n", + __func__, iblock, (unsigned long) inode->i_size); goto abort; } - + /* On the last section, nextblk == 0, section size is likely to * exceed sect_size by a partial block, and access beyond the * end of the file will reach beyond the section size, too. @@ -976,20 +983,21 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, iput(ninode); if (++section > 100) { - printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n"); - printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u " - "nextblk=%lu nextoff=%lu\n", - iblock, firstext, (unsigned) sect_size, - nextblk, nextoff); + printk(KERN_DEBUG "%s: More than 100 file sections ?!?" + " aborting...\n", __func__); + printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u " + "nextblk=%lu nextoff=%lu\n", __func__, + iblock, firstext, (unsigned) sect_size, + nextblk, nextoff); goto abort; } } - - if ( *bh ) { + + if (*bh) { map_bh(*bh, inode->i_sb, firstext + b_off - offset); } else { *bh = sb_getblk(inode->i_sb, firstext+b_off-offset); - if ( !*bh ) + if (!*bh) goto abort; } bh++; /* Next buffer head */ @@ -1010,7 +1018,7 @@ static int isofs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { if (create) { - printk("isofs_get_block: Kernel tries to allocate a block\n"); + printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__); return -EROFS; } @@ -1070,11 +1078,11 @@ static int isofs_read_level3_size(struct inode *inode) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra; - struct buffer_head * bh = NULL; + struct buffer_head *bh = NULL; unsigned long block, offset, block_saved, offset_saved; int i = 0; int more_entries = 0; - struct iso_directory_record * tmpde = NULL; + struct iso_directory_record *tmpde = NULL; struct iso_inode_info *ei = ISOFS_I(inode); inode->i_size = 0; @@ -1089,7 +1097,7 @@ static int isofs_read_level3_size(struct inode *inode) offset = ei->i_iget5_offset; do { - struct iso_directory_record * de; + struct iso_directory_record *de; unsigned int de_len; if (!bh) { @@ -1163,10 +1171,9 @@ out_noread: return -EIO; out_toomany: - printk(KERN_INFO "isofs_read_level3_size: " - "More than 100 file sections ?!?, aborting...\n" - "isofs_read_level3_size: inode=%lu\n", - inode->i_ino); + printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n" + "isofs_read_level3_size: inode=%lu\n", + __func__, inode->i_ino); goto out; } @@ -1177,9 +1184,9 @@ static void isofs_read_inode(struct inode *inode) unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned long block; int high_sierra = sbi->s_high_sierra; - struct buffer_head * bh = NULL; - struct iso_directory_record * de; - struct iso_directory_record * tmpde = NULL; + struct buffer_head *bh = NULL; + struct iso_directory_record *de; + struct iso_directory_record *tmpde = NULL; unsigned int de_len; unsigned long offset; struct iso_inode_info *ei = ISOFS_I(inode); @@ -1199,7 +1206,7 @@ static void isofs_read_inode(struct inode *inode) tmpde = kmalloc(de_len, GFP_KERNEL); if (tmpde == NULL) { - printk(KERN_INFO "isofs_read_inode: out of memory\n"); + printk(KERN_INFO "%s: out of memory\n", __func__); goto fail; } memcpy(tmpde, bh->b_data + offset, frag1); @@ -1212,24 +1219,26 @@ static void isofs_read_inode(struct inode *inode) } inode->i_ino = isofs_get_ino(ei->i_iget5_block, - ei->i_iget5_offset, - ISOFS_BUFFER_BITS(inode)); + ei->i_iget5_offset, + ISOFS_BUFFER_BITS(inode)); /* Assume it is a normal-format file unless told otherwise */ ei->i_file_format = isofs_file_normal; if (de->flags[-high_sierra] & 2) { inode->i_mode = S_IRUGO | S_IXUGO | S_IFDIR; - inode->i_nlink = 1; /* Set to 1. We know there are 2, but - the find utility tries to optimize - if it is 2, and it screws up. It is - easier to give 1 which tells find to - do it the hard way. */ + inode->i_nlink = 1; /* + * Set to 1. We know there are 2, but + * the find utility tries to optimize + * if it is 2, and it screws up. It is + * easier to give 1 which tells find to + * do it the hard way. + */ } else { - /* Everybody gets to read the file. */ + /* Everybody gets to read the file. */ inode->i_mode = sbi->s_mode; inode->i_nlink = 1; - inode->i_mode |= S_IFREG; + inode->i_mode |= S_IFREG; } inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; @@ -1239,13 +1248,14 @@ static void isofs_read_inode(struct inode *inode) ei->i_format_parm[1] = 0; ei->i_format_parm[2] = 0; - ei->i_section_size = isonum_733 (de->size); + ei->i_section_size = isonum_733(de->size); if (de->flags[-high_sierra] & 0x80) { - if(isofs_read_level3_size(inode)) goto fail; + if(isofs_read_level3_size(inode)) + goto fail; } else { ei->i_next_section_block = 0; ei->i_next_section_offset = 0; - inode->i_size = isonum_733 (de->size); + inode->i_size = isonum_733(de->size); } /* @@ -1258,23 +1268,24 @@ static void isofs_read_inode(struct inode *inode) inode->i_size &= 0x00ffffff; if (de->interleave[0]) { - printk("Interleaved files not (yet) supported.\n"); + printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n"); inode->i_size = 0; } /* I have no idea what file_unit_size is used for, so we will flag it for now */ if (de->file_unit_size[0] != 0) { - printk("File unit size != 0 for ISO file (%ld).\n", - inode->i_ino); + printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n", + inode->i_ino); } /* I have no idea what other flag bits are used for, so we will flag it for now */ #ifdef DEBUG if((de->flags[-high_sierra] & ~2)!= 0){ - printk("Unusual flag settings for ISO file (%ld %x).\n", - inode->i_ino, de->flags[-high_sierra]); + printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file " + "(%ld %x).\n", + inode->i_ino, de->flags[-high_sierra]); } #endif @@ -1285,11 +1296,11 @@ static void isofs_read_inode(struct inode *inode) inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; - ei->i_first_extent = (isonum_733 (de->extent) + - isonum_711 (de->ext_attr_length)); + ei->i_first_extent = (isonum_733(de->extent) + + isonum_711(de->ext_attr_length)); /* Set the number of blocks for stat() - should be done before RR */ - inode->i_blocks = (inode->i_size + 511) >> 9; + inode->i_blocks = (inode->i_size + 511) >> 9; /* * Now test for possible Rock Ridge extensions which will override @@ -1306,7 +1317,7 @@ static void isofs_read_inode(struct inode *inode) /* Install the inode operations vector */ if (S_ISREG(inode->i_mode)) { inode->i_fop = &generic_ro_fops; - switch ( ei->i_file_format ) { + switch (ei->i_file_format) { #ifdef CONFIG_ZISOFS case isofs_file_compressed: inode->i_data.a_ops = &zisofs_aops; @@ -1350,7 +1361,7 @@ static int isofs_iget5_test(struct inode *ino, void *data) struct isofs_iget5_callback_data *d = (struct isofs_iget5_callback_data*)data; return (i->i_iget5_block == d->block) - && (i->i_iget5_offset == d->offset); + && (i->i_iget5_offset == d->offset); } static int isofs_iget5_set(struct inode *ino, void *data) @@ -1384,7 +1395,7 @@ struct inode *isofs_iget(struct super_block *sb, hashval = (block << sb->s_blocksize_bits) | offset; inode = iget5_locked(sb, hashval, &isofs_iget5_test, - &isofs_iget5_set, &data); + &isofs_iget5_set, &data); if (inode && (inode->i_state & I_NEW)) { sb->s_op->read_inode(inode); @@ -1398,7 +1409,7 @@ static int isofs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, - mnt); + mnt); } static struct file_system_type iso9660_fs_type = { diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index fb8fe7a9ddc6..92c14b850e9c 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -80,22 +80,20 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st if (utf8) { len = wcsntombs_be(outname, de->name, - de->name_len[0] >> 1, PAGE_SIZE); + de->name_len[0] >> 1, PAGE_SIZE); } else { len = uni16_to_x8(outname, (__be16 *) de->name, - de->name_len[0] >> 1, nls); + de->name_len[0] >> 1, nls); } - if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) { + if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) len -= 2; - } /* * Windows doesn't like periods at the end of a name, * so neither do we */ - while (len >= 2 && (outname[len-1] == '.')) { + while (len >= 2 && (outname[len-1] == '.')) len--; - } return len; } diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index c04b3a14a3e9..c8c7e5138a01 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -15,7 +15,7 @@ * some sanity tests. */ static int -isofs_cmp(struct dentry * dentry, const char * compare, int dlen) +isofs_cmp(struct dentry *dentry, const char *compare, int dlen) { struct qstr qstr; @@ -48,24 +48,24 @@ isofs_cmp(struct dentry * dentry, const char * compare, int dlen) */ static unsigned long isofs_find_entry(struct inode *dir, struct dentry *dentry, - unsigned long *block_rv, unsigned long* offset_rv, - char * tmpname, struct iso_directory_record * tmpde) + unsigned long *block_rv, unsigned long *offset_rv, + char *tmpname, struct iso_directory_record *tmpde) { unsigned long bufsize = ISOFS_BUFFER_SIZE(dir); unsigned char bufbits = ISOFS_BUFFER_BITS(dir); unsigned long block, f_pos, offset, block_saved, offset_saved; - struct buffer_head * bh = NULL; + struct buffer_head *bh = NULL; struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb); if (!ISOFS_I(dir)->i_first_extent) return 0; - + f_pos = 0; offset = 0; block = 0; while (f_pos < dir->i_size) { - struct iso_directory_record * de; + struct iso_directory_record *de; int de_len, match, i, dlen; char *dpnt; @@ -114,7 +114,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, if (sbi->s_rock && ((i = get_rock_ridge_filename(de, tmpname, dir)))) { - dlen = i; /* possibly -1 */ + dlen = i; /* possibly -1 */ dpnt = tmpname; #ifdef CONFIG_JOLIET } else if (sbi->s_joliet_level) { @@ -145,8 +145,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, isofs_normalize_block_and_offset(de, &block_saved, &offset_saved); - *block_rv = block_saved; - *offset_rv = offset_saved; + *block_rv = block_saved; + *offset_rv = offset_saved; brelse(bh); return 1; } @@ -155,7 +155,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, return 0; } -struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { int found; unsigned long block, offset; @@ -170,9 +170,9 @@ struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct n lock_kernel(); found = isofs_find_entry(dir, dentry, - &block, &offset, - page_address(page), - 1024 + page_address(page)); + &block, &offset, + page_address(page), + 1024 + page_address(page)); __free_page(page); inode = NULL; -- cgit v1.2.1 From 60bfba7e85f88fe834e623ead799cf580de20971 Mon Sep 17 00:00:00 2001 From: Jan Kratochvil Date: Sun, 15 Jul 2007 23:40:06 -0700 Subject: PIE randomization This patch is using mmap()'s randomization functionality in such a way that it maps the main executable of (specially compiled/linked -pie/-fpie) ET_DYN binaries onto a random address (in cases in which mmap() is allowed to perform a randomization). Origin of this patch is in exec-shield (http://people.redhat.com/mingo/exec-shield/) [jkosina@suse.cz: pie randomization: fix BAD_ADDR macro] Signed-off-by: Jan Kratochvil Signed-off-by: Jiri Kosina Cc: Ingo Molnar Cc: Roland McGrath Cc: Jakub Jelinek Signed-off-by: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 109 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 86 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 08e4414b8374..5cfa735639ae 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -45,7 +45,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); static int load_elf_library(struct file *); -static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int); +static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long); /* * If we don't support core dumping, then supply a NULL so we @@ -80,7 +80,7 @@ static struct linux_binfmt elf_format = { .hasvdso = 1 }; -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) +#define BAD_ADDR(x) IS_ERR_VALUE(x) static int set_brk(unsigned long start, unsigned long end) { @@ -285,33 +285,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, #ifndef elf_map static unsigned long elf_map(struct file *filep, unsigned long addr, - struct elf_phdr *eppnt, int prot, int type) + struct elf_phdr *eppnt, int prot, int type, + unsigned long total_size) { unsigned long map_addr; - unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr); + unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); + unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr); + addr = ELF_PAGESTART(addr); + size = ELF_PAGEALIGN(size); - down_write(¤t->mm->mmap_sem); /* mmap() will return -EINVAL if given a zero size, but a * segment with zero filesize is perfectly valid */ - if (eppnt->p_filesz + pageoffset) - map_addr = do_mmap(filep, ELF_PAGESTART(addr), - eppnt->p_filesz + pageoffset, prot, type, - eppnt->p_offset - pageoffset); - else - map_addr = ELF_PAGESTART(addr); + if (!size) + return addr; + + down_write(¤t->mm->mmap_sem); + /* + * total_size is the size of the ELF (interpreter) image. + * The _first_ mmap needs to know the full size, otherwise + * randomization might put this image into an overlapping + * position with the ELF binary image. (since size < total_size) + * So we first map the 'big' image - and unmap the remainder at + * the end. (which unmap is needed for ELF images with holes.) + */ + if (total_size) { + total_size = ELF_PAGEALIGN(total_size); + map_addr = do_mmap(filep, addr, total_size, prot, type, off); + if (!BAD_ADDR(map_addr)) + do_munmap(current->mm, map_addr+size, total_size-size); + } else + map_addr = do_mmap(filep, addr, size, prot, type, off); + up_write(¤t->mm->mmap_sem); return(map_addr); } #endif /* !elf_map */ +static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) +{ + int i, first_idx = -1, last_idx = -1; + + for (i = 0; i < nr; i++) { + if (cmds[i].p_type == PT_LOAD) { + last_idx = i; + if (first_idx == -1) + first_idx = i; + } + } + if (first_idx == -1) + return 0; + + return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - + ELF_PAGESTART(cmds[first_idx].p_vaddr); +} + + /* This is much more generalized than the library routine read function, so we keep this separate. Technically the library read function is only provided so that we can read a.out libraries that have an ELF header */ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, - struct file *interpreter, unsigned long *interp_load_addr) + struct file *interpreter, unsigned long *interp_map_addr, + unsigned long no_base) { struct elf_phdr *elf_phdata; struct elf_phdr *eppnt; @@ -319,6 +356,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; unsigned long error = ~0UL; + unsigned long total_size; int retval, i, size; /* First of all, some simple consistency checks */ @@ -357,6 +395,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, goto out_close; } + total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum); + if (!total_size) { + error = -EINVAL; + goto out_close; + } + eppnt = elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { @@ -374,9 +418,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED; + else if (no_base && interp_elf_ex->e_type == ET_DYN) + load_addr = -vaddr; map_addr = elf_map(interpreter, load_addr + vaddr, - eppnt, elf_prot, elf_type); + eppnt, elf_prot, elf_type, total_size); + total_size = 0; + if (!*interp_map_addr) + *interp_map_addr = map_addr; error = map_addr; if (BAD_ADDR(map_addr)) goto out_close; @@ -442,8 +491,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, goto out_close; } - *interp_load_addr = load_addr; - error = ((unsigned long)interp_elf_ex->e_entry) + load_addr; + error = load_addr; out_close: kfree(elf_phdata); @@ -540,7 +588,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) int elf_exec_fileno; int retval, i; unsigned int size; - unsigned long elf_entry, interp_load_addr = 0; + unsigned long elf_entry; + unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc = 0; char passed_fileno[6]; @@ -808,9 +857,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->start_stack = bprm->p; /* Now we do a little grungy work by mmaping the ELF image into - the correct location in memory. At this point, we assume that - the image should be loaded at fixed address, not at a variable - address. */ + the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { int elf_prot = 0, elf_flags; @@ -864,11 +911,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ +#ifdef CONFIG_X86 + load_bias = 0; +#else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); +#endif } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, - elf_prot, elf_flags); + elf_prot, elf_flags,0); if (BAD_ADDR(error)) { send_sig(SIGKILL, current, 0); retval = IS_ERR((void *)error) ? @@ -944,13 +995,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) } if (elf_interpreter) { - if (interpreter_type == INTERPRETER_AOUT) + if (interpreter_type == INTERPRETER_AOUT) { elf_entry = load_aout_interp(&loc->interp_ex, interpreter); - else + } else { + unsigned long interp_map_addr; /* unused */ + elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, - &interp_load_addr); + &interp_map_addr, + load_bias); + if (!BAD_ADDR(elf_entry)) { + /* + * load_elf_interp() returns relocation + * adjustment + */ + interp_load_addr = elf_entry; + elf_entry += loc->interp_elf_ex.e_entry; + } + } if (BAD_ADDR(elf_entry)) { force_sig(SIGSEGV, current); retval = IS_ERR((void *)elf_entry) ? -- cgit v1.2.1 From 17973f5af741f1758ed57c5115ca394c22bee159 Mon Sep 17 00:00:00 2001 From: Micah Cowan Date: Sun, 15 Jul 2007 23:40:08 -0700 Subject: Only send SIGXFSZ when exceeding rlimits. Some users have been having problems with utilities like cp or dd dumping core when they try to copy a file that's too large for the destination filesystem (typically, > 4gb). Apparently, some defunct standards required SIGXFSZ to be sent in such circumstances, but SUS only requires/allows it for when a written file exceeds the process's resource limits. I'd like to limit SIGXFSZs to the bare minimum required by SUS. Patch sent per http://lkml.org/lkml/2007/4/10/302 Signed-off-by: Micah Cowan Acked-by: Alan Cox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ncpfs/file.c | 2 -- fs/reiserfs/file.c | 1 - 2 files changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index d3152f8d95c6..2b145de45b39 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -203,7 +203,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { if (pos >= MAX_NON_LFS) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } if (count > MAX_NON_LFS - (u32)pos) { @@ -212,7 +211,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * } if (pos >= inode->i_sb->s_maxbytes) { if (count || pos > inode->i_sb->s_maxbytes) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 30eebfb1b2d8..2070aeee2a52 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1305,7 +1305,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && *ppos + count > MAX_NON_LFS) { if (*ppos >= MAX_NON_LFS) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } if (count > MAX_NON_LFS - (unsigned long)*ppos) -- cgit v1.2.1 From 99fc06df72fe1c9ad3ec274720dcb5658c40bfd2 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sun, 15 Jul 2007 23:40:09 -0700 Subject: procfs directory entry cleanup Function proc_register() will assign proc_dir_operations and proc_dir_inode_operations to ent's members proc_fops and proc_iops correctly if ent is a directory. So the early assignment isn't necessary. Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4f8e53568b22..b5e7155d30d8 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -530,12 +530,6 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp return -EAGAIN; dp->low_ino = i; - spin_lock(&proc_subdir_lock); - dp->next = dir->subdir; - dp->parent = dir; - dir->subdir = dp; - spin_unlock(&proc_subdir_lock); - if (S_ISDIR(dp->mode)) { if (dp->proc_iops == NULL) { dp->proc_fops = &proc_dir_operations; @@ -551,6 +545,13 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp if (dp->proc_iops == NULL) dp->proc_iops = &proc_file_inode_operations; } + + spin_lock(&proc_subdir_lock); + dp->next = dir->subdir; + dp->parent = dir; + dir->subdir = dp; + spin_unlock(&proc_subdir_lock); + return 0; } @@ -653,9 +654,6 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, ent = proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { - ent->proc_fops = &proc_dir_operations; - ent->proc_iops = &proc_dir_inode_operations; - if (proc_register(parent, ent) < 0) { kfree(ent); ent = NULL; @@ -690,10 +688,6 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, ent = proc_create(&parent,name,mode,nlink); if (ent) { - if (S_ISDIR(mode)) { - ent->proc_fops = &proc_dir_operations; - ent->proc_iops = &proc_dir_inode_operations; - } if (proc_register(parent, ent) < 0) { kfree(ent); ent = NULL; -- cgit v1.2.1 From e8d6c554126b830217c5e9f549e0e21f865a0a8a Mon Sep 17 00:00:00 2001 From: David Howells Date: Sun, 15 Jul 2007 23:40:12 -0700 Subject: AFS: implement file locking Implement file locking for AFS. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/Makefile | 1 + fs/afs/afs.h | 8 + fs/afs/afs_fs.h | 3 + fs/afs/callback.c | 3 + fs/afs/dir.c | 1 + fs/afs/file.c | 2 + fs/afs/flock.c | 558 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/afs/fsclient.c | 155 ++++++++++++++- fs/afs/internal.h | 30 +++ fs/afs/main.c | 1 + fs/afs/misc.c | 1 + fs/afs/super.c | 3 + fs/afs/vnode.c | 130 +++++++++++-- 13 files changed, 881 insertions(+), 15 deletions(-) create mode 100644 fs/afs/flock.c (limited to 'fs') diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 73ce561f3ea0..a66671082cfb 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -8,6 +8,7 @@ kafs-objs := \ cmservice.o \ dir.o \ file.o \ + flock.o \ fsclient.o \ inode.o \ main.o \ diff --git a/fs/afs/afs.h b/fs/afs/afs.h index 245257948140..c548aa346f0d 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -37,6 +37,13 @@ typedef enum { AFS_FTYPE_SYMLINK = 3, } afs_file_type_t; +typedef enum { + AFS_LOCK_READ = 0, /* read lock request */ + AFS_LOCK_WRITE = 1, /* write lock request */ +} afs_lock_type_t; + +#define AFS_LOCKWAIT (5 * 60) /* time until a lock times out (seconds) */ + /* * AFS file identifier */ @@ -120,6 +127,7 @@ struct afs_file_status { struct afs_fid parent; /* parent dir ID for non-dirs only */ time_t mtime_client; /* last time client changed data */ time_t mtime_server; /* last time server changed data */ + s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ }; /* diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h index a18c374ebe08..eb647323d8f0 100644 --- a/fs/afs/afs_fs.h +++ b/fs/afs/afs_fs.h @@ -31,6 +31,9 @@ enum AFS_FS_Operations { FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ FSGETROOTVOLUME = 151, /* AFS Get root volume name */ + FSSETLOCK = 156, /* AFS Request a file lock */ + FSEXTENDLOCK = 157, /* AFS Extend a file lock */ + FSRELEASELOCK = 158, /* AFS Release a file lock */ FSLOOKUP = 161, /* AFS lookup file in directory */ FSFETCHDATA64 = 65537, /* AFS Fetch file data */ FSSTOREDATA64 = 65538, /* AFS Store file data */ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index bacf518c6fa8..b8243945818d 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -125,6 +125,9 @@ static void afs_break_callback(struct afs_server *server, spin_unlock(&server->cb_lock); queue_work(afs_callback_update_worker, &vnode->cb_broken_work); + if (list_empty(&vnode->granted_locks) && + !list_empty(&vnode->pending_locks)) + afs_lock_may_be_available(vnode); spin_unlock(&vnode->lock); } } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 546c59522eb1..33fe39ad4e03 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -44,6 +44,7 @@ const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, .release = afs_release, .readdir = afs_readdir, + .lock = afs_lock, }; const struct inode_operations afs_dir_inode_operations = { diff --git a/fs/afs/file.c b/fs/afs/file.c index aede7eb66dd4..525f7c56e068 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -34,6 +34,8 @@ const struct file_operations afs_file_operations = { .mmap = generic_file_readonly_mmap, .splice_read = generic_file_splice_read, .fsync = afs_fsync, + .lock = afs_lock, + .flock = afs_flock, }; const struct inode_operations afs_file_inode_operations = { diff --git a/fs/afs/flock.c b/fs/afs/flock.c new file mode 100644 index 000000000000..8f07f8d1bfa9 --- /dev/null +++ b/fs/afs/flock.c @@ -0,0 +1,558 @@ +/* AFS file locking support + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include "internal.h" + +#define AFS_LOCK_GRANTED 0 +#define AFS_LOCK_PENDING 1 + +static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl); +static void afs_fl_release_private(struct file_lock *fl); + +static struct workqueue_struct *afs_lock_manager; + +static struct file_lock_operations afs_lock_ops = { + .fl_copy_lock = afs_fl_copy_lock, + .fl_release_private = afs_fl_release_private, +}; + +/* + * initialise the lock manager thread if it isn't already running + */ +static int afs_init_lock_manager(void) +{ + if (!afs_lock_manager) { + afs_lock_manager = create_singlethread_workqueue("kafs_lockd"); + if (!afs_lock_manager) + return -ENOMEM; + } + return 0; +} + +/* + * destroy the lock manager thread if it's running + */ +void __exit afs_kill_lock_manager(void) +{ + if (afs_lock_manager) + destroy_workqueue(afs_lock_manager); +} + +/* + * if the callback is broken on this vnode, then the lock may now be available + */ +void afs_lock_may_be_available(struct afs_vnode *vnode) +{ + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0); +} + +/* + * the lock will time out in 5 minutes unless we extend it, so schedule + * extension in a bit less than that time + */ +static void afs_schedule_lock_extension(struct afs_vnode *vnode) +{ + queue_delayed_work(afs_lock_manager, &vnode->lock_work, + AFS_LOCKWAIT * HZ / 2); +} + +/* + * do work for a lock, including: + * - probing for a lock we're waiting on but didn't get immediately + * - extending a lock that's close to timing out + */ +void afs_lock_work(struct work_struct *work) +{ + struct afs_vnode *vnode = + container_of(work, struct afs_vnode, lock_work.work); + struct file_lock *fl; + afs_lock_type_t type; + struct key *key; + int ret; + + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + spin_lock(&vnode->lock); + + if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) { + _debug("unlock"); + spin_unlock(&vnode->lock); + + /* attempt to release the server lock; if it fails, we just + * wait 5 minutes and it'll time out anyway */ + ret = afs_vnode_release_lock(vnode, vnode->unlock_key); + if (ret < 0) + printk(KERN_WARNING "AFS:" + " Failed to release lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + + spin_lock(&vnode->lock); + key_put(vnode->unlock_key); + vnode->unlock_key = NULL; + clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags); + } + + /* if we've got a lock, then it must be time to extend that lock as AFS + * locks time out after 5 minutes */ + if (!list_empty(&vnode->granted_locks)) { + _debug("extend"); + + if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) + BUG(); + fl = list_entry(vnode->granted_locks.next, + struct file_lock, fl_u.afs.link); + key = key_get(fl->fl_file->private_data); + spin_unlock(&vnode->lock); + + ret = afs_vnode_extend_lock(vnode, key); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + key_put(key); + switch (ret) { + case 0: + afs_schedule_lock_extension(vnode); + break; + default: + /* ummm... we failed to extend the lock - retry + * extension shortly */ + printk(KERN_WARNING "AFS:" + " Failed to extend lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + queue_delayed_work(afs_lock_manager, &vnode->lock_work, + HZ * 10); + break; + } + _leave(" [extend]"); + return; + } + + /* if we don't have a granted lock, then we must've been called back by + * the server, and so if might be possible to get a lock we're + * currently waiting for */ + if (!list_empty(&vnode->pending_locks)) { + _debug("get"); + + if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) + BUG(); + fl = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + key = key_get(fl->fl_file->private_data); + type = (fl->fl_type == F_RDLCK) ? + AFS_LOCK_READ : AFS_LOCK_WRITE; + spin_unlock(&vnode->lock); + + ret = afs_vnode_set_lock(vnode, key, type); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + switch (ret) { + case -EWOULDBLOCK: + _debug("blocked"); + break; + case 0: + _debug("acquired"); + if (type == AFS_LOCK_READ) + set_bit(AFS_VNODE_READLOCKED, &vnode->flags); + else + set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); + ret = AFS_LOCK_GRANTED; + default: + spin_lock(&vnode->lock); + /* the pending lock may have been withdrawn due to a + * signal */ + if (list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link) == fl) { + fl->fl_u.afs.state = ret; + if (ret == AFS_LOCK_GRANTED) + list_move_tail(&fl->fl_u.afs.link, + &vnode->granted_locks); + else + list_del_init(&fl->fl_u.afs.link); + wake_up(&fl->fl_wait); + spin_unlock(&vnode->lock); + } else { + _debug("withdrawn"); + clear_bit(AFS_VNODE_READLOCKED, &vnode->flags); + clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); + spin_unlock(&vnode->lock); + afs_vnode_release_lock(vnode, key); + if (!list_empty(&vnode->pending_locks)) + afs_lock_may_be_available(vnode); + } + break; + } + key_put(key); + _leave(" [pend]"); + return; + } + + /* looks like the lock request was withdrawn on a signal */ + spin_unlock(&vnode->lock); + _leave(" [no locks]"); +} + +/* + * pass responsibility for the unlocking of a vnode on the server to the + * manager thread, lest a pending signal in the calling thread interrupt + * AF_RXRPC + * - the caller must hold the vnode lock + */ +static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) +{ + cancel_delayed_work(&vnode->lock_work); + if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) && + !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) + BUG(); + if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) + BUG(); + vnode->unlock_key = key_get(key); + afs_lock_may_be_available(vnode); +} + +/* + * request a lock on a file on the server + */ +static int afs_do_setlk(struct file *file, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + afs_lock_type_t type; + struct key *key = file->private_data; + int ret; + + _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + + /* only whole-file locks are supported */ + if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) + return -EINVAL; + + ret = afs_init_lock_manager(); + if (ret < 0) + return ret; + + fl->fl_ops = &afs_lock_ops; + INIT_LIST_HEAD(&fl->fl_u.afs.link); + fl->fl_u.afs.state = AFS_LOCK_PENDING; + + type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + + lock_kernel(); + + /* make sure we've got a callback on this file and that our view of the + * data version is up to date */ + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error; + + if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) { + ret = -EAGAIN; + goto error; + } + + spin_lock(&vnode->lock); + + if (list_empty(&vnode->pending_locks)) { + /* if there's no-one else with a lock on this vnode, then we + * need to ask the server for a lock */ + if (list_empty(&vnode->granted_locks)) { + _debug("not locked"); + ASSERTCMP(vnode->flags & + ((1 << AFS_VNODE_LOCKING) | + (1 << AFS_VNODE_READLOCKED) | + (1 << AFS_VNODE_WRITELOCKED)), ==, 0); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); + set_bit(AFS_VNODE_LOCKING, &vnode->flags); + spin_unlock(&vnode->lock); + + ret = afs_vnode_set_lock(vnode, key, type); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + switch (ret) { + case 0: + goto acquired_server_lock; + case -EWOULDBLOCK: + spin_lock(&vnode->lock); + ASSERT(list_empty(&vnode->granted_locks)); + ASSERTCMP(vnode->pending_locks.next, ==, + &fl->fl_u.afs.link); + goto wait; + default: + spin_lock(&vnode->lock); + list_del_init(&fl->fl_u.afs.link); + spin_unlock(&vnode->lock); + goto error; + } + } + + /* if we've already got a readlock on the server and no waiting + * writelocks, then we might be able to instantly grant another + * readlock */ + if (type == AFS_LOCK_READ && + vnode->flags & (1 << AFS_VNODE_READLOCKED)) { + _debug("instant readlock"); + ASSERTCMP(vnode->flags & + ((1 << AFS_VNODE_LOCKING) | + (1 << AFS_VNODE_WRITELOCKED)), ==, 0); + ASSERT(!list_empty(&vnode->granted_locks)); + goto sharing_existing_lock; + } + } + + /* otherwise, we need to wait for a local lock to become available */ + _debug("wait local"); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); +wait: + if (!(fl->fl_flags & FL_SLEEP)) { + _debug("noblock"); + ret = -EAGAIN; + goto abort_attempt; + } + spin_unlock(&vnode->lock); + + /* now we need to sleep and wait for the lock manager thread to get the + * lock from the server */ + _debug("sleep"); + ret = wait_event_interruptible(fl->fl_wait, + fl->fl_u.afs.state <= AFS_LOCK_GRANTED); + if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { + ret = fl->fl_u.afs.state; + if (ret < 0) + goto error; + spin_lock(&vnode->lock); + goto given_lock; + } + + /* we were interrupted, but someone may still be in the throes of + * giving us the lock */ + _debug("intr"); + ASSERTCMP(ret, ==, -ERESTARTSYS); + + spin_lock(&vnode->lock); + if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { + ret = fl->fl_u.afs.state; + if (ret < 0) { + spin_unlock(&vnode->lock); + goto error; + } + goto given_lock; + } + +abort_attempt: + /* we aren't going to get the lock, either because we're unwilling to + * wait, or because some signal happened */ + _debug("abort"); + if (list_empty(&vnode->granted_locks) && + vnode->pending_locks.next == &fl->fl_u.afs.link) { + if (vnode->pending_locks.prev != &fl->fl_u.afs.link) { + /* kick the next pending lock into having a go */ + list_del_init(&fl->fl_u.afs.link); + afs_lock_may_be_available(vnode); + } + } else { + list_del_init(&fl->fl_u.afs.link); + } + spin_unlock(&vnode->lock); + goto error; + +acquired_server_lock: + /* we've acquired a server lock, but it needs to be renewed after 5 + * mins */ + spin_lock(&vnode->lock); + afs_schedule_lock_extension(vnode); + if (type == AFS_LOCK_READ) + set_bit(AFS_VNODE_READLOCKED, &vnode->flags); + else + set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); +sharing_existing_lock: + /* the lock has been granted as far as we're concerned... */ + fl->fl_u.afs.state = AFS_LOCK_GRANTED; + list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); +given_lock: + /* ... but we do still need to get the VFS's blessing */ + ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING))); + ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) | + (1 << AFS_VNODE_WRITELOCKED))) != 0); + ret = posix_lock_file(file, fl, NULL); + if (ret < 0) + goto vfs_rejected_lock; + spin_unlock(&vnode->lock); + + /* again, make sure we've got a callback on this file and, again, make + * sure that our view of the data version is up to date (we ignore + * errors incurred here and deal with the consequences elsewhere) */ + afs_vnode_fetch_status(vnode, NULL, key); + +error: + unlock_kernel(); + _leave(" = %d", ret); + return ret; + +vfs_rejected_lock: + /* the VFS rejected the lock we just obtained, so we have to discard + * what we just got */ + _debug("vfs refused %d", ret); + list_del_init(&fl->fl_u.afs.link); + if (list_empty(&vnode->granted_locks)) + afs_defer_unlock(vnode, key); + spin_unlock(&vnode->lock); + goto abort_attempt; +} + +/* + * unlock on a file on the server + */ +static int afs_do_unlk(struct file *file, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct key *key = file->private_data; + int ret; + + _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + + /* only whole-file unlocks are supported */ + if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) + return -EINVAL; + + fl->fl_ops = &afs_lock_ops; + INIT_LIST_HEAD(&fl->fl_u.afs.link); + fl->fl_u.afs.state = AFS_LOCK_PENDING; + + spin_lock(&vnode->lock); + ret = posix_lock_file(file, fl, NULL); + if (ret < 0) { + spin_unlock(&vnode->lock); + _leave(" = %d [vfs]", ret); + return ret; + } + + /* discard the server lock only if all granted locks are gone */ + if (list_empty(&vnode->granted_locks)) + afs_defer_unlock(vnode, key); + spin_unlock(&vnode->lock); + _leave(" = 0"); + return 0; +} + +/* + * return information about a lock we currently hold, if indeed we hold one + */ +static int afs_do_getlk(struct file *file, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct key *key = file->private_data; + int ret, lock_count; + + _enter(""); + + fl->fl_type = F_UNLCK; + + mutex_lock(&vnode->vfs_inode.i_mutex); + + /* check local lock records first */ + ret = 0; + if (posix_test_lock(file, fl) == 0) { + /* no local locks; consult the server */ + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error; + lock_count = vnode->status.lock_count; + if (lock_count) { + if (lock_count > 0) + fl->fl_type = F_RDLCK; + else + fl->fl_type = F_WRLCK; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + } + } + +error: + mutex_unlock(&vnode->vfs_inode.i_mutex); + _leave(" = %d [%hd]", ret, fl->fl_type); + return ret; +} + +/* + * manage POSIX locks on a file + */ +int afs_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + + _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", + vnode->fid.vid, vnode->fid.vnode, cmd, + fl->fl_type, fl->fl_flags, + (long long) fl->fl_start, (long long) fl->fl_end); + + /* AFS doesn't support mandatory locks */ + if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID && + fl->fl_type != F_UNLCK) + return -ENOLCK; + + if (IS_GETLK(cmd)) + return afs_do_getlk(file, fl); + if (fl->fl_type == F_UNLCK) + return afs_do_unlk(file, fl); + return afs_do_setlk(file, fl); +} + +/* + * manage FLOCK locks on a file + */ +int afs_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + + _enter("{%x:%u},%d,{t=%x,fl=%x}", + vnode->fid.vid, vnode->fid.vnode, cmd, + fl->fl_type, fl->fl_flags); + + /* + * No BSD flocks over NFS allowed. + * Note: we could try to fake a POSIX lock request here by + * using ((u32) filp | 0x80000000) or some such as the pid. + * Not sure whether that would be unique, though, or whether + * that would break in other places. + */ + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + /* we're simulating flock() locks using posix locks on the server */ + fl->fl_owner = (fl_owner_t) file; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + + if (fl->fl_type == F_UNLCK) + return afs_do_unlk(file, fl); + return afs_do_setlk(file, fl); +} + +/* + * the POSIX lock management core VFS code copies the lock record and adds the + * copy into its own list, so we need to add that copy to the vnode's lock + * queue in the same place as the original (which will be deleted shortly + * after) + */ +static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + _enter(""); + + list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link); +} + +/* + * need to remove this lock from the vnode queue when it's removed from the + * VFS's list + */ +static void afs_fl_release_private(struct file_lock *fl) +{ + _enter(""); + + list_del_init(&fl->fl_u.afs.link); +} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 5dff1308b6f0..023b95b0d9d7 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -67,7 +67,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, EXTRACT(status->group); bp++; /* sync counter */ data_version |= (u64) ntohl(*bp++) << 32; - bp++; /* lock count */ + EXTRACT(status->lock_count); size |= (u64) ntohl(*bp++) << 32; bp++; /* spare 4 */ *_bp = bp; @@ -1748,3 +1748,156 @@ int afs_fs_get_volume_status(struct afs_server *server, return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); } + +/* + * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock + */ +static int afs_deliver_fs_xxxx_lock(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.SetLock operation type + */ +static const struct afs_call_type afs_RXFSSetLock = { + .name = "FS.SetLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * FS.ExtendLock operation type + */ +static const struct afs_call_type afs_RXFSExtendLock = { + .name = "FS.ExtendLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * FS.ReleaseLock operation type + */ +static const struct afs_call_type afs_RXFSReleaseLock = { + .name = "FS.ReleaseLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * get a lock on a file + */ +int afs_fs_set_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + afs_lock_type_t type, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSETLOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(type); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * extend a lock on a file + */ +int afs_fs_extend_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSEXTENDLOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * release a lock on a file + */ +int afs_fs_release_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSRELEASELOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 2c55dd94a1de..6306438f331f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -351,10 +351,18 @@ struct afs_vnode { #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ +#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */ +#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ +#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ +#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ long acl_order; /* ACL check count (callback break count) */ struct list_head writebacks; /* alterations in pagecache that need writing */ + struct list_head pending_locks; /* locks waiting to be granted */ + struct list_head granted_locks; /* locks granted on this file */ + struct delayed_work lock_work; /* work to be done in locking */ + struct key *unlock_key; /* key to be used in unlocking */ /* outstanding callback notification on this file */ struct rb_node server_rb; /* link in server->fs_vnodes */ @@ -473,6 +481,15 @@ extern const struct file_operations afs_file_operations; extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); +/* + * flock.c + */ +extern void __exit afs_kill_lock_manager(void); +extern void afs_lock_work(struct work_struct *); +extern void afs_lock_may_be_available(struct afs_vnode *); +extern int afs_lock(struct file *, int, struct file_lock *); +extern int afs_flock(struct file *, int, struct file_lock *); + /* * fsclient.c */ @@ -513,6 +530,15 @@ extern int afs_fs_get_volume_status(struct afs_server *, struct key *, struct afs_vnode *, struct afs_volume_status *, const struct afs_wait_mode *); +extern int afs_fs_set_lock(struct afs_server *, struct key *, + struct afs_vnode *, afs_lock_type_t, + const struct afs_wait_mode *); +extern int afs_fs_extend_lock(struct afs_server *, struct key *, + struct afs_vnode *, + const struct afs_wait_mode *); +extern int afs_fs_release_lock(struct afs_server *, struct key *, + struct afs_vnode *, + const struct afs_wait_mode *); /* * inode.c @@ -681,6 +707,10 @@ extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t, extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *); extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *, struct afs_volume_status *); +extern int afs_vnode_set_lock(struct afs_vnode *, struct key *, + afs_lock_type_t); +extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *); +extern int afs_vnode_release_lock(struct afs_vnode *, struct key *); /* * volume.c diff --git a/fs/afs/main.c b/fs/afs/main.c index cd21195bbb24..0f60f6b35769 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -168,6 +168,7 @@ static void __exit afs_exit(void) printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); afs_fs_exit(); + afs_kill_lock_manager(); afs_close_socket(); afs_purge_servers(); afs_callback_update_kill(); diff --git a/fs/afs/misc.c b/fs/afs/misc.c index d1a889c40742..2d33a5f7d218 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -35,6 +35,7 @@ int afs_abort_to_error(u32 abort_code) case VOVERQUOTA: return -EDQUOT; case VBUSY: return -EBUSY; case VMOVED: return -ENXIO; + case 0x2f6df0a: return -EWOULDBLOCK; case 0x2f6df0c: return -EACCES; case 0x2f6df0f: return -EBUSY; case 0x2f6df10: return -EEXIST; diff --git a/fs/afs/super.c b/fs/afs/super.c index 2e8496ba1205..993cdf1cce3a 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -460,6 +460,9 @@ static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep, spin_lock_init(&vnode->writeback_lock); spin_lock_init(&vnode->lock); INIT_LIST_HEAD(&vnode->writebacks); + INIT_LIST_HEAD(&vnode->pending_locks); + INIT_LIST_HEAD(&vnode->granted_locks); + INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work); INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work); } diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c index 232c55dc245d..ac811e445b2d 100644 --- a/fs/afs/vnode.c +++ b/fs/afs/vnode.c @@ -887,11 +887,6 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key, vnode->fid.unique, key_serial(key)); - /* this op will fetch the status */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - do { /* pick a server to query */ server = afs_volume_pick_fileserver(vnode); @@ -905,20 +900,127 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key, } while (!afs_volume_release_fileserver(vnode, server, ret)); /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * get a lock on a file + */ +int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key, + afs_lock_type_t type) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%u", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), type); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * extend a lock on a file + */ +int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * release a lock on a file + */ +int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } _leave(" = %d", ret); return ret; no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); return PTR_ERR(server); } -- cgit v1.2.1 From 03a9c30c231a61dd7457681aef51cc38ee01f766 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Jul 2007 23:40:16 -0700 Subject: AFS: drop explicit extern Don't use explicit extern specifier and quieten sparse warning: fs/afs/vnode.c:564:12: warning: function 'afs_vnode_link' with external linkage has definition Signed-off-by: Randy Dunlap Acked-By: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/vnode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c index ac811e445b2d..2f05c4fc2a70 100644 --- a/fs/afs/vnode.c +++ b/fs/afs/vnode.c @@ -561,7 +561,7 @@ no_server: /* * create a hard link */ -extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, +int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, struct key *key, const char *name) { struct afs_server *server; -- cgit v1.2.1 From f17e121fd055ba60d57a992702e59ae495faba76 Mon Sep 17 00:00:00 2001 From: young dave Date: Sun, 15 Jul 2007 23:40:17 -0700 Subject: remove useless tolower in isofs Remove useless tolower in isofs Signed-off-by: dave young Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 15c866f1a1fd..4f5418be0590 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -197,7 +197,7 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms) hash = init_name_hash(); while (len--) { c = tolower(*name++); - hash = partial_name_hash(tolower(c), hash); + hash = partial_name_hash(c, hash); } qstr->hash = end_name_hash(hash); -- cgit v1.2.1 From da58a1617343e345d435953a0f32024997a95164 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 15 Jul 2007 23:40:21 -0700 Subject: /proc/*/environ: wrong placing of ptrace_may_attach() check It's a bit dopey-looking and can permit a task to cause a pagefault in an mm which it doesn't have permission to read from. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 46ea5d56e1bb..d0921944e68c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -204,12 +204,17 @@ static int proc_pid_environ(struct task_struct *task, char * buffer) int res = 0; struct mm_struct *mm = get_task_mm(task); if (mm) { - unsigned int len = mm->env_end - mm->env_start; + unsigned int len; + + res = -ESRCH; + if (!ptrace_may_attach(task)) + goto out; + + len = mm->env_end - mm->env_start; if (len > PAGE_SIZE) len = PAGE_SIZE; res = access_process_vm(task, mm->env_start, buffer, len, 0); - if (!ptrace_may_attach(task)) - res = -ESRCH; +out: mmput(mm); } return res; -- cgit v1.2.1 From a6739af8b9e8bf0fd1fb3f4f8406a9f650cb733a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 15 Jul 2007 23:40:22 -0700 Subject: ext2: fix a comment when ext2_release_file() is called Signed-off-by: Jan Kara Acked-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 04afeecaaef3..ab7961260c49 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -24,9 +24,9 @@ #include "acl.h" /* - * Called when an inode is released. Note that this is different - * from ext2_open_file: open gets called at every open, but release - * gets called only when /all/ the files are closed. + * Called when filp is released. This happens when all file descriptors + * for a single struct file are closed. Note that different open() calls + * for the same file yield different struct file structures. */ static int ext2_release_file (struct inode * inode, struct file * filp) { -- cgit v1.2.1 From 00c5746da9fc6a793b9d94a8001ded5929f3a773 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 15 Jul 2007 23:40:22 -0700 Subject: mutex_unlock() later in seq_lseek() All manipulations with struct seq_file::version are done under struct seq_file::lock except one introduced in commit d6b7a781c51c91dd054e5c437885205592faac21 aka "[PATCH] Speed up /proc/pid/maps" Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 49194a4e6b91..e8e51db4989d 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -260,8 +260,8 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin) } } } - mutex_unlock(&m->lock); file->f_version = m->version; + mutex_unlock(&m->lock); return retval; } EXPORT_SYMBOL(seq_lseek); -- cgit v1.2.1 From 203a2935c734c054bfd4665fb5d8835498af50a8 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Sun, 15 Jul 2007 23:40:23 -0700 Subject: fs/block_dev.c: use list_for_each_entry() fs/block_dev.c: Use list_for_each_entry() instead of list_for_each() in nr_blockdev_pages() Signed-off-by: Matthias Kaehlcke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index b3e9bfa748cf..da5f0515e8eb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -588,12 +588,10 @@ EXPORT_SYMBOL(bdget); long nr_blockdev_pages(void) { - struct list_head *p; + struct block_device *bdev; long ret = 0; spin_lock(&bdev_lock); - list_for_each(p, &all_bdevs) { - struct block_device *bdev; - bdev = list_entry(p, struct block_device, bd_list); + list_for_each_entry(bdev, &all_bdevs, bd_list) { ret += bdev->bd_inode->i_mapping->nrpages; } spin_unlock(&bdev_lock); -- cgit v1.2.1 From 4a2d44590a603be292addce9c263982043416666 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 15 Jul 2007 23:40:31 -0700 Subject: buffer: kill old incorrect comment Signed-off-by: Eric W. Biederman Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index aa68206bd517..424165b569f8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1026,11 +1026,6 @@ failed: /* * Create buffers for the specified block device block's page. If * that page was dirty, the buffers are set dirty also. - * - * Except that's a bug. Attaching dirty buffers to a dirty - * blockdev's page can result in filesystem corruption, because - * some of those buffers may be aliases of filesystem data. - * grow_dev_page() will go BUG() if this happens. */ static int grow_buffers(struct block_device *bdev, sector_t block, int size) -- cgit v1.2.1 From f23513e8d96cf5e6cf8d2ff0cb5dd6bbc33995e4 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sun, 15 Jul 2007 23:40:32 -0700 Subject: Introduce O_CLOEXEC The problem is as follows: in multi-threaded code (or more correctly: all code using clone() with CLONE_FILES) we have a race when exec'ing. thread #1 thread #2 fd=open() fork + exec fcntl(fd,F_SETFD,FD_CLOEXEC) In some applications this can happen frequently. Take a web browser. One thread opens a file and another thread starts, say, an external PDF viewer. The result can even be a security issue if that open file descriptor refers to a sensitive file and the external program can somehow be tricked into using that descriptor. Just adding O_CLOEXEC support to open() doesn't solve the whole set of problems. There are other ways to create file descriptors (socket, epoll_create, Unix domain socket transfer, etc). These can and should be addressed separately though. open() is such an easy case that it makes not much sense putting the fix off. The test program: #include #include #include #include #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 #endif int main (int argc, char *argv[]) { int fd; if (argc > 1) { fd = atol (argv[1]); printf ("child: fd = %d\n", fd); if (fcntl (fd, F_GETFD) == 0 || errno != EBADF) { puts ("file descriptor valid in child"); return 1; } return 0; } fd = open ("/proc/self/exe", O_RDONLY | O_CLOEXEC); printf ("in parent: new fd = %d\n", fd); char buf[20]; snprintf (buf, sizeof (buf), "%d", fd); execl ("/proc/self/exe", argv[0], buf, NULL); puts ("execl failed"); return 1; } [kyle@parisc-linux.org: parisc fix] Signed-off-by: Ulrich Drepper Acked-by: Ingo Molnar Cc: Davide Libenzi Cc: Michael Kerrisk Cc: Chris Zankel Signed-off-by: Kyle McMartin Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 0d515d161974..e6991c1b5874 100644 --- a/fs/open.c +++ b/fs/open.c @@ -855,7 +855,7 @@ EXPORT_SYMBOL(dentry_open); /* * Find an empty file descriptor entry, and mark it busy. */ -int get_unused_fd(void) +static int get_unused_fd_flags(int flags) { struct files_struct * files = current->files; int fd, error; @@ -891,7 +891,10 @@ repeat: } FD_SET(fd, fdt->open_fds); - FD_CLR(fd, fdt->close_on_exec); + if (flags & O_CLOEXEC) + FD_SET(fd, fdt->close_on_exec); + else + FD_CLR(fd, fdt->close_on_exec); files->next_fd = fd + 1; #if 1 /* Sanity check */ @@ -907,6 +910,11 @@ out: return error; } +int get_unused_fd(void) +{ + return get_unused_fd_flags(0); +} + EXPORT_SYMBOL(get_unused_fd); static void __put_unused_fd(struct files_struct *files, unsigned int fd) @@ -959,7 +967,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) int fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { - fd = get_unused_fd(); + fd = get_unused_fd_flags(flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, flags, mode); if (IS_ERR(f)) { -- cgit v1.2.1 From 4a19542e5f694cd408a32c3d9dc593ba9366e2d7 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sun, 15 Jul 2007 23:40:34 -0700 Subject: O_CLOEXEC for SCM_RIGHTS Part two in the O_CLOEXEC saga: adding support for file descriptors received through Unix domain sockets. The patch is once again pretty minimal, it introduces a new flag for recvmsg and passes it just like the existing MSG_CMSG_COMPAT flag. I think this bit is not used otherwise but the networking people will know better. This new flag is not recognized by recvfrom and recv. These functions cannot be used for that purpose and the asymmetry this introduces is not worse than the already existing MSG_CMSG_COMPAT situations. The patch must be applied on the patch which introduced O_CLOEXEC. It has to remove static from the new get_unused_fd_flags function but since scm.c cannot live in a module the function still hasn't to be exported. Here's a test program to make sure the code works. It's so much longer than the actual patch... #include #include #include #include #include #include #include #include #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 #endif #ifndef MSG_CMSG_CLOEXEC # define MSG_CMSG_CLOEXEC 0x40000000 #endif int main (int argc, char *argv[]) { if (argc > 1) { int fd = atol (argv[1]); printf ("child: fd = %d\n", fd); if (fcntl (fd, F_GETFD) == 0 || errno != EBADF) { puts ("file descriptor valid in child"); return 1; } return 0; } struct sockaddr_un sun; strcpy (sun.sun_path, "./testsocket"); sun.sun_family = AF_UNIX; char databuf[] = "hello"; struct iovec iov[1]; iov[0].iov_base = databuf; iov[0].iov_len = sizeof (databuf); union { struct cmsghdr hdr; char bytes[CMSG_SPACE (sizeof (int))]; } buf; struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 1, .msg_control = buf.bytes, .msg_controllen = sizeof (buf) }; struct cmsghdr *cmsg = CMSG_FIRSTHDR (&msg); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN (sizeof (int)); msg.msg_controllen = cmsg->cmsg_len; pid_t child = fork (); if (child == -1) error (1, errno, "fork"); if (child == 0) { int sock = socket (PF_UNIX, SOCK_STREAM, 0); if (sock < 0) error (1, errno, "socket"); if (bind (sock, (struct sockaddr *) &sun, sizeof (sun)) < 0) error (1, errno, "bind"); if (listen (sock, SOMAXCONN) < 0) error (1, errno, "listen"); int conn = accept (sock, NULL, NULL); if (conn == -1) error (1, errno, "accept"); *(int *) CMSG_DATA (cmsg) = sock; if (sendmsg (conn, &msg, MSG_NOSIGNAL) < 0) error (1, errno, "sendmsg"); return 0; } /* For a test suite this should be more robust like a barrier in shared memory. */ sleep (1); int sock = socket (PF_UNIX, SOCK_STREAM, 0); if (sock < 0) error (1, errno, "socket"); if (connect (sock, (struct sockaddr *) &sun, sizeof (sun)) < 0) error (1, errno, "connect"); unlink (sun.sun_path); *(int *) CMSG_DATA (cmsg) = -1; if (recvmsg (sock, &msg, MSG_CMSG_CLOEXEC) < 0) error (1, errno, "recvmsg"); int fd = *(int *) CMSG_DATA (cmsg); if (fd == -1) error (1, 0, "no descriptor received"); char fdname[20]; snprintf (fdname, sizeof (fdname), "%d", fd); execl ("/proc/self/exe", argv[0], fdname, NULL); puts ("execl failed"); return 1; } [akpm@linux-foundation.org: Fix fastcall inconsistency noted by Michael Buesch] [akpm@linux-foundation.org: build fix] Signed-off-by: Ulrich Drepper Cc: Ingo Molnar Cc: Michael Buesch Cc: Michael Kerrisk Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index e6991c1b5874..be6a457f4226 100644 --- a/fs/open.c +++ b/fs/open.c @@ -855,7 +855,7 @@ EXPORT_SYMBOL(dentry_open); /* * Find an empty file descriptor entry, and mark it busy. */ -static int get_unused_fd_flags(int flags) +int get_unused_fd_flags(int flags) { struct files_struct * files = current->files; int fd, error; -- cgit v1.2.1 From cb510b8172602a66467f3551b4be1911f5a7c8c2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 15 Jul 2007 23:40:39 -0700 Subject: seq_file: more atomicity in traverse() Original problem: in some circumstances seq_file interface can present infinite proc file to the following script when normally said proc file is finite: while read line; do [do something with $line] done index'es being 0 and 1. Current one is 1, as bash prints second object line by line. Imagine first object being removed right before lseek(). traverse() will be called, because there is negative offset. traverse() will reset ->index to 0 (!). traverse() will call ->next() and get NULL in any usual iterate-over-list code using list_for_each_entry_continue() and such. There is one object in list now after all... traverse() will return 0, lseek() will update file position and pretend everything is OK. So, what we have now: ->f_pos points to place where second object will be printed, but ->index is 0. seq_read() instead of returning EOF, will start printing first line of first object every time it's called, until enough objects are added to ->f_pos return in bounds. Fix is to update ->index only after we're sure we saw enough objects down the road. Signed-off-by: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index e8e51db4989d..bbb19be260ce 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -177,21 +177,23 @@ EXPORT_SYMBOL(seq_read); static int traverse(struct seq_file *m, loff_t offset) { - loff_t pos = 0; + loff_t pos = 0, index; int error = 0; void *p; m->version = 0; - m->index = 0; + index = 0; m->count = m->from = 0; - if (!offset) + if (!offset) { + m->index = index; return 0; + } if (!m->buf) { m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); if (!m->buf) return -ENOMEM; } - p = m->op->start(m, &m->index); + p = m->op->start(m, &index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) @@ -204,15 +206,17 @@ static int traverse(struct seq_file *m, loff_t offset) if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; + m->index = index; break; } pos += m->count; m->count = 0; if (pos == offset) { - m->index++; + index++; + m->index = index; break; } - p = m->op->next(m, p, &m->index); + p = m->op->next(m, p, &index); } m->op->stop(m, p); return error; -- cgit v1.2.1 From aa0ac36518be648dda3a32f0b37a8b2b546e1b24 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 15 Jul 2007 23:40:39 -0700 Subject: Remove capability.h from mm.h I forgot to remove capability.h from mm.h while removing sched.h! This patch remedies that, because the only inline function which was using CAP_something was made out of line. Cross-compile tested without regressions on: all powerpc defconfigs all mips defconfigs all m68k defconfigs all arm defconfigs all ia64 defconfigs alpha alpha-allnoconfig alpha-defconfig alpha-up arm i386 i386-allnoconfig i386-defconfig i386-up ia64 ia64-allnoconfig ia64-defconfig ia64-up m68k mips parisc parisc-allnoconfig parisc-defconfig parisc-up powerpc powerpc-up s390 s390-allnoconfig s390-defconfig s390-up sparc sparc-allnoconfig sparc-defconfig sparc-up sparc64 sparc64-allnoconfig sparc64-defconfig sparc64-up um-x86_64 x86_64 x86_64-allnoconfig x86_64-defconfig x86_64-up as well as my two usual configs. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/gfs2/eaops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c index c1f44009853f..1ab3e9d73886 100644 --- a/fs/gfs2/eaops.c +++ b/fs/gfs2/eaops.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.1 From 9f7dd93de07420b423336d5d0028959e94778ddb Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 15 Jul 2007 23:40:45 -0700 Subject: ext3/ext4: orphan list check on destroy_inode Customers claims to ext3-related errors, investigation showed that ext3 orphan list has been corrupted and have the reference to non-ext3 inode. The following debug helps to understand the reasons of this issue. [akpm@linux-foundation.org: update for print_hex_dump() changes] Signed-off-by: Vasily Averin Cc: "Randy.Dunlap" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/super.c | 8 ++++++++ fs/ext4/super.c | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6e3062913a92..6ee4a95f4934 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -459,6 +459,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) static void ext3_destroy_inode(struct inode *inode) { + if (!list_empty(&(EXT3_I(inode)->i_orphan))) { + printk("EXT3 Inode %p: orphan list check failed!\n", + EXT3_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT3_I(inode), sizeof(struct ext3_inode_info), + false); + dump_stack(); + } kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 175b68c60968..6768c5aa3fee 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -510,6 +510,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static void ext4_destroy_inode(struct inode *inode) { + if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + printk("EXT4 Inode %p: orphan list check failed!\n", + EXT4_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT4_I(inode), sizeof(struct ext4_inode_info), + true); + dump_stack(); + } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } -- cgit v1.2.1 From a6c15c2b0fbfd5c0a84f5f0e1e3f20f85d2b8692 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 15 Jul 2007 23:40:46 -0700 Subject: ext3/ext4: orphan list corruption due bad inode After ext3 orphan list check has been added into ext3_destroy_inode() (please see my previous patch) the following situation has been detected: EXT3-fs warning (device sda6): ext3_unlink: Deleting nonexistent file (37901290), 0 Inode 00000101a15b7840: orphan list check failed! 00000773 6f665f00 74616d72 00000573 65725f00 06737270 66000000 616d726f ... Call Trace: [] ext3_destroy_inode+0x79/0x90 [] sys_unlink+0x126/0x1a0 [] error_exit+0x0/0x81 [] system_call+0x7e/0x83 First messages said that unlinked inode has i_nlink=0, then ext3_unlink() adds this inode into orphan list. Second message means that this inode has not been removed from orphan list. Inode dump has showed that i_fop = &bad_file_ops and it can be set in make_bad_inode() only. Then I've found that ext3_read_inode() can call make_bad_inode() without any error/warning messages, for example in the following case: ... if (inode->i_nlink == 0) { if (inode->i_mode == 0 || !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { /* this inode is deleted */ brelse (bh); goto bad_inode; ... Bad inode can live some time, ext3_unlink can add it to orphan list, but ext3_delete_inode() do not deleted this inode from orphan list. As result we can have orphan list corruption detected in ext3_destroy_inode(). However it is not clear for me how to fix this issue correctly. As far as i see is_bad_inode() is called after iget() in all places excluding ext3_lookup() and ext3_get_parent(). I believe it makes sense to add bad inode check to these functions too and call iput if bad inode detected. Signed-off-by: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/namei.c | 10 ++++++++++ fs/ext4/namei.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 9bb046df827a..1586807b8177 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1019,6 +1019,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str if (!inode) return ERR_PTR(-EACCES); + + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } } return d_splice_alias(inode, dentry); } @@ -1054,6 +1059,11 @@ struct dentry *ext3_get_parent(struct dentry *child) if (!inode) return ERR_PTR(-EACCES); + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } + parent = d_alloc_anon(inode); if (!parent) { iput(inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2811e5720ad0..2de339dd7554 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1017,6 +1017,11 @@ static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, str if (!inode) return ERR_PTR(-EACCES); + + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } } return d_splice_alias(inode, dentry); } @@ -1052,6 +1057,11 @@ struct dentry *ext4_get_parent(struct dentry *child) if (!inode) return ERR_PTR(-EACCES); + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } + parent = d_alloc_anon(inode); if (!parent) { iput(inode); -- cgit v1.2.1 From b663a79c191508f27cd885224b592a878c0ba0f6 Mon Sep 17 00:00:00 2001 From: Maxim Uvarov Date: Sun, 15 Jul 2007 23:40:48 -0700 Subject: taskstats: add context-switch counters Make available to the user the following task and process performance statistics: * Involuntary Context Switches (task_struct->nivcsw) * Voluntary Context Switches (task_struct->nvcsw) Statistics information is available from: 1. taskstats interface (Documentation/accounting/) 2. /proc/PID/status (task only). This data is useful for detecting hyperactivity patterns between processes. [akpm@linux-foundation.org: cleanup] Signed-off-by: Maxim Uvarov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Cc: Jonathan Lim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 680c913575f0..9cbab7e93557 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -289,6 +289,15 @@ static inline char *task_cap(struct task_struct *p, char *buffer) cap_t(p->cap_effective)); } +static inline char *task_context_switch_counts(struct task_struct *p, + char *buffer) +{ + return buffer + sprintf(buffer, "voluntary_ctxt_switches:\t%lu\n" + "nonvoluntary_ctxt_switches:\t%lu\n", + p->nvcsw, + p->nivcsw); +} + int proc_pid_status(struct task_struct *task, char * buffer) { char * orig = buffer; @@ -307,6 +316,7 @@ int proc_pid_status(struct task_struct *task, char * buffer) #if defined(CONFIG_S390) buffer = task_show_regs(task, buffer); #endif + buffer = task_context_switch_counts(task, buffer); return buffer - orig; } -- cgit v1.2.1 From a5001a27802723d6de50d9d8a446d594116524b0 Mon Sep 17 00:00:00 2001 From: Wyatt Banks Date: Sun, 15 Jul 2007 23:40:50 -0700 Subject: HFSPlus: change kmalloc/memset to kzalloc Removed kmalloc and memset in favor of kzalloc. To explain the HFSPLUS_SB() macro in the removed memset call: hfsplus_fs.h:#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info) Signed-off-by: Wyatt Banks Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index ebd1b380cbbc..42570c95ad62 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -283,11 +283,10 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) struct nls_table *nls = NULL; int err = -EINVAL; - sbi = kmalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; - memset(sbi, 0, sizeof(HFSPLUS_SB(sb))); sb->s_fs_info = sbi; INIT_HLIST_HEAD(&sbi->rsrc_inodes); hfsplus_fill_defaults(sbi); -- cgit v1.2.1 From e73a75fa7f062b52d015f1c961685dcaac57f710 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Jul 2007 23:40:52 -0700 Subject: hugetlbfs: use lib/parser, fix docs Use lib/parser.c to parse hugetlbfs mount options. Correct docs in hugetlbpage.txt. old size of hugetlbfs_fill_super: 675 bytes new size of hugetlbfs_fill_super: 686 bytes (hugetlbfs_parse_options() is inlined) Signed-off-by: Randy Dunlap Cc: Hugh Dickins Cc: David Gibson Cc: Adam Litke Acked-by: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 96 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e6b46b3ac2fe..711b801fbcdf 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -13,15 +13,18 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include #include #include @@ -47,6 +50,21 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = { int sysctl_hugetlb_shm_group; +enum { + Opt_size, Opt_nr_inodes, + Opt_mode, Opt_uid, Opt_gid, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_size, "size=%s"}, + {Opt_nr_inodes, "nr_inodes=%s"}, + {Opt_mode, "mode=%o"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_err, NULL}, +}; + static void huge_pagevec_release(struct pagevec *pvec) { int i; @@ -594,46 +612,70 @@ static const struct super_operations hugetlbfs_ops = { static int hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) { - char *opt, *value, *rest; + char *p, *rest; + substring_t args[MAX_OPT_ARGS]; + int option; if (!options) return 0; - while ((opt = strsep(&options, ",")) != NULL) { - if (!*opt) - continue; - value = strchr(opt, '='); - if (!value || !*value) - return -EINVAL; - else - *value++ = '\0'; - - if (!strcmp(opt, "uid")) - pconfig->uid = simple_strtoul(value, &value, 0); - else if (!strcmp(opt, "gid")) - pconfig->gid = simple_strtoul(value, &value, 0); - else if (!strcmp(opt, "mode")) - pconfig->mode = simple_strtoul(value,&value,0) & 0777U; - else if (!strcmp(opt, "size")) { - unsigned long long size = memparse(value, &rest); + while ((p = strsep(&options, ",")) != NULL) { + int token; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + goto bad_val; + pconfig->uid = option; + break; + + case Opt_gid: + if (match_int(&args[0], &option)) + goto bad_val; + pconfig->gid = option; + break; + + case Opt_mode: + if (match_octal(&args[0], &option)) + goto bad_val; + pconfig->mode = option & 0777U; + break; + + case Opt_size: { + unsigned long long size; + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + size = memparse(args[0].from, &rest); if (*rest == '%') { size <<= HPAGE_SHIFT; size *= max_huge_pages; do_div(size, 100); - rest++; } pconfig->nr_blocks = (size >> HPAGE_SHIFT); - value = rest; - } else if (!strcmp(opt,"nr_inodes")) { - pconfig->nr_inodes = memparse(value, &rest); - value = rest; - } else - return -EINVAL; + break; + } - if (*value) - return -EINVAL; + case Opt_nr_inodes: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + pconfig->nr_inodes = memparse(args[0].from, &rest); + break; + + default: + printk(KERN_ERR "hugetlbfs: Bad mount option: %s\n", p); + return 1; + break; + } } return 0; + +bad_val: + printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", + args[0].from, p); + return 1; } static int -- cgit v1.2.1 From b4c07bce796833401317519e44075889c5fd4d5f Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Sun, 15 Jul 2007 23:40:54 -0700 Subject: hugetlbfs: handle empty options string I was seeing a null pointer deref in fs/super.c:vfs_kern_mount(). Some file system get_sb() handler was returning NULL mnt_sb with a non-negative return value. I also noticed a "hugetlbfs: Bad mount option:" message in the log. Turns out that hugetlbfs_parse_options() was not checking for an empty option string after call to strsep(). On failure, hugetlbfs_parse_options() returns 1. hugetlbfs_fill_super() just passed this return code back up the call stack where vfs_kern_mount() missed the error and proceeded with a NULL mnt_sb. Apparently introduced by patch: hugetlbfs-use-lib-parser-fix-docs.patch The problem was exposed by this line in my fstab: none /huge hugetlbfs defaults 0 0 It can also be demonstrated by invoking mount of hugetlbfs directly with no options or a bogus option. This patch: 1) adds the check for empty option to hugetlbfs_parse_options(), 2) enhances the error message to bracket any unrecognized option with quotes , 3) modifies hugetlbfs_parse_options() to return -EINVAL on any unrecognized option, 4) adds a BUG_ON() to vfs_kern_mount() to catch any get_sb() handler that returns a NULL mnt->mnt_sb with a return value >= 0. Signed-off-by: Lee Schermerhorn Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 8 +++++--- fs/super.c | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 711b801fbcdf..d145cb79c30a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -621,6 +621,8 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) + continue; token = match_token(p, tokens, args); switch (token) { @@ -665,8 +667,9 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; default: - printk(KERN_ERR "hugetlbfs: Bad mount option: %s\n", p); - return 1; + printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", + p); + return -EINVAL; break; } } @@ -693,7 +696,6 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) config.gid = current->fsgid; config.mode = 0755; ret = hugetlbfs_parse_options(data, &config); - if (ret) return ret; diff --git a/fs/super.c b/fs/super.c index 5260d620c555..fc8ebedc6bed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -884,6 +884,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void error = type->get_sb(type, flags, name, data, mnt); if (error < 0) goto out_free_secdata; + BUG_ON(!mnt->mnt_sb); error = security_sb_kern_mount(mnt->mnt_sb, secdata); if (error) -- cgit v1.2.1 From 64d67d21773f1946ddc04aedc201f6c2f3ee1bfb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 15 Jul 2007 23:41:02 -0700 Subject: revert "vanishing ioctl handler debugging" Revert my do_ioctl() debugging patch: Paul fixed the bug. Cc: Paul Fulghum Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioctl.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 8c90cbc903fa..479c1038ed4a 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -21,7 +20,6 @@ static long do_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; - void *f; if (!filp->f_op) goto out; @@ -31,16 +29,10 @@ static long do_ioctl(struct file *filp, unsigned int cmd, if (error == -ENOIOCTLCMD) error = -EINVAL; goto out; - } else if ((f = filp->f_op->ioctl)) { + } else if (filp->f_op->ioctl) { lock_kernel(); - if (!filp->f_op->ioctl) { - printk("%s: ioctl %p disappeared\n", __FUNCTION__, f); - print_symbol("symbol: %s\n", (unsigned long)f); - dump_stack(); - } else { - error = filp->f_op->ioctl(filp->f_path.dentry->d_inode, - filp, cmd, arg); - } + error = filp->f_op->ioctl(filp->f_path.dentry->d_inode, + filp, cmd, arg); unlock_kernel(); } -- cgit v1.2.1 From 4d3b573ad9af85b6df104044f6fff05f04349db2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 15 Jul 2007 23:41:03 -0700 Subject: binfmt_elf warning fix fs/binfmt_elf.c: In function 'load_elf_binary': fs/binfmt_elf.c:1002: warning: 'interp_map_addr' may be used uninitialized in this function The compiler (gcc-4.1.0) is correct, but it failed to notice that we didn't use the resulting value. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5cfa735639ae..a27e42bf3400 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -999,7 +999,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) elf_entry = load_aout_interp(&loc->interp_ex, interpreter); } else { - unsigned long interp_map_addr; /* unused */ + unsigned long uninitialized_var(interp_map_addr); elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, -- cgit v1.2.1 From 467e9f4b5086a60a5cb2e032ccaf4a31abadc4c2 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Sun, 15 Jul 2007 23:41:06 -0700 Subject: fix create_new_namespaces() return value dup_mnt_ns() and clone_uts_ns() return NULL on failure. This is wrong, create_new_namespaces() uses ERR_PTR() to catch an error. This means that the subsequent create_new_namespaces() will hit BUG_ON() in copy_mnt_ns() or copy_utsname(). Modify create_new_namespaces() to also use the errors returned by the copy_*_ns routines and not to systematically return ENOMEM. [oleg@tv-sign.ru: better changelog] Signed-off-by: Cedric Le Goater Cc: Serge E. Hallyn Cc: Badari Pulavarty Cc: Pavel Emelianov Cc: Herbert Poetzl Cc: Eric W. Biederman Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index c811a94e4c88..5585623f6252 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1451,7 +1451,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) - return NULL; + return ERR_PTR(-ENOMEM); atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); @@ -1465,7 +1465,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); - return NULL; + return ERR_PTR(-ENOMEM);; } spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); -- cgit v1.2.1 From 030703e49d4966bd348660e0fdc2699507efb82b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 15 Jul 2007 23:41:08 -0700 Subject: ext3: fix deadlock in ext3_remount() and orphan list handling ext3_orphan_add() and ext3_orphan_del() functions lock sb->s_lock with a transaction started with ext3_mark_recovery_complete() waits for a transaction holding sb->s_lock, thus leading to a possible deadlock. At the moment we call ext3_mark_recovery_complete() from ext3_remount() we have done all the work needed for remounting and thus we are safe to drop sb->s_lock before we wait for transactions to commit. Note that at this moment we are still guarded by s_umount lock against other remounts/umounts. Signed-off-by: Jan Kara Cc: Eric Sandeen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6ee4a95f4934..69a59d4580e9 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2147,12 +2147,14 @@ static void ext3_mark_recovery_complete(struct super_block * sb, journal_lock_updates(journal); journal_flush(journal); + lock_super(sb); if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); sb->s_dirt = 0; ext3_commit_super(sb, es, 1); } + unlock_super(sb); journal_unlock_updates(journal); } @@ -2341,7 +2343,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) (sbi->s_mount_state & EXT3_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); + /* + * We have to unlock super so that we can wait for + * transactions. + */ + unlock_super(sb); ext3_mark_recovery_complete(sb, es); + lock_super(sb); } else { __le32 ret; if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, -- cgit v1.2.1 From 32c3773011a9b3522bd4abadc7fad8c27417119f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 15 Jul 2007 23:41:09 -0700 Subject: ext4: fix deadlock in ext4_remount() and orphan list handling ext4_orphan_add() and ext4_orphan_del() functions lock sb->s_lock with a transaction started with ext4_mark_recovery_complete() waits for a transaction holding sb->s_lock, thus leading to a possible deadlock. At the moment we call ext4_mark_recovery_complete() from ext4_remount() we have done all the work needed for remounting and thus we are safe to drop sb->s_lock before we wait for transactions to commit. Note that at this moment we are still guarded by s_umount lock against other remounts/umounts. Signed-off-by: Jan Kara Cc: Eric Sandeen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6768c5aa3fee..bd9aff9701c0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2222,12 +2222,14 @@ static void ext4_mark_recovery_complete(struct super_block * sb, jbd2_journal_lock_updates(journal); jbd2_journal_flush(journal); + lock_super(sb); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); sb->s_dirt = 0; ext4_commit_super(sb, es, 1); } + unlock_super(sb); jbd2_journal_unlock_updates(journal); } @@ -2416,7 +2418,13 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); + /* + * We have to unlock super so that we can wait for + * transactions. + */ + unlock_super(sb); ext4_mark_recovery_complete(sb, es); + lock_super(sb); } else { __le32 ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, -- cgit v1.2.1 From b716395e2b8e450e294537de0c91476ded2f0395 Mon Sep 17 00:00:00 2001 From: Vasily Tarasov Date: Sun, 15 Jul 2007 23:41:12 -0700 Subject: diskquota: 32bit quota tools on 64bit architectures OpenVZ Linux kernel team has discovered the problem with 32bit quota tools working on 64bit architectures. In 2.6.10 kernel sys32_quotactl() function was replaced by sys_quotactl() with the comment "sys_quotactl seems to be 32/64bit clean, enable it for 32bit" However this isn't right. Look at if_dqblk structure: struct if_dqblk { __u64 dqb_bhardlimit; __u64 dqb_bsoftlimit; __u64 dqb_curspace; __u64 dqb_ihardlimit; __u64 dqb_isoftlimit; __u64 dqb_curinodes; __u64 dqb_btime; __u64 dqb_itime; __u32 dqb_valid; }; For 32 bit quota tools sizeof(if_dqblk) == 0x44. But for 64 bit kernel its size is 0x48, 'cause of alignment! Thus we got a problem. Attached patch reintroduce sys32_quotactl() function, that handles this and related situations. [michal.k.k.piotrowski@gmail.com: build fix] [akpm@linux-foundation.org: Make it link with CONFIG_QUOTA=n] Signed-off-by: Vasily Tarasov Cc: Andi Kleen Cc: "Luck, Tony" Cc: Jan Kara Cc: Signed-off-by: Michal Piotrowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/quota.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) (limited to 'fs') diff --git a/fs/quota.c b/fs/quota.c index 9f237d6182c9..e6577ac15a6c 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -10,12 +10,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include /* Check validity of generic quotactl commands */ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) @@ -384,3 +386,119 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t return ret; } + +#if defined(CONFIG_X86_64) || defined(CONFIG_IA64) +/* + * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) + * and is necessary due to alignment problems. + */ +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +/* XFS structures */ +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; + +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) +{ + unsigned int cmds; + struct if_dqblk __user *dqblk; + struct compat_if_dqblk __user *compat_dqblk; + struct fs_quota_stat __user *fsqstat; + struct compat_fs_quota_stat __user *compat_fsqstat; + compat_uint_t data; + u16 xdata; + long ret; + + cmds = cmd >> SUBCMDSHIFT; + + switch (cmds) { + case Q_GETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = sys_quotactl(cmd, special, id, dqblk); + if (ret) + break; + if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || + get_user(data, &dqblk->dqb_valid) || + put_user(data, &compat_dqblk->dqb_valid)) + ret = -EFAULT; + break; + case Q_SETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = -EFAULT; + if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(data, &compat_dqblk->dqb_valid) || + put_user(data, &dqblk->dqb_valid)) + break; + ret = sys_quotactl(cmd, special, id, dqblk); + break; + case Q_XGETQSTAT: + fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); + compat_fsqstat = addr; + ret = sys_quotactl(cmd, special, id, fsqstat); + if (ret) + break; + ret = -EFAULT; + /* Copying qs_version, qs_flags, qs_pad */ + if (copy_in_user(compat_fsqstat, fsqstat, + offsetof(struct compat_fs_quota_stat, qs_uquota))) + break; + /* Copying qs_uquota */ + if (copy_in_user(&compat_fsqstat->qs_uquota, + &fsqstat->qs_uquota, + sizeof(compat_fsqstat->qs_uquota)) || + get_user(data, &fsqstat->qs_uquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) + break; + /* Copying qs_gquota */ + if (copy_in_user(&compat_fsqstat->qs_gquota, + &fsqstat->qs_gquota, + sizeof(compat_fsqstat->qs_gquota)) || + get_user(data, &fsqstat->qs_gquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) + break; + /* Copying the rest */ + if (copy_in_user(&compat_fsqstat->qs_incoredqs, + &fsqstat->qs_incoredqs, + sizeof(struct compat_fs_quota_stat) - + offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || + get_user(xdata, &fsqstat->qs_iwarnlimit) || + put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) + break; + ret = 0; + break; + default: + ret = sys_quotactl(cmd, special, id, addr); + } + return ret; +} +#endif -- cgit v1.2.1 From e3a68e30d28dbc6981dfc3d6ceddbfa2f885fe4e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sun, 15 Jul 2007 23:41:14 -0700 Subject: ext3: remove extra IS_RDONLY() check ext3_change_inode_journal_flag() is only called from one location: ext3_ioctl(EXT3_IOC_SETFLAGS). That ioctl case already has a IS_RDONLY() call in it so this one is superfluous. Signed-off-by: Dave Hansen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2a85ddee4740..de4e3161e479 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3195,7 +3195,7 @@ int ext3_change_inode_journal_flag(struct inode *inode, int val) */ journal = EXT3_JOURNAL(inode); - if (is_journal_aborted(journal) || IS_RDONLY(inode)) + if (is_journal_aborted(journal)) return -EROFS; journal_lock_updates(journal); -- cgit v1.2.1 From 213dd266d48af90c1eec8688c1ff31aa34d21de2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 15 Jul 2007 23:41:15 -0700 Subject: namespace: ensure clone_flags are always stored in an unsigned long While working on unshare support for the network namespace I noticed we were putting clone flags in an int. Which is weird because the syscall uses unsigned long and we at least need an unsigned to properly hold all of the unshare flags. So to make the code consistent, this patch updates the code to use unsigned long instead of int for the clone flags in those places where we get it wrong today. Signed-off-by: Eric W. Biederman Acked-by: Cedric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 5585623f6252..9211da4fef53 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1509,7 +1509,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, return new_ns; } -struct mnt_namespace *copy_mnt_ns(int flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; -- cgit v1.2.1 From 681dcd95431e2258c1174602fcd69393e4139959 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 15 Jul 2007 23:41:16 -0700 Subject: drop obsolete sys_ioctl export sys_ioctl() was only exported for our first version of compat ioctl handling. Now that the whole compat ioctl handling mess is more or less sorted out there are no more modular users left and we can kill it. There's one exception and that's sparc64's solaris compat module, but sparc64 has it's own export predating the generic one by years for that which this patch leaves untouched. Signed-off-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioctl.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 479c1038ed4a..c2a773e8620b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -174,11 +174,3 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) out: return error; } - -/* - * Platforms implementing 32 bit compatibility ioctl handlers in - * modules need this exported - */ -#ifdef CONFIG_COMPAT -EXPORT_SYMBOL(sys_ioctl); -#endif -- cgit v1.2.1 From 3fc74269c8910573a0e9a7bc303752194ce5cae0 Mon Sep 17 00:00:00 2001 From: vignesh babu Date: Sun, 15 Jul 2007 23:41:17 -0700 Subject: is_power_of_2: ext3/super.c Replace (n & (n-1)) in the context of power of 2 checks with is_power_of_2() Signed-off-by: vignesh babu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 69a59d4580e9..440c35d98993 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -1574,7 +1575,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || - (sbi->s_inode_size & (sbi->s_inode_size - 1)) || + (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { printk (KERN_ERR "EXT3-fs: unsupported inode size: %d\n", -- cgit v1.2.1 From f482394ccbca7234d29cc146d4a2b94f976ce5a1 Mon Sep 17 00:00:00 2001 From: vignesh babu Date: Sun, 15 Jul 2007 23:41:17 -0700 Subject: is_power_of_2(): jbd Replace (n & (n-1)) in the context of power of 2 checks with is_power_of_2(). Signed-off-by: vignesh babu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/revoke.c | 5 +++-- fs/jbd2/revoke.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 824e3b7d4ec1..8db2fa25170b 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -68,6 +68,7 @@ #include #include #endif +#include static struct kmem_cache *revoke_record_cache; static struct kmem_cache *revoke_table_cache; @@ -211,7 +212,7 @@ int journal_init_revoke(journal_t *journal, int hash_size) journal->j_revoke = journal->j_revoke_table[0]; /* Check that the hash_size is a power of two */ - J_ASSERT ((hash_size & (hash_size-1)) == 0); + J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke->hash_size = hash_size; @@ -238,7 +239,7 @@ int journal_init_revoke(journal_t *journal, int hash_size) journal->j_revoke = journal->j_revoke_table[1]; /* Check that the hash_size is a power of two */ - J_ASSERT ((hash_size & (hash_size-1)) == 0); + J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke->hash_size = hash_size; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 9246e763da78..28cac049a56b 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -68,6 +68,7 @@ #include #include #endif +#include static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; @@ -212,7 +213,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) journal->j_revoke = journal->j_revoke_table[0]; /* Check that the hash_size is a power of two */ - J_ASSERT ((hash_size & (hash_size-1)) == 0); + J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke->hash_size = hash_size; @@ -239,7 +240,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) journal->j_revoke = journal->j_revoke_table[1]; /* Check that the hash_size is a power of two */ - J_ASSERT ((hash_size & (hash_size-1)) == 0); + J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke->hash_size = hash_size; -- cgit v1.2.1 From 29bc5b4f73a65ef667df50d5ed474e371471d915 Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Sun, 15 Jul 2007 23:41:22 -0700 Subject: mistaken ext4_inode_bitmap for ext4_block_bitmap In ext4_new_blocks(), one of two ext4_block_bitmap() calls should be ext4_inode_bitmap() call. It is not harmful in normal processing, but it should be fixed. Signed-off-by: Toshiyuki Okajima Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 3b64bb16c727..9de54ae48dee 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1585,7 +1585,7 @@ allocated: ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no); if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) || - in_range(ext4_block_bitmap(sb, gdp), ret_block, num) || + in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) || in_range(ret_block, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group) || in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), -- cgit v1.2.1 From 1e96b7ca1e8f17c5117da369daaa7cf2edfdf9b1 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Sun, 15 Jul 2007 23:41:22 -0700 Subject: HFS+: refactor ASCII to unicode conversion routine for later reuse The HFS+ filesystem is case-insensitive and does automatic unicode decomposition by default, but does not provide custom dentry operations. This can lead to multiple dentries being cached for lookups on a filename with varying case and/or character (de)composition. These patches add custom dentry hash and comparison operations for case-sensitive and/or automatically decomposing HFS+ filesystems. Unicode decomposition and case-folding are performed as required to ensure equivalent filenames are hashed to the same values and compare as equal. This patch: Refactor existing HFS+ ASCII to unicode string conversion routine to split out character conversion functionality. This will be reused by the custom dentry hash and comparison routines. This approach avoids unnecessary memory allocation compared to using the string conversion routine directly in the new functions. [akpm@linux-foundation.org: avoid use-of-uninitialised] Signed-off-by: Duane Griffin Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/unicode.c | 107 +++++++++++++++++++++++++++++---------------------- 1 file changed, 62 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 689c8bd721fb..5df0052b2775 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -239,58 +239,75 @@ out: return res; } -int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, const char *astr, int len) +/* + * Convert one or more ASCII characters into a single unicode character. + * Returns the number of ASCII characters corresponding to the unicode char. + */ +static inline int asc2unichar(struct super_block *sb, const char *astr, int len, + wchar_t *uc) { - struct nls_table *nls = HFSPLUS_SB(sb).nls; - int size, off, decompose; + int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); + if (size <= 0) { + *uc = '?'; + size = 1; + } + switch (*uc) { + case 0x2400: + *uc = 0; + break; + case ':': + *uc = '/'; + break; + } + return size; +} + +/* Decomposes a single unicode character. */ +static inline u16 *decompose_unichar(wchar_t uc, int *size) +{ + int off; + + off = hfsplus_decompose_table[(uc >> 12) & 0xf]; + if (off == 0 || off == 0xffff) + return NULL; + + off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)]; + if (!off) + return NULL; + + off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)]; + if (!off) + return NULL; + + off = hfsplus_decompose_table[off + (uc & 0xf)]; + *size = off & 3; + if (*size == 0) + return NULL; + return hfsplus_decompose_table + (off / 4); +} + +int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, + const char *astr, int len) +{ + int size, dsize, decompose; + u16 *dstr, outlen = 0; wchar_t c; - u16 outlen = 0; decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); - while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { - size = nls->char2uni(astr, len, &c); - if (size <= 0) { - c = '?'; - size = 1; - } - astr += size; - len -= size; - switch (c) { - case 0x2400: - c = 0; - break; - case ':': - c = '/'; - break; - } - if (c >= 0xc0 && decompose) { - off = hfsplus_decompose_table[(c >> 12) & 0xf]; - if (!off) - goto done; - if (off == 0xffff) { - goto done; - } - off = hfsplus_decompose_table[off + ((c >> 8) & 0xf)]; - if (!off) - goto done; - off = hfsplus_decompose_table[off + ((c >> 4) & 0xf)]; - if (!off) - goto done; - off = hfsplus_decompose_table[off + (c & 0xf)]; - size = off & 3; - if (!size) - goto done; - off /= 4; - if (outlen + size > HFSPLUS_MAX_STRLEN) + size = asc2unichar(sb, astr, len, &c); + + if (decompose && (dstr = decompose_unichar(c, &dsize))) { + if (outlen + dsize > HFSPLUS_MAX_STRLEN) break; do { - ustr->unicode[outlen++] = cpu_to_be16(hfsplus_decompose_table[off++]); - } while (--size > 0); - continue; - } - done: - ustr->unicode[outlen++] = cpu_to_be16(c); + ustr->unicode[outlen++] = cpu_to_be16(*dstr++); + } while (--dsize > 0); + } else + ustr->unicode[outlen++] = cpu_to_be16(c); + + astr += size; + len -= size; } ustr->length = cpu_to_be16(outlen); if (len > 0) -- cgit v1.2.1 From d45bce8faf55511ec7d7ffc301461d864d67f1af Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Sun, 15 Jul 2007 23:41:23 -0700 Subject: HFS+: add custom dentry hash and comparison operations Add custom dentry hash and comparison operations for HFS+ filesystems that are case-insensitive and/or do automatic unicode decomposition. The new operations reuse the existing HFS+ ASCII to unicode conversion, unicode decomposition and case folding functionality. Signed-off-by: Duane Griffin Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/btree.c | 4 +- fs/hfsplus/dir.c | 2 + fs/hfsplus/hfsplus_fs.h | 4 ++ fs/hfsplus/inode.c | 5 ++ fs/hfsplus/super.c | 1 + fs/hfsplus/unicode.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 138 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 90ebab753d30..050d29c0a5b5 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -62,8 +62,10 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && (head->key_type == HFSPLUS_KEY_BINARY)) tree->keycmp = hfsplus_cat_bin_cmp_key; - else + else { tree->keycmp = hfsplus_cat_case_cmp_key; + HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; + } } else { printk(KERN_ERR "hfs: unknown B*Tree requested\n"); goto fail_page; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 80b5682a2273..1955ee61251c 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -36,6 +36,8 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, u16 type; sb = dir->i_sb; + + dentry->d_op = &hfsplus_dentry_operations; dentry->d_fsdata = NULL; hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 3915635b4470..d9f5eda6d039 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -150,6 +150,7 @@ struct hfsplus_sb_info { #define HFSPLUS_SB_NODECOMPOSE 0x0002 #define HFSPLUS_SB_FORCE 0x0004 #define HFSPLUS_SB_HFSX 0x0008 +#define HFSPLUS_SB_CASEFOLD 0x0010 struct hfsplus_inode_info { @@ -321,6 +322,7 @@ void hfsplus_file_truncate(struct inode *); /* inode.c */ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; +extern struct dentry_operations hfsplus_dentry_operations; void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); @@ -353,6 +355,8 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unist int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); +int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str); +int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2); /* wrapper.c */ int hfsplus_read_wrapper(struct super_block *); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 409ce5429c91..6f7c662174db 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -131,6 +131,11 @@ const struct address_space_operations hfsplus_aops = { .writepages = hfsplus_writepages, }; +struct dentry_operations hfsplus_dentry_operations = { + .d_hash = hfsplus_hash_dentry, + .d_compare = hfsplus_compare_dentry, +}; + static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 42570c95ad62..6d87a2a9534d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -380,6 +380,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) iput(root); goto cleanup; } + sb->s_root->d_op = &hfsplus_dentry_operations; str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; str.name = HFSP_HIDDENDIR_NAME; diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 5df0052b2775..9e10f9444b64 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -314,3 +314,126 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, return -ENAMETOOLONG; return 0; } + +/* + * Hash a string to an integer as appropriate for the HFS+ filesystem. + * Composed unicode characters are decomposed and case-folding is performed + * if the appropriate bits are (un)set on the superblock. + */ +int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) +{ + struct super_block *sb = dentry->d_sb; + const char *astr; + const u16 *dstr; + int casefold, decompose, size, dsize, len; + unsigned long hash; + wchar_t c; + u16 c2; + + casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); + decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + hash = init_name_hash(); + astr = str->name; + len = str->len; + while (len > 0) { + size = asc2unichar(sb, astr, len, &c); + astr += size; + len -= size; + + if (decompose && (dstr = decompose_unichar(c, &dsize))) { + do { + c2 = *dstr++; + if (!casefold || (c2 = case_fold(c2))) + hash = partial_name_hash(c2, hash); + } while (--dsize > 0); + } else { + c2 = c; + if (!casefold || (c2 = case_fold(c2))) + hash = partial_name_hash(c2, hash); + } + } + str->hash = end_name_hash(hash); + + return 0; +} + +/* + * Compare strings with HFS+ filename ordering. + * Composed unicode characters are decomposed and case-folding is performed + * if the appropriate bits are (un)set on the superblock. + */ +int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) +{ + struct super_block *sb = dentry->d_sb; + int casefold, decompose, size; + int dsize1, dsize2, len1, len2; + const u16 *dstr1, *dstr2; + const char *astr1, *astr2; + u16 c1, c2; + wchar_t c; + + casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); + decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + astr1 = s1->name; + len1 = s1->len; + astr2 = s2->name; + len2 = s2->len; + dsize1 = dsize2 = 0; + dstr1 = dstr2 = NULL; + + while (len1 > 0 && len2 > 0) { + if (!dsize1) { + size = asc2unichar(sb, astr1, len1, &c); + astr1 += size; + len1 -= size; + + if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) { + c1 = c; + dstr1 = &c1; + dsize1 = 1; + } + } + + if (!dsize2) { + size = asc2unichar(sb, astr2, len2, &c); + astr2 += size; + len2 -= size; + + if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) { + c2 = c; + dstr2 = &c2; + dsize2 = 1; + } + } + + c1 = *dstr1; + c2 = *dstr2; + if (casefold) { + if (!(c1 = case_fold(c1))) { + dstr1++; + dsize1--; + continue; + } + if (!(c2 = case_fold(c2))) { + dstr2++; + dsize2--; + continue; + } + } + if (c1 < c2) + return -1; + else if (c1 > c2) + return 1; + + dstr1++; + dsize1--; + dstr2++; + dsize2--; + } + + if (len1 < len2) + return -1; + if (len1 > len2) + return 1; + return 0; +} -- cgit v1.2.1 From 948730b0e39fb4cba4a5ed0fc40e0f017cce2dfa Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 15 Jul 2007 23:41:25 -0700 Subject: fs/namespace.c should #include "internal.h" Every file should include the headers containing the prototypes for its global functions. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 9211da4fef53..4198003d7e18 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -28,6 +28,7 @@ #include #include #include "pnode.h" +#include "internal.h" /* spinlock for vfsmount related operations, inplace of dcache_lock */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); -- cgit v1.2.1 From 4e91672c76319aaed24ea3e784e238cf445c57cb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sun, 15 Jul 2007 23:41:25 -0700 Subject: Replace obscure constructs in fs/block_dev.c Replace some funky codepaths in fs/block_dev.c with cleaner versions of the affected places. [akpm@linux-foundation.org: fix return value] Signed-off-by: Johannes Weiner Cc: Bjorn Steinbrink Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index da5f0515e8eb..75c47a21b2f0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -872,7 +872,7 @@ static struct bd_holder *find_bd_holder(struct block_device *bdev, */ static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) { - int ret; + int err; if (!bo) return -EINVAL; @@ -880,15 +880,18 @@ static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) if (!bd_holder_grab_dirs(bdev, bo)) return -EBUSY; - ret = add_symlink(bo->sdir, bo->sdev); - if (ret == 0) { - ret = add_symlink(bo->hdir, bo->hdev); - if (ret) - del_symlink(bo->sdir, bo->sdev); + err = add_symlink(bo->sdir, bo->sdev); + if (err) + return err; + + err = add_symlink(bo->hdir, bo->hdev); + if (err) { + del_symlink(bo->sdir, bo->sdev); + return err; } - if (ret == 0) - list_add_tail(&bo->list, &bdev->bd_holder_list); - return ret; + + list_add_tail(&bo->list, &bdev->bd_holder_list); + return 0; } /** @@ -946,7 +949,7 @@ static struct bd_holder *del_bd_holder(struct block_device *bdev, static int bd_claim_by_kobject(struct block_device *bdev, void *holder, struct kobject *kobj) { - int res; + int err; struct bd_holder *bo, *found; if (!kobj) @@ -957,21 +960,24 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder, return -ENOMEM; mutex_lock(&bdev->bd_mutex); - res = bd_claim(bdev, holder); - if (res == 0) { - found = find_bd_holder(bdev, bo); - if (found == NULL) { - res = add_bd_holder(bdev, bo); - if (res) - bd_release(bdev); - } - } - if (res || found) + err = bd_claim(bdev, holder); + if (err) + goto out; + + found = find_bd_holder(bdev, bo); + if (found) + goto out; + + err = add_bd_holder(bdev, bo); + if (err) + bd_release(bdev); + +out: + if (err || found) free_bd_holder(bo); mutex_unlock(&bdev->bd_mutex); - - return res; + return err; } /** -- cgit v1.2.1 From 4210df283cc703bc494f450c91b8311bdf7fe5ee Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 15 Jul 2007 23:41:28 -0700 Subject: bd_claim_by_disk: fix warning Fix this: fs/block_dev.c: In function 'bd_claim_by_disk': fs/block_dev.c:970: warning: 'found' may be used uninitialized in this function and given that free_bd_holder() now needs free(NULL)-is-legal behaviour, we can simplify bd_release_from_kobject(). Cc: Bjorn Steinbrink Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 75c47a21b2f0..3635315e3b99 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -963,20 +963,20 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder, err = bd_claim(bdev, holder); if (err) - goto out; + goto fail; found = find_bd_holder(bdev, bo); if (found) - goto out; + goto fail; err = add_bd_holder(bdev, bo); if (err) bd_release(bdev); - -out: - if (err || found) - free_bd_holder(bo); + else + bo = NULL; +fail: mutex_unlock(&bdev->bd_mutex); + free_bd_holder(bo); return err; } @@ -991,15 +991,12 @@ out: static void bd_release_from_kobject(struct block_device *bdev, struct kobject *kobj) { - struct bd_holder *bo; - if (!kobj) return; mutex_lock(&bdev->bd_mutex); bd_release(bdev); - if ((bo = del_bd_holder(bdev, kobj))) - free_bd_holder(bo); + free_bd_holder(del_bd_holder(bdev, kobj)); mutex_unlock(&bdev->bd_mutex); } -- cgit v1.2.1 From 1d9d02feeee89e9132034d504c9a45eeaf618a3d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Sun, 15 Jul 2007 23:41:32 -0700 Subject: move seccomp from /proc to a prctl This reduces the memory footprint and it enforces that only the current task can enable seccomp on itself (this is a requirement for a strightforward [modulo preempt ;) ] TIF_NOTSC implementation). Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 72 ---------------------------------------------------------- 1 file changed, 72 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index d0921944e68c..ae3627337a92 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -67,7 +67,6 @@ #include #include #include -#include #include #include #include @@ -817,71 +816,6 @@ static const struct file_operations proc_loginuid_operations = { }; #endif -#ifdef CONFIG_SECCOMP -static ssize_t seccomp_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); - char __buf[20]; - size_t len; - - if (!tsk) - return -ESRCH; - /* no need to print the trailing zero, so use only len */ - len = sprintf(__buf, "%u\n", tsk->seccomp.mode); - put_task_struct(tsk); - - return simple_read_from_buffer(buf, count, ppos, __buf, len); -} - -static ssize_t seccomp_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); - char __buf[20], *end; - unsigned int seccomp_mode; - ssize_t result; - - result = -ESRCH; - if (!tsk) - goto out_no_task; - - /* can set it only once to be even more secure */ - result = -EPERM; - if (unlikely(tsk->seccomp.mode)) - goto out; - - result = -EFAULT; - memset(__buf, 0, sizeof(__buf)); - count = min(count, sizeof(__buf) - 1); - if (copy_from_user(__buf, buf, count)) - goto out; - - seccomp_mode = simple_strtoul(__buf, &end, 0); - if (*end == '\n') - end++; - result = -EINVAL; - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { - tsk->seccomp.mode = seccomp_mode; - set_tsk_thread_flag(tsk, TIF_SECCOMP); - } else - goto out; - result = -EIO; - if (unlikely(!(end - __buf))) - goto out; - result = end - __buf; -out: - put_task_struct(tsk); -out_no_task: - return result; -} - -static const struct file_operations proc_seccomp_operations = { - .read = seccomp_read, - .write = seccomp_write, -}; -#endif /* CONFIG_SECCOMP */ - #ifdef CONFIG_FAULT_INJECTION static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) @@ -2042,9 +1976,6 @@ static const struct pid_entry tgid_base_stuff[] = { REG("numa_maps", S_IRUGO, numa_maps), #endif REG("mem", S_IRUSR|S_IWUSR, mem), -#ifdef CONFIG_SECCOMP - REG("seccomp", S_IRUSR|S_IWUSR, seccomp), -#endif LNK("cwd", cwd), LNK("root", root), LNK("exe", exe), @@ -2329,9 +2260,6 @@ static const struct pid_entry tid_base_stuff[] = { REG("numa_maps", S_IRUGO, numa_maps), #endif REG("mem", S_IRUSR|S_IWUSR, mem), -#ifdef CONFIG_SECCOMP - REG("seccomp", S_IRUSR|S_IWUSR, seccomp), -#endif LNK("cwd", cwd), LNK("root", root), LNK("exe", exe), -- cgit v1.2.1 From 9e8c4273ef4f631a896650bd2ade4c1b6487131b Mon Sep 17 00:00:00 2001 From: vignesh babu Date: Sun, 15 Jul 2007 23:41:36 -0700 Subject: is_power_of_2: ufs/super.c Replace (n & (n-1)) with is_power_of_2 Signed-off-by: vignesh babu Acked-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 22ff6ed55ce9..2b3011689e89 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -87,6 +87,7 @@ #include #include #include +#include #include "swab.h" #include "util.h" @@ -854,7 +855,7 @@ magic_found: uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); - if (uspi->s_fsize & (uspi->s_fsize - 1)) { + if (!is_power_of_2(uspi->s_fsize)) { printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n", uspi->s_fsize); goto failed; @@ -869,7 +870,7 @@ magic_found: uspi->s_fsize); goto failed; } - if (uspi->s_bsize & (uspi->s_bsize - 1)) { + if (!is_power_of_2(uspi->s_bsize)) { printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n", uspi->s_bsize); goto failed; -- cgit v1.2.1 From d375b97037c40d51b41d3b00b15729f6730c2cfe Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 15 Jul 2007 23:41:40 -0700 Subject: UDF: fix function name from udf_crc16 to udf_crc We have to change udf_crc16() name to udf_crc() to be able to play with CRC test. Signed-off-by: Cyrill Gorcunov Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/crc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/udf/crc.c b/fs/udf/crc.c index 1b82a4adc2f7..ef2bfaa19d75 100644 --- a/fs/udf/crc.c +++ b/fs/udf/crc.c @@ -106,8 +106,8 @@ int main(void) { unsigned short x; - x = udf_crc16(bytes, sizeof bytes); - printf("udf_crc16: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U); + x = udf_crc(bytes, sizeof bytes); + printf("udf_crc: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U); return 0; } -- cgit v1.2.1 From 952d9de116ad87261de106464a9eeec038c4cd14 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 15 Jul 2007 23:41:45 -0700 Subject: ext3: fix error handling in ext3_create_journal() Fix error handling in ext3_create_journal according to kernel conventions. Signed-off-by: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 440c35d98993..8b75f73ba3b4 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2084,6 +2084,7 @@ static int ext3_create_journal(struct super_block * sb, unsigned int journal_inum) { journal_t *journal; + int err; if (sb->s_flags & MS_RDONLY) { printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " @@ -2091,13 +2092,15 @@ static int ext3_create_journal(struct super_block * sb, return -EROFS; } - if (!(journal = ext3_get_journal(sb, journal_inum))) + journal = ext3_get_journal(sb, journal_inum); + if (!journal) return -EINVAL; printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n", journal_inum); - if (journal_create(journal)) { + err = journal_create(journal); + if (err) { printk(KERN_ERR "EXT3-fs: error creating journal.\n"); journal_destroy(journal); return -EIO; -- cgit v1.2.1 From 6c675bd43ccc36927c855d53d2e0042cdd1074ab Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 15 Jul 2007 23:41:45 -0700 Subject: ext4: fix error handling in ext4_create_journal Fix error handling in ext4_create_journal according to kernel conventions. Signed-off-by: Borislav Petkov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bd9aff9701c0..adcbfadfcb4c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2158,6 +2158,7 @@ static int ext4_create_journal(struct super_block * sb, unsigned int journal_inum) { journal_t *journal; + int err; if (sb->s_flags & MS_RDONLY) { printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " @@ -2165,13 +2166,15 @@ static int ext4_create_journal(struct super_block * sb, return -EROFS; } - if (!(journal = ext4_get_journal(sb, journal_inum))) + journal = ext4_get_journal(sb, journal_inum); + if (!journal) return -EINVAL; printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", journal_inum); - if (jbd2_journal_create(journal)) { + err = jbd2_journal_create(journal); + if (err) { printk(KERN_ERR "EXT4-fs: error creating journal.\n"); jbd2_journal_destroy(journal); return -EIO; -- cgit v1.2.1 From 4b4e5a1411c8b970983fb6022db1da31c4f5c301 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 15 Jul 2007 23:41:53 -0700 Subject: Fix trivial typos in anon_inodes.c comments Trivial typo and grammar fixes. Signed-off-by: "J. Bruce Fields" Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/anon_inodes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 40fe3a3222e4..a260198306c2 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -53,7 +53,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = { }; /** - * anon_inode_getfd - creates a new file instance by hooking it up to and + * anon_inode_getfd - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file * @@ -66,7 +66,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. - * All the files created with anon_inode_getfd() will share a single inode, by + * All the files created with anon_inode_getfd() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. */ @@ -141,9 +141,9 @@ err_put_filp: } /* - * A single inode exist for all anon_inode files. Contrary to pipes, - * anon_inode inodes has no per-instance data associated, so we can avoid - * the allocation of multiple of them. + * A single inode exists for all anon_inode files. Contrary to pipes, + * anon_inode inodes have no associated per-instance data, so we need + * only allocate one of them. */ static struct inode *anon_inode_mkinode(void) { -- cgit v1.2.1 From 2235219b7721b8e74de6841e79240936561a2b63 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 15 Jul 2007 23:41:58 -0700 Subject: ext2: statfs speed up This is a patch that speeds up statfs. It is very simple - the "overhead" calculation, which takes a huge amount of time for large filesystems, never changes unless the size of the filesystem itself changes. That means we can store it in memory and only recalculate if the filesystem has been resized (almost never). It also fixes a minor problem that we never update the on-disk superblock free blocks/inodes counts until the filesystem is unmounted. While not fatal, we may as well update that on disk when we have the information, and it makes things like debugfs and dumpe2fs report a bit more accurate info. Signed-off-by: Badari Pulavarty Signed-off-by: Andreas Dilger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 5de5061eb331..b2efd9083b9b 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1099,15 +1099,18 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) struct super_block *sb = dentry->d_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; - unsigned long overhead; - int i; u64 fsid; if (test_opt (sb, MINIX_DF)) - overhead = 0; - else { + sbi->s_overhead_last = 0; + else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { + unsigned long i, overhead = 0; + smp_rmb(); + /* - * Compute the overhead (FS structures) + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. */ /* @@ -1131,17 +1134,22 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) */ overhead += (sbi->s_groups_count * (2 + sbi->s_itb_per_group)); + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); } buf->f_type = EXT2_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; buf->f_bfree = ext2_count_free_blocks(sb); + es->s_free_blocks_count = cpu_to_le32(buf->f_bfree); buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = ext2_count_free_inodes(sb); + es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT2_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); -- cgit v1.2.1 From a71ce8c6c9bf269b192f352ea555217815cf027e Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 15 Jul 2007 23:41:59 -0700 Subject: ext3: statfs speed up This is a patch that speeds up statfs. It is very simple - the "overhead" calculation, which takes a huge amount of time for large filesystems, never changes unless the size of the filesystem itself changes. That means we can store it in memory and only recalculate if the filesystem has been resized (almost never). It also fixes a minor problem that we never update the on-disk superblock free blocks/inodes counts until the filesystem is unmounted. While not fatal, we may as well update that on disk when we have the information, and it makes things like debugfs and dumpe2fs report a bit more accurate info. Signed-off-by: Badari Pulavarty Signed-off-by: Andreas Dilger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/super.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 8b75f73ba3b4..51d1c456cdab 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2426,19 +2426,19 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) struct super_block *sb = dentry->d_sb; struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; - ext3_fsblk_t overhead; - int i; u64 fsid; - if (test_opt (sb, MINIX_DF)) - overhead = 0; - else { - unsigned long ngroups; - ngroups = EXT3_SB(sb)->s_groups_count; + if (test_opt(sb, MINIX_DF)) { + sbi->s_overhead_last = 0; + } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { + unsigned long ngroups = sbi->s_groups_count, i; + ext3_fsblk_t overhead = 0; smp_rmb(); /* - * Compute the overhead (FS structures) + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. */ /* @@ -2462,18 +2462,23 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * Every block group has an inode bitmap, a block * bitmap, and an inode table. */ - overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group)); + overhead += ngroups * (2 + sbi->s_itb_per_group); + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); } buf->f_type = EXT3_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); + es->s_free_blocks_count = cpu_to_le32(buf->f_bfree); buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); + es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT3_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); -- cgit v1.2.1 From 5e70030d4cf91613530a23b40ad9919bb9ee114f Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 15 Jul 2007 23:42:00 -0700 Subject: ext4: statfs speed up This is a patch that speeds up statfs. It is very simple - the "overhead" calculation, which takes a huge amount of time for large filesystems, never changes unless the size of the filesystem itself changes. That means we can store it in memory and only recalculate if the filesystem has been resized (almost never). It also fixes a minor problem that we never update the on-disk superblock free blocks/inodes counts until the filesystem is unmounted. While not fatal, we may as well update that on disk when we have the information, and it makes things like debugfs and dumpe2fs report a bit more accurate info. Signed-off-by: Badari Pulavarty Signed-off-by: Andreas Dilger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/super.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index adcbfadfcb4c..d0d8c76c7edb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2500,19 +2500,19 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf) struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - ext4_fsblk_t overhead; - int i; u64 fsid; - if (test_opt (sb, MINIX_DF)) - overhead = 0; - else { - unsigned long ngroups; - ngroups = EXT4_SB(sb)->s_groups_count; + if (test_opt(sb, MINIX_DF)) { + sbi->s_overhead_last = 0; + } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { + unsigned long ngroups = sbi->s_groups_count, i; + ext4_fsblk_t overhead = 0; smp_rmb(); /* - * Compute the overhead (FS structures) + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. */ /* @@ -2536,18 +2536,23 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf) * Every block group has an inode bitmap, a block * bitmap, and an inode table. */ - overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group)); + overhead += ngroups * (2 + sbi->s_itb_per_group); + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); } buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = ext4_blocks_count(es) - overhead; + buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); + es->s_free_blocks_count = cpu_to_le32(buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); + es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); -- cgit v1.2.1 From 29e3f347779088a79d832bfbd2d17c2f04050308 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sun, 15 Jul 2007 23:42:01 -0700 Subject: NLS: Remove obsolete Makefile entries Since the corresponding source files no longer exist, remove the irrelevant Makefile entries for them. Signed-off-by: Robert P. J. Day Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nls/Makefile | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nls/Makefile b/fs/nls/Makefile index a7ade138d684..f499dd7c3905 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -36,11 +36,9 @@ obj-$(CONFIG_NLS_ISO8859_6) += nls_iso8859-6.o obj-$(CONFIG_NLS_ISO8859_7) += nls_iso8859-7.o obj-$(CONFIG_NLS_ISO8859_8) += nls_cp1255.o obj-$(CONFIG_NLS_ISO8859_9) += nls_iso8859-9.o -obj-$(CONFIG_NLS_ISO8859_10) += nls_iso8859-10.o obj-$(CONFIG_NLS_ISO8859_13) += nls_iso8859-13.o obj-$(CONFIG_NLS_ISO8859_14) += nls_iso8859-14.o obj-$(CONFIG_NLS_ISO8859_15) += nls_iso8859-15.o obj-$(CONFIG_NLS_KOI8_R) += nls_koi8-r.o obj-$(CONFIG_NLS_KOI8_U) += nls_koi8-u.o nls_koi8-ru.o -obj-$(CONFIG_NLS_ABC) += nls_abc.o obj-$(CONFIG_NLS_UTF8) += nls_utf8.o -- cgit v1.2.1 From 98701dc19e0cf358d48208e994180bb8abad5079 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 15 Jul 2007 23:42:01 -0700 Subject: compat32: ignore the LOOP_CLR_FD ioctl compat32: Ignore the LOOP_CLR_FD ioctl for the loop block device, to kill an annoying kernel message when e.g. busybox umount is used. Signed-off-by: Geert Uytterhoeven Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6b44cdc96fac..e440a7b95d02 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -3489,6 +3490,9 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) + +/* loop */ +IGNORE_IOCTL(LOOP_CLR_FD) }; #define IOCTL_HASHSIZE 256 -- cgit v1.2.1 From 98283bb49c6c8c070ebde9f47489d3e9a83c1323 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Mon, 16 Jul 2007 09:40:05 +0900 Subject: fat: Fix the race of read/write the FAT12 entry FAT12 entry is 12bits, so it needs 2 phase to update the value. And writer and reader access it without any lock, so reader can get the half updated value. This fixes the long standing race condition by adding a global spinlock to only FAT12 for avoiding any impact against FAT16/32. Signed-off-by: OGAWA Hirofumi Signed-off-by: Linus Torvalds --- fs/fat/fatent.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index ab171ea8e869..2c1b73fb82ae 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -17,6 +17,8 @@ struct fatent_operations { int (*ent_next)(struct fat_entry *); }; +static DEFINE_SPINLOCK(fat12_entry_lock); + static void fat12_ent_blocknr(struct super_block *sb, int entry, int *offset, sector_t *blocknr) { @@ -116,10 +118,13 @@ static int fat12_ent_get(struct fat_entry *fatent) u8 **ent12_p = fatent->u.ent12_p; int next; + spin_lock(&fat12_entry_lock); if (fatent->entry & 1) next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4); else next = (*ent12_p[1] << 8) | *ent12_p[0]; + spin_unlock(&fat12_entry_lock); + next &= 0x0fff; if (next >= BAD_FAT12) next = FAT_ENT_EOF; @@ -151,6 +156,7 @@ static void fat12_ent_put(struct fat_entry *fatent, int new) if (new == FAT_ENT_EOF) new = EOF_FAT12; + spin_lock(&fat12_entry_lock); if (fatent->entry & 1) { *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f); *ent12_p[1] = new >> 4; @@ -158,6 +164,7 @@ static void fat12_ent_put(struct fat_entry *fatent, int new) *ent12_p[0] = new & 0xff; *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8); } + spin_unlock(&fat12_entry_lock); mark_buffer_dirty(fatent->bhs[0]); if (fatent->nr_bhs == 2) -- cgit v1.2.1 From 959bc220df38317bed9a677600b3945a8571fc1c Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Mon, 16 Jul 2007 19:39:02 +0100 Subject: Fix LDM for new field in the VOL5 VBLK. Teach LDM about a new field encountered with Windows Vista. This fixes LDM for people using Vista who have disabled drive letter assignment from one or more volumes. Doing this introduces a so far unknown field in the LDM database in the VOL5 VBLK structure which causes the LDM driver to fail to parse the VBLK structure and hence LDM fails to parse the disk altogether. This patch teaches the driver about this field. Thanks got to Ashton Mills for reporting the problem and working with me on getting it fixed. It is now working for him. Signed-off-by: Anton Altaparmakov CC: Richard Russon Signed-off-by: Linus Torvalds --- fs/partitions/ldm.c | 137 +++++++++++++++++++++++++++++++++++----------------- fs/partitions/ldm.h | 2 +- 2 files changed, 93 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 99873a2b4cbc..e7dd1d4e3473 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -677,15 +677,24 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp, * Return: -1 Error, the calculated offset exceeded the size of the buffer * n OK, a range-checked offset into buffer */ -static int ldm_relative (const u8 *buffer, int buflen, int base, int offset) +static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) { base += offset; - if ((!buffer) || (offset < 0) || (base > buflen)) + if (!buffer || offset < 0 || base > buflen) { + if (!buffer) + ldm_error("!buffer"); + if (offset < 0) + ldm_error("offset (%d) < 0", offset); + if (base > buflen) + ldm_error("base (%d) > buflen (%d)", base, buflen); return -1; - if ((base + buffer[base]) >= buflen) + } + if (base + buffer[base] >= buflen) { + ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, + buffer[base], buflen); return -1; - + } return buffer[base] + offset + 1; } @@ -1054,60 +1063,98 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) * Return: 'true' @vb contains a Volume VBLK * 'false' @vb contents are not defined */ -static bool ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb) +static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) { - int r_objid, r_name, r_vtype, r_child, r_size, r_id1, r_id2, r_size2; - int r_drive, len; + int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; + int r_id1, r_id2, r_size2, r_drive, len; struct vblk_volu *volu; - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - r_vtype = ldm_relative (buffer, buflen, 0x18, r_name); - r_child = ldm_relative (buffer, buflen, 0x2E, r_vtype); - r_size = ldm_relative (buffer, buflen, 0x3E, r_child); - - if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) - r_id1 = ldm_relative (buffer, buflen, 0x53, r_size); - else + BUG_ON(!buffer || !vb); + r_objid = ldm_relative(buffer, buflen, 0x18, 0); + if (r_objid < 0) { + ldm_error("r_objid %d < 0", r_objid); + return false; + } + r_name = ldm_relative(buffer, buflen, 0x18, r_objid); + if (r_name < 0) { + ldm_error("r_name %d < 0", r_name); + return false; + } + r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); + if (r_vtype < 0) { + ldm_error("r_vtype %d < 0", r_vtype); + return false; + } + r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); + if (r_disable_drive_letter < 0) { + ldm_error("r_disable_drive_letter %d < 0", + r_disable_drive_letter); + return false; + } + r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); + if (r_child < 0) { + ldm_error("r_child %d < 0", r_child); + return false; + } + r_size = ldm_relative(buffer, buflen, 0x3D, r_child); + if (r_size < 0) { + ldm_error("r_size %d < 0", r_size); + return false; + } + if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { + r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); + if (r_id1 < 0) { + ldm_error("r_id1 %d < 0", r_id1); + return false; + } + } else r_id1 = r_size; - - if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) - r_id2 = ldm_relative (buffer, buflen, 0x53, r_id1); - else + if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { + r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); + if (r_id2 < 0) { + ldm_error("r_id2 %d < 0", r_id2); + return false; + } + } else r_id2 = r_id1; - - if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) - r_size2 = ldm_relative (buffer, buflen, 0x53, r_id2); - else + if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { + r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); + if (r_size2 < 0) { + ldm_error("r_size2 %d < 0", r_size2); + return false; + } + } else r_size2 = r_id2; - - if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) - r_drive = ldm_relative (buffer, buflen, 0x53, r_size2); - else + if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { + r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); + if (r_drive < 0) { + ldm_error("r_drive %d < 0", r_drive); + return false; + } + } else r_drive = r_size2; - len = r_drive; - if (len < 0) + if (len < 0) { + ldm_error("len %d < 0", len); return false; - + } len += VBLK_SIZE_VOL5; - if (len != BE32 (buffer + 0x14)) + if (len > BE32(buffer + 0x14)) { + ldm_error("len %d > BE32(buffer + 0x14) %d", len, + BE32(buffer + 0x14)); return false; - + } volu = &vb->vblk.volu; - - ldm_get_vstr (buffer + 0x18 + r_name, volu->volume_type, - sizeof (volu->volume_type)); - memcpy (volu->volume_state, buffer + 0x19 + r_vtype, - sizeof (volu->volume_state)); - volu->size = ldm_get_vnum (buffer + 0x3E + r_child); - volu->partition_type = buffer[0x42 + r_size]; - memcpy (volu->guid, buffer + 0x43 + r_size, sizeof (volu->guid)); + ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, + sizeof(volu->volume_type)); + memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, + sizeof(volu->volume_state)); + volu->size = ldm_get_vnum(buffer + 0x3D + r_child); + volu->partition_type = buffer[0x41 + r_size]; + memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { - ldm_get_vstr (buffer + 0x53 + r_size, volu->drive_hint, - sizeof (volu->drive_hint)); + ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, + sizeof(volu->drive_hint)); } return true; } diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h index d2e6a3046939..80f63b5fdd9f 100644 --- a/fs/partitions/ldm.h +++ b/fs/partitions/ldm.h @@ -68,7 +68,7 @@ struct parsed_partitions; #define VBLK_SIZE_DSK3 12 #define VBLK_SIZE_DSK4 45 #define VBLK_SIZE_PRT3 28 -#define VBLK_SIZE_VOL5 59 +#define VBLK_SIZE_VOL5 58 /* component types */ #define COMP_STRIPE 0x01 /* Stripe-set */ -- cgit v1.2.1 From 5b37696fda07b8acf37beba3853f83106397ccdf Mon Sep 17 00:00:00 2001 From: Satyam Sharma Date: Tue, 17 Jul 2007 00:24:23 +0530 Subject: utime(s): Honour CAP_FOWNER when times==NULL do_utimes() does not honour CAP_FOWNER when times==NULL. Trivial and obvious one-line fix. Signed-off-by: Satyam Sharma Signed-off-by: Linus Torvalds --- fs/utimes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/utimes.c b/fs/utimes.c index b3c88952465f..83a7e69e706c 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -106,7 +106,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags if (IS_IMMUTABLE(inode)) goto dput_and_out; - if (current->fsuid != inode->i_uid) { + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) { if (f) { if (!(f->f_mode & FMODE_WRITE)) goto dput_and_out; -- cgit v1.2.1 From 10fa16e75c70c8e7ec4675c5f985548a6565dca2 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Mon, 16 Jul 2007 16:02:49 -0500 Subject: 9p: fix debug compilation error s/9p/v9fs.c: In function 'v9fs_parse_options': fs/9p/v9fs.c:134: error: 'p9_debug_level' undeclared (first use in this function) Signed-off-by: Dave Jones Acked-by: Eric Van Hensbergen --- fs/9p/v9fs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 45c35986d49f..0a7068e30ecb 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -131,7 +131,9 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses) switch (token) { case Opt_debug: v9ses->debug = option; +#ifdef CONFIG_NET_9P_DEBUG p9_debug_level = option; +#endif break; case Opt_port: v9ses->port = option; -- cgit v1.2.1 From 769848c03895b63e5662eb7e4ec8c4866f7d0183 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 17 Jul 2007 04:03:05 -0700 Subject: Add __GFP_MOVABLE for callers to flag allocations from high memory that may be migrated It is often known at allocation time whether a page may be migrated or not. This patch adds a flag called __GFP_MOVABLE and a new mask called GFP_HIGH_MOVABLE. Allocations using the __GFP_MOVABLE can be either migrated using the page migration mechanism or reclaimed by syncing with backing storage and discarding. An API function very similar to alloc_zeroed_user_highpage() is added for __GFP_MOVABLE allocations called alloc_zeroed_user_highpage_movable(). The flags used by alloc_zeroed_user_highpage() are not changed because it would change the semantics of an existing API. After this patch is applied there are no in-kernel users of alloc_zeroed_user_highpage() so it probably should be marked deprecated if this patch is merged. Note that this patch includes a minor cleanup to the use of __GFP_ZERO in shmem.c to keep all flag modifications to inode->mapping in the shmem_dir_alloc() helper function. This clean-up suggestion is courtesy of Hugh Dickens. Additional credit goes to Christoph Lameter and Linus Torvalds for shaping the concept. Credit to Hugh Dickens for catching issues with shmem swap vector and ramfs allocations. [akpm@linux-foundation.org: build fix] [hugh@veritas.com: __GFP_ZERO cleanup] Signed-off-by: Mel Gorman Cc: Andy Whitcroft Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- fs/inode.c | 10 ++++++++-- fs/ramfs/inode.c | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 424165b569f8..94344b2e0b46 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -982,7 +982,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; page = find_or_create_page(inode->i_mapping, index, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); if (!page) return NULL; diff --git a/fs/inode.c b/fs/inode.c index 9a012cc5b6cd..47b87b071de3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -145,7 +145,7 @@ static struct inode *alloc_inode(struct super_block *sb) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; - mapping_set_gfp_mask(mapping, GFP_HIGHUSER); + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; @@ -519,7 +519,13 @@ repeat: * new_inode - obtain an inode * @sb: superblock * - * Allocates a new inode for given superblock. + * Allocates a new inode for given superblock. The default gfp_mask + * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. + * If HIGHMEM pages are unsuitable or it is known that pages allocated + * for the page cache are not reclaimable or migratable, + * mapping_set_gfp_mask() must be called with suitable flags on the + * newly created inode's mapping + * */ struct inode *new_inode(struct super_block *sb) { diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index d40d22b347b7..ef2b46d099ff 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -60,6 +60,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_blocks = 0; inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { default: -- cgit v1.2.1 From 5ad333eb66ff1e52a87639822ae088577669dcf9 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 17 Jul 2007 04:03:16 -0700 Subject: Lumpy Reclaim V4 When we are out of memory of a suitable size we enter reclaim. The current reclaim algorithm targets pages in LRU order, which is great for fairness at order-0 but highly unsuitable if you desire pages at higher orders. To get pages of higher order we must shoot down a very high proportion of memory; >95% in a lot of cases. This patch set adds a lumpy reclaim algorithm to the allocator. It targets groups of pages at the specified order anchored at the end of the active and inactive lists. This encourages groups of pages at the requested orders to move from active to inactive, and active to free lists. This behaviour is only triggered out of direct reclaim when higher order pages have been requested. This patch set is particularly effective when utilised with an anti-fragmentation scheme which groups pages of similar reclaimability together. This patch set is based on Peter Zijlstra's lumpy reclaim V2 patch which forms the foundation. Credit to Mel Gorman for sanitity checking. Mel said: The patches have an application with hugepage pool resizing. When lumpy-reclaim is used used with ZONE_MOVABLE, the hugepages pool can be resized with greater reliability. Testing on a desktop machine with 2GB of RAM showed that growing the hugepage pool with ZONE_MOVABLE on it's own was very slow as the success rate was quite low. Without lumpy-reclaim, each attempt to grow the pool by 100 pages would yield 1 or 2 hugepages. With lumpy-reclaim, getting 40 to 70 hugepages on each attempt was typical. [akpm@osdl.org: ia64 pfn_to_nid fixes and loop cleanup] [bunk@stusta.de: static declarations for internal functions] [a.p.zijlstra@chello.nl: initial lumpy V2 implementation] Signed-off-by: Andy Whitcroft Acked-by: Peter Zijlstra Acked-by: Mel Gorman Acked-by: Mel Gorman Cc: Bob Picco Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 94344b2e0b46..d654a3b6209e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -356,7 +356,7 @@ static void free_more_memory(void) for_each_online_pgdat(pgdat) { zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; if (*zones) - try_to_free_pages(zones, GFP_NOFS); + try_to_free_pages(zones, 0, GFP_NOFS); } } -- cgit v1.2.1 From 8e1f936b73150f5095448a0fee6d4f30a1f9001d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 17 Jul 2007 04:03:17 -0700 Subject: mm: clean up and kernelify shrinker registration I can never remember what the function to register to receive VM pressure is called. I have to trace down from __alloc_pages() to find it. It's called "set_shrinker()", and it needs Your Help. 1) Don't hide struct shrinker. It contains no magic. 2) Don't allocate "struct shrinker". It's not helpful. 3) Call them "register_shrinker" and "unregister_shrinker". 4) Call the function "shrink" not "shrinker". 5) Reduce the 17 lines of waffly comments to 13, but document it properly. Signed-off-by: Rusty Russell Cc: David Chinner Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 7 ++++++- fs/dquot.c | 7 ++++++- fs/inode.c | 7 ++++++- fs/mbcache.c | 9 ++++++--- fs/nfs/super.c | 10 ++++++---- fs/xfs/linux-2.6/xfs_buf.c | 14 ++++++-------- fs/xfs/quota/xfs_qm.c | 10 +++++++--- 7 files changed, 43 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 0e73aa0a0e8b..cb9d05056b54 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -883,6 +883,11 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } +static struct shrinker dcache_shrinker = { + .shrink = shrink_dcache_memory, + .seeks = DEFAULT_SEEKS, +}; + /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate @@ -2115,7 +2120,7 @@ static void __init dcache_init(unsigned long mempages) dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); - set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); + register_shrinker(&dcache_shrinker); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/dquot.c b/fs/dquot.c index 8819d281500c..7e273151f589 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -538,6 +538,11 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; } +static struct shrinker dqcache_shrinker = { + .shrink = shrink_dqcache_memory, + .seeks = DEFAULT_SEEKS, +}; + /* * Put reference to dquot * NOTE: If you change this function please check whether dqput_blocks() works right... @@ -1870,7 +1875,7 @@ static int __init dquot_init(void) printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - set_shrinker(DEFAULT_SEEKS, shrink_dqcache_memory); + register_shrinker(&dqcache_shrinker); return 0; } diff --git a/fs/inode.c b/fs/inode.c index 47b87b071de3..320e088d0b28 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -462,6 +462,11 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask) return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } +static struct shrinker icache_shrinker = { + .shrink = shrink_icache_memory, + .seeks = DEFAULT_SEEKS, +}; + static void __wait_on_freeing_inode(struct inode *inode); /* * Called with the inode lock held. @@ -1385,7 +1390,7 @@ void __init inode_init(unsigned long mempages) SLAB_MEM_SPREAD), init_once, NULL); - set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); + register_shrinker(&icache_shrinker); /* Hash may have been set up in inode_init_early */ if (!hashdist) diff --git a/fs/mbcache.c b/fs/mbcache.c index deeb9dc062d9..fbb1d02f8791 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -100,7 +100,6 @@ struct mb_cache { static LIST_HEAD(mb_cache_list); static LIST_HEAD(mb_cache_lru_list); static DEFINE_SPINLOCK(mb_cache_spinlock); -static struct shrinker *mb_shrinker; static inline int mb_cache_indexes(struct mb_cache *cache) @@ -118,6 +117,10 @@ mb_cache_indexes(struct mb_cache *cache) static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); +static struct shrinker mb_cache_shrinker = { + .shrink = mb_cache_shrink_fn, + .seeks = DEFAULT_SEEKS, +}; static inline int __mb_cache_entry_is_hashed(struct mb_cache_entry *ce) @@ -662,13 +665,13 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, static int __init init_mbcache(void) { - mb_shrinker = set_shrinker(DEFAULT_SEEKS, mb_cache_shrink_fn); + register_shrinker(&mb_cache_shrinker); return 0; } static void __exit exit_mbcache(void) { - remove_shrinker(mb_shrinker); + unregister_shrinker(&mb_cache_shrinker); } module_init(init_mbcache) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a2b1af89ca1a..adffe1615c51 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -300,7 +300,10 @@ static const struct super_operations nfs4_sops = { }; #endif -static struct shrinker *acl_shrinker; +static struct shrinker acl_shrinker = { + .shrink = nfs_access_cache_shrinker, + .seeks = DEFAULT_SEEKS, +}; /* * Register the NFS filesystems @@ -321,7 +324,7 @@ int __init register_nfs_fs(void) if (ret < 0) goto error_2; #endif - acl_shrinker = set_shrinker(DEFAULT_SEEKS, nfs_access_cache_shrinker); + register_shrinker(&acl_shrinker); return 0; #ifdef CONFIG_NFS_V4 @@ -339,8 +342,7 @@ error_0: */ void __exit unregister_nfs_fs(void) { - if (acl_shrinker != NULL) - remove_shrinker(acl_shrinker); + unregister_shrinker(&acl_shrinker); #ifdef CONFIG_NFS_V4 unregister_filesystem(&nfs4_fs_type); nfs_unregister_sysctl(); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 2df63622354e..b0f0e58866de 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -35,10 +35,13 @@ #include static kmem_zone_t *xfs_buf_zone; -static struct shrinker *xfs_buf_shake; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); +static struct shrinker xfs_buf_shake = { + .shrink = xfsbufd_wakeup, + .seeks = DEFAULT_SEEKS, +}; static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; @@ -1832,14 +1835,9 @@ xfs_buf_init(void) if (!xfsdatad_workqueue) goto out_destroy_xfslogd_workqueue; - xfs_buf_shake = set_shrinker(DEFAULT_SEEKS, xfsbufd_wakeup); - if (!xfs_buf_shake) - goto out_destroy_xfsdatad_workqueue; - + register_shrinker(&xfs_buf_shake); return 0; - out_destroy_xfsdatad_workqueue: - destroy_workqueue(xfsdatad_workqueue); out_destroy_xfslogd_workqueue: destroy_workqueue(xfslogd_workqueue); out_free_buf_zone: @@ -1854,7 +1852,7 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - remove_shrinker(xfs_buf_shake); + unregister_shrinker(&xfs_buf_shake); destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 7def4c699343..2d274b23ade5 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -62,7 +62,6 @@ uint ndquot; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; -static struct shrinker *xfs_qm_shaker; static cred_t xfs_zerocr; @@ -78,6 +77,11 @@ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC int xfs_qm_shake(int, gfp_t); +static struct shrinker xfs_qm_shaker = { + .shrink = xfs_qm_shake, + .seeks = DEFAULT_SEEKS, +}; + #ifdef DEBUG extern mutex_t qcheck_lock; #endif @@ -149,7 +153,7 @@ xfs_Gqm_init(void) } else xqm->qm_dqzone = qm_dqzone; - xfs_qm_shaker = set_shrinker(DEFAULT_SEEKS, xfs_qm_shake); + register_shrinker(&xfs_qm_shaker); /* * The t_dqinfo portion of transactions. @@ -181,7 +185,7 @@ xfs_qm_destroy( ASSERT(xqm != NULL); ASSERT(xqm->qm_nrefs == 0); - remove_shrinker(xfs_qm_shaker); + unregister_shrinker(&xfs_qm_shaker); hsize = xqm->qm_dqhashmask + 1; for (i = 0; i < hsize; i++) { xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); -- cgit v1.2.1 From 787d2214c19bcc9b6ac48af0ce098277a801eded Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 17 Jul 2007 04:03:34 -0700 Subject: fs: introduce some page/buffer invariants It is a bug to set a page dirty if it is not uptodate unless it has buffers. If the page has buffers, then the page may be dirty (some buffers dirty) but not uptodate (some buffers not uptodate). The exception to this rule is if the set_page_dirty caller is racing with truncate or invalidate. A buffer can not be set dirty if it is not uptodate. If either of these situations occurs, it indicates there could be some data loss problem. Some of these warnings could be a harmless one where the page or buffer is set uptodate immediately after it is dirtied, however we should fix those up, and enforce this ordering. Bring the order of operations for truncate into line with those of invalidate. This will prevent a page from being able to go !uptodate while we're holding the tree_lock, which is probably a good thing anyway. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 54 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index d654a3b6209e..0f9006714230 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -675,6 +675,39 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) } EXPORT_SYMBOL(mark_buffer_dirty_inode); +/* + * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * dirty. + * + * If warn is true, then emit a warning if the page is not uptodate and has + * not been truncated. + */ +static int __set_page_dirty(struct page *page, + struct address_space *mapping, int warn) +{ + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + if (TestSetPageDirty(page)) + return 0; + + write_lock_irq(&mapping->tree_lock); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !PageUptodate(page)); + + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + task_io_account_write(PAGE_CACHE_SIZE); + } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&mapping->tree_lock); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + return 1; +} + /* * Add a page to the dirty page list. * @@ -702,7 +735,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); */ int __set_page_dirty_buffers(struct page *page) { - struct address_space * const mapping = page_mapping(page); + struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -719,21 +752,7 @@ int __set_page_dirty_buffers(struct page *page) } spin_unlock(&mapping->private_lock); - if (TestSetPageDirty(page)) - return 0; - - write_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - task_io_account_write(PAGE_CACHE_SIZE); - } - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - write_unlock_irq(&mapping->tree_lock); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return 1; + return __set_page_dirty(page, mapping, 1); } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1132,8 +1151,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) */ void fastcall mark_buffer_dirty(struct buffer_head *bh) { + WARN_ON_ONCE(!buffer_uptodate(bh)); if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) - __set_page_dirty_nobuffers(bh->b_page); + __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); } /* -- cgit v1.2.1 From 831441862956fffa17b9801db37e6ea1650b0f69 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 Jul 2007 04:03:35 -0700 Subject: Freezer: make kernel threads nonfreezable by default Currently, the freezer treats all tasks as freezable, except for the kernel threads that explicitly set the PF_NOFREEZE flag for themselves. This approach is problematic, since it requires every kernel thread to either set PF_NOFREEZE explicitly, or call try_to_freeze(), even if it doesn't care for the freezing of tasks at all. It seems better to only require the kernel threads that want to or need to be frozen to use some freezer-related code and to remove any freezer-related code from the other (nonfreezable) kernel threads, which is done in this patch. The patch causes all kernel threads to be nonfreezable by default (ie. to have PF_NOFREEZE set by default) and introduces the set_freezable() function that should be called by the freezable kernel threads in order to unset PF_NOFREEZE. It also makes all of the currently freezable kernel threads call set_freezable(), so it shouldn't cause any (intentional) change of behaviour to appear. Additionally, it updates documentation to describe the freezing of tasks more accurately. [akpm@linux-foundation.org: build fixes] Signed-off-by: Rafael J. Wysocki Acked-by: Nigel Cunningham Cc: Pavel Machek Cc: Oleg Nesterov Cc: Gautham R Shenoy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cifs/cifsfs.c | 1 + fs/cifs/connect.c | 1 + fs/jffs2/background.c | 1 + fs/lockd/svc.c | 2 ++ fs/nfs/callback.c | 2 ++ fs/nfsd/nfssvc.c | 2 ++ fs/xfs/linux-2.6/xfs_super.c | 1 + 7 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8b0cbf4a4ad0..bd0f2f2353ce 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -849,6 +849,7 @@ static int cifs_oplock_thread(void * dummyarg) __u16 netfid; int rc; + set_freezable(); do { if (try_to_freeze()) continue; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f4e92661b223..0a1b8bd1dfcb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -363,6 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) GFP_KERNEL); } + set_freezable(); while (!kthread_should_stop()) { if (try_to_freeze()) continue; diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 0c82dfcfd246..143c5530caf3 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -81,6 +81,7 @@ static int jffs2_garbage_collect_thread(void *_c) set_user_nice(current, 10); + set_freezable(); for (;;) { allow_signal(SIGHUP); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 26809325469c..9fcdef50aad6 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -119,6 +120,7 @@ lockd(struct svc_rqst *rqstp) complete(&lockd_start_done); daemonize("lockd"); + set_freezable(); /* Process request with signals blocked, but allow SIGKILL. */ allow_signal(SIGKILL); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 75f309c8741a..a796be5051bf 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -67,6 +68,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) daemonize("nfsv4-svc"); /* Process request with signals blocked, but allow SIGKILL. */ allow_signal(SIGKILL); + set_freezable(); complete(&nfs_callback_info.started); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ff55950efb43..5c8192bcbced 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -432,6 +433,7 @@ nfsd(struct svc_rqst *rqstp) * dirty pages. */ current->flags |= PF_LESS_THROTTLE; + set_freezable(); /* * The main request loop diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 06894cf00b12..4528f9a3f304 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -562,6 +562,7 @@ xfssyncd( bhv_vfs_sync_work_t *work, *n; LIST_HEAD (tmp); + set_freezable(); timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); for (;;) { timeleft = schedule_timeout_interruptible(timeleft); -- cgit v1.2.1 From 9281acea6a3687ff0f262e0be31eac34895b95d7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2007 04:03:51 -0700 Subject: kallsyms: make KSYM_NAME_LEN include space for trailing '\0' KSYM_NAME_LEN is peculiar in that it does not include the space for the trailing '\0', forcing all users to use KSYM_NAME_LEN + 1 when allocating buffer. This is nonsense and error-prone. Moreover, when the caller forgets that it's very likely to subtly bite back by corrupting the stack because the last position of the buffer is always cleared to zero. This patch increments KSYM_NAME_LEN by one and updates code accordingly. * off-by-one bug in asm-powerpc/kprobes.h::kprobe_lookup_name() macro is fixed. * Where MODULE_NAME_LEN and KSYM_NAME_LEN were used together, MODULE_NAME_LEN was treated as if it didn't include space for the trailing '\0'. Fix it. Signed-off-by: Tejun Heo Acked-by: Paulo Marques Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index ae3627337a92..42cb4f5613b6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -283,7 +283,7 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer) static int proc_pid_wchan(struct task_struct *task, char *buffer) { unsigned long wchan; - char symname[KSYM_NAME_LEN+1]; + char symname[KSYM_NAME_LEN]; wchan = get_wchan(task); -- cgit v1.2.1 From a569425512253992cc64ebf8b6d00a62f986db3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Jul 2007 04:04:28 -0700 Subject: knfsd: exportfs: add exportfs.h header currently the export_operation structure and helpers related to it are in fs.h. fs.h is already far too large and there are very few places needing the export bits, so split them off into a separate header. [akpm@linux-foundation.org: fix cifs build] Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Cc: Steven French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cifs/export.c | 1 + fs/efs/super.c | 1 + fs/exportfs/expfs.c | 1 + fs/ext2/super.c | 1 + fs/ext3/super.c | 1 + fs/ext4/super.c | 1 + fs/fat/inode.c | 1 + fs/gfs2/ops_export.c | 1 + fs/isofs/isofs.h | 1 + fs/jfs/super.c | 1 + fs/nfsd/export.c | 1 + fs/nfsd/nfsfh.c | 1 + fs/ntfs/namei.c | 1 + fs/ocfs2/export.h | 2 ++ fs/reiserfs/inode.c | 1 + fs/reiserfs/super.c | 1 + fs/xfs/linux-2.6/xfs_super.h | 2 ++ 17 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 1d716392c3aa..96df1d51fdc3 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -29,6 +29,7 @@ */ #include +#include #ifdef CONFIG_CIFS_EXPERIMENTAL diff --git a/fs/efs/super.c b/fs/efs/super.c index e0a6839e68ae..67338d622f56 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index e98f6cd7200c..16287af34859 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -1,4 +1,5 @@ +#include #include #include #include diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b2efd9083b9b..3eefa97fe204 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 51d1c456cdab..4f84dc86628a 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d0d8c76c7edb..b806e689c4aa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/fat/inode.c b/fs/fat/inode.c index cfaf5877d98b..0a7ddb39a593 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index 99ea5659bc2c..b8312edee0e4 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index efe2872cd4e3..a07e67b1ea7f 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -1,5 +1,6 @@ #include #include +#include #include #include diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 20e4ac1c79a3..49b652bd96ba 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 79bd03b8bbf8..c518421a9355 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 6ca2d24fc216..0108d3ec1c27 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index bff01a54675a..e93c6142b23c 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -21,6 +21,7 @@ */ #include +#include #include #include "attrib.h" diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h index 5b77ee7866ef..e08bed9e45a0 100644 --- a/fs/ocfs2/export.h +++ b/fs/ocfs2/export.h @@ -26,6 +26,8 @@ #ifndef OCFS2_EXPORT_H #define OCFS2_EXPORT_H +#include + extern struct export_operations ocfs2_export_ops; #endif /* OCFS2_EXPORT_H */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 1272d11399fb..ddde489f1cb2 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b4ac9119200e..5a93cfe1a032 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 33dd1ca13245..201cc3273c84 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -18,6 +18,8 @@ #ifndef __XFS_SUPER_H__ #define __XFS_SUPER_H__ +#include + #ifdef CONFIG_XFS_DMAPI # define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops) # define vfs_initdmapi() dmapi_init() -- cgit v1.2.1 From 5ca29607331fe37980dc3b488793ef8b1409b722 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Jul 2007 04:04:29 -0700 Subject: knfsd: exportfs: remove iget abuse When the exportfs interface was added the expectation was that filesystems provide an operation to convert from a file handle to an inode/dentry, but it kept a backwards compat option that still calls into iget. Calling into iget from non-filesystem code is very bad, because it gives too little information to filesystem, and simply crashes if the filesystem doesn't implement the ->read_inode routine. Fortunately there are only two filesystems left using this fallback: efs and jfs. This patch moves a copy of export_iget to each of those to implement the get_dentry method. While this is a temporary increase of lines of code in the kernel it allows for a much cleaner interface and important code restructuring in later patches. [akpm@linux-foundation.org: add jfs_get_inode_flags() declaration] Signed-off-by: Dave Kleikamp Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/efs/namei.c | 32 ++++++++++++++++++++++++++++++ fs/efs/super.c | 1 + fs/exportfs/expfs.c | 56 +++-------------------------------------------------- fs/jfs/jfs_inode.h | 1 + fs/jfs/namei.c | 32 ++++++++++++++++++++++++++++++ fs/jfs/super.c | 1 + 6 files changed, 70 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/efs/namei.c b/fs/efs/namei.c index ed4a207fe22a..5276b19423c1 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -75,6 +75,38 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei return NULL; } +struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp) +{ + __u32 *objp = vobjp; + unsigned long ino = objp[0]; + __u32 generation = objp[1]; + struct inode *inode; + struct dentry *result; + + if (ino == 0) + return ERR_PTR(-ESTALE); + inode = iget(sb, ino); + if (inode == NULL) + return ERR_PTR(-ENOMEM); + + if (is_bad_inode(inode) || + (generation && inode->i_generation != generation)) { + result = ERR_PTR(-ESTALE); + goto out_iput; + } + + result = d_alloc_anon(inode); + if (!result) { + result = ERR_PTR(-ENOMEM); + goto out_iput; + } + return result; + + out_iput: + iput(inode); + return result; +} + struct dentry *efs_get_parent(struct dentry *child) { struct dentry *parent; diff --git a/fs/efs/super.c b/fs/efs/super.c index 67338d622f56..d360c81f3a72 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -114,6 +114,7 @@ static const struct super_operations efs_superblock_operations = { }; static struct export_operations efs_export_ops = { + .get_dentry = efs_get_dentry, .get_parent = efs_get_parent, }; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 16287af34859..dd132bb7b8f8 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -391,61 +391,11 @@ out: return error; } - -static struct dentry *export_iget(struct super_block *sb, unsigned long ino, __u32 generation) +static struct dentry *get_dentry(struct super_block *sb, void *vobjp) { - - /* iget isn't really right if the inode is currently unallocated!! - * This should really all be done inside each filesystem - * - * ext2fs' read_inode has been strengthed to return a bad_inode if - * the inode had been deleted. - * - * Currently we don't know the generation for parent directory, so - * a generation of 0 means "accept any" - */ - struct inode *inode; - struct dentry *result; - if (ino == 0) - return ERR_PTR(-ESTALE); - inode = iget(sb, ino); - if (inode == NULL) - return ERR_PTR(-ENOMEM); - if (is_bad_inode(inode) - || (generation && inode->i_generation != generation) - ) { - /* we didn't find the right inode.. */ - dprintk("fh_verify: Inode %lu, Bad count: %d %d or version %u %u\n", - inode->i_ino, - inode->i_nlink, atomic_read(&inode->i_count), - inode->i_generation, - generation); - - iput(inode); - return ERR_PTR(-ESTALE); - } - /* now to find a dentry. - * If possible, get a well-connected one - */ - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - return result; + return ERR_PTR(-ESTALE); } - -static struct dentry *get_object(struct super_block *sb, void *vobjp) -{ - __u32 *objp = vobjp; - unsigned long ino = objp[0]; - __u32 generation = objp[1]; - - return export_iget(sb, ino, generation); -} - - /** * export_encode_fh - default export_operations->encode_fh function * @dentry: the dentry to encode @@ -524,7 +474,7 @@ struct export_operations export_op_default = { .get_name = get_name, .get_parent = get_parent, - .get_dentry = get_object, + .get_dentry = get_dentry, }; EXPORT_SYMBOL(export_op_default); diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 2374b595f2e1..f0ec72b263f1 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -32,6 +32,7 @@ extern void jfs_truncate_nolock(struct inode *, loff_t); extern void jfs_free_zero_link(struct inode *); extern struct dentry *jfs_get_parent(struct dentry *dentry); extern void jfs_get_inode_flags(struct jfs_inode_info *); +extern struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 25161c4121e4..932797ba433b 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1477,6 +1477,38 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc return dentry; } +struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp) +{ + __u32 *objp = vobjp; + unsigned long ino = objp[0]; + __u32 generation = objp[1]; + struct inode *inode; + struct dentry *result; + + if (ino == 0) + return ERR_PTR(-ESTALE); + inode = iget(sb, ino); + if (inode == NULL) + return ERR_PTR(-ENOMEM); + + if (is_bad_inode(inode) || + (generation && inode->i_generation != generation)) { + result = ERR_PTR(-ESTALE); + goto out_iput; + } + + result = d_alloc_anon(inode); + if (!result) { + result = ERR_PTR(-ENOMEM); + goto out_iput; + } + return result; + + out_iput: + iput(inode); + return result; +} + struct dentry *jfs_get_parent(struct dentry *dentry) { struct super_block *sb = dentry->d_inode->i_sb; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 49b652bd96ba..929fceca7999 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -738,6 +738,7 @@ static const struct super_operations jfs_super_operations = { }; static struct export_operations jfs_export_operations = { + .get_dentry = jfs_get_dentry, .get_parent = jfs_get_parent, }; -- cgit v1.2.1 From d37065cd6d6bbe98fd4be14d6c9e64c0bfa124c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Jul 2007 04:04:30 -0700 Subject: knfsd: exportfs: add procedural interface for NFSD Currently NFSD calls directly into filesystems through the export_operations structure. I plan to change this interface in various ways in later patches, and want to avoid the export of the default operations to NFSD, so this patch adds two simple exportfs_encode_fh/exportfs_decode_fh helpers for NFSD to call instead of poking into exportfs guts. Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exportfs/expfs.c | 22 +++++++++++++++++++++- fs/nfsd/nfsfh.c | 18 +++++------------- 2 files changed, 26 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index dd132bb7b8f8..db86006956b0 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -3,6 +3,7 @@ #include #include #include +#include #include struct export_operations export_op_default; @@ -468,6 +469,26 @@ static struct dentry *export_decode_fh(struct super_block *sb, __u32 *fh, int fh acceptable, context); } +int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, + int connectable) +{ + struct export_operations *nop = dentry->d_sb->s_export_op; + + return CALL(nop, encode_fh)(dentry, fh, max_len, connectable); +} +EXPORT_SYMBOL_GPL(exportfs_encode_fh); + +struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh, int fh_len, + int fileid_type, int (*acceptable)(void *, struct dentry *), + void *context) +{ + struct export_operations *nop = mnt->mnt_sb->s_export_op; + + return CALL(nop, decode_fh)(mnt->mnt_sb, fh, fh_len, fileid_type, + acceptable, context); +} +EXPORT_SYMBOL_GPL(exportfs_decode_fh); + struct export_operations export_op_default = { .decode_fh = export_decode_fh, .encode_fh = export_encode_fh, @@ -477,7 +498,6 @@ struct export_operations export_op_default = { .get_dentry = get_dentry, }; -EXPORT_SYMBOL(export_op_default); EXPORT_SYMBOL(find_exported_dentry); MODULE_LICENSE("GPL"); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 0108d3ec1c27..ecf9c361b3f5 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -28,10 +28,6 @@ static int nfsd_nr_verified; static int nfsd_nr_put; -extern struct export_operations export_op_default; - -#define CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun) - /* * our acceptability function. * if NOSUBTREECHECK, accept anything @@ -212,11 +208,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (fileid_type == 0) dentry = dget(exp->ex_dentry); else { - struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op; - dentry = CALL(nop,decode_fh)(exp->ex_mnt->mnt_sb, - datap, data_left, - fileid_type, - nfsd_acceptable, exp); + dentry = exportfs_decode_fh(exp->ex_mnt, datap, + data_left, fileid_type, + nfsd_acceptable, exp); } if (dentry == NULL) goto out; @@ -287,15 +281,13 @@ out: static inline int _fh_update(struct dentry *dentry, struct svc_export *exp, __u32 *datap, int *maxsize) { - struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op; - if (dentry == exp->ex_dentry) { *maxsize = 0; return 0; } - return CALL(nop,encode_fh)(dentry, datap, maxsize, - !(exp->ex_flags&NFSEXP_NOSUBTREECHECK)); + return exportfs_encode_fh(dentry, datap, maxsize, + !(exp->ex_flags & NFSEXP_NOSUBTREECHECK)); } /* -- cgit v1.2.1 From 10f11c341da8c0ec6b8b024e892108a6537ba8c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Jul 2007 04:04:31 -0700 Subject: knfsd: exportfs: remove CALL macro Currently exportfs uses a way to call methods very differently from the rest of the kernel. This patch changes it to the standard conventions for method calls. Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exportfs/expfs.c | 128 +++++++++++++++++++++++++++++----------------------- 1 file changed, 72 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index db86006956b0..1e6f556514d6 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -6,11 +6,36 @@ #include #include -struct export_operations export_op_default; +#define dprintk(fmt, args...) do{}while(0) -#define CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun) -#define dprintk(fmt, args...) do{}while(0) +static int get_name(struct dentry *dentry, char *name, + struct dentry *child); + + +static struct dentry *exportfs_get_dentry(struct super_block *sb, void *obj) +{ + struct dentry *result = ERR_PTR(-ESTALE); + + if (sb->s_export_op->get_dentry) { + result = sb->s_export_op->get_dentry(sb, obj); + if (!result) + result = ERR_PTR(-ESTALE); + } + + return result; +} + +static int exportfs_get_name(struct dentry *dir, char *name, + struct dentry *child) +{ + struct export_operations *nop = dir->d_sb->s_export_op; + + if (nop->get_name) + return nop->get_name(dir, name, child); + else + return get_name(dir, name, child); +} static struct dentry * find_acceptable_alias(struct dentry *result, @@ -78,7 +103,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, { struct dentry *result = NULL; struct dentry *target_dir; - int err; + int err = -ESTALE; struct export_operations *nops = sb->s_export_op; struct dentry *alias; int noprogress; @@ -87,14 +112,10 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, /* * Attempt to find the inode. */ - result = CALL(sb->s_export_op,get_dentry)(sb,obj); - err = -ESTALE; - if (result == NULL) - goto err_out; - if (IS_ERR(result)) { - err = PTR_ERR(result); - goto err_out; - } + result = exportfs_get_dentry(sb, obj); + if (IS_ERR(result)) + return result; + if (S_ISDIR(result->d_inode->i_mode) && (result->d_flags & DCACHE_DISCONNECTED)) { /* it is an unconnected directory, we must connect it */ @@ -122,11 +143,11 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, if (parent == NULL) goto err_result; - target_dir = CALL(sb->s_export_op,get_dentry)(sb,parent); - if (IS_ERR(target_dir)) + target_dir = exportfs_get_dentry(sb,parent); + if (IS_ERR(target_dir)) { err = PTR_ERR(target_dir); - if (target_dir == NULL || IS_ERR(target_dir)) goto err_result; + } } /* * Now we need to make sure that target_dir is properly connected. @@ -177,18 +198,27 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, spin_unlock(&pd->d_lock); noprogress = 0; } else { - /* we have hit the top of a disconnected path. Try - * to find parent and connect - * note: racing with some other process renaming a - * directory isn't much of a problem here. If someone - * renames the directory, it will end up properly - * connected, which is what we want + /* + * We have hit the top of a disconnected path, try to + * find parent and connect. + * + * Racing with some other process renaming a directory + * isn't much of a problem here. If someone renames + * the directory, it will end up properly connected, + * which is what we want + * + * Getting the parent can't be supported generically, + * the locking is too icky. + * + * Instead we just return EACCES. If server reboots + * or inodes get flushed, you lose */ - struct dentry *ppd; + struct dentry *ppd = ERR_PTR(-EACCES); struct dentry *npd; mutex_lock(&pd->d_inode->i_mutex); - ppd = CALL(nops,get_parent)(pd); + if (nops->get_parent) + ppd = nops->get_parent(pd); mutex_unlock(&pd->d_inode->i_mutex); if (IS_ERR(ppd)) { @@ -199,7 +229,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, break; } dprintk("find_exported_dentry: find name of %lu in %lu\n", pd->d_inode->i_ino, ppd->d_inode->i_ino); - err = CALL(nops,get_name)(ppd, nbuf, pd); + err = exportfs_get_name(ppd, nbuf, pd); if (err) { dput(ppd); dput(pd); @@ -250,7 +280,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, /* if we weren't after a directory, have one more step to go */ if (result != target_dir) { struct dentry *nresult; - err = CALL(nops,get_name)(target_dir, nbuf, result); + err = exportfs_get_name(target_dir, nbuf, result); if (!err) { mutex_lock(&target_dir->d_inode->i_mutex); nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); @@ -286,23 +316,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, dput(target_dir); err_result: dput(result); - err_out: return ERR_PTR(err); } - - -static struct dentry *get_parent(struct dentry *child) -{ - /* get_parent cannot be supported generically, the locking - * is too icky. - * instead, we just return EACCES. If server reboots or inodes - * get flushed, you lose - */ - return ERR_PTR(-EACCES); -} - - struct getdents_callback { char *name; /* name that was found. It already points to a buffer NAME_MAX+1 is size */ @@ -392,11 +408,6 @@ out: return error; } -static struct dentry *get_dentry(struct super_block *sb, void *vobjp) -{ - return ERR_PTR(-ESTALE); -} - /** * export_encode_fh - default export_operations->encode_fh function * @dentry: the dentry to encode @@ -472,9 +483,15 @@ static struct dentry *export_decode_fh(struct super_block *sb, __u32 *fh, int fh int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) { - struct export_operations *nop = dentry->d_sb->s_export_op; + struct export_operations *nop = dentry->d_sb->s_export_op; + int error; - return CALL(nop, encode_fh)(dentry, fh, max_len, connectable); + if (nop->encode_fh) + error = nop->encode_fh(dentry, fh, max_len, connectable); + else + error = export_encode_fh(dentry, fh, max_len, connectable); + + return error; } EXPORT_SYMBOL_GPL(exportfs_encode_fh); @@ -483,21 +500,20 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh, int fh_len, void *context) { struct export_operations *nop = mnt->mnt_sb->s_export_op; + struct dentry *result; - return CALL(nop, decode_fh)(mnt->mnt_sb, fh, fh_len, fileid_type, + if (nop->decode_fh) { + result = nop->decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type, acceptable, context); + } else { + result = export_decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type, + acceptable, context); + } + + return result; } EXPORT_SYMBOL_GPL(exportfs_decode_fh); -struct export_operations export_op_default = { - .decode_fh = export_decode_fh, - .encode_fh = export_encode_fh, - - .get_name = get_name, - .get_parent = get_parent, - .get_dentry = get_dentry, -}; - EXPORT_SYMBOL(find_exported_dentry); MODULE_LICENSE("GPL"); -- cgit v1.2.1 From d7dd618a5901ce0b44ef518208b35f728775db74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Jul 2007 04:04:31 -0700 Subject: knfsd: exportfs: untangle ISDIR logic in find_exported_dentry Rework some logic in find_exported_dentry so that we only have a single S_ISDIR check and logic that makes clear to the reader what we're really doing here. Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exportfs/expfs.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 1e6f556514d6..abf0a316aa1a 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -116,30 +116,23 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, if (IS_ERR(result)) return result; - if (S_ISDIR(result->d_inode->i_mode) && - (result->d_flags & DCACHE_DISCONNECTED)) { - /* it is an unconnected directory, we must connect it */ - ; - } else { - if (acceptable(context, result)) - return result; - if (S_ISDIR(result->d_inode->i_mode)) { + if (S_ISDIR(result->d_inode->i_mode)) { + if (!(result->d_flags & DCACHE_DISCONNECTED)) { + if (acceptable(context, result)) + return result; err = -EACCES; goto err_result; } + target_dir = dget(result); + } else { + if (acceptable(context, result)) + return result; + alias = find_acceptable_alias(result, acceptable, context); if (alias) return alias; - } - /* It's a directory, or we are required to confirm the file's - * location in the tree based on the parent information - */ - dprintk("find_exported_dentry: need to look harder for %s/%d\n",sb->s_id,*(int*)obj); - if (S_ISDIR(result->d_inode->i_mode)) - target_dir = dget(result); - else { if (parent == NULL) goto err_result; @@ -149,6 +142,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, goto err_result; } } + /* * Now we need to make sure that target_dir is properly connected. * It may already be, as the flag isn't always updated when connection -- cgit v1.2.1 From fb66a1989c8abc3015aa334f617658b277e5fe98 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Jul 2007 04:04:32 -0700 Subject: knfsd: exportfs: move acceptable check into find_acceptable_alias All callers of find_acceptable_alias check if the current dentry is acceptable before looking for other acceptable aliases using find_acceptable_alias. Move the check into find_acceptable_alias to make the code a little more dense and add a comment to find_acceptable_alias that documents its intent. Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exportfs/expfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index abf0a316aa1a..085d6c547f44 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -37,6 +37,9 @@ static int exportfs_get_name(struct dentry *dir, char *name, return get_name(dir, name, child); } +/* + * Check if the dentry or any of it's aliases is acceptable. + */ static struct dentry * find_acceptable_alias(struct dentry *result, int (*acceptable)(void *context, struct dentry *dentry), @@ -44,6 +47,9 @@ find_acceptable_alias(struct dentry *result, { struct dentry *dentry, *toput = NULL; + if (acceptable(context, result)) + return result; + spin_lock(&dcache_lock); list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) { dget_locked(dentry); @@ -126,9 +132,6 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, target_dir = dget(result); } else { - if (acceptable(context, result)) - return result; - alias = find_acceptable_alias(result, acceptable, context); if (alias) return alias; @@ -289,9 +292,6 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, } } dput(target_dir); - /* now result is properly connected, it is our best bet */ - if (acceptable(context, result)) - return result; alias = find_acceptable_alias(result, acceptable, context); if (alias) -- cgit v1.2.1 From dd90b50906db2c03e236e046f2fc7f7290efe4b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Jul 2007 04:04:32 -0700 Subject: knfsd: exportfs: add find_disconnected_root helper Break the loop that finds the root of a disconnected subtree into a helper of its own to make reading easier and document the intent. Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exportfs/expfs.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 085d6c547f44..166d8ca9eaea 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -70,6 +70,27 @@ find_acceptable_alias(struct dentry *result, return NULL; } +/* + * Find root of a disconnected subtree and return a reference to it. + */ +static struct dentry * +find_disconnected_root(struct dentry *dentry) +{ + dget(dentry); + spin_lock(&dentry->d_lock); + while (!IS_ROOT(dentry) && + (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) { + struct dentry *parent = dentry->d_parent; + dget(parent); + spin_unlock(&dentry->d_lock); + dput(dentry); + dentry = parent; + spin_lock(&dentry->d_lock); + } + spin_unlock(&dentry->d_lock); + return dentry; +} + /** * find_exported_dentry - helper routine to implement export_operations->decode_fh * @sb: The &super_block identifying the filesystem @@ -164,23 +185,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, * the noprogress counter. If we go through the loop 10 times (2 is * probably enough) without getting anywhere, we just give up */ - noprogress= 0; + noprogress = 0; while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) { - struct dentry *pd = target_dir; - - dget(pd); - spin_lock(&pd->d_lock); - while (!IS_ROOT(pd) && - (pd->d_parent->d_flags&DCACHE_DISCONNECTED)) { - struct dentry *parent = pd->d_parent; - - dget(parent); - spin_unlock(&pd->d_lock); - dput(pd); - pd = parent; - spin_lock(&pd->d_lock); - } - spin_unlock(&pd->d_lock); + struct dentry *pd = find_disconnected_root(target_dir); if (!IS_ROOT(pd)) { /* must have found a connected parent - great */ -- cgit v1.2.1 From 019ab801cf32381b90cbe0144cc5695aed0e408c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Jul 2007 04:04:33 -0700 Subject: knfsd: exportfs: split out reconnecting a dentry from find_exported_dentry There's a clear subfunctionality of reconnecting a given dentry to the main dentry tree in find_exported_dentry, that can be called both for the dentry we're looking for or it's parent directory. This patch splits the subfunctionality out into a separate helper to make the code more readable and document it's intent. As a nice side-optimization we can avoid getting a superfluous dentry reference count in the case we need to reconnect a directory on it's own. Signed-off-by: Christoph Hellwig Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exportfs/expfs.c | 213 ++++++++++++++++++++++++++++------------------------ 1 file changed, 113 insertions(+), 100 deletions(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 166d8ca9eaea..8adb32a9387a 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -91,101 +91,27 @@ find_disconnected_root(struct dentry *dentry) return dentry; } -/** - * find_exported_dentry - helper routine to implement export_operations->decode_fh - * @sb: The &super_block identifying the filesystem - * @obj: An opaque identifier of the object to be found - passed to - * get_inode - * @parent: An optional opqaue identifier of the parent of the object. - * @acceptable: A function used to test possible &dentries to see if they are - * acceptable - * @context: A parameter to @acceptable so that it knows on what basis to - * judge. - * - * find_exported_dentry is the central helper routine to enable file systems - * to provide the decode_fh() export_operation. It's main task is to take - * an &inode, find or create an appropriate &dentry structure, and possibly - * splice this into the dcache in the correct place. - * - * The decode_fh() operation provided by the filesystem should call - * find_exported_dentry() with the same parameters that it received except - * that instead of the file handle fragment, pointers to opaque identifiers - * for the object and optionally its parent are passed. The default decode_fh - * routine passes one pointer to the start of the filehandle fragment, and - * one 8 bytes into the fragment. It is expected that most filesystems will - * take this approach, though the offset to the parent identifier may well be - * different. + +/* + * Make sure target_dir is fully connected to the dentry tree. * - * find_exported_dentry() will call get_dentry to get an dentry pointer from - * the file system. If any &dentry in the d_alias list is acceptable, it will - * be returned. Otherwise find_exported_dentry() will attempt to splice a new - * &dentry into the dcache using get_name() and get_parent() to find the - * appropriate place. + * It may already be, as the flag isn't always updated when connection happens. */ - -struct dentry * -find_exported_dentry(struct super_block *sb, void *obj, void *parent, - int (*acceptable)(void *context, struct dentry *de), - void *context) +static int +reconnect_path(struct super_block *sb, struct dentry *target_dir) { - struct dentry *result = NULL; - struct dentry *target_dir; - int err = -ESTALE; - struct export_operations *nops = sb->s_export_op; - struct dentry *alias; - int noprogress; char nbuf[NAME_MAX+1]; + int noprogress = 0; + int err = -ESTALE; /* - * Attempt to find the inode. - */ - result = exportfs_get_dentry(sb, obj); - if (IS_ERR(result)) - return result; - - if (S_ISDIR(result->d_inode->i_mode)) { - if (!(result->d_flags & DCACHE_DISCONNECTED)) { - if (acceptable(context, result)) - return result; - err = -EACCES; - goto err_result; - } - - target_dir = dget(result); - } else { - alias = find_acceptable_alias(result, acceptable, context); - if (alias) - return alias; - - if (parent == NULL) - goto err_result; - - target_dir = exportfs_get_dentry(sb,parent); - if (IS_ERR(target_dir)) { - err = PTR_ERR(target_dir); - goto err_result; - } - } - - /* - * Now we need to make sure that target_dir is properly connected. - * It may already be, as the flag isn't always updated when connection - * happens. - * So, we walk up parent links until we find a connected directory, - * or we run out of directories. Then we find the parent, find - * the name of the child in that parent, and do a lookup. - * This should connect the child into the parent - * We then repeat. - */ - - /* it is possible that a confused file system might not let us complete + * It is possible that a confused file system might not let us complete * the path to the root. For example, if get_parent returns a directory * in which we cannot find a name for the child. While this implies a * very sick filesystem we don't want it to cause knfsd to spin. Hence * the noprogress counter. If we go through the loop 10 times (2 is * probably enough) without getting anywhere, we just give up */ - noprogress = 0; while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) { struct dentry *pd = find_disconnected_root(target_dir); @@ -221,18 +147,20 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, struct dentry *npd; mutex_lock(&pd->d_inode->i_mutex); - if (nops->get_parent) - ppd = nops->get_parent(pd); + if (sb->s_export_op->get_parent) + ppd = sb->s_export_op->get_parent(pd); mutex_unlock(&pd->d_inode->i_mutex); if (IS_ERR(ppd)) { err = PTR_ERR(ppd); - dprintk("find_exported_dentry: get_parent of %ld failed, err %d\n", - pd->d_inode->i_ino, err); + dprintk("%s: get_parent of %ld failed, err %d\n", + __FUNCTION__, pd->d_inode->i_ino, err); dput(pd); break; } - dprintk("find_exported_dentry: find name of %lu in %lu\n", pd->d_inode->i_ino, ppd->d_inode->i_ino); + + dprintk("%s: find name of %lu in %lu\n", __FUNCTION__, + pd->d_inode->i_ino, ppd->d_inode->i_ino); err = exportfs_get_name(ppd, nbuf, pd); if (err) { dput(ppd); @@ -244,13 +172,14 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, continue; break; } - dprintk("find_exported_dentry: found name: %s\n", nbuf); + dprintk("%s: found name: %s\n", __FUNCTION__, nbuf); mutex_lock(&ppd->d_inode->i_mutex); npd = lookup_one_len(nbuf, ppd, strlen(nbuf)); mutex_unlock(&ppd->d_inode->i_mutex); if (IS_ERR(npd)) { err = PTR_ERR(npd); - dprintk("find_exported_dentry: lookup failed: %d\n", err); + dprintk("%s: lookup failed: %d\n", + __FUNCTION__, err); dput(ppd); dput(pd); break; @@ -263,7 +192,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, if (npd == pd) noprogress = 0; else - printk("find_exported_dentry: npd != pd\n"); + printk("%s: npd != pd\n", __FUNCTION__); dput(npd); dput(ppd); if (IS_ROOT(pd)) { @@ -279,15 +208,101 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, /* something went wrong - oh-well */ if (!err) err = -ESTALE; - goto err_target; + return err; } - /* if we weren't after a directory, have one more step to go */ - if (result != target_dir) { - struct dentry *nresult; + + return 0; +} + +/** + * find_exported_dentry - helper routine to implement export_operations->decode_fh + * @sb: The &super_block identifying the filesystem + * @obj: An opaque identifier of the object to be found - passed to + * get_inode + * @parent: An optional opqaue identifier of the parent of the object. + * @acceptable: A function used to test possible &dentries to see if they are + * acceptable + * @context: A parameter to @acceptable so that it knows on what basis to + * judge. + * + * find_exported_dentry is the central helper routine to enable file systems + * to provide the decode_fh() export_operation. It's main task is to take + * an &inode, find or create an appropriate &dentry structure, and possibly + * splice this into the dcache in the correct place. + * + * The decode_fh() operation provided by the filesystem should call + * find_exported_dentry() with the same parameters that it received except + * that instead of the file handle fragment, pointers to opaque identifiers + * for the object and optionally its parent are passed. The default decode_fh + * routine passes one pointer to the start of the filehandle fragment, and + * one 8 bytes into the fragment. It is expected that most filesystems will + * take this approach, though the offset to the parent identifier may well be + * different. + * + * find_exported_dentry() will call get_dentry to get an dentry pointer from + * the file system. If any &dentry in the d_alias list is acceptable, it will + * be returned. Otherwise find_exported_dentry() will attempt to splice a new + * &dentry into the dcache using get_name() and get_parent() to find the + * appropriate place. + */ + +struct dentry * +find_exported_dentry(struct super_block *sb, void *obj, void *parent, + int (*acceptable)(void *context, struct dentry *de), + void *context) +{ + struct dentry *result, *alias; + int err = -ESTALE; + + /* + * Attempt to find the inode. + */ + result = exportfs_get_dentry(sb, obj); + if (IS_ERR(result)) + return result; + + if (S_ISDIR(result->d_inode->i_mode)) { + if (!(result->d_flags & DCACHE_DISCONNECTED)) { + if (acceptable(context, result)) + return result; + err = -EACCES; + goto err_result; + } + + err = reconnect_path(sb, result); + if (err) + goto err_result; + } else { + struct dentry *target_dir, *nresult; + char nbuf[NAME_MAX+1]; + + alias = find_acceptable_alias(result, acceptable, context); + if (alias) + return alias; + + if (parent == NULL) + goto err_result; + + target_dir = exportfs_get_dentry(sb,parent); + if (IS_ERR(target_dir)) { + err = PTR_ERR(target_dir); + goto err_result; + } + + err = reconnect_path(sb, target_dir); + if (err) { + dput(target_dir); + goto err_result; + } + + /* + * As we weren't after a directory, have one more step to go. + */ err = exportfs_get_name(target_dir, nbuf, result); if (!err) { mutex_lock(&target_dir->d_inode->i_mutex); - nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); + nresult = lookup_one_len(nbuf, target_dir, + strlen(nbuf)); mutex_unlock(&target_dir->d_inode->i_mutex); if (!IS_ERR(nresult)) { if (nresult->d_inode) { @@ -297,8 +312,8 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, dput(nresult); } } + dput(target_dir); } - dput(target_dir); alias = find_acceptable_alias(result, acceptable, context); if (alias) @@ -308,13 +323,11 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, dput(result); /* It might be justifiable to return ESTALE here, * but the filehandle at-least looks reasonable good - * and it just be a permission problem, so returning + * and it may just be a permission problem, so returning * -EACCESS is safer */ return ERR_PTR(-EACCES); - err_target: - dput(target_dir); err_result: dput(result); return ERR_PTR(err); -- cgit v1.2.1 From 12127498c8f5e479df15ee374a0932f5659df49e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 17 Jul 2007 04:04:34 -0700 Subject: nfsd warning fix gcc-4.3: fs/nfsd/nfsctl.c: In function 'write_getfs': fs/nfsd/nfsctl.c:248: warning: cast from pointer to integer of different size Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 71c686dc7257..5ab80edc795b 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -245,7 +245,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size) } exp_readunlock(); if (err == 0) - err = res->fh_size + (int)&((struct knfsd_fh*)0)->fh_base; + err = res->fh_size + offsetof(struct knfsd_fh, fh_base); out: return err; } -- cgit v1.2.1 From 9a8db97e7756119689c93c431e8b8324080f5625 Mon Sep 17 00:00:00 2001 From: Marc Eshel Date: Tue, 17 Jul 2007 04:04:35 -0700 Subject: knfsd: lockd: nfsd4: use same grace period for lockd and nfsd4 Both lockd and (in the nfsv4 case) nfsd enforce a "grace period" after reboot, during which clients may reclaim locks from the previous server instance, but may not acquire new locks. Currently the lockd and nfsd enforce grace periods of different lengths. This may cause problems when we reboot a server with both v2/v3 and v4 clients. For example, if the lockd grace period is shorter (as is likely the case), then a v3 client might acquire a new lock that conflicts with a lock already held (but not yet reclaimed) by a v4 client. This patch calculates a lease time that lockd and nfsd can both use. Signed-off-by: Marc Eshel Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/lockd/svc.c | 27 ++++++++++++++++++++------- fs/nfsd/lockd.c | 1 + fs/nfsd/nfs4state.c | 16 ++++++++++++---- 3 files changed, 33 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9fcdef50aad6..82e2192a0d5c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -76,18 +76,31 @@ static const int nlm_port_min = 0, nlm_port_max = 65535; static struct ctl_table_header * nlm_sysctl_table; -static unsigned long set_grace_period(void) +static unsigned long get_lockd_grace_period(void) { - unsigned long grace_period; - /* Note: nlm_timeout should always be nonzero */ if (nlm_grace_period) - grace_period = ((nlm_grace_period + nlm_timeout - 1) - / nlm_timeout) * nlm_timeout * HZ; + return roundup(nlm_grace_period, nlm_timeout) * HZ; else - grace_period = nlm_timeout * 5 * HZ; + return nlm_timeout * 5 * HZ; +} + +unsigned long get_nfs_grace_period(void) +{ + unsigned long lockdgrace = get_lockd_grace_period(); + unsigned long nfsdgrace = 0; + + if (nlmsvc_ops) + nfsdgrace = nlmsvc_ops->get_grace_period(); + + return max(lockdgrace, nfsdgrace); +} +EXPORT_SYMBOL(get_nfs_grace_period); + +static unsigned long set_grace_period(void) +{ nlmsvc_grace_period = 1; - return grace_period + jiffies; + return get_nfs_grace_period() + jiffies; } static inline void clear_grace_period(void) diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 221acd1f11f6..9e4a568a5013 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -65,6 +65,7 @@ nlm_fclose(struct file *filp) static struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ + .get_grace_period = get_nfs4_grace_period, }; void diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8c52913d7cb6..9cc31eaf3857 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -51,6 +51,7 @@ #include #include #include +#include #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -3190,20 +3191,27 @@ nfsd4_load_reboot_recovery_data(void) printk("NFSD: Failure reading reboot recovery data\n"); } +unsigned long +get_nfs4_grace_period(void) +{ + return max(user_lease_time, lease_time) * HZ; +} + /* initialization to perform when the nfsd service is started: */ static void __nfs4_state_start(void) { - time_t grace_time; + unsigned long grace_time; boot_time = get_seconds(); - grace_time = max(user_lease_time, lease_time); + grace_time = get_nfs_grace_period(); lease_time = user_lease_time; in_grace = 1; - printk("NFSD: starting %ld-second grace period\n", grace_time); + printk(KERN_INFO "NFSD: starting %ld-second grace period\n", + grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); - queue_delayed_work(laundry_wq, &laundromat_work, grace_time*HZ); + queue_delayed_work(laundry_wq, &laundromat_work, grace_time); } int -- cgit v1.2.1 From f7fede4b27bfc6c987d6da8e40384b1b098830bb Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:36 -0700 Subject: knfsd: nfsd4: silence a compiler warning in ACL code Silence a compiler warning in the ACL code, and add a comment making clear the initialization serves no other purpose. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4acl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index cc3b7badd486..4adb5ee4759b 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -183,8 +183,13 @@ static void summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) { struct posix_acl_entry *pa, *pe; - pas->users = 0; - pas->groups = 0; + + /* + * Only pas.users and pas.groups need initialization; previous + * posix_acl_valid() calls ensure that the other fields will be + * initialized in the following loop. But, just to placate gcc: + */ + memset(pas, 0, sizeof(*pas)); pas->mask = 07; pe = acl->a_entries + acl->a_count; -- cgit v1.2.1 From 0ac68d17996eb421dde51452b89d5545ba07c6fe Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 17 Jul 2007 04:04:37 -0700 Subject: knfsd: nfsd4: fix enc_stateid_sz for nfsd callbacks enc_stateid_sz should be given in u32 words units, not bytes, so we were overestimating the buffer space needed here. Signed-off-by: Benny Halevy Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4callback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 5443c52b57aa..31d6633c7fe4 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -75,7 +75,7 @@ enum nfs_cb_opnum4 { #define op_enc_sz 1 #define op_dec_sz 2 #define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) -#define enc_stateid_sz 16 +#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) #define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ 1 + enc_stateid_sz + \ enc_nfs4_fh_sz) -- cgit v1.2.1 From 4b2ca38ad6c44ed0442092a829e6e954bf3580af Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:37 -0700 Subject: knfsd: nfsd4: fix handling of acl errrors nfs4_acl_nfsv4_to_posix() returns an error and returns any posix acls calculated in two caller-provided pointers. It was setting these pointers to -errno in some error cases, resulting in nfsd4_set_nfs4_acl() calling posix_acl_release() with a -errno as an argument. Fix both the caller and the callee, by modifying nfsd4_set_nfs4_acl() to stop relying on the passed-in-pointers being left as NULL in the error case, and by modifying nfs4_acl_nfsv4_to_posix() to stop returning garbage in those pointers. Thanks to Alex Soule for reporting the bug. Signed-off-by: "J. Bruce Fields" Cc: Alexander Soule Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4acl.c | 3 +++ fs/nfsd/vfs.c | 22 +++++++--------------- 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 4adb5ee4759b..b6ed38380ab8 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -737,13 +737,16 @@ int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, *pacl = posix_state_to_acl(&effective_acl_state, flags); if (IS_ERR(*pacl)) { ret = PTR_ERR(*pacl); + *pacl = NULL; goto out_dstate; } *dpacl = posix_state_to_acl(&default_acl_state, flags | NFS4_ACL_TYPE_DEFAULT); if (IS_ERR(*dpacl)) { ret = PTR_ERR(*dpacl); + *dpacl = NULL; posix_acl_release(*pacl); + *pacl = NULL; goto out_dstate; } sort_pacl(*pacl); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 945b1cedde2b..8d6b5c483ae1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -435,7 +435,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, /* Get inode */ error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); if (error) - goto out; + return error; dentry = fhp->fh_dentry; inode = dentry->d_inode; @@ -444,33 +444,25 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); if (host_error == -EINVAL) { - error = nfserr_attrnotsupp; - goto out; + return nfserr_attrnotsupp; } else if (host_error < 0) goto out_nfserr; host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); if (host_error < 0) - goto out_nfserr; + goto out_release; - if (S_ISDIR(inode->i_mode)) { + if (S_ISDIR(inode->i_mode)) host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); - if (host_error < 0) - goto out_nfserr; - } - - error = nfs_ok; -out: +out_release: posix_acl_release(pacl); posix_acl_release(dpacl); - return (error); out_nfserr: if (host_error == -EOPNOTSUPP) - error = nfserr_attrnotsupp; + return nfserr_attrnotsupp; else - error = nfserrno(host_error); - goto out; + return nfserrno(host_error); } static struct posix_acl * -- cgit v1.2.1 From 1e5140279f31e47d58ed6036ee61ba7a65710e63 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:38 -0700 Subject: knfsd: nfsd: remove unused header interface.h It looks like Al Viro gutted this header file five years ago and it hasn't been touched since. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfsctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5ab80edc795b..baac89d917ca 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -35,7 +35,6 @@ #include #include #include -#include #include -- cgit v1.2.1 From c2f1a551dea8b37c2e0cb886885c250fb703e9d8 Mon Sep 17 00:00:00 2001 From: Meelap Shah Date: Tue, 17 Jul 2007 04:04:39 -0700 Subject: knfsd: nfsd4: vary maximum delegation limit based on RAM size Our original NFSv4 delegation policy was to give out a read delegation on any open when it was possible to. Since the lifetime of a delegation isn't limited to that of an open, a client may quite reasonably hang on to a delegation as long as it has the inode cached. This becomes an obvious problem the first time a client's inode cache approaches the size of the server's total memory. Our first quick solution was to add a hard-coded limit. This patch makes a mild incremental improvement by varying that limit according to the server's total memory size, allowing at most 4 delegations per megabyte of RAM. My quick back-of-the-envelope calculation finds that in the worst case (where every delegation is for a different inode), a delegation could take about 1.5K, which would make the worst case usage about 6% of memory. The new limit works out to be about the same as the old on a 1-gig server. [akpm@linux-foundation.org: Don't needlessly bloat vmlinux] [akpm@linux-foundation.org: Make it right for highmem machines] Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9cc31eaf3857..46249886ea86 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -150,6 +151,7 @@ get_nfs4_file(struct nfs4_file *fi) } static int num_delegations; +unsigned int max_delegations; /* * Open owner state (share locks) @@ -193,7 +195,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; dprintk("NFSD alloc_init_deleg\n"); - if (num_delegations > STATEID_HASH_SIZE * 4) + if (num_delegations > max_delegations) return NULL; dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); if (dp == NULL) @@ -3197,6 +3199,27 @@ get_nfs4_grace_period(void) return max(user_lease_time, lease_time) * HZ; } +/* + * Since the lifetime of a delegation isn't limited to that of an open, a + * client may quite reasonably hang on to a delegation as long as it has + * the inode cached. This becomes an obvious problem the first time a + * client's inode cache approaches the size of the server's total memory. + * + * For now we avoid this problem by imposing a hard limit on the number + * of delegations, which varies according to the server's memory size. + */ +static void +set_max_delegations(void) +{ + /* + * Allow at most 4 delegations per megabyte of RAM. Quick + * estimates suggest that in the worst case (where every delegation + * is for a different inode), a delegation could take about 1.5K, + * giving a worst case usage of about 6% of memory. + */ + max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); +} + /* initialization to perform when the nfsd service is started: */ static void @@ -3212,6 +3235,7 @@ __nfs4_state_start(void) grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); queue_delayed_work(laundry_wq, &laundromat_work, grace_time); + set_max_delegations(); } int -- cgit v1.2.1 From 47f9940c55c0bdc65188749cae4e841601f513bb Mon Sep 17 00:00:00 2001 From: Meelap Shah Date: Tue, 17 Jul 2007 04:04:40 -0700 Subject: knfsd: nfsd4: don't delegate files that have had conflicts One more incremental delegation policy improvement: don't give out a delegation on a file if conflicting access has previously required that a delegation be revoked on that file. (In practice we'll forget about the conflict when the struct nfs4_file is removed on close, so this is of limited use for now, though it should at least solve a temporary problem with self-conflicts on write opens from the same client.) Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 46249886ea86..e4a4c87ec8c6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -195,6 +195,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; dprintk("NFSD alloc_init_deleg\n"); + if (fp->fi_had_conflict) + return NULL; if (num_delegations > max_delegations) return NULL; dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); @@ -1002,6 +1004,7 @@ alloc_init_file(struct inode *ino) list_add(&fp->fi_hash, &file_hashtbl[hashval]); fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; return fp; } return NULL; @@ -1328,6 +1331,7 @@ do_recall(void *__dp) { struct nfs4_delegation *dp = __dp; + dp->dl_file->fi_had_conflict = true; nfsd4_cb_recall(dp); return 0; } -- cgit v1.2.1 From 2d3bb25209c1f9a27ea9535c7fd2f6729a5e7db1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:40 -0700 Subject: knfsd: nfsd: make all exp_finding functions return -errno's on err Currently exp_find(), exp_get_by_name(), and friends, return an export on success, and on failure return: errors -EAGAIN (drop this request pending an upcall) or -ETIMEDOUT (an upcall has timed out), or return NULL, which can mean either that there was a memory allocation failure, or that an export was not found, or that a passed-in export lacks an auth_domain. Many callers seem to assume that NULL means that an export was not found, which may lead to bugs in the case of a memory allocation failure. Modify these functions to distinguish between the two NULL cases by returning either -ENOENT or -ENOMEM. They now never return NULL. We get to simplify some code in the process. We return -ENOENT in the case of a missing auth_domain. This case should probably be removed (or converted to a bug) after confirming that it can never happen. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 58 ++++++++++++++++++++++---------------------------------- fs/nfsd/nfsfh.c | 11 +++++------ fs/nfsd/vfs.c | 9 ++++----- 3 files changed, 32 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c518421a9355..d4accdcb53a2 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -739,16 +739,18 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) int err; if (!clp) - return NULL; + return ERR_PTR(-ENOENT); key.ek_client = clp; key.ek_fsidtype = fsid_type; memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); ek = svc_expkey_lookup(&key); - if (ek != NULL) - if ((err = cache_check(&svc_expkey_cache, &ek->h, reqp))) - ek = ERR_PTR(err); + if (ek == NULL) + return ERR_PTR(-ENOMEM); + err = cache_check(&svc_expkey_cache, &ek->h, reqp); + if (err) + return ERR_PTR(err); return ek; } @@ -809,30 +811,21 @@ exp_get_by_name(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry, struct cache_req *reqp) { struct svc_export *exp, key; + int err; if (!clp) - return NULL; + return ERR_PTR(-ENOENT); key.ex_client = clp; key.ex_mnt = mnt; key.ex_dentry = dentry; exp = svc_export_lookup(&key); - if (exp != NULL) { - int err; - - err = cache_check(&svc_export_cache, &exp->h, reqp); - switch (err) { - case 0: break; - case -EAGAIN: - case -ETIMEDOUT: - exp = ERR_PTR(err); - break; - default: - exp = NULL; - } - } - + if (exp == NULL) + return ERR_PTR(-ENOMEM); + err = cache_check(&svc_export_cache, &exp->h, reqp); + if (err) + return ERR_PTR(err); return exp; } @@ -848,7 +841,7 @@ exp_parent(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry, dget(dentry); exp = exp_get_by_name(clp, mnt, dentry, reqp); - while (exp == NULL && !IS_ROOT(dentry)) { + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { struct dentry *parent; parent = dget_parent(dentry); @@ -901,7 +894,7 @@ static void exp_fsid_unhash(struct svc_export *exp) return; ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); - if (ek && !IS_ERR(ek)) { + if (!IS_ERR(ek)) { ek->h.expiry_time = get_seconds()-1; cache_put(&ek->h, &svc_expkey_cache); } @@ -939,7 +932,7 @@ static void exp_unhash(struct svc_export *exp) struct inode *inode = exp->ex_dentry->d_inode; ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); - if (ek && !IS_ERR(ek)) { + if (!IS_ERR(ek)) { ek->h.expiry_time = get_seconds()-1; cache_put(&ek->h, &svc_expkey_cache); } @@ -990,13 +983,12 @@ exp_export(struct nfsctl_export *nxp) /* must make sure there won't be an ex_fsid clash */ if ((nxp->ex_flags & NFSEXP_FSID) && - (fsid_key = exp_get_fsid_key(clp, nxp->ex_dev)) && - !IS_ERR(fsid_key) && + (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) && fsid_key->ek_mnt && (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) ) goto finish; - if (exp) { + if (!IS_ERR(exp)) { /* just a flags/id/fsid update */ exp_fsid_unhash(exp); @@ -1105,7 +1097,7 @@ exp_unexport(struct nfsctl_export *nxp) err = -EINVAL; exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL); path_release(&nd); - if (!exp) + if (IS_ERR(exp)) goto out_domain; exp_do_unexport(exp); @@ -1150,10 +1142,6 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize) err = PTR_ERR(exp); goto out; } - if (!exp) { - dprintk("nfsd: exp_rootfh export not found.\n"); - goto out; - } /* * fh must be initialized before calling fh_compose @@ -1177,13 +1165,13 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv, { struct svc_export *exp; struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp); - if (!ek || IS_ERR(ek)) + if (IS_ERR(ek)) return ERR_PTR(PTR_ERR(ek)); exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp); cache_put(&ek->h, &svc_expkey_cache); - if (!exp || IS_ERR(exp)) + if (IS_ERR(exp)) return ERR_PTR(PTR_ERR(exp)); return exp; } @@ -1205,10 +1193,10 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); exp = exp_find(clp, FSID_NUM, fsidv, creq); + if (PTR_ERR(exp) == -ENOENT) + return nfserr_perm; if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); - if (exp == NULL) - return nfserr_perm; rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); exp_put(exp); return rv; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ecf9c361b3f5..89f9041a7782 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -160,15 +160,14 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) &rqstp->rq_chandle); } - if (IS_ERR(exp) && (PTR_ERR(exp) == -EAGAIN - || PTR_ERR(exp) == -ETIMEDOUT)) { - error = nfserrno(PTR_ERR(exp)); + error = nfserr_stale; + if (PTR_ERR(exp) == -ENOENT) goto out; - } - error = nfserr_stale; - if (!exp || IS_ERR(exp)) + if (IS_ERR(exp)) { + error = nfserrno(PTR_ERR(exp)); goto out; + } /* Check if the request originated from a secure port. */ error = nfserr_perm; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8d6b5c483ae1..0a18149ce963 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -192,15 +192,14 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, exp2 = exp_parent(exp->ex_client, mnt, dentry, &rqstp->rq_chandle); - if (IS_ERR(exp2)) { + if (PTR_ERR(exp2) == -ENOENT) { + dput(dentry); + dentry = dget(dparent); + } else if (IS_ERR(exp2)) { host_err = PTR_ERR(exp2); dput(dentry); mntput(mnt); goto out_nfserr; - } - if (!exp2) { - dput(dentry); - dentry = dget(dparent); } else { exp_put(exp); exp = exp2; -- cgit v1.2.1 From 42ed95c4e7415714aaab604ae7b1602b87b27b73 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:41 -0700 Subject: knfsd: nfsd4: build rpcsec_gss whenever nfsd4 is built Select rpcsec_gss support whenever asked for NFSv4 support. The rfc actually requires gss, and gss is also the main reason to migrate to v4. We already do this on the client side. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index ee11f8d94085..613df554728d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1675,6 +1675,7 @@ config NFSD_V3_ACL config NFSD_V4 bool "Provide NFSv4 server support (EXPERIMENTAL)" depends on NFSD_V3 && EXPERIMENTAL + select RPCSEC_GSS_KRB5 help If you would like to include the NFSv4 server as well as the NFSv2 and NFSv3 servers, say Y here. This feature is experimental, and -- cgit v1.2.1 From e677bfe4d451f8271986a229270c6eecd1f62b3f Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 17 Jul 2007 04:04:42 -0700 Subject: knfsd: nfsd4: parse secinfo information in exports downcall We add a list of pseudoflavors to each export downcall, which will be used both as a list of security flavors allowed on that export, and (in the order given) as the list of pseudoflavors to return on secinfo calls. This patch parses the new downcall information and adds it to the export structure, but doesn't use it for anything yet. Signed-off-by: J. Bruce Fields Signed-off-by: Andy Adamson Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index d4accdcb53a2..fbbbcc5a2fa3 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -452,8 +454,48 @@ out_free_all: return err; } +static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) +{ + int listsize, err; + struct exp_flavor_info *f; + + err = get_int(mesg, &listsize); + if (err) + return err; + if (listsize < 0 || listsize > MAX_SECINFO_LIST) + return -EINVAL; + + for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { + err = get_int(mesg, &f->pseudoflavor); + if (err) + return err; + /* + * Just a quick sanity check; we could also try to check + * whether this pseudoflavor is supported, but at worst + * an unsupported pseudoflavor on the export would just + * be a pseudoflavor that won't match the flavor of any + * authenticated request. The administrator will + * probably discover the problem when someone fails to + * authenticate. + */ + if (f->pseudoflavor < 0) + return -EINVAL; + err = get_int(mesg, &f->flags); + if (err) + return err; + /* Only some flags are allowed to differ between flavors: */ + if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags)) + return -EINVAL; + } + exp->ex_nflavors = listsize; + return 0; +} + #else /* CONFIG_NFSD_V4 */ -static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; } +static inline int +fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;} +static inline int +secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) @@ -477,6 +519,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_uuid = NULL; + /* secinfo */ + exp.ex_nflavors = 0; + if (mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; @@ -554,7 +599,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (exp.ex_uuid == NULL) err = -ENOMEM; } - } else + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else /* quietly ignore unknown words and anything * following. Newer user-space can try to set * new values, then see what the result was. @@ -655,6 +702,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); + int i; new->ex_flags = item->ex_flags; new->ex_anon_uid = item->ex_anon_uid; @@ -670,6 +718,10 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; + new->ex_nflavors = item->ex_nflavors; + for (i = 0; i < MAX_SECINFO_LIST; i++) { + new->ex_flavors[i] = item->ex_flavors[i]; + } } static struct cache_head *svc_export_alloc(void) -- cgit v1.2.1 From df547efb03e3e8f9ea726e1d07fbbd6fd0706cd7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:43 -0700 Subject: knfsd: nfsd4: simplify exp_pseudoroot arguments We're passing three arguments to exp_pseudoroot, two of which are just fields of the svc_rqst. Soon we'll want to pass in a third field as well. So let's just give up and pass in the whole struct svc_rqst. Also sneak in some minor style cleanups while we're at it. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 5 ++--- fs/nfsd/nfs4proc.c | 7 +++---- fs/nfsd/nfs4xdr.c | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index fbbbcc5a2fa3..af6abb2529c9 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1235,8 +1235,7 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv, * export point with fsid==0 */ __be32 -exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, - struct cache_req *creq) +exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) { struct svc_export *exp; __be32 rv; @@ -1244,7 +1243,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); - exp = exp_find(clp, FSID_NUM, fsidv, creq); + exp = exp_find(rqstp->rq_client, FSID_NUM, fsidv, rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) return nfserr_perm; if (IS_ERR(exp)) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8522729830db..a106e3be7c13 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -286,8 +286,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; fh_put(&cstate->current_fh); - status = exp_pseudoroot(rqstp->rq_client, &cstate->current_fh, - &rqstp->rq_chandle); + status = exp_pseudoroot(rqstp, &cstate->current_fh); return status; } @@ -474,8 +473,8 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 ret; fh_init(&tmp_fh, NFS4_FHSIZE); - if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh, - &rqstp->rq_chandle)) != 0) + ret = exp_pseudoroot(rqstp, &tmp_fh); + if (ret) return ret; if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { fh_put(&tmp_fh); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 15809dfd88a5..b0bfbda375e1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1296,7 +1296,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 * char *path, *rootpath; fh_init(&tmp_fh, NFS4_FHSIZE); - *stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle); + *stat = exp_pseudoroot(rqstp, &tmp_fh); if (*stat) return NULL; rootpath = tmp_fh.fh_export->ex_path; -- cgit v1.2.1 From 87548c37c8bdbf98aea002c9c04e4dc8aa27fe1b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:44 -0700 Subject: knfsd: nfsd: remove superfluous assignment from nfsd_lookup The "err" variable will only be used in the final return, which always happens after either the preceding err = fh_compose(...); or after the following err = nfserrno(host_err); So the earlier assignment to err is ignored. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/vfs.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0a18149ce963..ec6aaf8b0e36 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -168,8 +168,6 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, exp = fhp->fh_export; exp_get(exp); - err = nfserr_acces; - /* Lookup the name, but don't follow links */ if (isdotent(name, len)) { if (len==1) -- cgit v1.2.1 From 0989a7889695831e49e2c53c1884f52645516a90 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:44 -0700 Subject: knfsd: nfsd: provide export lookup wrappers which take a svc_rqst Split the callers of exp_get_by_name(), exp_find(), and exp_parent() into those that are processing requests and those that are doing other stuff (like looking up filehandles for mountd). No change in behavior, just a (fairly pointless, on its own) cleanup. (Note this has the effect of making nfsd_cross_mnt() pass rqstp->rq_client instead of exp->ex_client into exp_find_by_name(). However, the two should have the same value at this point.) Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 28 +++++++++++++++++++++++++++- fs/nfsd/nfsfh.c | 5 ++--- fs/nfsd/vfs.c | 5 ++--- 3 files changed, 31 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index af6abb2529c9..9b569af695ab 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1228,6 +1228,32 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv, return exp; } +/* + * Called from functions that handle requests; functions that do work on + * behalf of mountd are passed a single client name to use, and should + * use exp_get_by_name() or exp_find(). + */ +struct svc_export * +rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt, + struct dentry *dentry) +{ + return exp_get_by_name(rqstp->rq_client, mnt, dentry, + &rqstp->rq_chandle); +} + +struct svc_export * +rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) +{ + return exp_find(rqstp->rq_client, fsid_type, fsidv, + &rqstp->rq_chandle); +} + +struct svc_export * +rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, + struct dentry *dentry) +{ + return exp_parent(rqstp->rq_client, mnt, dentry, &rqstp->rq_chandle); +} /* * Called when we need the filehandle for the root of the pseudofs, @@ -1243,7 +1269,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); - exp = exp_find(rqstp->rq_client, FSID_NUM, fsidv, rqstp->rq_chandle); + exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); if (PTR_ERR(exp) == -ENOENT) return nfserr_perm; if (IS_ERR(exp)) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 89f9041a7782..180e068ea064 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -145,7 +145,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) fh->fh_fsid[1] = fh->fh_fsid[2]; } if ((data_left -= len)<0) goto out; - exp = exp_find(rqstp->rq_client, fh->fh_fsid_type, datap, &rqstp->rq_chandle); + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, datap); datap += len; } else { dev_t xdev; @@ -156,8 +156,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) xdev = old_decode_dev(fh->ofh_xdev); xino = u32_to_ino_t(fh->ofh_xino); mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); - exp = exp_find(rqstp->rq_client, FSID_DEV, tfh, - &rqstp->rq_chandle); + exp = rqst_exp_find(rqstp, FSID_DEV, tfh); } error = nfserr_stale; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ec6aaf8b0e36..65043af232ee 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -113,7 +113,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); - exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle); + exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts); if (IS_ERR(exp2)) { err = PTR_ERR(exp2); dput(mounts); @@ -188,8 +188,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, dput(dentry); dentry = dp; - exp2 = exp_parent(exp->ex_client, mnt, dentry, - &rqstp->rq_chandle); + exp2 = rqst_exp_parent(rqstp, mnt, dentry); if (PTR_ERR(exp2) == -ENOENT) { dput(dentry); dentry = dget(dparent); -- cgit v1.2.1 From 3ab4d8b1215d61736e2a9a26bea7cc2e6b029e3d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:46 -0700 Subject: knfsd: nfsd: set rq_client to ip-address-determined-domain We want it to be possible for users to restrict exports both by IP address and by pseudoflavor. The pseudoflavor information has previously been passed using special auth_domains stored in the rq_client field. After the preceding patch that stored the pseudoflavor in rq_pflavor, that's now superfluous; so now we use rq_client for the ip information, as auth_null and auth_unix do. However, we keep around the special auth_domain in the rq_gssclient field for backwards compatibility purposes, so we can still do upcalls using the old "gss/pseudoflavor" auth_domain if upcalls using the unix domain to give us an appropriate export. This allows us to continue supporting old mountd. In fact, for this first patch, we always use the "gss/pseudoflavor" auth_domain (and only it) if it is available; thus rq_client is ignored in the auth_gss case, and this patch on its own makes no change in behavior; that will be left to later patches. Note on idmap: I'm almost tempted to just replace the auth_domain in the idmap upcall by a dummy value--no version of idmapd has ever used it, and it's unlikely anyone really wants to perform idmapping differently depending on the where the client is (they may want to perform *credential* mapping differently, but that's a different matter--the idmapper just handles id's used in getattr and setattr). But I'm updating the idmapd code anyway, just out of general backwards-compatibility paranoia. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 15 +++++++++++---- fs/nfsd/nfs4idmap.c | 13 +++++++++++-- fs/nfsd/nfsfh.c | 2 -- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 9b569af695ab..ac225a79376c 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1237,21 +1237,28 @@ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt, struct dentry *dentry) { - return exp_get_by_name(rqstp->rq_client, mnt, dentry, - &rqstp->rq_chandle); + struct auth_domain *clp; + + clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; + return exp_get_by_name(clp, mnt, dentry, &rqstp->rq_chandle); } struct svc_export * rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) { - return exp_find(rqstp->rq_client, fsid_type, fsidv, - &rqstp->rq_chandle); + struct auth_domain *clp; + + clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; + return exp_find(clp, fsid_type, fsidv, &rqstp->rq_chandle); } struct svc_export * rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, struct dentry *dentry) { + struct auth_domain *clp; + + clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; return exp_parent(rqstp->rq_client, mnt, dentry, &rqstp->rq_chandle); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 45aa21ce6784..2cf9a9a2d89c 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -587,6 +587,15 @@ idmap_lookup(struct svc_rqst *rqstp, return ret; } +static char * +rqst_authname(struct svc_rqst *rqstp) +{ + struct auth_domain *clp; + + clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; + return clp->name; +} + static int idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) @@ -600,7 +609,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return -EINVAL; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; - strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname)); + strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); if (ret == -ENOENT) ret = -ESRCH; /* nfserr_badname */ @@ -620,7 +629,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) }; int ret; - strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname)); + strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item); if (ret == -ENOENT) return sprintf(name, "%u", id); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 180e068ea064..22cb5be79ad0 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -120,8 +120,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) int data_left = fh->fh_size/4; error = nfserr_stale; - if (rqstp->rq_client == NULL) - goto out; if (rqstp->rq_vers > 2) error = nfserr_badhandle; if (rqstp->rq_vers == 4 && fh->fh_size == 0) -- cgit v1.2.1 From 2ea2209f073dc7049bd285b4f5dbc0aa273f9746 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:46 -0700 Subject: knfsd: nfsd: use ip-address-based domain in secinfo case With this patch, we fall back on using the gss/pseudoflavor only if we fail to find a matching auth_unix export that has a secinfo list. As long as sec= options aren't used, there's still no change in behavior here (except possibly for some additional auth_unix cache lookups, whose results will be ignored). The sec= option, however, is not actually enforced yet; later patches will add the necessary checks. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ac225a79376c..4537a8f5cb9a 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1229,6 +1229,10 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv, } /* + * Uses rq_client and rq_gssclient to find an export; uses rq_client (an + * auth_unix client) if it's available and has secinfo information; + * otherwise, will try to use rq_gssclient. + * * Called from functions that handle requests; functions that do work on * behalf of mountd are passed a single client name to use, and should * use exp_get_by_name() or exp_find(). @@ -1237,29 +1241,83 @@ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt, struct dentry *dentry) { - struct auth_domain *clp; + struct svc_export *gssexp, *exp = NULL; + + if (rqstp->rq_client == NULL) + goto gss; - clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; - return exp_get_by_name(clp, mnt, dentry, &rqstp->rq_chandle); + /* First try the auth_unix client: */ + exp = exp_get_by_name(rqstp->rq_client, mnt, dentry, + &rqstp->rq_chandle); + if (PTR_ERR(exp) == -ENOENT) + goto gss; + if (IS_ERR(exp)) + return exp; + /* If it has secinfo, assume there are no gss/... clients */ + if (exp->ex_nflavors > 0) + return exp; +gss: + /* Otherwise, try falling back on gss client */ + if (rqstp->rq_gssclient == NULL) + return exp; + gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry, + &rqstp->rq_chandle); + if (PTR_ERR(gssexp) == -ENOENT) + return exp; + if (exp && !IS_ERR(exp)) + exp_put(exp); + return gssexp; } struct svc_export * rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) { - struct auth_domain *clp; + struct svc_export *gssexp, *exp = NULL; + + if (rqstp->rq_client == NULL) + goto gss; - clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; - return exp_find(clp, fsid_type, fsidv, &rqstp->rq_chandle); + /* First try the auth_unix client: */ + exp = exp_find(rqstp->rq_client, fsid_type, fsidv, &rqstp->rq_chandle); + if (PTR_ERR(exp) == -ENOENT) + goto gss; + if (IS_ERR(exp)) + return exp; + /* If it has secinfo, assume there are no gss/... clients */ + if (exp->ex_nflavors > 0) + return exp; +gss: + /* Otherwise, try falling back on gss client */ + if (rqstp->rq_gssclient == NULL) + return exp; + gssexp = exp_find(rqstp->rq_gssclient, fsid_type, fsidv, + &rqstp->rq_chandle); + if (PTR_ERR(gssexp) == -ENOENT) + return exp; + if (exp && !IS_ERR(exp)) + exp_put(exp); + return gssexp; } struct svc_export * rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, struct dentry *dentry) { - struct auth_domain *clp; + struct svc_export *exp; - clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; - return exp_parent(rqstp->rq_client, mnt, dentry, &rqstp->rq_chandle); + dget(dentry); + exp = rqst_exp_get_by_name(rqstp, mnt, dentry); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { + struct dentry *parent; + + parent = dget_parent(dentry); + dput(dentry); + dentry = parent; + exp = rqst_exp_get_by_name(rqstp, mnt, dentry); + } + dput(dentry); + return exp; } /* -- cgit v1.2.1 From 6c0a654dceaa4342270306de77eadb0173dfb58a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:47 -0700 Subject: knfsd: nfsd: factor nfsd_lookup into 2 pieces Factor nfsd_lookup into nfsd_lookup_dentry, which finds the right dentry and export, and a second part which composes the filehandle (and which will later check the security flavor on the new export). No change in behavior. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/vfs.c | 55 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 65043af232ee..627f460a4007 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -135,21 +135,10 @@ out: return err; } -/* - * Look up one component of a pathname. - * N.B. After this call _both_ fhp and resfh need an fh_put - * - * If the lookup would cross a mountpoint, and the mounted filesystem - * is exported to the client with NFSEXP_NOHIDE, then the lookup is - * accepted as it stands and the mounted directory is - * returned. Otherwise the covered directory is returned. - * NOTE: this mountpoint crossing is not supported properly by all - * clients and is explicitly disallowed for NFSv3 - * NeilBrown - */ __be32 -nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, - int len, struct svc_fh *resfh) +nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, + const char *name, int len, + struct svc_export **exp_ret, struct dentry **dentry_ret) { struct svc_export *exp; struct dentry *dparent; @@ -219,6 +208,38 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, } } } + *dentry_ret = dentry; + *exp_ret = exp; + return 0; + +out_nfserr: + exp_put(exp); + return nfserrno(host_err); +} + +/* + * Look up one component of a pathname. + * N.B. After this call _both_ fhp and resfh need an fh_put + * + * If the lookup would cross a mountpoint, and the mounted filesystem + * is exported to the client with NFSEXP_NOHIDE, then the lookup is + * accepted as it stands and the mounted directory is + * returned. Otherwise the covered directory is returned. + * NOTE: this mountpoint crossing is not supported properly by all + * clients and is explicitly disallowed for NFSv3 + * NeilBrown + */ +__be32 +nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, + int len, struct svc_fh *resfh) +{ + struct svc_export *exp; + struct dentry *dentry; + __be32 err; + + err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); + if (err) + return err; /* * Note: we compose the file handle now, but as the * dentry may be negative, it may need to be updated. @@ -227,15 +248,11 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, if (!err && !dentry->d_inode) err = nfserr_noent; dput(dentry); -out: exp_put(exp); return err; - -out_nfserr: - err = nfserrno(host_err); - goto out; } + /* * Set various file attributes. * N.B. After this call fhp needs an fh_put -- cgit v1.2.1 From 32c1eb0cd7ee00b5eb7b6f7059c635fbc1052966 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 17 Jul 2007 04:04:48 -0700 Subject: knfsd: nfsd4: return nfserr_wrongsec Make the first actual use of the secinfo information by using it to return nfserr_wrongsec when an export is found that doesn't allow the flavor used on this request. Signed-off-by: J. Bruce Fields Signed-off-by: Andy Adamson Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 26 ++++++++++++++++++++++++++ fs/nfsd/nfsfh.c | 6 ++++++ fs/nfsd/nfssvc.c | 10 ++++++++++ fs/nfsd/vfs.c | 4 ++++ 4 files changed, 46 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 4537a8f5cb9a..323cbdcc9bfd 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1228,6 +1228,28 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv, return exp; } +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + + /* legacy gss-only clients are always OK: */ + if (exp->ex_client == rqstp->rq_gssclient) + return 0; + /* ip-address based client; check sec= export option: */ + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor == rqstp->rq_flavor) + return 0; + } + /* defaults in absence of sec= options: */ + if (exp->ex_nflavors == 0) { + if (rqstp->rq_flavor == RPC_AUTH_NULL || + rqstp->rq_flavor == RPC_AUTH_UNIX) + return 0; + } + return nfserr_wrongsec; +} + /* * Uses rq_client and rq_gssclient to find an export; uses rq_client (an * auth_unix client) if it's available and has secinfo information; @@ -1340,6 +1362,10 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); + if (rv) + goto out; + rv = check_nfsd_access(exp, rqstp); +out: exp_put(exp); return rv; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 22cb5be79ad0..d5fe392b14fb 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -20,6 +20,7 @@ #include #include +#include #include #define NFSDDBG_FACILITY NFSDDBG_FH @@ -248,6 +249,11 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (error) goto out; + /* Check security flavor */ + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; + /* Finally, check access permissions. */ error = nfsd_permission(exp, dentry, access); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 5c8192bcbced..a8c89ae4c743 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -494,6 +494,15 @@ out: module_put_and_exit(0); } +static __be32 map_new_errors(u32 vers, __be32 nfserr) +{ + if (nfserr == nfserr_jukebox && vers == 2) + return nfserr_dropit; + if (nfserr == nfserr_wrongsec && vers < 4) + return nfserr_acces; + return nfserr; +} + int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -536,6 +545,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); + nfserr = map_new_errors(rqstp->rq_vers, nfserr); if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2) nfserr = nfserr_dropit; if (nfserr == nfserr_dropit) { diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 627f460a4007..8e109e586a74 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -240,6 +240,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); if (err) return err; + err = check_nfsd_access(exp, rqstp); + if (err) + goto out; /* * Note: we compose the file handle now, but as the * dentry may be negative, it may need to be updated. @@ -247,6 +250,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, err = fh_compose(resfh, exp, dentry, fhp); if (!err && !dentry->d_inode) err = nfserr_noent; +out: dput(dentry); exp_put(exp); return err; -- cgit v1.2.1 From 0ec757df9743025f14190d6034d8bd2bf37c2dd1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:48 -0700 Subject: knfsd: nfsd4: make readonly access depend on pseudoflavor Allow readonly access to vary depending on the pseudoflavor, using the flag passed with each pseudoflavor in the export downcall. The rest of the flags are ignored for now, though some day we might also allow id squashing to vary based on the flavor. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfsfh.c | 2 +- fs/nfsd/nfsproc.c | 3 ++- fs/nfsd/vfs.c | 13 +++++++------ 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index d5fe392b14fb..8d2b49914843 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -255,7 +255,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) goto out; /* Finally, check access permissions. */ - error = nfsd_permission(exp, dentry, access); + error = nfsd_permission(rqstp, exp, dentry, access); if (error) { dprintk("fh_verify: %s/%s permission failure, " diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index b2c7147aa921..977a71f64e19 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -278,7 +278,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, * echo thing > device-special-file-or-pipe * by doing a CREATE with type==0 */ - nfserr = nfsd_permission(newfhp->fh_export, + nfserr = nfsd_permission(rqstp, + newfhp->fh_export, newfhp->fh_dentry, MAY_WRITE|MAY_LOCAL_ACCESS); if (nfserr && nfserr != nfserr_rofs) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8e109e586a74..e90f4a8a1d01 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -328,7 +328,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* The size case is special. It changes the file as well as the attributes. */ if (iap->ia_valid & ATTR_SIZE) { if (iap->ia_size < inode->i_size) { - err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); + err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); if (err) goto out; } @@ -616,7 +616,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor sresult |= map->access; - err2 = nfsd_permission(export, dentry, map->how); + err2 = nfsd_permission(rqstp, export, dentry, map->how); switch (err2) { case nfs_ok: result |= map->access; @@ -1043,7 +1043,7 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, __be32 err; if (file) { - err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, MAY_READ|MAY_OWNER_OVERRIDE); if (err) goto out; @@ -1072,7 +1072,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, __be32 err = 0; if (file) { - err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, MAY_WRITE|MAY_OWNER_OVERRIDE); if (err) goto out; @@ -1801,7 +1801,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) * Check for a user's access permissions to this inode. */ __be32 -nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) +nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, + struct dentry *dentry, int acc) { struct inode *inode = dentry->d_inode; int err; @@ -1832,7 +1833,7 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) */ if (!(acc & MAY_LOCAL_ACCESS)) if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { - if (EX_RDONLY(exp) || IS_RDONLY(inode)) + if (EX_RDONLY(exp, rqstp) || IS_RDONLY(inode)) return nfserr_rofs; if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) return nfserr_perm; -- cgit v1.2.1 From ac34cdb03dfdb8cdc824f41f577434c5c2521155 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:49 -0700 Subject: knfsd: nfsd: factor out code from show_expflags Factor out some code to be shared by secinfo display code. Remove some unnecessary conditional printing of commas where we know the condition is true. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 323cbdcc9bfd..06d5cd4a52c4 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1453,28 +1453,35 @@ static struct flags { { 0, {"", ""}} }; -static void exp_flags(struct seq_file *m, int flag, int fsid, - uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc) +static void show_expflags(struct seq_file *m, int flags, int mask) { - int first = 0; struct flags *flg; + int state, first = 0; for (flg = expflags; flg->flag; flg++) { - int state = (flg->flag & flag)?0:1; + if (flg->flag & ~mask) + continue; + state = (flg->flag & flags) ? 0 : 1; if (*flg->name[state]) seq_printf(m, "%s%s", first++?",":"", flg->name[state]); } +} + +static void exp_flags(struct seq_file *m, int flag, int fsid, + uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc) +{ + show_expflags(m, flag, NFSEXP_ALLFLAGS); if (flag & NFSEXP_FSID) - seq_printf(m, "%sfsid=%d", first++?",":"", fsid); + seq_printf(m, ",fsid=%d", fsid); if (anonu != (uid_t)-2 && anonu != (0x10000-2)) - seq_printf(m, "%sanonuid=%d", first++?",":"", anonu); + seq_printf(m, ",sanonuid=%d", anonu); if (anong != (gid_t)-2 && anong != (0x10000-2)) - seq_printf(m, "%sanongid=%d", first++?",":"", anong); + seq_printf(m, ",sanongid=%d", anong); if (fsloc && fsloc->locations_count > 0) { char *loctype = (fsloc->migrated) ? "refer" : "replicas"; int i; - seq_printf(m, "%s%s=", first++?",":"", loctype); + seq_printf(m, ",%s=", loctype); seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); -- cgit v1.2.1 From 91fe39d35ebd6adaece4e090f6b1a3e4b6a59c97 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:49 -0700 Subject: knfsd: nfsd: display export secinfo information Add secinfo information to the display in proc/net/sunrpc/nfsd.export/content. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 06d5cd4a52c4..c7bbf460b009 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -641,6 +641,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs); +static void show_secinfo(struct seq_file *m, struct svc_export *exp); static int svc_export_show(struct seq_file *m, struct cache_detail *cd, @@ -670,6 +671,7 @@ static int svc_export_show(struct seq_file *m, seq_printf(m, "%02x", exp->ex_uuid[i]); } } + show_secinfo(m, exp); } seq_puts(m, ")\n"); return 0; @@ -1467,6 +1469,33 @@ static void show_expflags(struct seq_file *m, int flags, int mask) } } +static void show_secinfo_flags(struct seq_file *m, int flags) +{ + seq_printf(m, ","); + show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); +} + +static void show_secinfo(struct seq_file *m, struct svc_export *exp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + int lastflags = 0, first = 0; + + if (exp->ex_nflavors == 0) + return; + for (f = exp->ex_flavors; f < end; f++) { + if (first || f->flags != lastflags) { + if (!first) + show_secinfo_flags(m, lastflags); + seq_printf(m, ",sec=%d", f->pseudoflavor); + lastflags = f->flags; + } else { + seq_printf(m, ":%d", f->pseudoflavor); + } + } + show_secinfo_flags(m, lastflags); +} + static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc) { -- cgit v1.2.1 From dcb488a3b7ac3987e21148f44f641c9b2e734232 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 17 Jul 2007 04:04:51 -0700 Subject: knfsd: nfsd4: implement secinfo Implement the secinfo operation. (Thanks to Usha Ketineni wrote an earlier version of this support.) Cc: Usha Ketineni Signed-off-by: Andy Adamson Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4proc.c | 28 ++++++++++++++++++++ fs/nfsd/nfs4xdr.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a106e3be7c13..3c627128e205 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -47,6 +47,7 @@ #include #include #include +#include #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -609,6 +610,30 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } +static __be32 +nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_secinfo *secinfo) +{ + struct svc_fh resfh; + struct svc_export *exp; + struct dentry *dentry; + __be32 err; + + fh_init(&resfh, NFS4_FHSIZE); + err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, + secinfo->si_name, secinfo->si_namelen, + &exp, &dentry); + if (err) + return err; + if (dentry->d_inode == NULL) { + exp_put(exp); + err = nfserr_noent; + } else + secinfo->si_exp = exp; + dput(dentry); + return err; +} + static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr) @@ -1008,6 +1033,9 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, }, + [OP_SECINFO] = { + .op_func = (nfsd4op_func)nfsd4_secinfo, + }, [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, }, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b0bfbda375e1..864498f8f2d9 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -56,6 +56,7 @@ #include #include #include +#include #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -818,6 +819,23 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) DECODE_TAIL; } +static __be32 +nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo *secinfo) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(secinfo->si_namelen); + READ_BUF(secinfo->si_namelen); + SAVEMEM(secinfo->si_name, secinfo->si_namelen); + status = check_filename(secinfo->si_name, secinfo->si_namelen, + nfserr_noent); + if (status) + return status; + DECODE_TAIL; +} + static __be32 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) { @@ -1131,6 +1149,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) case OP_SAVEFH: op->status = nfs_ok; break; + case OP_SECINFO: + op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo); + break; case OP_SETATTR: op->status = nfsd4_decode_setattr(argp, &op->u.setattr); break; @@ -1847,11 +1868,19 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, if (d_mountpoint(dentry)) { int err; + /* + * Why the heck aren't we just using nfsd_lookup?? + * Different "."/".." handling? Something else? + * At least, add a comment here to explain.... + */ err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp); if (err) { nfserr = nfserrno(err); goto out_put; } + nfserr = check_nfsd_access(exp, cd->rd_rqstp); + if (nfserr) + goto out_put; } nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, @@ -2419,6 +2448,49 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } } +static void +nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_secinfo *secinfo) +{ + int i = 0; + struct svc_export *exp = secinfo->si_exp; + ENCODE_HEAD; + + if (nfserr) + goto out; + RESERVE_SPACE(4); + WRITE32(exp->ex_nflavors); + ADJUST_ARGS(); + for (i = 0; i < exp->ex_nflavors; i++) { + u32 flav = exp->ex_flavors[i].pseudoflavor; + struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav); + + if (gm) { + RESERVE_SPACE(4); + WRITE32(RPC_AUTH_GSS); + ADJUST_ARGS(); + RESERVE_SPACE(4 + gm->gm_oid.len); + WRITE32(gm->gm_oid.len); + WRITEMEM(gm->gm_oid.data, gm->gm_oid.len); + ADJUST_ARGS(); + RESERVE_SPACE(4); + WRITE32(0); /* qop */ + ADJUST_ARGS(); + RESERVE_SPACE(4); + WRITE32(gss_pseudoflavor_to_service(gm, flav)); + ADJUST_ARGS(); + gss_mech_put(gm); + } else { + RESERVE_SPACE(4); + WRITE32(flav); + ADJUST_ARGS(); + } + } +out: + if (exp) + exp_put(exp); +} + /* * The SETATTR encode routine is special -- it always encodes a bitmap, * regardless of the error status. @@ -2559,6 +2631,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) break; case OP_SAVEFH: break; + case OP_SECINFO: + nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo); + break; case OP_SETATTR: nfsd4_encode_setattr(resp, op->status, &op->u.setattr); break; -- cgit v1.2.1 From 4796f45740bc6f2e3e6cc14e7ed481b38bd0bd39 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:51 -0700 Subject: knfsd: nfsd4: secinfo handling without secinfo= option We could return some sort of error in the case where someone asks for secinfo on an export without the secinfo= option set--that'd be no worse than what we've been doing. But it's not really correct. So, hack up an approximate secinfo response in that case--it may not be complete, but it'll tell the client at least one acceptable security flavor. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4xdr.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 864498f8f2d9..b3d55c6747fd 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -57,6 +57,7 @@ #include #include #include +#include #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -2454,15 +2455,38 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, int nfserr, { int i = 0; struct svc_export *exp = secinfo->si_exp; + u32 nflavs; + struct exp_flavor_info *flavs; + struct exp_flavor_info def_flavs[2]; ENCODE_HEAD; if (nfserr) goto out; + if (exp->ex_nflavors) { + flavs = exp->ex_flavors; + nflavs = exp->ex_nflavors; + } else { /* Handling of some defaults in absence of real secinfo: */ + flavs = def_flavs; + if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) { + nflavs = 2; + flavs[0].pseudoflavor = RPC_AUTH_UNIX; + flavs[1].pseudoflavor = RPC_AUTH_NULL; + } else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) { + nflavs = 1; + flavs[0].pseudoflavor + = svcauth_gss_flavor(exp->ex_client); + } else { + nflavs = 1; + flavs[0].pseudoflavor + = exp->ex_client->flavour->flavour; + } + } + RESERVE_SPACE(4); - WRITE32(exp->ex_nflavors); + WRITE32(nflavs); ADJUST_ARGS(); - for (i = 0; i < exp->ex_nflavors; i++) { - u32 flav = exp->ex_flavors[i].pseudoflavor; + for (i = 0; i < nflavs; i++) { + u32 flav = flavs[i].pseudoflavor; struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav); if (gm) { -- cgit v1.2.1 From 9091224f3cff4721f295df29e8a99705a63bc4c7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:52 -0700 Subject: knfsd: nfsd: allow auth_sys nlm on rpcsec_gss exports Our clients (like other clients, as far as I know) use only auth_sys for nlm, even when using rpcsec_gss for the main nfs operations. Administrators that want to deny non-kerberos-authenticated locking requests will need to turn off NFS protocol versions less than 4.... Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfsfh.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8d2b49914843..0eb464a39aae 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -249,10 +249,16 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (error) goto out; - /* Check security flavor */ - error = check_nfsd_access(exp, rqstp); - if (error) - goto out; + if (!(access & MAY_LOCK)) { + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; + } /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); -- cgit v1.2.1 From 1269bc69b6649282091bb7007372acf4ab8357fd Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Jul 2007 04:04:52 -0700 Subject: knfsd: nfsd: enforce per-flavor id squashing Allow root squashing to vary per-pseudoflavor, so that you can (for example) allow root access only when sufficiently strong security is in use. Signed-off-by: "J. Bruce Fields" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/auth.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 6e92b0fe5323..cf61dc8ae942 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -12,17 +12,31 @@ #define CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE)) +static int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor == rqstp->rq_flavor) + return f->flags; + } + return exp->ex_flags; + +} + int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) { struct svc_cred cred = rqstp->rq_cred; int i; + int flags = nfsexp_flags(rqstp, exp); int ret; - if (exp->ex_flags & NFSEXP_ALLSQUASH) { + if (flags & NFSEXP_ALLSQUASH) { cred.cr_uid = exp->ex_anon_uid; cred.cr_gid = exp->ex_anon_gid; cred.cr_group_info = groups_alloc(0); - } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) { + } else if (flags & NFSEXP_ROOTSQUASH) { struct group_info *gi; if (!cred.cr_uid) cred.cr_uid = exp->ex_anon_uid; -- cgit v1.2.1 From c381bfcf0cd100a37cd969fa0d3aa758e13b5bcc Mon Sep 17 00:00:00 2001 From: Mika Kukkonen Date: Tue, 17 Jul 2007 04:04:53 -0700 Subject: Couple fixes to fs/ecryptfs/inode.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following was uncovered by compiling the kernel with '-W' flag: CC [M] fs/ecryptfs/inode.o fs/ecryptfs/inode.c: In function ‘ecryptfs_lookup’: fs/ecryptfs/inode.c:304: warning: comparison of unsigned expression < 0 is always false fs/ecryptfs/inode.c: In function ‘ecryptfs_symlink’: fs/ecryptfs/inode.c:486: warning: comparison of unsigned expression < 0 is always false Function ecryptfs_encode_filename() can return -ENOMEM, so change the variables to plain int, as in the first case the only real use actually expects int, and in latter case there is no use beoynd the error check. Signed-off-by: Mika Kukkonen Cc: Michael Halcrow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 83e94fedd4e9..e77a2ec71aa5 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -282,7 +282,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, struct dentry *lower_dentry; struct vfsmount *lower_mnt; char *encoded_name; - unsigned int encoded_namelen; + int encoded_namelen; struct ecryptfs_crypt_stat *crypt_stat = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; char *page_virt = NULL; @@ -473,7 +473,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, struct dentry *lower_dir_dentry; umode_t mode; char *encoded_symname; - unsigned int encoded_symlen; + int encoded_symlen; struct ecryptfs_crypt_stat *crypt_stat = NULL; lower_dentry = ecryptfs_dentry_to_lower(dentry); -- cgit v1.2.1 From 3bd858ab1c451725c07a805dcb315215dc85b86e Mon Sep 17 00:00:00 2001 From: Satyam Sharma Date: Tue, 17 Jul 2007 15:00:08 +0530 Subject: Introduce is_owner_or_cap() to wrap CAP_FOWNER use with fsuid check Introduce is_owner_or_cap() macro in fs.h, and convert over relevant users to it. This is done because we want to avoid bugs in the future where we check for only effective fsuid of the current task against a file's owning uid, without simultaneously checking for CAP_FOWNER as well, thus violating its semantics. [ XFS uses special macros and structures, and in general looked ... untouchable, so we leave it alone -- but it has been looked over. ] The (current->fsuid != inode->i_uid) check in generic_permission() and exec_permission_lite() is left alone, because those operations are covered by CAP_DAC_OVERRIDE and CAP_DAC_READ_SEARCH. Similarly operations falling under the purview of CAP_CHOWN and CAP_LEASE are also left alone. Signed-off-by: Satyam Sharma Cc: Al Viro Acked-by: Serge E. Hallyn Signed-off-by: Linus Torvalds --- fs/attr.c | 4 ++-- fs/ext2/acl.c | 2 +- fs/ext2/ioctl.c | 4 ++-- fs/ext3/acl.c | 2 +- fs/ext3/ioctl.c | 6 +++--- fs/ext4/acl.c | 2 +- fs/ext4/ioctl.c | 6 +++--- fs/fcntl.c | 2 +- fs/generic_acl.c | 2 +- fs/gfs2/acl.c | 2 +- fs/hfsplus/ioctl.c | 2 +- fs/jffs2/acl.c | 2 +- fs/jfs/ioctl.c | 2 +- fs/jfs/xattr.c | 2 +- fs/namei.c | 2 +- fs/ocfs2/ioctl.c | 2 +- fs/reiserfs/ioctl.c | 5 ++--- fs/reiserfs/xattr_acl.c | 2 +- fs/utimes.c | 2 +- fs/xattr.c | 3 +-- 20 files changed, 27 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index a0a0c7b07ba3..f8dfc2269d85 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -42,7 +42,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr) /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) goto error; /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : @@ -52,7 +52,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr) /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) { - if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) goto error; } fine: diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 7c420b800c34..e58669e1b87c 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -464,7 +464,7 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value, if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (value) { diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index e85c48218239..3bcd25422ee4 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -36,7 +36,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if (IS_RDONLY(inode)) return -EROFS; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) @@ -74,7 +74,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, case EXT2_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *) arg); case EXT2_IOC_SETVERSION: - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (IS_RDONLY(inode)) return -EROFS; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 1e5038d9a01b..d34e9967430a 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -489,7 +489,7 @@ ext3_xattr_set_acl(struct inode *inode, int type, const void *value, if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (value) { diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 965006dba6be..4a2a02c95bf9 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -41,7 +41,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if (IS_RDONLY(inode)) return -EROFS; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) @@ -122,7 +122,7 @@ flags_err: __u32 generation; int err; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (IS_RDONLY(inode)) return -EROFS; @@ -181,7 +181,7 @@ flags_err: if (IS_RDONLY(inode)) return -EROFS; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 9e882546d91a..a8bae8cd1d5d 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -489,7 +489,7 @@ ext4_xattr_set_acl(struct inode *inode, int type, const void *value, if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (value) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 500567dd53b6..7b4aa4543c83 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -40,7 +40,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if (IS_RDONLY(inode)) return -EROFS; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) @@ -121,7 +121,7 @@ flags_err: __u32 generation; int err; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (IS_RDONLY(inode)) return -EROFS; @@ -180,7 +180,7 @@ flags_err: if (IS_RDONLY(inode)) return -EROFS; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) diff --git a/fs/fcntl.c b/fs/fcntl.c index 8e382a5d51bd..3f22e9f4f691 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -215,7 +215,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) - if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; /* required for strict SunOS emulation */ diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 9ccb78947171..995d63b2e747 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -78,7 +78,7 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (value) { acl = posix_acl_from_xattr(value, size); diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 6e80844367ee..1047a8c7226a 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -74,7 +74,7 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) { if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl) return -EOPNOTSUPP; - if (current->fsuid != ip->i_inode.i_uid && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(&ip->i_inode)) return -EPERM; if (S_ISLNK(ip->i_inode.i_mode)) return -EOPNOTSUPP; diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 79fd10402ea3..b60c0affbec5 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -38,7 +38,7 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, if (IS_RDONLY(inode)) return -EROFS; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(flags, (int __user *)arg)) diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index a46101ee867a..65b3a1b5b88d 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -435,7 +435,7 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, struct posix_acl *acl; int rc; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (value) { diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index fe063af6fd2f..3c8663bea98c 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -69,7 +69,7 @@ int jfs_ioctl(struct inode * inode, struct file * filp, unsigned int cmd, if (IS_RDONLY(inode)) return -EROFS; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index b2375f0774b7..9b7f2cdaae0a 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -697,7 +697,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, struct posix_acl *acl; int rc; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; /* diff --git a/fs/namei.c b/fs/namei.c index 5e2d98d10c5d..defaa47c11d4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1576,7 +1576,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME) - if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; /* diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index bd68c3f2afbe..87dcece7e1b5 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -63,7 +63,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, goto bail_unlock; status = -EACCES; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) goto bail_unlock; if (!S_ISDIR(inode->i_mode)) diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index b484d2913c0d..11a0fcc2d402 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -51,8 +51,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, if (IS_RDONLY(inode)) return -EROFS; - if ((current->fsuid != inode->i_uid) - && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (get_user(flags, (int __user *)arg)) @@ -81,7 +80,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, case REISERFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); case REISERFS_IOC_SETVERSION: - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (IS_RDONLY(inode)) return -EROFS; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 5296a29cc5eb..b7e4fa4539de 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -21,7 +21,7 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) if (!reiserfs_posixacl(inode->i_sb)) return -EOPNOTSUPP; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EPERM; if (value) { diff --git a/fs/utimes.c b/fs/utimes.c index 83a7e69e706c..682eb63b20ad 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -106,7 +106,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags if (IS_IMMUTABLE(inode)) goto dput_and_out; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) { + if (!is_owner_or_cap(inode)) { if (f) { if (!(f->f_mode & FMODE_WRITE)) goto dput_and_out; diff --git a/fs/xattr.c b/fs/xattr.c index 4523aca79659..a44fd92caca3 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -60,8 +60,7 @@ xattr_permission(struct inode *inode, const char *name, int mask) if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return -EPERM; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (mask & MAY_WRITE) && (current->fsuid != inode->i_uid) && - !capable(CAP_FOWNER)) + (mask & MAY_WRITE) && !is_owner_or_cap(inode)) return -EPERM; } -- cgit v1.2.1 From 8e1c091cccd551557d24ce845715e8ceb6c49d36 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Tue, 17 Jul 2007 05:40:59 -0400 Subject: arch/i386/* fs/* ipc/*: mark variables with uninitialized_var() Mark variables with uninitialized_var() if such a warning appears, and analysis proves that the var is initialized properly on all paths it is used. Signed-off-by: Jeff Garzik --- fs/ocfs2/file.c | 3 ++- fs/udf/super.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f04c7aa834cb..004c2abbc732 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1867,7 +1867,8 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, loff_t pos; const struct iovec *cur_iov = iov; struct page *user_page, *page; - char *buf, *dst; + char * uninitialized_var(buf); + char *dst; void *fsdata; /* diff --git a/fs/udf/super.c b/fs/udf/super.c index 6658afb41cc7..d6a504f5d758 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1356,7 +1356,7 @@ udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset) case UDF_VIRTUAL_MAP15: case UDF_VIRTUAL_MAP20: { - kernel_lb_addr ino; + kernel_lb_addr uninitialized_var(ino); if (!UDF_SB_LASTBLOCK(sb)) { -- cgit v1.2.1 From 97ac73506c0ba93f30239bb57b4cfc5d73e68a62 Mon Sep 17 00:00:00 2001 From: Amit Arora Date: Tue, 17 Jul 2007 21:42:44 -0400 Subject: sys_fallocate() implementation on i386, x86_64 and powerpc fallocate() is a new system call being proposed here which will allow applications to preallocate space to any file(s) in a file system. Each file system implementation that wants to use this feature will need to support an inode operation called ->fallocate(). Applications can use this feature to avoid fragmentation to certain level and thus get faster access speed. With preallocation, applications also get a guarantee of space for particular file(s) - even if later the the system becomes full. Currently, glibc provides an interface called posix_fallocate() which can be used for similar cause. Though this has the advantage of working on all file systems, but it is quite slow (since it writes zeroes to each block that has to be preallocated). Without a doubt, file systems can do this more efficiently within the kernel, by implementing the proposed fallocate() system call. It is expected that posix_fallocate() will be modified to call this new system call first and incase the kernel/filesystem does not implement it, it should fall back to the current implementation of writing zeroes to the new blocks. ToDos: 1. Implementation on other architectures (other than i386, x86_64, and ppc). Patches for s390(x) and ia64 are already available from previous posts, but it was decided that they should be added later once fallocate is in the mainline. Hence not including those patches in this take. 2. Changes to glibc, a) to support fallocate() system call b) to make posix_fallocate() and posix_fallocate64() call fallocate() Signed-off-by: Amit Arora --- fs/open.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index be6a457f4226..a6b054edacba 100644 --- a/fs/open.c +++ b/fs/open.c @@ -26,6 +26,7 @@ #include #include #include +#include int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -352,6 +353,64 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) } #endif +asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len) +{ + struct file *file; + struct inode *inode; + long ret = -EINVAL; + + if (offset < 0 || len <= 0) + goto out; + + /* Return error if mode is not supported */ + ret = -EOPNOTSUPP; + if (mode && !(mode & FALLOC_FL_KEEP_SIZE)) + goto out; + + ret = -EBADF; + file = fget(fd); + if (!file) + goto out; + if (!(file->f_mode & FMODE_WRITE)) + goto out_fput; + /* + * Revalidate the write permissions, in case security policy has + * changed since the files were opened. + */ + ret = security_file_permission(file, MAY_WRITE); + if (ret) + goto out_fput; + + inode = file->f_path.dentry->d_inode; + + ret = -ESPIPE; + if (S_ISFIFO(inode->i_mode)) + goto out_fput; + + ret = -ENODEV; + /* + * Let individual file system decide if it supports preallocation + * for directories or not. + */ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EFBIG; + /* Check for wrap through zero too */ + if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) + goto out_fput; + + if (inode->i_op && inode->i_op->fallocate) + ret = inode->i_op->fallocate(inode, mode, offset, len); + else + ret = -ENOSYS; + +out_fput: + fput(file); +out: + return ret; +} + /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and -- cgit v1.2.1 From a2df2a63407803a833f82e1fa6693826c8c9d584 Mon Sep 17 00:00:00 2001 From: Amit Arora Date: Tue, 17 Jul 2007 21:42:41 -0400 Subject: fallocate support in ext4 This patch implements ->fallocate() inode operation in ext4. With this patch users of ext4 file systems will be able to use fallocate() system call for persistent preallocation. Current implementation only supports preallocation for regular files (directories not supported as of date) with extent maps. This patch does not support block-mapped files currently. Only FALLOC_ALLOCATE and FALLOC_RESV_SPACE modes are being supported as of now. Signed-off-by: Amit Arora --- fs/ext4/extents.c | 249 +++++++++++++++++++++++++++++++++++++++++++++--------- fs/ext4/file.c | 1 + 2 files changed, 209 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b9ce24129070..ba25832a756c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -282,7 +283,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) } else if (path->p_ext) { ext_debug(" %d:%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - le16_to_cpu(path->p_ext->ee_len), + ext4_ext_get_actual_len(path->p_ext), ext_pblock(path->p_ext)); } else ext_debug(" []"); @@ -305,7 +306,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), - le16_to_cpu(ex->ee_len), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext_pblock(ex)); } ext_debug("\n"); } @@ -425,7 +426,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) ext_debug(" -> %d:%llu:%d ", le32_to_cpu(path->p_ext->ee_block), ext_pblock(path->p_ext), - le16_to_cpu(path->p_ext->ee_len)); + ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { @@ -686,7 +687,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("move %d:%llu:%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), ext_pblock(path[depth].p_ext), - le16_to_cpu(path[depth].p_ext->ee_len), + ext4_ext_get_actual_len(path[depth].p_ext), newblock); /*memmove(ex++, path[depth].p_ext++, sizeof(struct ext4_extent)); @@ -1106,7 +1107,19 @@ static int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { - if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) != + unsigned short ext1_ee_len, ext2_ee_len; + + /* + * Make sure that either both extents are uninitialized, or + * both are _not_. + */ + if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + return 0; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; @@ -1115,14 +1128,14 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. */ - if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN) + if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (le16_to_cpu(ex1->ee_len) >= 4) return 0; #endif - if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2)) + if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) return 1; return 0; } @@ -1144,7 +1157,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode, unsigned int ret = 0; b1 = le32_to_cpu(newext->ee_block); - len1 = le16_to_cpu(newext->ee_len); + len1 = ext4_ext_get_actual_len(newext); depth = ext_depth(inode); if (!path[depth].p_ext) goto out; @@ -1191,8 +1204,9 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; int depth, len, err, next; + unsigned uninitialized = 0; - BUG_ON(newext->ee_len == 0); + BUG_ON(ext4_ext_get_actual_len(newext) == 0); depth = ext_depth(inode); ex = path[depth].p_ext; BUG_ON(path[depth].p_hdr == NULL); @@ -1200,14 +1214,24 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* try to insert block into found extent and return */ if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append %d block to %d:%d (from %llu)\n", - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - le16_to_cpu(ex->ee_len), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) - + le16_to_cpu(newext->ee_len)); + + /* + * ext4_can_extents_be_merged should have checked that either + * both extents are uninitialized, or both aren't. Thus we + * need to check only one of them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1263,7 +1287,7 @@ has_space: ext_debug("first extent in the leaf: %d:%llu:%d\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len)); + ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); } else if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { @@ -1276,7 +1300,7 @@ has_space: "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 2, nearex + 1, len); } @@ -1289,7 +1313,7 @@ has_space: "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 1, nearex, len); path[depth].p_ext = nearex; @@ -1308,8 +1332,13 @@ merge: if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1)) break; /* merge with next extent! */ - nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len) - + le16_to_cpu(nearex[1].ee_len)); + if (ext4_ext_is_uninitialized(nearex)) + uninitialized = 1; + nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex) + + ext4_ext_get_actual_len(nearex + 1)); + if (uninitialized) + ext4_ext_mark_uninitialized(nearex); + if (nearex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - nearex - 1) * sizeof(struct ext4_extent); @@ -1379,8 +1408,8 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block, end = le32_to_cpu(ex->ee_block); if (block + num < end) end = block + num; - } else if (block >= - le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) { + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { /* need to allocate space after found extent */ start = block; end = block + num; @@ -1392,7 +1421,8 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block, * by found extent */ start = block; - end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len); + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); if (block + num < end) end = block + num; exists = 1; @@ -1408,7 +1438,7 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block, cbex.ec_type = EXT4_EXT_CACHE_GAP; } else { cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = le16_to_cpu(ex->ee_len); + cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_start = ext_pblock(ex); cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } @@ -1481,15 +1511,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext_debug("cache gap(before): %lu [%lu:%lu]", (unsigned long) block, (unsigned long) le32_to_cpu(ex->ee_block), - (unsigned long) le16_to_cpu(ex->ee_len)); + (unsigned long) ext4_ext_get_actual_len(ex)); } else if (block >= le32_to_cpu(ex->ee_block) - + le16_to_cpu(ex->ee_len)) { + + ext4_ext_get_actual_len(ex)) { lblock = le32_to_cpu(ex->ee_block) - + le16_to_cpu(ex->ee_len); + + ext4_ext_get_actual_len(ex); len = ext4_ext_next_allocated_block(path); ext_debug("cache gap(after): [%lu:%lu] %lu", (unsigned long) le32_to_cpu(ex->ee_block), - (unsigned long) le16_to_cpu(ex->ee_len), + (unsigned long) ext4_ext_get_actual_len(ex), (unsigned long) block); BUG_ON(len == lblock); len = len - lblock; @@ -1619,12 +1649,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, unsigned long from, unsigned long to) { struct buffer_head *bh; + unsigned short ee_len = ext4_ext_get_actual_len(ex); int i; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = le16_to_cpu(ex->ee_len); spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; @@ -1638,12 +1668,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, } #endif if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ unsigned long num; ext4_fsblk_t start; - num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from; - start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num; + num = le32_to_cpu(ex->ee_block) + ee_len - from; + start = ext_pblock(ex) + ee_len - num; ext_debug("free last %lu blocks starting %llu\n", num, start); for (i = 0; i < num; i++) { bh = sb_find_get_block(inode->i_sb, start + i); @@ -1651,12 +1681,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, } ext4_free_blocks(handle, inode, start, num); } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + from, to, le32_to_cpu(ex->ee_block), ee_len); } else { printk("strange request: removal(2) %lu-%lu from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + from, to, le32_to_cpu(ex->ee_block), ee_len); } return 0; } @@ -1671,6 +1701,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned a, b, block, num; unsigned long ex_ee_block; unsigned short ex_ee_len; + unsigned uninitialized = 0; struct ext4_extent *ex; ext_debug("truncate since %lu in leaf\n", start); @@ -1685,7 +1716,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = le16_to_cpu(ex->ee_len); + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex_ee_len = ext4_ext_get_actual_len(ex); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -1753,6 +1786,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex->ee_block = cpu_to_le32(block); ex->ee_len = cpu_to_le16(num); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -1762,7 +1797,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = le16_to_cpu(ex->ee_len); + ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) @@ -2038,7 +2073,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (ex) { unsigned long ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext_pblock(ex); - unsigned short ee_len = le16_to_cpu(ex->ee_len); + unsigned short ee_len; /* * Allow future support for preallocated extents to be added @@ -2046,8 +2081,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * Uninitialized extents are treated as holes, except that * we avoid (fail) allocating new blocks during a write. */ - if (ee_len > EXT_MAX_LEN) + if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN) goto out2; + ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ if (iblock >= ee_block && iblock < ee_block + ee_len) { newblock = iblock - ee_block + ee_start; @@ -2055,8 +2091,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, allocated = ee_len - (iblock - ee_block); ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, ee_block, ee_len, newblock); - ext4_ext_put_in_cache(inode, ee_block, ee_len, - ee_start, EXT4_EXT_CACHE_EXTENT); + /* Do not put uninitialized extent in the cache */ + if (!ext4_ext_is_uninitialized(ex)) + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start, + EXT4_EXT_CACHE_EXTENT); goto out; } } @@ -2098,6 +2137,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(allocated); + if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + ext4_ext_mark_uninitialized(&newex); err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) { /* free data blocks we just allocated */ @@ -2113,8 +2154,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext_pblock(&newex); __set_bit(BH_New, &bh_result->b_state); - ext4_ext_put_in_cache(inode, iblock, allocated, newblock, - EXT4_EXT_CACHE_EXTENT); + /* Cache only when it is _not_ an uninitialized extent */ + if (create != EXT4_CREATE_UNINITIALIZED_EXT) + ext4_ext_put_in_cache(inode, iblock, allocated, newblock, + EXT4_EXT_CACHE_EXTENT); out: if (allocated > max_blocks) allocated = max_blocks; @@ -2217,3 +2260,127 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) return needed; } + +/* + * preallocate space for a file. This implements ext4's fallocate inode + * operation, which gets called from sys_fallocate system call. + * For block-mapped files, posix_fallocate should fall back to the method + * of writing zeroes to the required new blocks (the same behavior which is + * expected for file systems which do not support fallocate() system call). + */ +long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) +{ + handle_t *handle; + ext4_fsblk_t block, max_blocks; + ext4_fsblk_t nblocks = 0; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct buffer_head map_bh; + unsigned int credits, blkbits = inode->i_blkbits; + + /* + * currently supporting (pre)allocate mode for extent-based + * files _only_ + */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return -EOPNOTSUPP; + + /* preallocation to directories is currently not supported */ + if (S_ISDIR(inode->i_mode)) + return -ENODEV; + + block = offset >> blkbits; + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - block; + + /* + * credits to insert 1 extent into extent tree + buffers to be able to + * modify 1 super block, 1 block bitmap and 1 group descriptor. + */ + credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; +retry: + while (ret >= 0 && ret < max_blocks) { + block = block + ret; + max_blocks = max_blocks - ret; + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + + ret = ext4_ext_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_CREATE_UNINITIALIZED_EXT, 0); + WARN_ON(!ret); + if (!ret) { + ext4_error(inode->i_sb, "ext4_fallocate", + "ext4_ext_get_blocks returned 0! inode#%lu" + ", block=%llu, max_blocks=%llu", + inode->i_ino, block, max_blocks); + ret = -EIO; + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + if (ret > 0) { + /* check wrap through sign-bit/zero here */ + if ((block + ret) < 0 || (block + ret) < block) { + ret = -EIO; + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + if (buffer_new(&map_bh) && ((block + ret) > + (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits) + >> blkbits))) + nblocks = nblocks + ret; + } + + /* Update ctime if new blocks get allocated */ + if (nblocks) { + struct timespec now; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + } + + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + /* + * Time to update the file size. + * Update only when preallocation was requested beyond the file size. + */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (offset + len) > i_size_read(inode)) { + if (ret > 0) { + /* + * if no error, we assume preallocation succeeded + * completely + */ + mutex_lock(&inode->i_mutex); + i_size_write(inode, offset + len); + EXT4_I(inode)->i_disksize = i_size_read(inode); + mutex_unlock(&inode->i_mutex); + } else if (ret < 0 && nblocks) { + /* Handle partial allocation scenario */ + loff_t newsize; + + mutex_lock(&inode->i_mutex); + newsize = (nblocks << blkbits) + i_size_read(inode); + i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits)); + EXT4_I(inode)->i_disksize = i_size_read(inode); + mutex_unlock(&inode->i_mutex); + } + } + + return ret > 0 ? ret2 : ret; +} diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d4c8186aed64..1a81cd66d63b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -134,5 +134,6 @@ const struct inode_operations ext4_file_inode_operations = { .removexattr = generic_removexattr, #endif .permission = ext4_permission, + .fallocate = ext4_fallocate, }; -- cgit v1.2.1 From 56055d3ae4cc7fa6d2b10885f20269de8a989ed7 Mon Sep 17 00:00:00 2001 From: Amit Arora Date: Tue, 17 Jul 2007 21:42:38 -0400 Subject: write support for preallocated blocks This patch adds write support to the uninitialized extents that get created when a preallocation is done using fallocate(). It takes care of splitting the extents into multiple (upto three) extents and merging the new split extents with neighbouring ones, if possible. Signed-off-by: Amit Arora --- fs/ext4/extents.c | 254 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 222 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ba25832a756c..ded3d469f978 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1140,6 +1140,53 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, return 0; } +/* + * This function tries to merge the "ex" extent to the next extent in the tree. + * It always tries to merge towards right. If you want to merge towards + * left, pass "ex - 1" as argument instead of "ex". + * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns + * 1 if they got merged. + */ +int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) +{ + struct ext4_extent_header *eh; + unsigned int depth, len; + int merge_done = 0; + int uninitialized = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + while (ex < EXT_LAST_EXTENT(eh)) { + if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) + break; + /* merge with next extent! */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(ex + 1)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + + if (ex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - ex - 1) + * sizeof(struct ext4_extent); + memmove(ex + 1, ex + 2, len); + } + eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1); + merge_done = 1; + WARN_ON(eh->eh_entries == 0); + if (!eh->eh_entries) + ext4_error(inode->i_sb, "ext4_ext_try_to_merge", + "inode#%lu, eh->eh_entries = 0!", inode->i_ino); + } + + return merge_done; +} + /* * check if a portion of the "newext" extent overlaps with an * existing extent. @@ -1328,25 +1375,7 @@ has_space: merge: /* try to merge extents to the right */ - while (nearex < EXT_LAST_EXTENT(eh)) { - if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1)) - break; - /* merge with next extent! */ - if (ext4_ext_is_uninitialized(nearex)) - uninitialized = 1; - nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex) - + ext4_ext_get_actual_len(nearex + 1)); - if (uninitialized) - ext4_ext_mark_uninitialized(nearex); - - if (nearex + 1 < EXT_LAST_EXTENT(eh)) { - len = (EXT_LAST_EXTENT(eh) - nearex - 1) - * sizeof(struct ext4_extent); - memmove(nearex + 1, nearex + 2, len); - } - eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); - BUG_ON(eh->eh_entries == 0); - } + ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -2012,15 +2041,158 @@ void ext4_ext_release(struct super_block *sb) #endif } +/* + * This function is called by ext4_ext_get_blocks() if someone tries to write + * to an uninitialized extent. It may result in splitting the uninitialized + * extent into multiple extents (upto three - one initialized and two + * uninitialized). + * There are three possibilities: + * a> There is no split required: Entire extent should be initialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + */ +int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + ext4_fsblk_t iblock, + unsigned long max_blocks) +{ + struct ext4_extent *ex, newex; + struct ext4_extent *ex1 = NULL; + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; + unsigned int allocated, ee_block, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; + int ret = 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (iblock - ee_block); + newblock = iblock - ee_block + ext_pblock(ex); + ex2 = ex; + + /* ex1: ee_block to iblock - 1 : uninitialized */ + if (iblock > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * for sanity, update the length of the ex2 extent before + * we insert ex3, if ex1 is NULL. This is to avoid temporary + * overlap of blocks. + */ + if (!ex1 && allocated > max_blocks) + ex2->ee_len = cpu_to_le16(max_blocks); + /* ex3: to ee_block + ee_len : uninitialised */ + if (allocated > max_blocks) { + unsigned int newdepth; + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock + max_blocks); + ext4_ext_store_pblock(ex3, newblock + max_blocks); + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3); + if (err) + goto out; + /* + * The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + if (newdepth != depth) { + depth = newdepth; + path = ext4_ext_find_extent(inode, iblock, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out; + } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + } + allocated = max_blocks; + } + /* + * If there was a change of depth as part of the + * insertion of ex3 above, we need to update the length + * of the ex1 extent again here + */ + if (ex1 && ex1 != ex) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* ex2: iblock to iblock + maxblocks-1 : initialised */ + ex2->ee_block = cpu_to_le32(iblock); + ex2->ee_start = cpu_to_le32(newblock); + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + if (ex2 != ex) + goto insert; + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* + * New (initialized) extent starts from the first block + * in the current extent. i.e., ex2 == ex + * We have to see if it can be merged with the extent + * on the left. + */ + if (ex2 > EXT_FIRST_EXTENT(eh)) { + /* + * To merge left, pass "ex2 - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + ex2--; + } + } + /* + * Try to Merge towards right. This might be required + * only when the whole extent is being written to. + * i.e. ex2 == ex and ex3 == NULL. + */ + if (!ex3) { + ret = ext4_ext_try_to_merge(inode, path, ex2); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + } + } + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; +insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex); +out: + return err ? err : allocated; +} + int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create, int extend_disksize) { struct ext4_ext_path *path = NULL; + struct ext4_extent_header *eh; struct ext4_extent newex, *ex; ext4_fsblk_t goal, newblock; - int err = 0, depth; + int err = 0, depth, ret; unsigned long allocated = 0; __clear_bit(BH_New, &bh_result->b_state); @@ -2033,8 +2205,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (goal) { if (goal == EXT4_EXT_CACHE_GAP) { if (!create) { - /* block isn't allocated yet and - * user doesn't want to allocate it */ + /* + * block isn't allocated yet and + * user doesn't want to allocate it + */ goto out2; } /* we should allocate requested block */ @@ -2068,6 +2242,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * this is why assert can't be put in ext4_ext_find_extent() */ BUG_ON(path[depth].p_ext == NULL && depth != 0); + eh = path[depth].p_hdr; ex = path[depth].p_ext; if (ex) { @@ -2076,13 +2251,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, unsigned short ee_len; /* - * Allow future support for preallocated extents to be added - * as an RO_COMPAT feature: * Uninitialized extents are treated as holes, except that - * we avoid (fail) allocating new blocks during a write. + * we split out initialized portions during a write. */ - if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN) - goto out2; ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ if (iblock >= ee_block && iblock < ee_block + ee_len) { @@ -2091,12 +2262,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, allocated = ee_len - (iblock - ee_block); ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, ee_block, ee_len, newblock); + /* Do not put uninitialized extent in the cache */ - if (!ext4_ext_is_uninitialized(ex)) + if (!ext4_ext_is_uninitialized(ex)) { ext4_ext_put_in_cache(inode, ee_block, ee_len, ee_start, EXT4_EXT_CACHE_EXTENT); - goto out; + goto out; + } + if (create == EXT4_CREATE_UNINITIALIZED_EXT) + goto out; + if (!create) + goto out2; + + ret = ext4_ext_convert_to_initialized(handle, inode, + path, iblock, + max_blocks); + if (ret <= 0) + goto out2; + else + allocated = ret; + goto outnew; } } @@ -2105,8 +2291,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * we couldn't try to create block if create flag is zero */ if (!create) { - /* put just found gap into cache to speed up - * subsequent requests */ + /* + * put just found gap into cache to speed up + * subsequent requests + */ ext4_ext_put_gap_in_cache(inode, path, iblock); goto out2; } @@ -2152,6 +2340,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); +outnew: __set_bit(BH_New, &bh_result->b_state); /* Cache only when it is _not_ an uninitialized extent */ @@ -2221,7 +2410,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) err = ext4_ext_remove_space(inode, last_block); /* In a multi-transaction truncate, we only make the final - * transaction synchronous. */ + * transaction synchronous. + */ if (IS_SYNC(inode)) handle->h_sync = 1; -- cgit v1.2.1 From 749269facaf87f6e516c3af12763e03181b9c139 Mon Sep 17 00:00:00 2001 From: Amit Arora Date: Wed, 18 Jul 2007 09:02:56 -0400 Subject: Change on-disk format to support 2^15 uninitialized extents This change was suggested by Andreas Dilger. This patch changes the EXT_MAX_LEN value and extent code which marks/checks uninitialized extents. With this change it will be possible to have initialized extents with 2^15 blocks (earlier the max blocks we could have was 2^15 - 1). This way we can have better extent-to-block alignment. Now, maximum number of blocks we can have in an initialized extent is 2^15 and in an uninitialized extent is 2^15 - 1. Signed-off-by: Amit Arora --- fs/ext4/extents.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ded3d469f978..77146b826a13 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1107,7 +1107,7 @@ static int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { - unsigned short ext1_ee_len, ext2_ee_len; + unsigned short ext1_ee_len, ext2_ee_len, max_len; /* * Make sure that either both extents are uninitialized, or @@ -1116,6 +1116,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) return 0; + if (ext4_ext_is_uninitialized(ex1)) + max_len = EXT_UNINIT_MAX_LEN; + else + max_len = EXT_INIT_MAX_LEN; + ext1_ee_len = ext4_ext_get_actual_len(ex1); ext2_ee_len = ext4_ext_get_actual_len(ex2); @@ -1128,7 +1133,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. */ - if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN) + if (ext1_ee_len + ext2_ee_len > max_len) return 0; #ifdef AGGRESSIVE_TEST if (le16_to_cpu(ex1->ee_len) >= 4) @@ -1815,7 +1820,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex->ee_block = cpu_to_le32(block); ex->ee_len = cpu_to_le16(num); - if (uninitialized) + /* + * Do not mark uninitialized if all the blocks in the + * extent have been removed. + */ + if (uninitialized && num) ext4_ext_mark_uninitialized(ex); err = ext4_ext_dirty(handle, inode, path + depth); @@ -2308,6 +2317,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* allocate new block */ goal = ext4_ext_find_goal(inode, path, iblock); + /* + * See if request is beyond maximum number of blocks we can have in + * a single extent. For an initialized extent this limit is + * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is + * EXT_UNINIT_MAX_LEN. + */ + if (max_blocks > EXT_INIT_MAX_LEN && + create != EXT4_CREATE_UNINITIALIZED_EXT) + max_blocks = EXT_INIT_MAX_LEN; + else if (max_blocks > EXT_UNINIT_MAX_LEN && + create == EXT4_CREATE_UNINITIALIZED_EXT) + max_blocks = EXT_UNINIT_MAX_LEN; + /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ newex.ee_block = cpu_to_le32(iblock); newex.ee_len = cpu_to_le16(max_blocks); -- cgit v1.2.1 From 1e2462f93e011f63fd0f1fedd2c05338ca6b31c2 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Wed, 18 Jul 2007 09:00:55 -0400 Subject: ext4: Enable extents by default Turn on extents feature by default in ext4 filesystem, to get wider testing of extents feature in ext4dev. This can be disabled using -o noextents. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b806e689c4aa..9c8baf460588 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -734,7 +734,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_extents, + Opt_grpquota, Opt_extents, Opt_noextents, }; static match_table_t tokens = { @@ -785,6 +785,7 @@ static match_table_t tokens = { {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, {Opt_err, NULL}, {Opt_resize, "resize"}, }; @@ -1120,6 +1121,9 @@ clear_qf_name: case Opt_extents: set_opt (sbi->s_mount_opt, EXTENTS); break; + case Opt_noextents: + clear_opt (sbi->s_mount_opt, EXTENTS); + break; default: printk (KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1551,6 +1555,12 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, RESERVATION); + /* + * turn on extents feature by default in ext4 filesystem + * User -o noextents to turn it off + */ + set_opt(sbi->s_mount_opt, EXTENTS); + if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, NULL, 0)) goto failed_mount; -- cgit v1.2.1 From ff9ddf7e847c4dc533f119efb6c77a6e57ab6397 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 18 Jul 2007 09:24:20 -0400 Subject: ext4: copy i_flags to inode flags on write Propagate flags such as S_APPEND, S_IMMUTABLE, etc. from i_flags into ext4-specific i_flags. Quota code changes these flags on quota files (to make it harder for sysadmin to screw himself) and these changes were not correctly propagated into the filesystem. (This is a forward port patch from ext3) Signed-off-by: Jan Kara Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 20 ++++++++++++++++++++ fs/ext4/ioctl.c | 1 + 2 files changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8416fa28c422..49035c5a2c43 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2583,6 +2583,25 @@ void ext4_set_inode_flags(struct inode *inode) inode->i_flags |= S_DIRSYNC; } +/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ +void ext4_get_inode_flags(struct ext4_inode_info *ei) +{ + unsigned int flags = ei->vfs_inode.i_flags; + + ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT4_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT4_APPEND_FL; + if (flags & S_IMMUTABLE) + ei->i_flags |= EXT4_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT4_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT4_DIRSYNC_FL; +} + void ext4_read_inode(struct inode * inode) { struct ext4_iloc iloc; @@ -2744,6 +2763,7 @@ static int ext4_do_update_inode(handle_t *handle, if (ei->i_state & EXT4_STATE_NEW) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); if(!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7b4aa4543c83..9737432f079d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -28,6 +28,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, switch (cmd) { case EXT4_IOC_GETFLAGS: + ext4_get_inode_flags(ei); flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { -- cgit v1.2.1 From c29c0ae7f282828da3695167ed870131798348d9 Mon Sep 17 00:00:00 2001 From: Alex Tomas Date: Wed, 18 Jul 2007 09:19:09 -0400 Subject: ext4: Make extents code sanely handle on-disk corruption Add more run-time checking of extent header fields and remove BUG_ON checks so we don't panic the kernel just because the on-disk filesystem is corrupted. Signed-off-by: Alex Tomas Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 144 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 77146b826a13..96264b2ef0a3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -92,36 +92,6 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static int ext4_ext_check_header(const char *function, struct inode *inode, - struct ext4_extent_header *eh) -{ - const char *error_msg = NULL; - - if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { - error_msg = "invalid magic"; - goto corrupted; - } - if (unlikely(eh->eh_max == 0)) { - error_msg = "invalid eh_max"; - goto corrupted; - } - if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { - error_msg = "invalid eh_entries"; - goto corrupted; - } - return 0; - -corrupted: - ext4_error(inode->i_sb, function, - "bad header in inode #%lu: %s - magic %x, " - "entries %u, max %u, depth %u", - inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - le16_to_cpu(eh->eh_depth)); - - return -EIO; -} - static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) { int err; @@ -270,6 +240,70 @@ static int ext4_ext_space_root_idx(struct inode *inode) return size; } +static int +ext4_ext_max_entries(struct inode *inode, int depth) +{ + int max; + + if (depth == ext_depth(inode)) { + if (depth == 0) + max = ext4_ext_space_root(inode); + else + max = ext4_ext_space_root_idx(inode); + } else { + if (depth == 0) + max = ext4_ext_space_block(inode); + else + max = ext4_ext_space_block_idx(inode); + } + + return max; +} + +static int __ext4_ext_check_header(const char *function, struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + const char *error_msg; + int max = 0; + + if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { + error_msg = "invalid magic"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { + error_msg = "unexpected eh_depth"; + goto corrupted; + } + if (unlikely(eh->eh_max == 0)) { + error_msg = "invalid eh_max"; + goto corrupted; + } + max = ext4_ext_max_entries(inode, depth); + if (unlikely(le16_to_cpu(eh->eh_max) > max)) { + error_msg = "too large eh_max"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { + error_msg = "invalid eh_entries"; + goto corrupted; + } + return 0; + +corrupted: + ext4_error(inode->i_sb, function, + "bad header in inode #%lu: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); + + return -EIO; +} + +#define ext4_ext_check_header(inode, eh, depth) \ + __ext4_ext_check_header(__FUNCTION__, inode, eh, depth) + #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { @@ -330,6 +364,7 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path) /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block + * the header must be checked before calling this */ static void ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block) @@ -337,9 +372,6 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent_idx *r, *l, *m; - BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC); - BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); - BUG_ON(le16_to_cpu(eh->eh_entries) <= 0); ext_debug("binsearch for %d(idx): ", block); @@ -389,6 +421,7 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc /* * ext4_ext_binsearch: * binary search for closest extent of the given block + * the header must be checked before calling this */ static void ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) @@ -396,9 +429,6 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent *r, *l, *m; - BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC); - BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); - if (eh->eh_entries == 0) { /* * this leaf is empty: @@ -469,11 +499,10 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path) short int depth, i, ppos = 0, alloc = 0; eh = ext_inode_hdr(inode); - BUG_ON(eh == NULL); - if (ext4_ext_check_header(__FUNCTION__, inode, eh)) + depth = ext_depth(inode); + if (ext4_ext_check_header(inode, eh, depth)) return ERR_PTR(-EIO); - i = depth = ext_depth(inode); /* account possible depth increase */ if (!path) { @@ -485,10 +514,12 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path) } path[0].p_hdr = eh; + i = depth; /* walk through the tree */ while (i) { ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + ext4_ext_binsearch_idx(inode, path + ppos, block); path[ppos].p_block = idx_pblock(path[ppos].p_idx); path[ppos].p_depth = i; @@ -505,7 +536,7 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path) path[ppos].p_hdr = eh; i--; - if (ext4_ext_check_header(__FUNCTION__, inode, eh)) + if (ext4_ext_check_header(inode, eh, i)) goto err; } @@ -514,9 +545,6 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path) path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; - if (ext4_ext_check_header(__FUNCTION__, inode, eh)) - goto err; - /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); @@ -1738,13 +1766,12 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned uninitialized = 0; struct ext4_extent *ex; + /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %lu in leaf\n", start); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; BUG_ON(eh == NULL); - BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max)); - BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC); /* find where to start removing */ ex = EXT_LAST_EXTENT(eh); @@ -1898,7 +1925,7 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start) return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); - if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) { + if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; } @@ -1919,17 +1946,8 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start) if (!path[i].p_hdr) { ext_debug("initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); - if (ext4_ext_check_header(__FUNCTION__, inode, - path[i].p_hdr)) { - err = -EIO; - goto out; - } } - BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries) - > le16_to_cpu(path[i].p_hdr->eh_max)); - BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC); - if (!path[i].p_idx) { /* this level hasn't been touched yet */ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); @@ -1946,17 +1964,27 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start) i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { + struct buffer_head *bh; /* go to the next level */ ext_debug("move to level %d (block %llu)\n", i + 1, idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - path[i+1].p_bh = - sb_bread(sb, idx_pblock(path[i].p_idx)); - if (!path[i+1].p_bh) { + bh = sb_bread(sb, idx_pblock(path[i].p_idx)); + if (!bh) { /* should we reset i_size? */ err = -EIO; break; } + if (WARN_ON(i + 1 > depth)) { + err = -EIO; + break; + } + if (ext4_ext_check_header(inode, ext_block_hdr(bh), + depth - i - 1)) { + err = -EIO; + break; + } + path[i + 1].p_bh = bh; /* save actual number of indexes since this * number is changed at the next iteration */ -- cgit v1.2.1 From eb40a09c679d7f9709f7087add57f2e1c7122bb3 Mon Sep 17 00:00:00 2001 From: "Jose R. Santos" Date: Wed, 18 Jul 2007 08:37:25 -0400 Subject: ext4: Set the journal JBD2_FEATURE_INCOMPAT_64BIT on large devices Set the journals JBD2_FEATURE_INCOMPAT_64BIT on devices with more than 32bit block sizes during mount time. This ensure proper record lenth when writing to the journal. Signed-off-by: Jose R. Santos Signed-off-by: Andreas Dilger Signed-off-by: Mingming Cao Signed-off-by: Laurent Vivier Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9c8baf460588..af0835187e76 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1813,6 +1813,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount3; } + if (ext4_blocks_count(es) > 0xffffffffULL && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); + goto failed_mount4; + } + /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { -- cgit v1.2.1 From e23291b9120c11aafb2ee76fb71a062eb3c1056c Mon Sep 17 00:00:00 2001 From: "Jose R. Santos" Date: Wed, 18 Jul 2007 08:57:06 -0400 Subject: jbd2: Fix CONFIG_JBD_DEBUG ifdef to be CONFIG_JBD2_DEBUG When the JBD code was forked to create the new JBD2 code base, the references to CONFIG_JBD_DEBUG where never changed to CONFIG_JBD2_DEBUG. This patch fixes that. Signed-off-by: Jose R. Santos Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 4 ++-- fs/ext4/ioctl.c | 4 ++-- fs/jbd2/journal.c | 14 +++++++------- fs/jbd2/recovery.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9de54ae48dee..e53b4af52f11 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -517,7 +517,7 @@ do_more: /* * An HJ special. This is expensive... */ -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG jbd_unlock_bh_state(bitmap_bh); { struct buffer_head *debug_bh; @@ -1597,7 +1597,7 @@ allocated: performed_allocation = 1; -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG { struct buffer_head *debug_bh; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 9737432f079d..5b00775d5096 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -141,7 +141,7 @@ flags_err: ext4_journal_stop(handle); return err; } -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG case EXT4_IOC_WAIT_FOR_READONLY: /* * This is racy - by the time we're woken up and running, @@ -283,7 +283,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC32_SETVERSION_OLD: cmd = EXT4_IOC_SETVERSION_OLD; break; -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG case EXT4_IOC32_WAIT_FOR_READONLY: cmd = EXT4_IOC_WAIT_FOR_READONLY; break; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 78d63b818f0b..8f530cc66d34 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -528,7 +528,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_EMERG @@ -1709,7 +1709,7 @@ void jbd2_slab_free(void *ptr, size_t size) * Journal_head storage management */ static struct kmem_cache *jbd2_journal_head_cache; -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG static atomic_t nr_journal_heads = ATOMIC_INIT(0); #endif @@ -1747,7 +1747,7 @@ static struct journal_head *journal_alloc_journal_head(void) struct journal_head *ret; static unsigned long last_warning; -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG atomic_inc(&nr_journal_heads); #endif ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); @@ -1768,7 +1768,7 @@ static struct journal_head *journal_alloc_journal_head(void) static void journal_free_journal_head(struct journal_head *jh) { -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG atomic_dec(&nr_journal_heads); memset(jh, JBD_POISON_FREE, sizeof(*jh)); #endif @@ -1953,12 +1953,12 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) /* * /proc tunables */ -#if defined(CONFIG_JBD_DEBUG) +#if defined(CONFIG_JBD2_DEBUG) int jbd2_journal_enable_debug; EXPORT_SYMBOL(jbd2_journal_enable_debug); #endif -#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) +#if defined(CONFIG_JBD2_DEBUG) && defined(CONFIG_PROC_FS) static struct proc_dir_entry *proc_jbd_debug; @@ -2073,7 +2073,7 @@ static int __init journal_init(void) static void __exit journal_exit(void) { -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 395c92a04ac9..e7730a045b93 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -295,7 +295,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) printk(KERN_ERR "JBD: error %d scanning journal\n", err); ++journal->j_transaction_sequence; } else { -#ifdef CONFIG_JBD_DEBUG +#ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); #endif jbd_debug(0, -- cgit v1.2.1 From 0f49d5d019afa4e94253bfc92f0daca3badb990b Mon Sep 17 00:00:00 2001 From: "Jose R. Santos" Date: Wed, 18 Jul 2007 08:50:18 -0400 Subject: jbd2: Move jbd2-debug file to debugfs The jbd2-debug file used to be located in /proc/sys/fs/jbd2-debug, but it incorrectly used create_proc_entry() instead of the sysctl routines, and no proc entry was ever created. Instead of fixing this we might as well move the jbd2-debug file to debugfs which would be the preferred location for this kind of tunable. The new location is now /sys/kernel/debug/jbd2/jbd2-debug. Signed-off-by: Jose R. Santos Signed-off-by: "Theodore Ts'o" --- fs/Kconfig | 10 ++++----- fs/jbd2/journal.c | 67 ++++++++++++++++++++++--------------------------------- 2 files changed, 32 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 613df554728d..6a649902c5ac 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -251,7 +251,7 @@ config JBD2 config JBD2_DEBUG bool "JBD2 (ext4dev/ext4) debugging support" - depends on JBD2 + depends on JBD2 && DEBUG_FS help If you are using the ext4dev/ext4 journaled file system (or potentially any other filesystem/device using JBD2), this option @@ -260,10 +260,10 @@ config JBD2_DEBUG By default, the debugging output will be turned off. If you select Y here, then you will be able to turn on debugging - with "echo N > /proc/sys/fs/jbd2-debug", where N is a number between - 1 and 5. The higher the number, the more debugging output is - generated. To turn debugging off again, do - "echo 0 > /proc/sys/fs/jbd2-debug". + with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a + number between 1 and 5. The higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". config FS_MBCACHE # Meta block cache for Extended Attributes (ext2/ext3/ext4) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8f530cc66d34..f290cb7cb834 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -1951,64 +1952,50 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) } /* - * /proc tunables + * debugfs tunables */ #if defined(CONFIG_JBD2_DEBUG) -int jbd2_journal_enable_debug; +u8 jbd2_journal_enable_debug; EXPORT_SYMBOL(jbd2_journal_enable_debug); #endif -#if defined(CONFIG_JBD2_DEBUG) && defined(CONFIG_PROC_FS) +#if defined(CONFIG_JBD2_DEBUG) && defined(CONFIG_DEBUG_FS) -static struct proc_dir_entry *proc_jbd_debug; +#define JBD2_DEBUG_NAME "jbd2-debug" -static int read_jbd_debug(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int ret; +struct dentry *jbd2_debugfs_dir, *jbd2_debug; - ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug); - *eof = 1; - return ret; +static void __init jbd2_create_debugfs_entry(void) +{ + jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); + if (jbd2_debugfs_dir) + jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO, + jbd2_debugfs_dir, + &jbd2_journal_enable_debug); } -static int write_jbd_debug(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static void __exit jbd2_remove_debugfs_entry(void) { - char buf[32]; - - if (count > ARRAY_SIZE(buf) - 1) - count = ARRAY_SIZE(buf) - 1; - if (copy_from_user(buf, buffer, count)) - return -EFAULT; - buf[ARRAY_SIZE(buf) - 1] = '\0'; - jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10); - return count; + if (jbd2_debug) + debugfs_remove(jbd2_debug); + if (jbd2_debugfs_dir) + debugfs_remove(jbd2_debugfs_dir); } -#define JBD_PROC_NAME "sys/fs/jbd2-debug" +#else -static void __init create_jbd_proc_entry(void) +static void __init jbd2_create_debugfs_entry(void) { - proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); - if (proc_jbd_debug) { - /* Why is this so hard? */ - proc_jbd_debug->read_proc = read_jbd_debug; - proc_jbd_debug->write_proc = write_jbd_debug; - } + do { + } while (0); } -static void __exit jbd2_remove_jbd_proc_entry(void) +static void __exit jbd2_remove_debugfs_entry(void) { - if (proc_jbd_debug) - remove_proc_entry(JBD_PROC_NAME, NULL); + do { + } while (0); } -#else - -#define create_jbd_proc_entry() do {} while (0) -#define jbd2_remove_jbd_proc_entry() do {} while (0) - #endif struct kmem_cache *jbd2_handle_cache; @@ -2067,7 +2054,7 @@ static int __init journal_init(void) ret = journal_init_caches(); if (ret != 0) jbd2_journal_destroy_caches(); - create_jbd_proc_entry(); + jbd2_create_debugfs_entry(); return ret; } @@ -2078,7 +2065,7 @@ static void __exit journal_exit(void) if (n) printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); #endif - jbd2_remove_jbd_proc_entry(); + jbd2_remove_debugfs_entry(); jbd2_journal_destroy_caches(); } -- cgit v1.2.1 From ef7f38359ea8b3e9c7f2cae9a4d4935f55ca9e80 Mon Sep 17 00:00:00 2001 From: Kalpak Shah Date: Wed, 18 Jul 2007 09:15:20 -0400 Subject: ext4: Add nanosecond timestamps This patch adds nanosecond timestamps for ext4. This involves adding *time_extra fields to the ext4_inode to extend the timestamps to 64-bits. Creation time is also added by this patch. These extended fields will fit into an inode if the filesystem was formatted with large inodes (-I 256 or larger) and there are currently no EAs consuming all of the available space. For new inodes we always reserve enough space for the kernel's known extended fields, but for inodes created with an old kernel this might not have been the case. So this patch also adds the EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature flag(ro-compat so that older kernels can't create inodes with a smaller extra_isize). which indicates if the fields fitting inside s_min_extra_isize are available or not. If the expansion of inodes if unsuccessful then this feature will be disabled. This feature is only enabled if requested by the sysadmin. None of the extended inode fields is critical for correct filesystem operation. Signed-off-by: Andreas Dilger Signed-off-by: Kalpak Shah Signed-off-by: Eric Sandeen Signed-off-by: Dave Kleikamp Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 8 ++++---- fs/ext4/inode.c | 22 +++++++++++++--------- fs/ext4/ioctl.c | 4 ++-- fs/ext4/namei.c | 16 ++++++++-------- fs/ext4/super.c | 28 ++++++++++++++++++++++++++++ fs/ext4/xattr.c | 2 +- 6 files changed, 56 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c88b439ba5cd..427f83066a0d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -563,7 +563,8 @@ got: inode->i_ino = ino; /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = + ext4_current_time(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -595,9 +596,8 @@ got: spin_unlock(&sbi->s_next_gen_lock); ei->i_state = EXT4_STATE_NEW; - ei->i_extra_isize = - (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ? - sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0; + + ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; if(DQUOT_ALLOC_INODE(inode)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 49035c5a2c43..b83f91edebd1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -726,7 +726,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, /* We are done with atomic stuff, now do the rest of housekeeping */ - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); /* had we spliced it onto indirect block? */ @@ -2375,7 +2375,7 @@ do_indirects: ext4_discard_reservation(inode); mutex_unlock(&ei->truncate_mutex); - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); /* @@ -2629,10 +2629,6 @@ void ext4_read_inode(struct inode * inode) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); inode->i_size = le32_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; ei->i_state = 0; ei->i_dir_start_lookup = 0; @@ -2710,6 +2706,11 @@ void ext4_read_inode(struct inode * inode) } else ei->i_extra_isize = 0; + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); + if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; @@ -2791,9 +2792,12 @@ static int ext4_do_update_inode(handle_t *handle, } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le32(ei->i_disksize); - raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5b00775d5096..c04c7ccba9e3 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -97,7 +97,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, ei->i_flags = flags; ext4_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -134,7 +134,7 @@ flags_err: return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2de339dd7554..40106b7ea4b8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1295,7 +1295,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -2056,7 +2056,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); drop_nlink(dir); ext4_update_dx_flag(dir); @@ -2106,13 +2106,13 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + dir->i_ctime = dir->i_mtime = ext4_current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime; + inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); retval = 0; @@ -2203,7 +2203,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); inc_nlink(inode); atomic_inc(&inode->i_count); @@ -2305,7 +2305,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = CURRENT_TIME_SEC; + old_inode->i_ctime = ext4_current_time(old_inode); ext4_mark_inode_dirty(handle, old_inode); /* @@ -2338,9 +2338,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, if (new_inode) { drop_nlink(new_inode); - new_inode->i_ctime = CURRENT_TIME_SEC; + new_inode->i_ctime = ext4_current_time(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); ext4_update_dx_flag(old_dir); if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index af0835187e76..b47259f6f39c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1651,6 +1651,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_inode_size); goto failed_mount; } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) + sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); } sbi->s_frag_size = EXT4_MIN_FRAG_SIZE << le32_to_cpu(es->s_log_frag_size); @@ -1874,6 +1876,32 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) } ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY); + + /* determine the minimum size of new large inodes, if present */ + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_want_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_want_extra_isize); + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_min_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_min_extra_isize); + } + } + /* Check if enough inode space is available */ + if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + printk(KERN_INFO "EXT4-fs: required extra inode space not" + "available.\n"); + } + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e832e96095b3..fe16a569d06b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1013,7 +1013,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with -- cgit v1.2.1 From 6dd4ee7cab7e3a17c571aebd444f4344c8c4946e Mon Sep 17 00:00:00 2001 From: Kalpak Shah Date: Wed, 18 Jul 2007 09:19:57 -0400 Subject: ext4: Expand extra_inodes space per the s_{want,min}_extra_isize fields We need to make sure that existing ext3 filesystems can also avail the new fields that have been added to the ext4 inode. We use s_want_extra_isize and s_min_extra_isize to decide by how much we should expand the inode. If EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set then we expand the inode by max(s_want_extra_isize, s_min_extra_isize , sizeof(ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is still an open question about whether users should be able to set s_*_extra_isize smaller than the known fields or not. This patch also adds the functionality to expand inodes to include the newly added fields. We start by trying to expand by s_want_extra_isize bytes and if its fails we try to expand by s_min_extra_isize bytes. This is done by changing the i_extra_isize if enough space is available in the inode and no EAs are present. If EAs are present and there is enough space in the inode then the EAs in the inode are shifted to make space. If enough space is not available in the inode due to the EAs then 1 or more EAs are shifted to the external EA block. In the worst case when even the external EA block does not have enough space we inform the user that some EA would need to be deleted or s_min_extra_isize would have to be reduced. Signed-off-by: Andreas Dilger Signed-off-by: Kalpak Shah Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 63 ++++++++++++- fs/ext4/xattr.c | 274 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/ext4/xattr.h | 17 ++++ 3 files changed, 346 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b83f91edebd1..f6d8528c4f55 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3105,6 +3105,39 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, return err; } +/* + * Expand an inode by new_extra_isize bytes. + * Returns 0 on success or negative error number on failure. + */ +int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, + struct ext4_iloc iloc, handle_t *handle) +{ + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + return 0; + + raw_inode = ext4_raw_inode(&iloc); + + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* No extended attributes present */ + if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, + new_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + return 0; + } + + /* try to expand with EAs present */ + return ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); +} + /* * What we do here is to mark the in-core inode as clean with respect to inode * dirtiness (it may still be data-dirty). @@ -3129,10 +3162,38 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) { struct ext4_iloc iloc; - int err; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; + int err, ret; might_sleep(); err = ext4_reserve_inode_write(handle, inode, &iloc); + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { + /* + * We need extra buffer credits since we may write into EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if ((jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + ret = ext4_expand_extra_isize(inode, + sbi->s_want_extra_isize, + iloc, handle); + if (ret) { + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + if (mnt_count != sbi->s_es->s_mnt_count) { + ext4_warning(inode->i_sb, __FUNCTION__, + "Unable to expand inode %lu. Delete" + " some EAs or run e2fsck.", + inode->i_ino); + mnt_count = sbi->s_es->s_mnt_count; + } + } + } + } if (!err) err = ext4_mark_iloc_dirty(handle, inode, &iloc); return err; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fe16a569d06b..b10d68fffb55 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -66,13 +66,6 @@ #define BFIRST(bh) ENTRY(BHDR(bh)+1) #define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) -#define IHDR(inode, raw_inode) \ - ((struct ext4_xattr_ibody_header *) \ - ((void *)raw_inode + \ - EXT4_GOOD_OLD_INODE_SIZE + \ - EXT4_I(inode)->i_extra_isize)) -#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) - #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, f...) do { \ printk(KERN_DEBUG "inode %s:%lu: ", \ @@ -508,6 +501,24 @@ out: return; } +/* + * Find the available free space for EAs. This also returns the total number of + * bytes used by EA entries. + */ +static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, + size_t *min_offs, void *base, int *total) +{ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + *total += EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; + } + } + return (*min_offs - ((void *)last - base) - sizeof(__u32)); +} + struct ext4_xattr_info { int name_index; const char *name; @@ -1014,6 +1025,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = ext4_current_time(inode); + if (!value) + EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with @@ -1066,6 +1079,253 @@ retry: return error; } +/* + * Shift the EA entries in the inode to create space for the increased + * i_extra_isize. + */ +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, + int value_offs_shift, void *to, + void *from, size_t n, int blocksize) +{ + struct ext4_xattr_entry *last = entry; + int new_offs; + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + BUG_ON(new_offs + le32_to_cpu(last->e_value_size) + > blocksize); + last->e_value_offs = cpu_to_le16(new_offs); + } + } + /* Shift the entries by n bytes */ + memmove(to, from, n); +} + +/* + * Expand an inode by new_extra_isize bytes when EAs are present. + * Returns 0 on success or negative error number on failure. + */ +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry, *last, *first; + struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t min_offs, free; + int total_ino, total_blk; + void *base, *start, *end; + int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int s_min_extra_isize = EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize; + + down_write(&EXT4_I(inode)->xattr_sem); +retry: + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + } + + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* + * Check if enough free space is available in the inode to shift the + * entries ahead by new_extra_isize. + */ + + base = start = entry; + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + min_offs = end - base; + last = entry; + total_ino = sizeof(struct ext4_xattr_ibody_header); + + free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + if (free >= new_extra_isize) { + entry = IFIRST(header); + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino, + inode->i_sb->s_blocksize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + error = 0; + goto cleanup; + } + + /* + * Enough free space isn't available in the inode, check if + * EA block can hold new_extra_isize bytes. + */ + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + if (ext4_xattr_check_block(bh)) { + ext4_error(inode->i_sb, __FUNCTION__, + "inode %lu: bad block %llu", inode->i_ino, + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + base = BHDR(bh); + first = BFIRST(bh); + end = bh->b_data + bh->b_size; + min_offs = end - base; + free = ext4_xattr_free_space(first, &min_offs, base, + &total_blk); + if (free < new_extra_isize) { + if (!tried_min_extra_isize && s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; + } + error = -1; + goto cleanup; + } + } else { + free = inode->i_sb->s_blocksize; + } + + while (new_extra_isize > 0) { + size_t offs, size, entry_size; + struct ext4_xattr_entry *small_entry = NULL; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + }; + unsigned int total_size; /* EA entry size + value size */ + unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ + unsigned int min_total_size = ~0U; + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + if (!is || !bs) { + error = -ENOMEM; + goto cleanup; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + entry = NULL; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= free && total_size < min_total_size) { + if (total_size < new_extra_isize) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry) { + entry = small_entry; + } else { + if (!tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + goto retry; + } + error = -1; + goto cleanup; + } + } + offs = le16_to_cpu(entry->e_value_offs); + size = le32_to_cpu(entry->e_value_size); + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + i.name_index = entry->e_name_index, + buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!buffer || !b_entry_name) { + error = -ENOMEM; + goto cleanup; + } + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + offs, + EXT4_XATTR_SIZE(size)); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto cleanup; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto cleanup; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + + entry = IFIRST(header); + if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) + shift_bytes = new_extra_isize; + else + shift_bytes = entry_size + size; + /* Adjust the offsets and shift the remaining entries ahead */ + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - + shift_bytes, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, + (void *)header, total_ino - entry_size, + inode->i_sb->s_blocksize); + + extra_isize += shift_bytes; + new_extra_isize -= shift_bytes; + EXT4_I(inode)->i_extra_isize = extra_isize; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = cpu_to_le32(size); + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto cleanup; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto cleanup; + kfree(b_entry_name); + kfree(buffer); + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + } + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + +cleanup: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + + + /* * ext4_xattr_delete_inode() * diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 79432b35398f..d7f5d6a12651 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -56,6 +56,13 @@ struct ext4_xattr_entry { #define EXT4_XATTR_SIZE(size) \ (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) +#define IHDR(inode, raw_inode) \ + ((struct ext4_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT4_GOOD_OLD_INODE_SIZE + \ + EXT4_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + # ifdef CONFIG_EXT4DEV_FS_XATTR extern struct xattr_handler ext4_xattr_user_handler; @@ -74,6 +81,9 @@ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, extern void ext4_xattr_delete_inode(handle_t *, struct inode *); extern void ext4_xattr_put_super(struct super_block *); +extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); + extern int init_ext4_xattr(void); extern void exit_ext4_xattr(void); @@ -129,6 +139,13 @@ exit_ext4_xattr(void) { } +static inline int +ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + return -EOPNOTSUPP; +} + #define ext4_xattr_handlers NULL # endif /* CONFIG_EXT4DEV_FS_XATTR */ -- cgit v1.2.1 From f8628a14a27eb4512a1ede43de1d9db4d9f92bc3 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Wed, 18 Jul 2007 08:38:01 -0400 Subject: ext4: Remove 65000 subdirectory limit This patch adds support to ext4 for allowing more than 65000 subdirectories. Currently the maximum number of subdirectories is capped at 32000. If we exceed 65000 subdirectories in an htree directory it sets the inode link count to 1 and no longer counts subdirectories. The directory link count is not actually used when determining if a directory is empty, as that only counts subdirectories and not regular files that might be in there. A EXT4_FEATURE_RO_COMPAT_DIR_NLINK flag has been added and it is set if the subdir count for any directory crosses 65000. A later fsck will clear EXT4_FEATURE_RO_COMPAT_DIR_NLINK if there are no longer any directory with >65000 subdirs. Signed-off-by: Andreas Dilger Signed-off-by: Kalpak Shah Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 60 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 40106b7ea4b8..da224974af78 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1629,6 +1629,35 @@ static int ext4_delete_entry (handle_t *handle, return -ENOENT; } +/* + * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, + * since this indicates that nlinks count was previously 1. + */ +static void ext4_inc_count(handle_t *handle, struct inode *inode) +{ + inc_nlink(inode); + if (is_dx(inode) && inode->i_nlink > 1) { + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { + inode->i_nlink = 1; + EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_DIR_NLINK); + } + } +} + +/* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +static void ext4_dec_count(handle_t *handle, struct inode *inode) +{ + drop_nlink(inode); + if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) + inc_nlink(inode); +} + + static int ext4_add_nondir(handle_t *handle, struct dentry *dentry, struct inode *inode) { @@ -1725,7 +1754,7 @@ static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode) struct ext4_dir_entry_2 * de; int err, retries = 0; - if (dir->i_nlink >= EXT4_LINK_MAX) + if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; retry: @@ -1748,7 +1777,7 @@ retry: inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; dir_block = ext4_bread (handle, inode, 0, 1, &err); if (!dir_block) { - drop_nlink(inode); /* is this nlink == 0? */ + ext4_dec_count(handle, inode); /* is this nlink == 0? */ ext4_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -1780,7 +1809,7 @@ retry: iput (inode); goto out_stop; } - inc_nlink(dir); + ext4_inc_count(handle, dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); @@ -2045,9 +2074,9 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; - if (inode->i_nlink != 2) + if (!EXT4_DIR_LINK_EMPTY(inode)) ext4_warning (inode->i_sb, "ext4_rmdir", - "empty directory has nlink!=2 (%d)", + "empty directory has too many links (%d)", inode->i_nlink); inode->i_version++; clear_nlink(inode); @@ -2058,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) ext4_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); - drop_nlink(dir); + ext4_dec_count(handle, dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); @@ -2109,7 +2138,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) dir->i_ctime = dir->i_mtime = ext4_current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); - drop_nlink(inode); + ext4_dec_count(handle, inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = ext4_current_time(inode); @@ -2159,7 +2188,7 @@ retry: err = __page_symlink(inode, symname, l, mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { - drop_nlink(inode); + ext4_dec_count(handle, inode); ext4_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -2185,8 +2214,9 @@ static int ext4_link (struct dentry * old_dentry, struct inode *inode = old_dentry->d_inode; int err, retries = 0; - if (inode->i_nlink >= EXT4_LINK_MAX) + if (EXT4_DIR_LINK_MAX(inode)) return -EMLINK; + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2204,7 +2234,7 @@ retry: handle->h_sync = 1; inode->i_ctime = ext4_current_time(inode); - inc_nlink(inode); + ext4_inc_count(handle, inode); atomic_inc(&inode->i_count); err = ext4_add_nondir(handle, dentry, inode); @@ -2337,7 +2367,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, } if (new_inode) { - drop_nlink(new_inode); + ext4_dec_count(handle, new_inode); new_inode->i_ctime = ext4_current_time(new_inode); } old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); @@ -2348,11 +2378,13 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); ext4_journal_dirty_metadata(handle, dir_bh); - drop_nlink(old_dir); + ext4_dec_count(handle, old_dir); if (new_inode) { - drop_nlink(new_inode); + /* checked empty_dir above, can't have another parent, + * ext3_dec_count() won't work for many-linked dirs */ + new_inode->i_nlink = 0; } else { - inc_nlink(new_dir); + ext4_inc_count(handle, new_dir); ext4_update_dx_flag(new_dir); ext4_mark_inode_dirty(handle, new_dir); } -- cgit v1.2.1 From fc0e15a667121e02686cc52679f6272959fb60cc Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 18 Jul 2007 09:20:44 -0400 Subject: Use zero_user_page() in ext4 where possible Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f6d8528c4f55..125aca714685 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1766,7 +1766,6 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, struct inode *inode = mapping->host; struct buffer_head *bh; int err = 0; - void *kaddr; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); @@ -1778,10 +1777,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, */ if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode) && PageUptodate(page)) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + zero_user_page(page, offset, length, KM_USER0); set_page_dirty(page); goto unlock; } @@ -1834,10 +1830,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + zero_user_page(page, offset, length, KM_USER0); BUFFER_TRACE(bh, "zeroed end of block"); -- cgit v1.2.1 From 1330593eb2d055bdd6ed935886cf2c13c896b288 Mon Sep 17 00:00:00 2001 From: Vignesh Babu Date: Wed, 18 Jul 2007 09:11:02 -0400 Subject: ext4: Use is_power_of_2() Replace (n & (n-1)) in the context of power of 2 checks with is_power_of_2() Signed-off-by: Vignesh Babu Cc: Signed-off-by: Andrew Morton Signed-off-by: Dave Kleikamp Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b47259f6f39c..6dcbb28dc06d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -1644,7 +1645,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || - (sbi->s_inode_size & (sbi->s_inode_size - 1)) || + (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { printk (KERN_ERR "EXT4-fs: unsupported inode size: %d\n", -- cgit v1.2.1 From d699594dc151c664a2b307e680a3cd9b7145fd83 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 18 Jul 2007 08:33:51 -0400 Subject: ext4: remove extra IS_RDONLY() check ext4_change_inode_journal_flag() is only called from one location: ext4_ioctl(EXT3_IOC_SETFLAGS). That ioctl case already has a IS_RDONLY() call in it so this one is superfluous. Signed-off-by: Dave Hansen Cc: Signed-off-by: Andrew Morton Signed-off-by: Dave Kleikamp Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 125aca714685..de26c25d6a18 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3275,7 +3275,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) */ journal = EXT4_JOURNAL(inode); - if (is_journal_aborted(journal) || IS_RDONLY(inode)) + if (is_journal_aborted(journal)) return -EROFS; jbd2_journal_lock_updates(journal); -- cgit v1.2.1 From 26d535ed24f74ce949d7b49e40574c45cd845cdd Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 18 Jul 2007 08:33:37 -0400 Subject: Fix compilation with EXT_DEBUG, also fix leXX_to_cpu conversions. Signed-off-by: Dmitry Monakhov Acked-by: Alex Tomas Signed-off-by: Dave Kleikamp Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 96264b2ef0a3..11ce15d91acc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -383,13 +383,14 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc r = m - 1; else l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block, - m, m->ei_block, r, r->ei_block); + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), + m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); } path->p_idx = l - 1; ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), - idx_block(path->p_idx)); + idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH { @@ -448,8 +449,9 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) r = m - 1; else l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block, - m, m->ee_block, r, r->ee_block); + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), + m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); } path->p_ext = l - 1; @@ -582,7 +584,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { len = (len - 1) * sizeof(struct ext4_extent_idx); len = len < 0 ? 0 : len; - ext_debug("insert new index %d after: %d. " + ext_debug("insert new index %d after: %llu. " "move %d from 0x%p to 0x%p\n", logical, ptr, len, (curp->p_idx + 1), (curp->p_idx + 2)); @@ -593,7 +595,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, /* insert before */ len = len * sizeof(struct ext4_extent_idx); len = len < 0 ? 0 : len; - ext_debug("insert new index %d before: %d. " + ext_debug("insert new index %d before: %llu. " "move %d from 0x%p to 0x%p\n", logical, ptr, len, curp->p_idx, (curp->p_idx + 1)); @@ -793,7 +795,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr)); while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { - ext_debug("%d: move %d:%d in new index %llu\n", i, + ext_debug("%d: move %d:%llu in new index %llu\n", i, le32_to_cpu(path[i].p_idx->ei_block), idx_pblock(path[i].p_idx), newblock); -- cgit v1.2.1 From e9f410b1c035b6e63f0b4c3d6cfe4298d6a04492 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 18 Jul 2007 09:09:15 -0400 Subject: ext4: extent macros cleanup Use the EXT_LAST_INDEX macro; that's what it's there for. Clean up ext4_ext_ext_grow_indepth() so the correct EXT_FIRST_INDEX or EXT_FIRST_MACRO is used as necessary. The two macros are equivalent, so the C will collapse the if statement out, but it makes the code much more readable. Signed-off-by: Dmitry Monakhov Acked-by: Alex Tomas Signed-off-by: Dave Kleikamp Singed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 11ce15d91acc..750c46f7d893 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -376,7 +376,7 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc ext_debug("binsearch for %d(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; - r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1; + r = EXT_LAST_INDEX(eh); while (l <= r) { m = l + (r - l) / 2; if (block < le32_to_cpu(m->ei_block)) @@ -441,7 +441,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) ext_debug("binsearch for %d: ", block); l = EXT_FIRST_EXTENT(eh) + 1; - r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1; + r = EXT_LAST_EXTENT(eh); while (l <= r) { m = l + (r - l) / 2; @@ -924,8 +924,13 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); curp->p_hdr->eh_entries = cpu_to_le16(1); curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); - /* FIXME: it works, but actually path[0] can be index */ - curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; + + if (path[0].p_hdr->eh_depth) + curp->p_idx->ei_block = + EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; + else + curp->p_idx->ei_block = + EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ext4_idx_store_pblock(curp->p_idx, newblock); neh = ext_inode_hdr(inode); -- cgit v1.2.1 From 86313c488a6848b7ec2ba04e74f25f79dd32a0b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: usermodehelper: Tidy up waiting Rather than using a tri-state integer for the wait flag in call_usermodehelper_exec, define a proper enum, and use that. I've preserved the integer values so that any callers I've missed should still work OK. Signed-off-by: Jeremy Fitzhardinge Cc: James Bottomley Cc: Randy Dunlap Cc: Christoph Hellwig Cc: Andi Kleen Cc: Paul Mackerras Cc: Johannes Berg Cc: Ralf Baechle Cc: Bjorn Helgaas Cc: Joel Becker Cc: Tony Luck Cc: Kay Sievers Cc: Srivatsa Vaddagiri Cc: Oleg Nesterov Cc: David Howells --- fs/ocfs2/heartbeat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 352eb4a13f98..c4c36171240d 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb) envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; - ret = call_usermodehelper(argv[0], argv, envp, 1); + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (ret < 0) mlog_errno(ret); } -- cgit v1.2.1