diff options
Diffstat (limited to 'Documentation/filesystems')
39 files changed, 4057 insertions, 1258 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2c391338c675..efea228ccd8a 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -21,8 +21,7 @@ prototypes: char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int, unsigned int); + struct dentry *(*d_real)(struct dentry *, const struct inode *); locking rules: rename_lock ->d_lock may block rcu-walk @@ -64,7 +63,7 @@ prototypes: void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, - umode_t create_mode, int *opened); + umode_t create_mode); int (*tmpfile) (struct inode *, struct dentry *, umode_t); locking rules: @@ -441,8 +440,6 @@ prototypes: int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -473,7 +470,7 @@ prototypes: }; locking rules: - All except for ->poll_mask may block. + All may block. ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -505,9 +502,6 @@ in sys_read() and friends. the lease within the individual filesystem to record the result of the operation -->poll_mask can be called with or without the waitqueue lock for the waitqueue -returned from ->get_poll_head. - --------------------------- dquot_operations ------------------------------- prototypes: int (*write_dquot) (struct dquot *); @@ -537,9 +531,9 @@ More details about quota locking can be found in fs/dquot.c. prototypes: void (*open)(struct vm_area_struct*); void (*close)(struct vm_area_struct*); - int (*fault)(struct vm_area_struct*, struct vm_fault *); - int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); - int (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); + vm_fault_t (*fault)(struct vm_area_struct*, struct vm_fault *); + vm_fault_t (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); + vm_fault_t (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: diff --git a/Documentation/filesystems/cifs/AUTHORS b/Documentation/filesystems/cifs/AUTHORS index 9f4f87e16240..75865da2ce14 100644 --- a/Documentation/filesystems/cifs/AUTHORS +++ b/Documentation/filesystems/cifs/AUTHORS @@ -42,9 +42,11 @@ Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) Scott Lovenberg Pavel Shilovsky (for great work adding SMB2 support, and various SMB3 features) Aurelien Aptel (for DFS SMB3 work and some key bug fixes) -Ronnie Sahlberg (for SMB3 xattr work and bug fixes) +Ronnie Sahlberg (for SMB3 xattr work, bug fixes, and lots of great work on compounding) Shirish Pargaonkar (for many ACL patches over the years) Sachin Prabhu (many bug fixes, including for reconnect, copy offload and security) +Paulo Alcantara +Long Li (some great work on RDMA, SMB Direct) Test case and Bug Report contributors @@ -58,5 +60,4 @@ mention to the Stanford Checker (SWAT) which pointed out many minor bugs in error paths. Valuable suggestions also have come from Al Viro and Dave Miller. -And thanks to the IBM LTC and Power test teams and SuSE testers for -finding multiple bugs during excellent stress test runs. +And thanks to the IBM LTC and Power test teams and SuSE and Citrix and RedHat testers for finding multiple bugs during excellent stress test runs. diff --git a/Documentation/filesystems/cifs/CHANGES b/Documentation/filesystems/cifs/CHANGES index bc0025cdd1c9..1df7f4910eb2 100644 --- a/Documentation/filesystems/cifs/CHANGES +++ b/Documentation/filesystems/cifs/CHANGES @@ -1,1065 +1,4 @@ -Version 1.62 ------------- -Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened -to more strictly handle corrupt frames. - -Version 1.61 ------------- -Fix append problem to Samba servers (files opened with O_APPEND could -have duplicated data). Fix oops in cifs_lookup. Workaround problem -mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. -Disable use of server inode numbers when server only -partially supports them (e.g. for one server querying inode numbers on -FindFirst fails but QPathInfo queries works). Fix oops with dfs in -cifs_put_smb_ses. Fix mmap to work on directio mounts (needed -for OpenOffice when on forcedirectio mount e.g.) - -Version 1.60 -------------- -Fix memory leak in reconnect. Fix oops in DFS mount error path. -Set s_maxbytes to smaller (the max that vfs can handle) so that -sendfile will now work over cifs mounts again. Add noforcegid -and noforceuid mount parameters. Fix small mem leak when using -ntlmv2. Fix 2nd mount to same server but with different port to -be allowed (rather than reusing the 1st port) - only when the -user explicitly overrides the port on the 2nd mount. - -Version 1.59 ------------- -Client uses server inode numbers (which are persistent) rather than -client generated ones by default (mount option "serverino" turned -on by default if server supports it). Add forceuid and forcegid -mount options (so that when negotiating unix extensions specifying -which uid mounted does not immediately force the server's reported -uids to be overridden). Add support for scope mount parm. Improve -hard link detection to use same inode for both. Do not set -read-only dos attribute on directories (for chmod) since Windows -explorer special cases this attribute bit for directories for -a different purpose. - -Version 1.58 ------------- -Guard against buffer overruns in various UCS-2 to UTF-8 string conversions -when the UTF-8 string is composed of unusually long (more than 4 byte) converted -characters. Add support for mounting root of a share which redirects immediately -to DFS target. Convert string conversion functions from Unicode to more -accurately mark string length before allocating memory (which may help the -rare cases where a UTF-8 string is much larger than the UCS2 string that -we converted from). Fix endianness of the vcnum field used during -session setup to distinguish multiple mounts to same server from different -userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental -flag to be set to 2, and mount must enable krb5 to turn on extended security). -Performance of file create to Samba improved (posix create on lookup -removes 1 of 2 network requests sent on file create) - -Version 1.57 ------------- -Improve support for multiple security contexts to the same server. We -used to use the same "vcnumber" for all connections which could cause -the server to treat subsequent connections, especially those that -are authenticated as guest, as reconnections, invalidating the earlier -user's smb session. This fix allows cifs to mount multiple times to the -same server with different userids without risking invalidating earlier -established security contexts. fsync now sends SMB Flush operation -to better ensure that we wait for server to write all of the data to -server disk (not just write it over the network). Add new mount -parameter to allow user to disable sending the (slow) SMB flush on -fsync if desired (fsync still flushes all cached write data to the server). -Posix file open support added (turned off after one attempt if server -fails to support it properly, as with Samba server versions prior to 3.3.2) -Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too -little memory for the "nativeFileSystem" field returned by the server -during mount). Endian convert inode numbers if necessary (makes it easier -to compare inode numbers on network files from big endian systems). - -Version 1.56 ------------- -Add "forcemandatorylock" mount option to allow user to use mandatory -rather than posix (advisory) byte range locks, even though server would -support posix byte range locks. Fix query of root inode when prefixpath -specified and user does not have access to query information about the -top of the share. Fix problem in 2.6.28 resolving DFS paths to -Samba servers (worked to Windows). Fix rmdir so that pending search -(readdir) requests do not get invalid results which include the now -removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable -when using DFS. Add better file create support to servers which support -the CIFS POSIX protocol extensions (this adds support for new flags -on create, and improves semantics for write of locked ranges). - -Version 1.55 ------------- -Various fixes to make delete of open files behavior more predictable -(when delete of an open file fails we mark the file as "delete-on-close" -in a way that more servers accept, but only if we can first rename the -file to a temporary name). Add experimental support for more safely -handling fcntl(F_SETLEASE). Convert cifs to using blocking tcp -sends, and also let tcp autotune the socket send and receive buffers. -This reduces the number of EAGAIN errors returned by TCP/IP in -high stress workloads (and the number of retries on socket writes -when sending large SMBWriteX requests). Fix case in which a portion of -data can in some cases not get written to the file on the server before the -file is closed. Fix DFS parsing to properly handle path consumed field, -and to handle certain codepage conversions better. Fix mount and -umount race that can cause oops in mount or umount or reconnect. - -Version 1.54 ------------- -Fix premature write failure on congested networks (we would give up -on EAGAIN from the socket too quickly on large writes). -Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. -Fix endian problems in acl (mode from/to cifs acl) on bigendian -architectures. Fix problems with preserving timestamps on copying open -files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit -on parent directory when server supports Unix Extensions but not POSIX -create. Update cifs.upcall version to handle new Kerberos sec flags -(this requires update of cifs.upcall program from Samba). Fix memory leak -on dns_upcall (resolving DFS referralls). Fix plain text password -authentication (requires setting SecurityFlags to 0x30030 to enable -lanman and plain text though). Fix writes to be at correct offset when -file is open with O_APPEND and file is on a directio (forcediretio) mount. -Fix bug in rewinding readdir directory searches. Add nodfs mount option. - -Version 1.53 ------------- -DFS support added (Microsoft Distributed File System client support needed -for referrals which enable a hierarchical name space among servers). -Disable temporary caching of mode bits to servers which do not support -storing of mode (e.g. Windows servers, when client mounts without cifsacl -mount option) and add new "dynperm" mount option to enable temporary caching -of mode (enable old behavior). Fix hang on mount caused when server crashes -tcp session during negotiate protocol. - -Version 1.52 ------------- -Fix oops on second mount to server when null auth is used. -Enable experimental Kerberos support. Return writebehind errors on flush -and sync so that events like out of disk space get reported properly on -cached files. Fix setxattr failure to certain Samba versions. Fix mount -of second share to disconnected server session (autoreconnect on this). -Add ability to modify cifs acls for handling chmod (when mounted with -cifsacl flag). Fix prefixpath path separator so we can handle mounts -with prefixpaths longer than one directory (one path component) when -mounted to Windows servers. Fix slow file open when cifsacl -enabled. Fix memory leak in FindNext when the SMB call returns -EBADF. - - -Version 1.51 ------------- -Fix memory leak in statfs when mounted to very old servers (e.g. -Windows 9x). Add new feature "POSIX open" which allows servers -which support the current POSIX Extensions to provide better semantics -(e.g. delete for open files opened with posix open). Take into -account umask on posix mkdir not just older style mkdir. Add -ability to mount to IPC$ share (which allows CIFS named pipes to be -opened, read and written as if they were files). When 1st tree -connect fails (e.g. due to signing negotiation failure) fix -leak that causes cifsd not to stop and rmmod to fail to cleanup -cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on -bigendian architectures. Fix possible memory corruption when -EAGAIN returned on kern_recvmsg. Return better error if server -requires packet signing but client has disabled it. When mounted -with cifsacl mount option - mode bits are approximated based -on the contents of the ACL of the file or directory. When cifs -mount helper is missing convert make sure that UNC name -has backslash (not forward slash) between ip address of server -and the share name. - -Version 1.50 ------------- -Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is -done with "serverino" mount option). Add support for POSIX Unlink -(helps with certain sharing violation cases when server such as -Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix" -mount option to allow disabling the CIFS Unix Extensions for just -that mount. Fix hang on spinlock in find_writable_file (race when -reopening file after session crash). Byte range unlock request to -windows server could unlock more bytes (on server copy of file) -than intended if start of unlock request is well before start of -a previous byte range lock that we issued. - -Version 1.49 ------------- -IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6 -address after the "ip=" mount option, at least until mount.cifs is fixed to -handle DNS host to ipv6 name translation). Accept override of uid or gid -on mount even when Unix Extensions are negotiated (it used to be ignored -when Unix Extensions were ignored). This allows users to override the -default uid and gid for files when they are certain that the uids or -gids on the server do not match those of the client. Make "sec=none" -mount override username (so that null user connection is attempted) -to match what documentation said. Support for very large reads, over 127K, -available to some newer servers (such as Samba 3.0.26 and later but -note that it also requires setting CIFSMaxBufSize at module install -time to a larger value which may hurt performance in some cases). -Make sign option force signing (or fail if server does not support it). - -Version 1.48 ------------- -Fix mtime bouncing around from local idea of last write times to remote time. -Fix hang (in i_size_read) when simultaneous size update of same remote file -on smp system corrupts sequence number. Do not reread unnecessarily partial page -(which we are about to overwrite anyway) when writing out file opened rw. -When DOS attribute of file on non-Unix server's file changes on the server side -from read-only back to read-write, reflect this change in default file mode -(we had been leaving a file's mode read-only until the inode were reloaded). -Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute -when archive dos attribute not set and we are changing mode back to writeable -on server which does not support the Unix Extensions). Remove read only dos -attribute on chmod when adding any write permission (ie on any of -user/group/other (not all of user/group/other ie 0222) when -mounted to windows. Add support for POSIX MkDir (slight performance -enhancement and eliminates the network race between the mkdir and set -path info of the mode). - - -Version 1.47 ------------- -Fix oops in list_del during mount caused by unaligned string. -Fix file corruption which could occur on some large file -copies caused by writepages page i/o completion bug. -Seek to SEEK_END forces check for update of file size for non-cached -files. Allow file size to be updated on remote extend of locally open, -non-cached file. Fix reconnect to newer Samba servers (or other servers -which support the CIFS Unix/POSIX extensions) so that we again tell the -server the Unix/POSIX cifs capabilities which we support (SetFSInfo). -Add experimental support for new POSIX Open/Mkdir (which returns -stat information on the open, and allows setting the mode). - -Version 1.46 ------------- -Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps. -Allow null user to be specified on mount ("username="). Do not return -EINVAL on readdir when filldir fails due to overwritten blocksize -(fixes FC problem). Return error in rename 2nd attempt retry (ie report -if rename by handle also fails, after rename by path fails, we were -not reporting whether the retry worked or not). Fix NTLMv2 to -work to Windows servers (mount with option "sec=ntlmv2"). - -Version 1.45 ------------- -Do not time out lockw calls when using posix extensions. Do not -time out requests if server still responding reasonably fast -on requests on other threads. Improve POSIX locking emulation, -(lock cancel now works, and unlock of merged range works even -to Windows servers now). Fix oops on mount to lanman servers -(win9x, os/2 etc.) when null password. Do not send listxattr -(SMB to query all EAs) if nouser_xattr specified. Fix SE Linux -problem (instantiate inodes/dentries in right order for readdir). - -Version 1.44 ------------- -Rewritten sessionsetup support, including support for legacy SMB -session setup needed for OS/2 and older servers such as Windows 95 and 98. -Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst -so we can do search (ls etc.) to OS/2. Do not send NTCreateX -or recent levels of FindFirst unless server says it supports NT SMBs -(instead use legacy equivalents from LANMAN dialect). Fix to allow -NTLMv2 authentication support (now can use stronger password hashing -on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004). -Allow override of global cifs security flags on mount via "sec=" option(s). - -Version 1.43 ------------- -POSIX locking to servers which support CIFS POSIX Extensions -(disabled by default controlled by proc/fs/cifs/Experimental). -Handle conversion of long share names (especially Asian languages) -to Unicode during mount. Fix memory leak in sess struct on reconnect. -Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on -cifs open which helps rare case when setpathinfo fails or server does -not support it. - -Version 1.42 ------------- -Fix slow oplock break when mounted to different servers at the same time and -the tids match and we try to find matching fid on wrong server. Fix read -looping when signing required by server (2.6.16 kernel only). Fix readdir -vs. rename race which could cause each to hang. Return . and .. even -if server does not. Allow searches to skip first three entries and -begin at any location. Fix oops in find_writeable_file. - -Version 1.41 ------------- -Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can -configure stronger authentication. Fix sfu symlinks so they can -be followed (not just recognized). Fix wraparound of bcc on -read responses when buffer size over 64K and also fix wrap of -max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in -cifs_user_read and cifs_readpages (when EAGAIN on send of smb -on socket is returned over and over). Add POSIX (advisory) byte range -locking support (requires server with newest CIFS UNIX Extensions -to the protocol implemented). Slow down negprot slightly in port 139 -RFC1001 case to give session_init time on buggy servers. - -Version 1.40 ------------- -Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance -of readpages by eliminating one extra memcpy. Allow update of file size -from remote server even if file is open for write as long as mount is -directio. Recognize share mode security and send NTLM encrypted password -on tree connect if share mode negotiated. - -Version 1.39 ------------- -Defer close of a file handle slightly if pending writes depend on that handle -(this reduces the EBADF bad file handle errors that can be logged under heavy -stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 -Fix SFU style symlinks and mknod needed for servers which do not support the -CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative -dentries so files that the client sees as deleted but that later get created -on the server will be recognized. Add client side permission check on setattr. -Timeout stuck requests better (where server has never responded or sent corrupt -responses) - -Version 1.38 ------------- -Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket) -to be smaller at first (but increasing) so large write performance performance -over GigE is better. Do not hang thread on illegal byte range lock response -from Windows (Windows can send an RFC1001 size which does not match smb size) by -allowing an SMBs TCP length to be up to a few bytes longer than it should be. -wsize and rsize can now be larger than negotiated buffer size if server -supports large readx/writex, even when directio mount flag not specified. -Write size will in many cases now be 16K instead of 4K which greatly helps -file copy performance on lightly loaded networks. Fix oops in dnotify -when experimental config flag enabled. Make cifsFYI more granular. - -Version 1.37 ------------- -Fix readdir caching when unlink removes file in current search buffer, -and this is followed by a rewind search to just before the deleted entry. -Do not attempt to set ctime unless atime and/or mtime change requested -(most servers throw it away anyway). Fix length check of received smbs -to be more accurate. Fix big endian problem with mapchars mount option, -and with a field returned by statfs. - -Version 1.36 ------------- -Add support for mounting to older pre-CIFS servers such as Windows9x and ME. -For these older servers, add option for passing netbios name of server in -on mount (servernetbiosname). Add suspend support for power management, to -avoid cifsd thread preventing software suspend from working. -Add mount option for disabling the default behavior of sending byte range lock -requests to the server (necessary for certain applications which break with -mandatory lock behavior such as Evolution), and also mount option for -requesting case insensitive matching for path based requests (requesting -case sensitive is the default). - -Version 1.35 ------------- -Add writepage performance improvements. Fix path name conversions -for long filenames on mounts which were done with "mapchars" mount option -specified. Ensure multiplex ids do not collide. Fix case in which -rmmod can oops if done soon after last unmount. Fix truncated -search (readdir) output when resume filename was a long filename. -Fix filename conversion when mapchars mount option was specified and -filename was a long filename. - -Version 1.34 ------------- -Fix error mapping of the TOO_MANY_LINKS (hardlinks) case. -Do not oops if root user kills cifs oplock kernel thread or -kills the cifsd thread (NB: killing the cifs kernel threads is not -recommended, unmount and rmmod cifs will kill them when they are -no longer needed). Fix readdir to ASCII servers (ie older servers -which do not support Unicode) and also require asterisk. -Fix out of memory case in which data could be written one page -off in the page cache. - -Version 1.33 ------------- -Fix caching problem, in which readdir of directory containing a file -which was cached could cause the file's time stamp to be updated -without invalidating the readahead data (so we could get stale -file data on the client for that file even as the server copy changed). -Cleanup response processing so cifsd can not loop when abnormally -terminated. - - -Version 1.32 ------------- -Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one -transact response for an SMB request and search entry split across two frames. -Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server) -as new protocol extensions. Do not send Get/Set calls for POSIX ACLs -unless server explicitly claims to support them in CIFS Unix extensions -POSIX ACL capability bit. Fix packet signing when multiuser mounting with -different users from the same client to the same server. Fix oops in -cifs_close. Add mount option for remapping reserved characters in -filenames (also allow recognizing files with created by SFU which have any -of these seven reserved characters, except backslash, to be recognized). -Fix invalid transact2 message (we were sometimes trying to interpret -oplock breaks as SMB responses). Add ioctl for checking that the -current uid matches the uid of the mounter (needed by umount.cifs). -Reduce the number of large buffer allocations in cifs response processing -(significantly reduces memory pressure under heavy stress with multiple -processes accessing the same server at the same time). - -Version 1.31 ------------- -Fix updates of DOS attributes and time fields so that files on NT4 servers -do not get marked delete on close. Display sizes of cifs buffer pools in -cifs stats. Fix oops in unmount when cifsd thread being killed by -shutdown. Add generic readv/writev and aio support. Report inode numbers -consistently in readdir and lookup (when serverino mount option is -specified use the inode number that the server reports - for both lookup -and readdir, otherwise by default the locally generated inode number is used -for inodes created in either path since servers are not always able to -provide unique inode numbers when exporting multiple volumes from under one -sharename). - -Version 1.30 ------------- -Allow new nouser_xattr mount parm to disable xattr support for user namespace. -Do not flag user_xattr mount parm in dmesg. Retry failures setting file time -(mostly affects NT4 servers) by retry with handle based network operation. -Add new POSIX Query FS Info for returning statfs info more accurately. -Handle passwords with multiple commas in them. - -Version 1.29 ------------- -Fix default mode in sysfs of cifs module parms. Remove old readdir routine. -Fix capabilities flags for large readx so as to allow reads larger than 64K. - -Version 1.28 ------------- -Add module init parm for large SMB buffer size (to allow it to be changed -from its default of 16K) which is especially useful for large file copy -when mounting with the directio mount option. Fix oops after -returning from mount when experimental ExtendedSecurity enabled and -SpnegoNegotiated returning invalid error. Fix case to retry better when -peek returns from 1 to 3 bytes on socket which should have more data. -Fixed path based calls (such as cifs lookup) to handle path names -longer than 530 (now can handle PATH_MAX). Fix pass through authentication -from Samba server to DC (Samba required dummy LM password). - -Version 1.27 ------------- -Turn off DNOTIFY (directory change notification support) by default -(unless built with the experimental flag) to fix hang with KDE -file browser. Fix DNOTIFY flag mappings. Fix hang (in wait_event -waiting on an SMB response) in SendReceive when session dies but -reconnects quickly from another task. Add module init parms for -minimum number of large and small network buffers in the buffer pools, -and for the maximum number of simultaneous requests. - -Version 1.26 ------------- -Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later -and other POSIX CIFS compliant servers. Fix error mapping for getfacl -to EOPNOTSUPP when server does not support posix acls on the wire. Fix -improperly zeroed buffer in CIFS Unix extensions set times call. - -Version 1.25 ------------- -Fix internationalization problem in cifs readdir with filenames that map to -longer UTF-8 strings than the string on the wire was in Unicode. Add workaround -for readdir to netapp servers. Fix search rewind (seek into readdir to return -non-consecutive entries). Do not do readdir when server negotiates -buffer size to small to fit filename. Add support for reading POSIX ACLs from -the server (add also acl and noacl mount options). - -Version 1.24 ------------- -Optionally allow using server side inode numbers, rather than client generated -ones by specifying mount option "serverino" - this is required for some apps -to work which double check hardlinked files and have persistent inode numbers. - -Version 1.23 ------------- -Multiple bigendian fixes. On little endian systems (for reconnect after -network failure) fix tcp session reconnect code so we do not try first -to reconnect on reverse of port 445. Treat reparse points (NTFS junctions) -as directories rather than symlinks because we can do follow link on them. - -Version 1.22 ------------- -Add config option to enable XATTR (extended attribute) support, mapping -xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of -minor fixes pointed out by the Stanford SWAT checker (mostly missing -or out of order NULL pointer checks in little used error paths). - -Version 1.21 ------------- -Add new mount parm to control whether mode check (generic_permission) is done -on the client. If Unix extensions are enabled and the uids on the client -and server do not match, client permission checks are meaningless on -server uids that do not exist on the client (this does not affect the -normal ACL check which occurs on the server). Fix default uid -on mknod to match create and mkdir. Add optional mount parm to allow -override of the default uid behavior (in which the server sets the uid -and gid of newly created files). Normally for network filesystem mounts -user want the server to set the uid/gid on newly created files (rather than -using uid of the client processes you would in a local filesystem). - -Version 1.20 ------------- -Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps -info into /proc/fs/cifs/DebugData. Fix oops in rare oops in readdir -(in build_wildcard_path_from_dentry). Fix mknod to pass type field -(block/char/fifo) properly. Remove spurious mount warning log entry when -credentials passed as mount argument. Set major/minor device number in -inode for block and char devices when unix extensions enabled. - -Version 1.19 ------------- -Fix /proc/fs/cifs/Stats and DebugData display to handle larger -amounts of return data. Properly limit requests to MAX_REQ (50 -is the usual maximum active multiplex SMB/CIFS requests per server). -Do not kill cifsd (and thus hurt the other SMB session) when more than one -session to the same server (but with different userids) exists and one -of the two user's smb sessions is being removed while leaving the other. -Do not loop reconnecting in cifsd demultiplex thread when admin -kills the thread without going through unmount. - -Version 1.18 ------------- -Do not rename hardlinked files (since that should be a noop). Flush -cached write behind data when reopening a file after session abend, -except when already in write. Grab per socket sem during reconnect -to avoid oops in sendmsg if overlapping with reconnect. Do not -reset cached inode file size on readdir for files open for write on -client. - - -Version 1.17 ------------- -Update number of blocks in file so du command is happier (in Linux a fake -blocksize of 512 is required for calculating number of blocks in inode). -Fix prepare write of partial pages to read in data from server if possible. -Fix race on tcpStatus field between unmount and reconnection code, causing -cifsd process sometimes to hang around forever. Improve out of memory -checks in cifs_filldir - -Version 1.16 ------------- -Fix incorrect file size in file handle based setattr on big endian hardware. -Fix oops in build_path_from_dentry when out of memory. Add checks for invalid -and closing file structs in writepage/partialpagewrite. Add statistics -for each mounted share (new menuconfig option). Fix endianness problem in -volume information displayed in /proc/fs/cifs/DebugData (only affects -affects big endian architectures). Prevent renames while constructing -path names for open, mkdir and rmdir. - -Version 1.15 ------------- -Change to mempools for alloc smb request buffers and multiplex structs -to better handle low memory problems (and potential deadlocks). - -Version 1.14 ------------- -Fix incomplete listings of large directories on Samba servers when Unix -extensions enabled. Fix oops when smb_buffer can not be allocated. Fix -rename deadlock when writing out dirty pages at same time. - -Version 1.13 ------------- -Fix open of files in which O_CREATE can cause the mode to change in -some cases. Fix case in which retry of write overlaps file close. -Fix PPC64 build error. Reduce excessive stack usage in smb password -hashing. Fix overwrite of Linux user's view of file mode to Windows servers. - -Version 1.12 ------------- -Fixes for large file copy, signal handling, socket retry, buffer -allocation and low memory situations. - -Version 1.11 ------------- -Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize) -also now allowing support for specifying client netbiosname. NT4 support added. - -Version 1.10 ------------- -Fix reconnection (and certain failed mounts) to properly wake up the -blocked users thread so it does not seem hung (in some cases was blocked -until the cifs receive timeout expired). Fix spurious error logging -to kernel log when application with open network files killed. - -Version 1.09 ------------- -Fix /proc/fs module unload warning message (that could be logged -to the kernel log). Fix intermittent failure in connectathon -test7 (hardlink count not immediately refreshed in case in which -inode metadata can be incorrectly kept cached when time near zero) - -Version 1.08 ------------- -Allow file_mode and dir_mode (specified at mount time) to be enforced -locally (the server already enforced its own ACLs too) for servers -that do not report the correct mode (do not support the -CIFS Unix Extensions). - -Version 1.07 ------------- -Fix some small memory leaks in some unmount error paths. Fix major leak -of cache pages in readpages causing multiple read oriented stress -testcases (including fsx, and even large file copy) to fail over time. - -Version 1.06 ------------- -Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server. -This allows files that differ only in case and improves performance of file -creation and file open to such servers. Fix semaphore conflict which causes -slow delete of open file to Samba (which unfortunately can cause an oplock -break to self while vfs_unlink held i_sem) which can hang for 20 seconds. - -Version 1.05 ------------- -fixes to cifs_readpages for fsx test case - -Version 1.04 ------------- -Fix caching data integrity bug when extending file size especially when no -oplock on file. Fix spurious logging of valid already parsed mount options -that are parsed outside of the cifs vfs such as nosuid. - - -Version 1.03 ------------- -Connect to server when port number override not specified, and tcp port -unitialized. Reset search to restart at correct file when kernel routine -filldir returns error during large directory searches (readdir). - -Version 1.02 ------------- -Fix caching problem when files opened by multiple clients in which -page cache could contain stale data, and write through did -not occur often enough while file was still open when read ahead -(read oplock) not allowed. Treat "sep=" when first mount option -as an override of comma as the default separator between mount -options. - -Version 1.01 ------------- -Allow passwords longer than 16 bytes. Allow null password string. - -Version 1.00 ------------- -Gracefully clean up failed mounts when attempting to mount to servers such as -Windows 98 that terminate tcp sessions during protocol negotiation. Handle -embedded commas in mount parsing of passwords. - -Version 0.99 ------------- -Invalidate local inode cached pages on oplock break and when last file -instance is closed so that the client does not continue using stale local -copy rather than later modified server copy of file. Do not reconnect -when server drops the tcp session prematurely before negotiate -protocol response. Fix oops in reopen_file when dentry freed. Allow -the support for CIFS Unix Extensions to be disabled via proc interface. - -Version 0.98 ------------- -Fix hang in commit_write during reconnection of open files under heavy load. -Fix unload_nls oops in a mount failure path. Serialize writes to same socket -which also fixes any possible races when cifs signatures are enabled in SMBs -being sent out of signature sequence number order. - -Version 0.97 ------------- -Fix byte range locking bug (endian problem) causing bad offset and -length. - -Version 0.96 ------------- -Fix oops (in send_sig) caused by CIFS unmount code trying to -wake up the demultiplex thread after it had exited. Do not log -error on harmless oplock release of closed handle. - -Version 0.95 ------------- -Fix unsafe global variable usage and password hash failure on gcc 3.3.1 -Fix problem reconnecting secondary mounts to same server after session -failure. Fix invalid dentry - race in mkdir when directory gets created -by another client between the lookup and mkdir. - -Version 0.94 ------------- -Fix to list processing in reopen_files. Fix reconnection when server hung -but tcpip session still alive. Set proper timeout on socket read. - -Version 0.93 ------------- -Add missing mount options including iocharset. SMP fixes in write and open. -Fix errors in reconnecting after TCP session failure. Fix module unloading -of default nls codepage - -Version 0.92 ------------- -Active smb transactions should never go negative (fix double FreeXid). Fix -list processing in file routines. Check return code on kmalloc in open. -Fix spinlock usage for SMP. - -Version 0.91 ------------- -Fix oops in reopen_files when invalid dentry. drop dentry on server rename -and on revalidate errors. Fix cases where pid is now tgid. Fix return code -on create hard link when server does not support them. - -Version 0.90 ------------- -Fix scheduling while atomic error in getting inode info on newly created file. -Fix truncate of existing files opened with O_CREAT but not O_TRUNC set. - -Version 0.89 ------------- -Fix oops on write to dead tcp session. Remove error log write for case when file open -O_CREAT but not O_EXCL - -Version 0.88 ------------- -Fix non-POSIX behavior on rename of open file and delete of open file by taking -advantage of trans2 SetFileInfo rename facility if available on target server. -Retry on ENOSPC and EAGAIN socket errors. - -Version 0.87 ------------- -Fix oops on big endian readdir. Set blksize to be even power of two (2**blkbits) to fix -allocation size miscalculation. After oplock token lost do not read through -cache. - -Version 0.86 ------------- -Fix oops on empty file readahead. Fix for file size handling for locally cached files. - -Version 0.85 ------------- -Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files -during auto reconnection to server after server recovered from failure. - -Version 0.84 ------------- -Finish support for Linux 2.5 open/create changes, which removes the -redundant NTCreate/QPathInfo/close that was sent during file create. -Enable oplock by default. Enable packet signing by default (needed to -access many recent Windows servers) - -Version 0.83 ------------- -Fix oops when mounting to long server names caused by inverted parms to kmalloc. -Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled -we will choose a cifs user session (smb uid) that better matches the local -uid if a) the mount uid does not match the current uid and b) we have another -session to the same server (ip address) for a different mount which -matches the current local uid. - -Version 0.82 ------------- -Add support for mknod of block or character devices. Fix oplock -code (distributed caching) to properly send response to oplock -break from server. - -Version 0.81 ------------- -Finish up CIFS packet digital signing for the default -NTLM security case. This should help Windows 2003 -network interoperability since it is common for -packet signing to be required now. Fix statfs (stat -f) -which recently started returning errors due to -invalid value (-1 instead of 0) being set in the -struct kstatfs f_ffiles field. - -Version 0.80 ------------ -Fix oops on stopping oplock thread when removing cifs when -built as module. - -Version 0.79 ------------- -Fix mount options for ro (readonly), uid, gid and file and directory mode. - -Version 0.78 ------------- -Fix errors displayed on failed mounts to be more understandable. -Fixed various incorrect or misleading smb to posix error code mappings. - -Version 0.77 ------------- -Fix display of NTFS DFS junctions to display as symlinks. -They are the network equivalent. Fix oops in -cifs_partialpagewrite caused by missing spinlock protection -of openfile linked list. Allow writebehind caching errors to -be returned to the application at file close. - -Version 0.76 ------------- -Clean up options displayed in /proc/mounts by show_options to -be more consistent with other filesystems. - -Version 0.75 ------------- -Fix delete of readonly file to Windows servers. Reflect -presence or absence of read only dos attribute in mode -bits for servers that do not support CIFS Unix extensions. -Fix shortened results on readdir of large directories to -servers supporting CIFS Unix extensions (caused by -incorrect resume key). - -Version 0.74 ------------- -Fix truncate bug (set file size) that could cause hangs e.g. running fsx - -Version 0.73 ------------- -unload nls if mount fails. - -Version 0.72 ------------- -Add resume key support to search (readdir) code to workaround -Windows bug. Add /proc/fs/cifs/LookupCacheEnable which -allows disabling caching of attribute information for -lookups. - -Version 0.71 ------------- -Add more oplock handling (distributed caching code). Remove -dead code. Remove excessive stack space utilization from -symlink routines. - -Version 0.70 ------------- -Fix oops in get dfs referral (triggered when null path sent in to -mount). Add support for overriding rsize at mount time. - -Version 0.69 ------------- -Fix buffer overrun in readdir which caused intermittent kernel oopses. -Fix writepage code to release kmap on write data. Allow "-ip=" new -mount option to be passed in on parameter distinct from the first part -(server name portion of) the UNC name. Allow override of the -tcp port of the target server via new mount option "-port=" - -Version 0.68 ------------- -Fix search handle leak on rewind. Fix setuid and gid so that they are -reflected in the local inode immediately. Cleanup of whitespace -to make 2.4 and 2.5 versions more consistent. - - -Version 0.67 ------------- -Fix signal sending so that captive thread (cifsd) exits on umount -(which was causing the warning in kmem_cache_free of the request buffers -at rmmod time). This had broken as a sideeffect of the recent global -kernel change to daemonize. Fix memory leak in readdir code which -showed up in "ls -R" (and applications that did search rewinding). - -Version 0.66 ------------- -Reconnect tids and fids after session reconnection (still do not -reconnect byte range locks though). Fix problem caching -lookup information for directory inodes, improving performance, -especially in deep directory trees. Fix various build warnings. - -Version 0.65 ------------- -Finish fixes to commit write for caching/readahead consistency. fsx -now works to Samba servers. Fix oops caused when readahead -was interrupted by a signal. - -Version 0.64 ------------- -Fix data corruption (in partial page after truncate) that caused fsx to -fail to Windows servers. Cleaned up some extraneous error logging in -common error paths. Add generic sendfile support. - -Version 0.63 ------------- -Fix memory leak in AllocMidQEntry. -Finish reconnection logic, so connection with server can be dropped -(or server rebooted) and the cifs client will reconnect. - -Version 0.62 ------------- -Fix temporary socket leak when bad userid or password specified -(or other SMBSessSetup failure). Increase maximum buffer size to slightly -over 16K to allow negotiation of up to Samba and Windows server default read -sizes. Add support for readpages - -Version 0.61 ------------- -Fix oops when username not passed in on mount. Extensive fixes and improvements -to error logging (strip redundant newlines, change debug macros to ensure newline -passed in and to be more consistent). Fix writepage wrong file handle problem, -a readonly file handle could be incorrectly used to attempt to write out -file updates through the page cache to multiply open files. This could cause -the iozone benchmark to fail on the fwrite test. Fix bug mounting two different -shares to the same Windows server when using different usernames -(doing this to Samba servers worked but Windows was rejecting it) - now it is -possible to use different userids when connecting to the same server from a -Linux client. Fix oops when treeDisconnect called during unmount on -previously freed socket. - -Version 0.60 ------------- -Fix oops in readpages caused by not setting address space operations in inode in -rare code path. - -Version 0.59 ------------- -Includes support for deleting of open files and renaming over existing files (per POSIX -requirement). Add readlink support for Windows junction points (directory symlinks). - -Version 0.58 ------------- -Changed read and write to go through pagecache. Added additional address space operations. -Memory mapped operations now working. - -Version 0.57 ------------- -Added writepage code for additional memory mapping support. Fixed leak in xids causing -the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on -every stat call. Additional formatting cleanup. - -Version 0.56 ------------- -Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version. Formatting cleanup. - -Version 0.55 ------------- -Fixes from Zwane Mwaikambo for adding missing return code checking in a few places. -Also included a modified version of his fix to protect global list manipulation of -the smb session and tree connection and mid related global variables. - -Version 0.54 ------------- -Fix problem with captive thread hanging around at unmount time. Adjust to 2.5.42-pre -changes to superblock layout. Remove wasteful allocation of smb buffers (now the send -buffer is reused for responses). Add more oplock handling. Additional minor cleanup. - -Version 0.53 ------------- -More stylistic updates to better match kernel style. Add additional statistics -for filesystem which can be viewed via /proc/fs/cifs. Add more pieces of NTLMv2 -and CIFS Packet Signing enablement. - -Version 0.52 ------------- -Replace call to sleep_on with safer wait_on_event. -Make stylistic changes to better match kernel style recommendations. -Remove most typedef usage (except for the PDUs themselves). - -Version 0.51 ------------- -Update mount so the -unc mount option is no longer required (the ip address can be specified -in a UNC style device name. Implementation of readpage/writepage started. - -Version 0.50 ------------- -Fix intermittent problem with incorrect smb header checking on badly -fragmented tcp responses - -Version 0.49 ------------- -Fixes to setting of allocation size and file size. - -Version 0.48 ------------- -Various 2.5.38 fixes. Now works on 2.5.38 - -Version 0.47 ------------- -Prepare for 2.5 kernel merge. Remove ifdefs. - -Version 0.46 ------------- -Socket buffer management fixes. Fix dual free. - -Version 0.45 ------------- -Various big endian fixes for hardlinks and symlinks and also for dfs. - -Version 0.44 ------------- -Various big endian fixes for servers with Unix extensions such as Samba - -Version 0.43 ------------- -Various FindNext fixes for incorrect filenames on large directory searches on big endian -clients. basic posix file i/o tests now work on big endian machines, not just le - -Version 0.42 ------------- -SessionSetup and NegotiateProtocol now work from Big Endian machines. -Various Big Endian fixes found during testing on the Linux on 390. Various fixes for compatibility with older -versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7). - -Version 0.41 ------------- -Various minor fixes for Connectathon Posix "basic" file i/o test suite. Directory caching fixed so hardlinked -files now return the correct number of links on fstat as they are repeatedly linked and unlinked. - -Version 0.40 ------------- -Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate -session advanced session authentication). Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP. -Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs -(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos). - -Version 0.38 ------------- -Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable -it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU). - -Version 0.37 ------------- -Rewrote much of connection and mount/unmount logic to handle bugs with -multiple uses to same share, multiple users to same server etc. - -Version 0.36 ------------- -Fixed major problem with dentry corruption (missing call to dput) - -Version 0.35 ------------- -Rewrite of readdir code to fix bug. Various fixes for bigendian machines. -Begin adding oplock support. Multiusermount and oplockEnabled flags added to /proc/fs/cifs -although corresponding function not fully implemented in the vfs yet - -Version 0.34 ------------- -Fixed dentry caching bug, misc. cleanup - -Version 0.33 ------------- -Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes. Now can build -on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels. -Support for STATUS codes (newer 32 bit NT error codes) added. DFS support begun to be added. - -Version 0.32 ------------- -Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented -and tested against Samba 2.2.5 - - -Version 0.31 ------------- -1) Fixed lockrange to be correct (it was one byte too short) - -2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly -show range as locked when there is a conflict with an existing lock. - -3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories -in most cases. Eventually will offer optional ability to query server for the correct perms. - -3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded -but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb -session) - -4) Fixed error logging of valid mount options - -5) Removed logging of password field. - -6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c -and cleaned them up and made them more consistent with other cifs functions. - -7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways -(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed, -nor is the symlink support using the Unix extensions - -8) Started adding the readlink and follow_link code - -Version 0.3 ------------ -Initial drop - +See https://wiki.samba.org/index.php/LinuxCIFSKernel for summary +information (that may be easier to read than parsing the output of +"git log fs/cifs") about fixes/improvements to CIFS/SMB2/SMB3 support (changes +to cifs.ko module) by kernel version (and cifs internal module version). diff --git a/Documentation/filesystems/cifs/README b/Documentation/filesystems/cifs/README index 99ce3d25003d..4a804619cff2 100644 --- a/Documentation/filesystems/cifs/README +++ b/Documentation/filesystems/cifs/README @@ -603,8 +603,7 @@ DebugData Displays information about active CIFS sessions and shares, features enabled as well as the cifs.ko version. Stats Lists summary resource usage information as well as per - share statistics, if CONFIG_CIFS_STATS in enabled - in the kernel configuration. + share statistics. Configuration pseudo-files: SecurityFlags Flags which control security negotiation and @@ -687,23 +686,22 @@ cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel logging of various informational messages. 2 enables logging of non-zero SMB return codes while 4 enables logging of requests that take longer than one second to complete (except for byte range lock requests). -Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the -source code (typically by setting it in the beginning of cifsglob.h), -and setting it to seven enables all three. Finally, tracing +Setting it to 4 requires CONFIG_CIFS_STATS2 to be set in kernel configuration +(.config). Setting it to seven enables all three. Finally, tracing the start of smb requests and responses can be enabled via: echo 1 > /proc/fs/cifs/traceSMB -Per share (per client mount) statistics are available in /proc/fs/cifs/Stats -if the kernel was configured with cifs statistics enabled. The statistics -represent the number of successful (ie non-zero return code from the server) -SMB responses to some of the more common commands (open, delete, mkdir etc.). +Per share (per client mount) statistics are available in /proc/fs/cifs/Stats. +Additional information is available if CONFIG_CIFS_STATS2 is enabled in the +kernel configuration (.config). The statistics returned include counters which +represent the number of attempted and failed (ie non-zero return code from the +server) SMB3 (or cifs) requests grouped by request type (read, write, close etc.). Also recorded is the total bytes read and bytes written to the server for that share. Note that due to client caching effects this can be less than the number of bytes read and written by the application running on the client. -The statistics for the number of total SMBs and oplock breaks are different in -that they represent all for that share, not just those for which the server -returned success. +Statistics can be reset to zero by "echo 0 > /proc/fs/cifs/Stats" which may be +useful if comparing performance of two different scenarios. Also note that "cat /proc/fs/cifs/DebugData" will display information about the active sessions and the shares that are mounted. diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO index c5adf149b57f..852499aed64b 100644 --- a/Documentation/filesystems/cifs/TODO +++ b/Documentation/filesystems/cifs/TODO @@ -9,14 +9,14 @@ is a partial list of the known problems and missing features: a) SMB3 (and SMB3.02) missing optional features: - multichannel (started), integration with RDMA - - directory leases (improved metadata caching) - - T10 copy offload (copy chunk, and "Duplicate Extents" ioctl + - directory leases (improved metadata caching), started (root dir only) + - T10 copy offload ie "ODX" (copy chunk, and "Duplicate Extents" ioctl currently the only two server side copy mechanisms supported) b) improved sparse file support c) Directory entry caching relies on a 1 second timer, rather than -using Directory Leases +using Directory Leases, currently only the root file handle is cached longer d) quota support (needs minor kernel change since quota calls to make it to network filesystems or deviceless filesystems) @@ -42,6 +42,8 @@ mount or a per server basis to client UIDs or nobody if no mapping exists. Also better integration with winbind for resolving SID owners k) Add tools to take advantage of more smb3 specific ioctls and features +(passthrough ioctl/fsctl for sending various SMB3 fsctls to the server +is in progress) l) encrypted file support @@ -71,9 +73,8 @@ t) split cifs and smb3 support into separate modules so legacy (and less secure) CIFS dialect can be disabled in environments that don't need it and simplify the code. -u) Finish up SMB3.1.1 dialect support - -v) POSIX Extensions for SMB3.1.1 +v) POSIX Extensions for SMB3.1.1 (started, create and mkdir support added +so far). KNOWN BUGS ==================================== @@ -92,8 +93,8 @@ Misc testing to do 1) check out max path names and max path name components against various server types. Try nested symlinks (8 deep). Return max path name in stat -f information -2) Improve xfstest's cifs enablement and adapt xfstests where needed to test -cifs better +2) Improve xfstest's cifs/smb3 enablement and adapt xfstests where needed to test +cifs/smb3 better 3) Additional performance testing and optimization using iozone and similar - there are some easy changes that can be done to parallelize sequential writes, diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4/ext4.rst index 7f628b9f7c4b..9d4368d591fa 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4/ext4.rst @@ -1,6 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 -Ext4 Filesystem -=============== +======================== +General Information +======================== Ext4 is an advanced level of the ext3 filesystem which incorporates scalability and reliability enhancements for supporting large filesystems @@ -11,37 +13,30 @@ Mailing list: linux-ext4@vger.kernel.org Web site: http://ext4.wiki.kernel.org -1. Quick usage instructions: -=========================== +Quick usage instructions +======================== Note: More extensive information for getting started with ext4 can be - found at the ext4 wiki site at the URL: - http://ext4.wiki.kernel.org/index.php/Ext4_Howto +found at the ext4 wiki site at the URL: +http://ext4.wiki.kernel.org/index.php/Ext4_Howto - - Compile and install the latest version of e2fsprogs (as of this - writing version 1.41.3) from: + - The latest version of e2fsprogs can be found at: + + https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ - http://sourceforge.net/project/showfiles.php?group_id=2406 - or - https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ + http://sourceforge.net/project/showfiles.php?group_id=2406 or grab the latest git repository from: - git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git - - - Note that it is highly important to install the mke2fs.conf file - that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If - you have edited the /etc/mke2fs.conf file installed on your system, - you will need to merge your changes with the version from e2fsprogs - 1.41.x. + https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git - Create a new filesystem using the ext4 filesystem type: - # mke2fs -t ext4 /dev/hda1 + # mke2fs -t ext4 /dev/hda1 - Or to configure an existing ext3 filesystem to support extents: + Or to configure an existing ext3 filesystem to support extents: # tune2fs -O extents /dev/hda1 @@ -50,10 +45,6 @@ Note: More extensive information for getting started with ext4 can be # tune2fs -I 256 /dev/hda1 - (Note: we currently do not have tools to convert an ext4 - filesystem back to ext3; so please do not do try this on production - filesystems.) - - Mounting: # mount -t ext4 /dev/hda1 /wherever @@ -75,10 +66,11 @@ Note: More extensive information for getting started with ext4 can be the filesystem with a large journal can also be helpful for metadata-intensive workloads. -2. Features -=========== +Features +======== -2.1 Currently available +Currently Available +------------------- * ability to use filesystems > 16TB (e2fsprogs support not available yet) * extent format reduces metadata overhead (RAM, IO for access, transactions) @@ -103,31 +95,15 @@ Note: More extensive information for getting started with ext4 can be [1] Filesystems with a block size of 1k may see a limit imposed by the directory hash tree having a maximum depth of two. -2.2 Candidate features for future inclusion - -* online defrag (patches available but not well tested) -* reduced mke2fs time via lazy itable initialization in conjunction with - the uninit_bg feature (capability to do this is available in e2fsprogs - but a kernel thread to do lazy zeroing of unused inode table blocks - after filesystem is first mounted is required for safety) - -There are several others under discussion, whether they all make it in is -partly a function of how much time everyone has to work on them. Features like -metadata checksumming have been discussed and planned for a bit but no patches -exist yet so I'm not sure they're in the near-term roadmap. - -The big performance win will come with mballoc, delalloc and flex_bg -grouping of bitmaps and inode tables. Some test results available here: - - - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html - - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html - -3. Options -========== +Options +======= When mounting an ext4 filesystem, the following option are accepted: (*) == default +======================= ======================================================= +Mount Option Description +======================= ======================================================= ro Mount filesystem read only. Note that ext4 will replay the journal (and thus write to the partition) even when mounted "read only". The @@ -387,33 +363,38 @@ i_version Enable 64-bit inode version support. This option is dax Use direct access (no page cache). See Documentation/filesystems/dax.txt. Note that this option is incompatible with data=journal. +======================= ======================================================= Data Mode ========= There are 3 different data modes: * writeback mode -In data=writeback mode, ext4 does not journal data at all. This mode provides -a similar level of journaling as that of XFS, JFS, and ReiserFS in its default -mode - metadata journaling. A crash+recovery can cause incorrect data to -appear in files which were written shortly before the crash. This mode will -typically provide the best ext4 performance. + + In data=writeback mode, ext4 does not journal data at all. This mode provides + a similar level of journaling as that of XFS, JFS, and ReiserFS in its default + mode - metadata journaling. A crash+recovery can cause incorrect data to + appear in files which were written shortly before the crash. This mode will + typically provide the best ext4 performance. * ordered mode -In data=ordered mode, ext4 only officially journals metadata, but it logically -groups metadata information related to data changes with the data blocks into a -single unit called a transaction. When it's time to write the new metadata -out to disk, the associated data blocks are written first. In general, -this mode performs slightly slower than writeback but significantly faster than journal mode. + + In data=ordered mode, ext4 only officially journals metadata, but it logically + groups metadata information related to data changes with the data blocks into + a single unit called a transaction. When it's time to write the new metadata + out to disk, the associated data blocks are written first. In general, this + mode performs slightly slower than writeback but significantly faster than + journal mode. * journal mode -data=journal mode provides full data and metadata journaling. All new data is -written to the journal first, and then to its final location. -In the event of a crash, the journal can be replayed, bringing both data and -metadata into a consistent state. This mode is the slowest except when data -needs to be read from and written to disk at the same time where it -outperforms all others modes. Enabling this mode will disable delayed -allocation and O_DIRECT support. + + data=journal mode provides full data and metadata journaling. All new data is + written to the journal first, and then to its final location. In the event of + a crash, the journal can be replayed, bringing both data and metadata into a + consistent state. This mode is the slowest except when data needs to be read + from and written to disk at the same time where it outperforms all others + modes. Enabling this mode will disable delayed allocation and O_DIRECT + support. /proc entries ============= @@ -425,10 +406,12 @@ Information about mounted ext4 file systems can be found in in table below. Files in /proc/fs/ext4/<devname> -.............................................................................. + +================ ======= File Content +================ ======= mb_groups details of multiblock allocator buddy cache of free blocks -.............................................................................. +================ ======= /sys entries ============ @@ -439,28 +422,30 @@ Information about mounted ext4 file systems can be found in /sys/fs/ext4/dm-0). The files in each per-device directory are shown in table below. -Files in /sys/fs/ext4/<devname> +Files in /sys/fs/ext4/<devname>: + (see also Documentation/ABI/testing/sysfs-fs-ext4) -.............................................................................. - File Content +============================= ================================================= +File Content +============================= ================================================= delayed_allocation_blocks This file is read-only and shows the number of blocks that are dirty in the page cache, but which do not have their location in the filesystem allocated yet. - inode_goal Tuning parameter which (if non-zero) controls +inode_goal Tuning parameter which (if non-zero) controls the goal inode used by the inode allocator in preference to all other allocation heuristics. This is intended for debugging use only, and should be 0 on production systems. - inode_readahead_blks Tuning parameter which controls the maximum +inode_readahead_blks Tuning parameter which controls the maximum number of inode table blocks that ext4's inode table readahead algorithm will pre-read into the buffer cache - lifetime_write_kbytes This file is read-only and shows the number of +lifetime_write_kbytes This file is read-only and shows the number of kilobytes of data that have been written to this filesystem since it was created. @@ -508,7 +493,7 @@ Files in /sys/fs/ext4/<devname> in the file system. If there is not enough space for the reserved space when mounting the file mount will _not_ fail. -.............................................................................. +============================= ================================================= Ioctls ====== @@ -518,8 +503,10 @@ through the system call interfaces. The list of all Ext4 specific ioctls are shown in the table below. Table of Ext4 specific ioctls -.............................................................................. - Ioctl Description + +============================= ================================================= +Ioctl Description +============================= ================================================= EXT4_IOC_GETFLAGS Get additional attributes associated with inode. The ioctl argument is an integer bitfield, with bit values described in ext4.h. This ioctl is an @@ -610,8 +597,7 @@ Table of Ext4 specific ioctls normal user by accident. The data blocks of the previous boot loader will be associated with the given inode. - -.............................................................................. +============================= ================================================= References ========== diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst new file mode 100644 index 000000000000..71121605558c --- /dev/null +++ b/Documentation/filesystems/ext4/index.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +ext4 Filesystem +=============== + +General usage and on-disk artifacts writen by ext4. More documentation may +be ported from the wiki as time permits. This should be considered the +canonical source of information as the details here have been reviewed by +the ext4 community. + +.. toctree:: + :maxdepth: 5 + :numbered: + + ext4 + ondisk/index diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/ondisk/about.rst new file mode 100644 index 000000000000..0aadba052264 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/about.rst @@ -0,0 +1,44 @@ +.. SPDX-License-Identifier: GPL-2.0 + +About this Book +=============== + +This document attempts to describe the on-disk format for ext4 +filesystems. The same general ideas should apply to ext2/3 filesystems +as well, though they do not support all the features that ext4 supports, +and the fields will be shorter. + +**NOTE**: This is a work in progress, based on notes that the author +(djwong) made while picking apart a filesystem by hand. The data +structure definitions should be current as of Linux 4.18 and +e2fsprogs-1.44. All comments and corrections are welcome, since there is +undoubtedly plenty of lore that might not be reflected in freshly +created demonstration filesystems. + +License +------- +This book is licensed under the terms of the GNU Public License, v2. + +Terminology +----------- + +ext4 divides a storage device into an array of logical blocks both to +reduce bookkeeping overhead and to increase throughput by forcing larger +transfer sizes. Generally, the block size will be 4KiB (the same size as +pages on x86 and the block layer's default block size), though the +actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes. +Throughout this document, disk locations are given in terms of these +logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of +convenience, the logical block size will be referred to as +``$block_size`` throughout the rest of the document. + +When referenced in ``preformatted text`` blocks, ``sb`` refers to fields +in the super block, and ``inode`` refers to fields in an inode table +entry. + +Other References +---------------- + +Also see http://www.nongnu.org/ext2-doc/ for quite a collection of +information about ext2/3. Here's another old reference: +http://wiki.osdev.org/Ext2 diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/ondisk/allocators.rst new file mode 100644 index 000000000000..7aa85152ace3 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/allocators.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block and Inode Allocation Policy +--------------------------------- + +ext4 recognizes (better than ext3, anyway) that data locality is +generally a desirably quality of a filesystem. On a spinning disk, +keeping related blocks near each other reduces the amount of movement +that the head actuator and disk must perform to access a data block, +thus speeding up disk IO. On an SSD there of course are no moving parts, +but locality can increase the size of each transfer request while +reducing the total number of requests. This locality may also have the +effect of concentrating writes on a single erase block, which can speed +up file rewrites significantly. Therefore, it is useful to reduce +fragmentation whenever possible. + +The first tool that ext4 uses to combat fragmentation is the multi-block +allocator. When a file is first created, the block allocator +speculatively allocates 8KiB of disk space to the file on the assumption +that the space will get written soon. When the file is closed, the +unused speculative allocations are of course freed, but if the +speculation is correct (typically the case for full writes of small +files) then the file data gets written out in a single multi-block +extent. A second related trick that ext4 uses is delayed allocation. +Under this scheme, when a file needs more blocks to absorb file writes, +the filesystem defers deciding the exact placement on the disk until all +the dirty buffers are being written out to disk. By not committing to a +particular placement until it's absolutely necessary (the commit timeout +is hit, or sync() is called, or the kernel runs out of memory), the hope +is that the filesystem can make better location decisions. + +The third trick that ext4 (and ext3) uses is that it tries to keep a +file's data blocks in the same block group as its inode. This cuts down +on the seek penalty when the filesystem first has to read a file's inode +to learn where the file's data blocks live and then seek over to the +file's data blocks to begin I/O operations. + +The fourth trick is that all the inodes in a directory are placed in the +same block group as the directory, when feasible. The working assumption +here is that all the files in a directory might be related, therefore it +is useful to try to keep them all together. + +The fifth trick is that the disk volume is cut up into 128MB block +groups; these mini-containers are used as outlined above to try to +maintain data locality. However, there is a deliberate quirk -- when a +directory is created in the root directory, the inode allocator scans +the block groups and puts that directory into the least heavily loaded +block group that it can find. This encourages directories to spread out +over a disk; as the top-level directory/file blobs fill up one block +group, the allocators simply move on to the next block group. Allegedly +this scheme evens out the loading on the block groups, though the author +suspects that the directories which are so unlucky as to land towards +the end of a spinning drive get a raw deal performance-wise. + +Of course if all of these mechanisms fail, one can always use e4defrag +to defragment files. diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/ondisk/attributes.rst new file mode 100644 index 000000000000..0b01b67b81fe --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/attributes.rst @@ -0,0 +1,191 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Extended Attributes +------------------- + +Extended attributes (xattrs) are typically stored in a separate data +block on the disk and referenced from inodes via ``inode.i_file_acl*``. +The first use of extended attributes seems to have been for storing file +ACLs and other security data (selinux). With the ``user_xattr`` mount +option it is possible for users to store extended attributes so long as +all attribute names begin with “user”; this restriction seems to have +disappeared as of Linux 3.0. + +There are two places where extended attributes can be found. The first +place is between the end of each inode entry and the beginning of the +next inode entry. For example, if inode.i\_extra\_isize = 28 and +sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes +available for in-inode extended attribute storage. The second place +where extended attributes can be found is in the block pointed to by +``inode.i_file_acl``. As of Linux 3.11, it is not possible for this +block to contain a pointer to a second extended attribute block (or even +the remaining blocks of a cluster). In theory it is possible for each +attribute's value to be stored in a separate data block, though as of +Linux 3.11 the code does not permit this. + +Keys are generally assumed to be ASCIIZ strings, whereas values can be +strings or binary data. + +Extended attributes, when stored after the inode, have a header +``ext4_xattr_ibody_header`` that is 4 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_magic + - Magic number for identification, 0xEA020000. This value is set by the + Linux driver, though e2fsprogs doesn't seem to check it(?) + +The beginning of an extended attribute block is in +``struct ext4_xattr_header``, which is 32 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_magic + - Magic number for identification, 0xEA020000. + * - 0x4 + - \_\_le32 + - h\_refcount + - Reference count. + * - 0x8 + - \_\_le32 + - h\_blocks + - Number of disk blocks used. + * - 0xC + - \_\_le32 + - h\_hash + - Hash value of all attributes. + * - 0x10 + - \_\_le32 + - h\_checksum + - Checksum of the extended attribute block. + * - 0x14 + - \_\_u32 + - h\_reserved[2] + - Zero. + +The checksum is calculated against the FS UUID, the 64-bit block number +of the extended attribute block, and the entire block (header + +entries). + +Following the ``struct ext4_xattr_header`` or +``struct ext4_xattr_ibody_header`` is an array of +``struct ext4_xattr_entry``; each of these entries is at least 16 bytes +long. When stored in an external block, the ``struct ext4_xattr_entry`` +entries must be stored in sorted order. The sort order is +``e_name_index``, then ``e_name_len``, and finally ``e_name``. +Attributes stored inside an inode do not need be stored in sorted order. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_u8 + - e\_name\_len + - Length of name. + * - 0x1 + - \_\_u8 + - e\_name\_index + - Attribute name index. There is a discussion of this below. + * - 0x2 + - \_\_le16 + - e\_value\_offs + - Location of this attribute's value on the disk block where it is stored. + Multiple attributes can share the same value. For an inode attribute + this value is relative to the start of the first entry; for a block this + value is relative to the start of the block (i.e. the header). + * - 0x4 + - \_\_le32 + - e\_value\_inum + - The inode where the value is stored. Zero indicates the value is in the + same block as this entry. This field is only used if the + INCOMPAT\_EA\_INODE feature is enabled. + * - 0x8 + - \_\_le32 + - e\_value\_size + - Length of attribute value. + * - 0xC + - \_\_le32 + - e\_hash + - Hash value of attribute name and attribute value. The kernel doesn't + update the hash for in-inode attributes, so for that case this value + must be zero, because e2fsck validates any non-zero hash regardless of + where the xattr lives. + * - 0x10 + - char + - e\_name[e\_name\_len] + - Attribute name. Does not include trailing NULL. + +Attribute values can follow the end of the entry table. There appears to +be a requirement that they be aligned to 4-byte boundaries. The values +are stored starting at the end of the block and grow towards the +xattr\_header/xattr\_entry table. When the two collide, the overflow is +put into a separate disk block. If the disk block fills up, the +filesystem returns -ENOSPC. + +The first four fields of the ``ext4_xattr_entry`` are set to zero to +mark the end of the key list. + +Attribute Name Indices +~~~~~~~~~~~~~~~~~~~~~~ + +Logically speaking, extended attributes are a series of key=value pairs. +The keys are assumed to be NULL-terminated strings. To reduce the amount +of on-disk space that the keys consume, the beginning of the key string +is matched against the attribute name index. If a match is found, the +attribute name index field is set, and matching string is removed from +the key name. Here is a map of name index values to key prefixes: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Name Index + - Key Prefix + * - 0 + - (no prefix) + * - 1 + - “user.” + * - 2 + - “system.posix\_acl\_access” + * - 3 + - “system.posix\_acl\_default” + * - 4 + - “trusted.” + * - 6 + - “security.” + * - 7 + - “system.” (inline\_data only?) + * - 8 + - “system.richacl” (SuSE kernels only?) + +For example, if the attribute key is “user.fubar”, the attribute name +index is set to 1 and the “fubar” name is recorded on disk. + +POSIX ACLs +~~~~~~~~~~ + +POSIX ACLs are stored in a reduced version of the Linux kernel (and +libacl's) internal ACL format. The key difference is that the version +number is different (1) and the ``e_id`` field is only stored for named +user and group ACLs. diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/ondisk/bigalloc.rst new file mode 100644 index 000000000000..c6d88557553c --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/bigalloc.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Bigalloc +-------- + +At the moment, the default size of a block is 4KiB, which is a commonly +supported page size on most MMU-capable hardware. This is fortunate, as +ext4 code is not prepared to handle the case where the block size +exceeds the page size. However, for a filesystem of mostly huge files, +it is desirable to be able to allocate disk blocks in units of multiple +blocks to reduce both fragmentation and metadata overhead. The +`bigalloc <Bigalloc>`__ feature provides exactly this ability. The +administrator can set a block cluster size at mkfs time (which is stored +in the s\_log\_cluster\_size field in the superblock); from then on, the +block bitmaps track clusters, not individual blocks. This means that +block groups can be several gigabytes in size (instead of just 128MiB); +however, the minimum allocation unit becomes a cluster, not a block, +even for directories. TaoBao had a patchset to extend the “use units of +clusters instead of blocks” to the extent tree, though it is not clear +where those patches went-- they eventually morphed into “extent tree v2” +but that code has not landed as of May 2015. + diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/ondisk/bitmaps.rst new file mode 100644 index 000000000000..c7546dbc197a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/bitmaps.rst @@ -0,0 +1,28 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block and inode Bitmaps +----------------------- + +The data block bitmap tracks the usage of data blocks within the block +group. + +The inode bitmap records which entries in the inode table are in use. + +As with most bitmaps, one bit represents the usage status of one data +block or inode table entry. This implies a block group size of 8 \* +number\_of\_bytes\_in\_a\_logical\_block. + +NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts +of the kernel and e2fsprogs code pretends that the block bitmap contains +zeros (i.e. all blocks in the group are free). However, it is not +necessarily the case that no blocks are in use -- if ``meta_bg`` is set, +the bitmaps and group descriptor live inside the group. Unfortunately, +ext2fs\_test\_block\_bitmap2() will return '0' for those locations, +which produces confusing debugfs output. + +Inode Table +----------- +Inode tables are statically allocated at mkfs time. Each block group +descriptor points to the start of the table, and the superblock records +the number of inodes per group. See the section on inodes for more +information. diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/ondisk/blockgroup.rst new file mode 100644 index 000000000000..baf888e4c06a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blockgroup.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Layout +------ + +The layout of a standard block group is approximately as follows (each +of these fields is discussed in a separate section below): + +.. list-table:: + :widths: 1 1 1 1 1 1 1 1 + :header-rows: 1 + + * - Group 0 Padding + - ext4 Super Block + - Group Descriptors + - Reserved GDT Blocks + - Data Block Bitmap + - inode Bitmap + - inode Table + - Data Blocks + * - 1024 bytes + - 1 block + - many blocks + - many blocks + - 1 block + - 1 block + - many blocks + - many more blocks + +For the special case of block group 0, the first 1024 bytes are unused, +to allow for the installation of x86 boot sectors and other oddities. +The superblock will start at offset 1024 bytes, whichever block that +happens to be (usually 0). However, if for some reason the block size = +1024, then block 0 is marked in use and the superblock goes in block 1. +For all other block groups, there is no padding. + +The ext4 driver primarily works with the superblock and the group +descriptors that are found in block group 0. Redundant copies of the +superblock and group descriptors are written to some of the block groups +across the disk in case the beginning of the disk gets trashed, though +not all block groups necessarily host a redundant copy (see following +paragraph for more details). If the group does not have a redundant +copy, the block group begins with the data block bitmap. Note also that +when the filesystem is freshly formatted, mkfs will allocate “reserve +GDT block” space after the block group descriptors and before the start +of the block bitmaps to allow for future expansion of the filesystem. By +default, a filesystem is allowed to increase in size by a factor of +1024x over the original filesystem size. + +The location of the inode table is given by ``grp.bg_inode_table_*``. It +is continuous range of blocks large enough to contain +``sb.s_inodes_per_group * sb.s_inode_size`` bytes. + +As for the ordering of items in a block group, it is generally +established that the super block and the group descriptor table, if +present, will be at the beginning of the block group. The bitmaps and +the inode table can be anywhere, and it is quite possible for the +bitmaps to come after the inode table, or for both to be in different +groups (flex\_bg). Leftover space is used for file data blocks, indirect +block maps, extent tree blocks, and extended attributes. + +Flexible Block Groups +--------------------- + +Starting in ext4, there is a new feature called flexible block groups +(flex\_bg). In a flex\_bg, several block groups are tied together as one +logical block group; the bitmap spaces and the inode table space in the +first block group of the flex\_bg are expanded to include the bitmaps +and inode tables of all other block groups in the flex\_bg. For example, +if the flex\_bg size is 4, then group 0 will contain (in order) the +superblock, group descriptors, data block bitmaps for groups 0-3, inode +bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining +space in group 0 is for file data. The effect of this is to group the +block metadata close together for faster loading, and to enable large +files to be continuous on disk. Backup copies of the superblock and +group descriptors are always at the beginning of block groups, even if +flex\_bg is enabled. The number of block groups that make up a flex\_bg +is given by 2 ^ ``sb.s_log_groups_per_flex``. + +Meta Block Groups +----------------- + +Without the option META\_BG, for safety concerns, all block group +descriptors copies are kept in the first block group. Given the default +128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4 +can have at most 2^27/64 = 2^21 block groups. This limits the entire +filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB. + +The solution to this problem is to use the metablock group feature +(META\_BG), which is already in ext3 for all 2.6 releases. With the +META\_BG feature, ext4 filesystems are partitioned into many metablock +groups. Each metablock group is a cluster of block groups whose group +descriptor structures can be stored in a single disk block. For ext4 +filesystems with 4 KB block size, a single metablock group partition +includes 64 block groups, or 8 GiB of disk space. The metablock group +feature moves the location of the group descriptors from the congested +first block group of the whole filesystem into the first group of each +metablock group itself. The backups are in the second and last group of +each metablock group. This increases the 2^21 maximum block groups limit +to the hard limit 2^32, allowing support for a 512PiB filesystem. + +The change in the filesystem format replaces the current scheme where +the superblock is followed by a variable-length set of block group +descriptors. Instead, the superblock and a single block group descriptor +block is placed at the beginning of the first, second, and last block +groups in a meta-block group. A meta-block group is a collection of +block groups which can be described by a single block group descriptor +block. Since the size of the block group descriptor structure is 32 +bytes, a meta-block group contains 32 block groups for filesystems with +a 1KB block size, and 128 block groups for filesystems with a 4KB +blocksize. Filesystems can either be created using this new block group +descriptor layout, or existing filesystems can be resized on-line, and +the field s\_first\_meta\_bg in the superblock will indicate the first +block group using this new layout. + +Please see an important note about ``BLOCK_UNINIT`` in the section about +block and inode bitmaps. + +Lazy Block Group Initialization +------------------------------- + +A new feature for ext4 are three block group descriptor flags that +enable mkfs to skip initializing other parts of the block group +metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean +that the inode and block bitmaps for that group can be calculated and +therefore the on-disk bitmap blocks are not initialized. This is +generally the case for an empty block group or a block group containing +only fixed-location block group metadata. The INODE\_ZEROED flag means +that the inode table has been initialized; mkfs will unset this flag and +rely on the kernel to initialize the inode tables in the background. + +By not writing zeroes to the bitmaps and inode table, mkfs time is +reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM, +but the dumpe2fs output prints this as “uninit\_bg”. They are the same +thing. diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/ondisk/blockmap.rst new file mode 100644 index 000000000000..30e25750d88a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blockmap.rst @@ -0,0 +1,49 @@ +.. SPDX-License-Identifier: GPL-2.0 + ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| i.i\_block Offset | Where It Points | ++=====================+==============================================================================================================================================================================================================================+ +| 0 to 11 | Direct map to file blocks 0 to 11. | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) | +| | | +| | +------------------------------+--------------------------------------------------------------------+ | +| | | Indirect Block Offset | Where It Points | | +| | +==============================+====================================================================+ | +| | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | +| | +------------------------------+--------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) | +| | | +| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | +| | | Double Indirect Block Offset | Where It Points | | +| | +================================+=========================================================================================================+ | +| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | +| | | | | | +| | | | +------------------------------+--------------------------------------------------------------------+ | | +| | | | | Indirect Block Offset | Where It Points | | | +| | | | +==============================+====================================================================+ | | +| | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | +| | | | +------------------------------+--------------------------------------------------------------------+ | | +| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) | +| | | +| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | +| | | Triple Indirect Block Offset | Where It Points | | +| | +================================+================================================================================================================================================+ | +| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | | +| | | | | | +| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | +| | | | | Double Indirect Block Offset | Where It Points | | | +| | | | +================================+=========================================================================================================+ | | +| | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | | +| | | | | | | | | +| | | | | | +------------------------------+--------------------------------------------------------------------+ | | | +| | | | | | | Indirect Block Offset | Where It Points | | | | +| | | | | | +==============================+====================================================================+ | | | +| | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | | +| | | | | | +------------------------------+--------------------------------------------------------------------+ | | | +| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | +| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/ondisk/blocks.rst new file mode 100644 index 000000000000..73d4dc0f7bda --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blocks.rst @@ -0,0 +1,142 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Blocks +------ + +ext4 allocates storage space in units of “blocks”. A block is a group of +sectors between 1KiB and 64KiB, and the number of sectors must be an +integral power of 2. Blocks are in turn grouped into larger units called +block groups. Block size is specified at mkfs time and typically is +4KiB. You may experience mounting problems if block size is greater than +page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory +pages). By default a filesystem can contain 2^32 blocks; if the '64bit' +feature is enabled, then a filesystem can have 2^64 blocks. + +For 32-bit filesystems, limits are as follows: + +.. list-table:: + :widths: 1 1 1 1 1 + :header-rows: 1 + + * - Item + - 1KiB + - 2KiB + - 4KiB + - 64KiB + * - Blocks + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Inodes + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - File System Size + - 4TiB + - 8TiB + - 16TiB + - 256PiB + * - Blocks Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Inodes Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Block Group Size + - 8MiB + - 32MiB + - 128MiB + - 32GiB + * - Blocks Per File, Extents + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Blocks Per File, Block Maps + - 16,843,020 + - 134,480,396 + - 1,074,791,436 + - 4,398,314,962,956 (really 2^32 due to field size limitations) + * - File Size, Extents + - 4TiB + - 8TiB + - 16TiB + - 256TiB + * - File Size, Block Maps + - 16GiB + - 256GiB + - 4TiB + - 256TiB + +For 64-bit filesystems, limits are as follows: + +.. list-table:: + :widths: 1 1 1 1 1 + :header-rows: 1 + + * - Item + - 1KiB + - 2KiB + - 4KiB + - 64KiB + * - Blocks + - 2^64 + - 2^64 + - 2^64 + - 2^64 + * - Inodes + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - File System Size + - 16ZiB + - 32ZiB + - 64ZiB + - 1YiB + * - Blocks Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Inodes Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Block Group Size + - 8MiB + - 32MiB + - 128MiB + - 32GiB + * - Blocks Per File, Extents + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Blocks Per File, Block Maps + - 16,843,020 + - 134,480,396 + - 1,074,791,436 + - 4,398,314,962,956 (really 2^32 due to field size limitations) + * - File Size, Extents + - 4TiB + - 8TiB + - 16TiB + - 256TiB + * - File Size, Block Maps + - 16GiB + - 256GiB + - 4TiB + - 256TiB + +Note: Files not using extents (i.e. files using block maps) must be +placed within the first 2^32 blocks of a filesystem. Files with extents +must be placed within the first 2^48 blocks of a filesystem. It's not +clear what happens with larger filesystems. diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/ondisk/checksums.rst new file mode 100644 index 000000000000..9d6a793b2e03 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/checksums.rst @@ -0,0 +1,73 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Checksums +--------- + +Starting in early 2012, metadata checksums were added to all major ext4 +and jbd2 data structures. The associated feature flag is metadata\_csum. +The desired checksum algorithm is indicated in the superblock, though as +of October 2012 the only supported algorithm is crc32c. Some data +structures did not have space to fit a full 32-bit checksum, so only the +lower 16 bits are stored. Enabling the 64bit feature increases the data +structure size so that full 32-bit checksums can be stored for many data +structures. However, existing 32-bit filesystems cannot be extended to +enable 64bit mode, at least not without the experimental resize2fs +patches to do so. + +Existing filesystems can have checksumming added by running +``tune2fs -O metadata_csum`` against the underlying device. If tune2fs +encounters directory blocks that lack sufficient empty space to add a +checksum, it will request that you run ``e2fsck -D`` to have the +directories rebuilt with checksums. This has the added benefit of +removing slack space from the directory files and rebalancing the htree +indexes. If you \_ignore\_ this step, your directories will not be +protected by a checksum! + +The following table describes the data elements that go into each type +of checksum. The checksum function is whatever the superblock describes +(crc32c as of October 2013) unless noted otherwise. + +.. list-table:: + :widths: 1 1 4 + :header-rows: 1 + + * - Metadata + - Length + - Ingredients + * - Superblock + - \_\_le32 + - The entire superblock up to the checksum field. The UUID lives inside + the superblock. + * - MMP + - \_\_le32 + - UUID + the entire MMP block up to the checksum field. + * - Extended Attributes + - \_\_le32 + - UUID + the entire extended attribute block. The checksum field is set to + zero. + * - Directory Entries + - \_\_le32 + - UUID + inode number + inode generation + the directory block up to the + fake entry enclosing the checksum field. + * - HTREE Nodes + - \_\_le32 + - UUID + inode number + inode generation + all valid extents + HTREE tail. + The checksum field is set to zero. + * - Extents + - \_\_le32 + - UUID + inode number + inode generation + the entire extent block up to + the checksum field. + * - Bitmaps + - \_\_le32 or \_\_le16 + - UUID + the entire bitmap. Checksums are stored in the group descriptor, + and truncated if the group descriptor size is 32 bytes (i.e. ^64bit) + * - Inodes + - \_\_le32 + - UUID + inode number + inode generation + the entire inode. The checksum + field is set to zero. Each inode has its own checksum. + * - Group Descriptors + - \_\_le16 + - If metadata\_csum, then UUID + group number + the entire descriptor; + else if gdt\_csum, then crc16(UUID + group number + the entire + descriptor). In all cases, only the lower 16 bits are stored. + diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/ondisk/directory.rst new file mode 100644 index 000000000000..8fcba68c2884 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/directory.rst @@ -0,0 +1,426 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Directory Entries +----------------- + +In an ext4 filesystem, a directory is more or less a flat file that maps +an arbitrary byte string (usually ASCII) to an inode number on the +filesystem. There can be many directory entries across the filesystem +that reference the same inode number--these are known as hard links, and +that is why hard links cannot reference files on other filesystems. As +such, directory entries are found by reading the data block(s) +associated with a directory file for the particular directory entry that +is desired. + +Linear (Classic) Directories +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, each directory lists its entries in an “almost-linear” +array. I write “almost” because it's not a linear array in the memory +sense because directory entries are not split across filesystem blocks. +Therefore, it is more accurate to say that a directory is a series of +data blocks and that each block contains a linear array of directory +entries. The end of each per-block array is signified by reaching the +end of the block; the last entry in the block has a record length that +takes it all the way to the end of the block. The end of the entire +directory is of course signified by reaching the end of the file. Unused +directory entries are signified by inode = 0. By default the filesystem +uses ``struct ext4_dir_entry_2`` for directory entries unless the +“filetype” feature flag is not set, in which case it uses +``struct ext4_dir_entry``. + +The original directory entry format is ``struct ext4_dir_entry``, which +is at most 263 bytes long, though on disk you'll need to reference +``dirent.rec_len`` to know for sure. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - inode + - Number of the inode that this directory entry points to. + * - 0x4 + - \_\_le16 + - rec\_len + - Length of this directory entry. Must be a multiple of 4. + * - 0x6 + - \_\_le16 + - name\_len + - Length of the file name. + * - 0x8 + - char + - name[EXT4\_NAME\_LEN] + - File name. + +Since file names cannot be longer than 255 bytes, the new directory +entry format shortens the rec\_len field and uses the space for a file +type flag, probably to avoid having to load every inode during directory +tree traversal. This format is ``ext4_dir_entry_2``, which is at most +263 bytes long, though on disk you'll need to reference +``dirent.rec_len`` to know for sure. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - inode + - Number of the inode that this directory entry points to. + * - 0x4 + - \_\_le16 + - rec\_len + - Length of this directory entry. + * - 0x6 + - \_\_u8 + - name\_len + - Length of the file name. + * - 0x7 + - \_\_u8 + - file\_type + - File type code, see ftype_ table below. + * - 0x8 + - char + - name[EXT4\_NAME\_LEN] + - File name. + +.. _ftype: + +The directory file type is one of the following values: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Unknown. + * - 0x1 + - Regular file. + * - 0x2 + - Directory. + * - 0x3 + - Character device file. + * - 0x4 + - Block device file. + * - 0x5 + - FIFO. + * - 0x6 + - Socket. + * - 0x7 + - Symbolic link. + +In order to add checksums to these classic directory blocks, a phony +``struct ext4_dir_entry`` is placed at the end of each leaf block to +hold the checksum. The directory entry is 12 bytes long. The inode +number and name\_len fields are set to zero to fool old software into +ignoring an apparently empty directory entry, and the checksum is stored +in the place where the name normally goes. The structure is +``struct ext4_dir_entry_tail``: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - det\_reserved\_zero1 + - Inode number, which must be zero. + * - 0x4 + - \_\_le16 + - det\_rec\_len + - Length of this directory entry, which must be 12. + * - 0x6 + - \_\_u8 + - det\_reserved\_zero2 + - Length of the file name, which must be zero. + * - 0x7 + - \_\_u8 + - det\_reserved\_ft + - File type, which must be 0xDE. + * - 0x8 + - \_\_le32 + - det\_checksum + - Directory leaf block checksum. + +The leaf directory block checksum is calculated against the FS UUID, the +directory's inode number, the directory's inode generation number, and +the entire directory entry block up to (but not including) the fake +directory entry. + +Hash Tree Directories +~~~~~~~~~~~~~~~~~~~~~ + +A linear array of directory entries isn't great for performance, so a +new feature was added to ext3 to provide a faster (but peculiar) +balanced tree keyed off a hash of the directory entry name. If the +EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a +hashed btree (htree) to organize and find directory entries. For +backwards read-only compatibility with ext2, this tree is actually +hidden inside the directory file, masquerading as “empty” directory data +blocks! It was stated previously that the end of the linear directory +entry table was signified with an entry pointing to inode 0; this is +(ab)used to fool the old linear-scan algorithm into thinking that the +rest of the directory block is empty so that it moves on. + +The root of the tree always lives in the first data block of the +directory. By ext2 custom, the '.' and '..' entries must appear at the +beginning of this first block, so they are put here as two +``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of +the root node contains metadata about the tree and finally a hash->block +map to find nodes that are lower in the htree. If +``dx_root.info.indirect_levels`` is non-zero then the htree has two +levels; the data block pointed to by the root node's map is an interior +node, which is indexed by a minor hash. Interior nodes in this tree +contains a zeroed out ``struct ext4_dir_entry_2`` followed by a +minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear +array of all ``struct ext4_dir_entry_2``; all of these entries +(presumably) hash to the same value. If there is an overflow, the +entries simply overflow into the next leaf node, and the +least-significant bit of the hash (in the interior node map) that gets +us to this next leaf node is set. + +To traverse the directory as a htree, the code calculates the hash of +the desired file name and uses it to find the corresponding block +number. If the tree is flat, the block is a linear array of directory +entries that can be searched; otherwise, the minor hash of the file name +is computed and used against this second block to find the corresponding +third block number. That third block number will be a linear array of +directory entries. + +To traverse the directory as a linear array (such as the old code does), +the code simply reads every data block in the directory. The blocks used +for the htree will appear to have no entries (aside from '.' and '..') +and so only the leaf nodes will appear to have any interesting content. + +The root of the htree is in ``struct dx_root``, which is the full length +of a data block: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - dot.inode + - inode number of this directory. + * - 0x4 + - \_\_le16 + - dot.rec\_len + - Length of this record, 12. + * - 0x6 + - u8 + - dot.name\_len + - Length of the name, 1. + * - 0x7 + - u8 + - dot.file\_type + - File type of this entry, 0x2 (directory) (if the feature flag is set). + * - 0x8 + - char + - dot.name[4] + - “.\\0\\0\\0” + * - 0xC + - \_\_le32 + - dotdot.inode + - inode number of parent directory. + * - 0x10 + - \_\_le16 + - dotdot.rec\_len + - block\_size - 12. The record length is long enough to cover all htree + data. + * - 0x12 + - u8 + - dotdot.name\_len + - Length of the name, 2. + * - 0x13 + - u8 + - dotdot.file\_type + - File type of this entry, 0x2 (directory) (if the feature flag is set). + * - 0x14 + - char + - dotdot\_name[4] + - “..\\0\\0” + * - 0x18 + - \_\_le32 + - struct dx\_root\_info.reserved\_zero + - Zero. + * - 0x1C + - u8 + - struct dx\_root\_info.hash\_version + - Hash type, see dirhash_ table below. + * - 0x1D + - u8 + - struct dx\_root\_info.info\_length + - Length of the tree information, 0x8. + * - 0x1E + - u8 + - struct dx\_root\_info.indirect\_levels + - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR + feature is set; cannot be larger than 2 otherwise. + * - 0x1F + - u8 + - struct dx\_root\_info.unused\_flags + - + * - 0x20 + - \_\_le16 + - limit + - Maximum number of dx\_entries that can follow this header, plus 1 for + the header itself. + * - 0x22 + - \_\_le16 + - count + - Actual number of dx\_entries that follow this header, plus 1 for the + header itself. + * - 0x24 + - \_\_le32 + - block + - The block number (within the directory file) that goes with hash=0. + * - 0x28 + - struct dx\_entry + - entries[0] + - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. + +.. _dirhash: + +The directory hash is one of the following values: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Legacy. + * - 0x1 + - Half MD4. + * - 0x2 + - Tea. + * - 0x3 + - Legacy, unsigned. + * - 0x4 + - Half MD4, unsigned. + * - 0x5 + - Tea, unsigned. + +Interior nodes of an htree are recorded as ``struct dx_node``, which is +also the full length of a data block: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - fake.inode + - Zero, to make it look like this entry is not in use. + * - 0x4 + - \_\_le16 + - fake.rec\_len + - The size of the block, in order to hide all of the dx\_node data. + * - 0x6 + - u8 + - name\_len + - Zero. There is no name for this “unused” directory entry. + * - 0x7 + - u8 + - file\_type + - Zero. There is no file type for this “unused” directory entry. + * - 0x8 + - \_\_le16 + - limit + - Maximum number of dx\_entries that can follow this header, plus 1 for + the header itself. + * - 0xA + - \_\_le16 + - count + - Actual number of dx\_entries that follow this header, plus 1 for the + header itself. + * - 0xE + - \_\_le32 + - block + - The block number (within the directory file) that goes with the lowest + hash value of this block. This value is stored in the parent block. + * - 0x12 + - struct dx\_entry + - entries[0] + - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. + +The hash maps that exist in both ``struct dx_root`` and +``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes +long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - hash + - Hash code. + * - 0x4 + - \_\_le32 + - block + - Block number (within the directory file, not filesystem blocks) of the + next node in the htree. + +(If you think this is all quite clever and peculiar, so does the +author.) + +If metadata checksums are enabled, the last 8 bytes of the directory +block (precisely the length of one dx\_entry) are used to store a +``struct dx_tail``, which contains the checksum. The ``limit`` and +``count`` entries in the dx\_root/dx\_node structures are adjusted as +necessary to fit the dx\_tail into the block. If there is no space for +the dx\_tail, the user is notified to run e2fsck -D to rebuild the +directory index (which will ensure that there's space for the checksum. +The dx\_tail structure is 8 bytes long and looks like this: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - u32 + - dt\_reserved + - Zero. + * - 0x4 + - \_\_le32 + - dt\_checksum + - Checksum of the htree directory block. + +The checksum is calculated against the FS UUID, the htree index header +(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in +use, and the tail block (dx\_tail). diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/ondisk/dynamic.rst new file mode 100644 index 000000000000..bb0c84333341 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/dynamic.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Dynamic Structures +================== + +Dynamic metadata are created on the fly when files and blocks are +allocated to files. + +.. include:: inodes.rst +.. include:: ifork.rst +.. include:: directory.rst +.. include:: attributes.rst diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/ondisk/eainode.rst new file mode 100644 index 000000000000..ecc0d01a0a72 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/eainode.rst @@ -0,0 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Large Extended Attribute Values +------------------------------- + +To enable ext4 to store extended attribute values that do not fit in the +inode or in the single extended attribute block attached to an inode, +the EA\_INODE feature allows us to store the value in the data blocks of +a regular file inode. This “EA inode” is linked only from the extended +attribute name index and must not appear in a directory entry. The +inode's i\_atime field is used to store a checksum of the xattr value; +and i\_ctime/i\_version store a 64-bit reference count, which enables +sharing of large xattr values between multiple owning inodes. For +backward compatibility with older versions of this feature, the +i\_mtime/i\_generation *may* store a back-reference to the inode number +and i\_generation of the **one** owning inode (in cases where the EA +inode is not referenced by multiple inodes) to verify that the EA inode +is the correct one being accessed. diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/ondisk/globals.rst new file mode 100644 index 000000000000..368bf7662b96 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/globals.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Global Structures +================= + +The filesystem is sharded into a number of block groups, each of which +have static metadata at fixed locations. + +.. include:: super.rst +.. include:: group_descr.rst +.. include:: bitmaps.rst +.. include:: mmp.rst +.. include:: journal.rst diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/ondisk/group_descr.rst new file mode 100644 index 000000000000..759827e5d2cf --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/group_descr.rst @@ -0,0 +1,170 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block Group Descriptors +----------------------- + +Each block group on the filesystem has one of these descriptors +associated with it. As noted in the Layout section above, the group +descriptors (if present) are the second item in the block group. The +standard configuration is for each block group to contain a full copy of +the block group descriptor table unless the sparse\_super feature flag +is set. + +Notice how the group descriptor records the location of both bitmaps and +the inode table (i.e. they can float). This means that within a block +group, the only data structures with fixed locations are the superblock +and the group descriptor table. The flex\_bg mechanism uses this +property to group several block groups into a flex group and lay out all +of the groups' bitmaps and inode tables into one long run in the first +group of the flex group. + +If the meta\_bg feature flag is set, then several block groups are +grouped together into a meta group. Note that in the meta\_bg case, +however, the first and last two block groups within the larger meta +group contain only group descriptors for the groups inside the meta +group. + +flex\_bg and meta\_bg do not appear to be mutually exclusive features. + +In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the +block group descriptor was only 32 bytes long and therefore ends at +bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the +block group descriptor expands to at least the 64 bytes described below; +the size is stored in the superblock. + +If gdt\_csum is set and metadata\_csum is not set, the block group +checksum is the crc16 of the FS UUID, the group number, and the group +descriptor structure. If metadata\_csum is set, then the block group +checksum is the lower 16 bits of the checksum of the FS UUID, the group +number, and the group descriptor structure. Both block and inode bitmap +checksums are calculated against the FS UUID, the group number, and the +entire bitmap. + +The block group descriptor is laid out in ``struct ext4_group_desc``. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - bg\_block\_bitmap\_lo + - Lower 32-bits of location of block bitmap. + * - 0x4 + - \_\_le32 + - bg\_inode\_bitmap\_lo + - Lower 32-bits of location of inode bitmap. + * - 0x8 + - \_\_le32 + - bg\_inode\_table\_lo + - Lower 32-bits of location of inode table. + * - 0xC + - \_\_le16 + - bg\_free\_blocks\_count\_lo + - Lower 16-bits of free block count. + * - 0xE + - \_\_le16 + - bg\_free\_inodes\_count\_lo + - Lower 16-bits of free inode count. + * - 0x10 + - \_\_le16 + - bg\_used\_dirs\_count\_lo + - Lower 16-bits of directory count. + * - 0x12 + - \_\_le16 + - bg\_flags + - Block group flags. See the bgflags_ table below. + * - 0x14 + - \_\_le32 + - bg\_exclude\_bitmap\_lo + - Lower 32-bits of location of snapshot exclusion bitmap. + * - 0x18 + - \_\_le16 + - bg\_block\_bitmap\_csum\_lo + - Lower 16-bits of the block bitmap checksum. + * - 0x1A + - \_\_le16 + - bg\_inode\_bitmap\_csum\_lo + - Lower 16-bits of the inode bitmap checksum. + * - 0x1C + - \_\_le16 + - bg\_itable\_unused\_lo + - Lower 16-bits of unused inode count. If set, we needn't scan past the + ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the + inode table for this group. + * - 0x1E + - \_\_le16 + - bg\_checksum + - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the + RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & + 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. + * - + - + - + - These fields only exist if the 64bit feature is enabled and s_desc_size + > 32. + * - 0x20 + - \_\_le32 + - bg\_block\_bitmap\_hi + - Upper 32-bits of location of block bitmap. + * - 0x24 + - \_\_le32 + - bg\_inode\_bitmap\_hi + - Upper 32-bits of location of inodes bitmap. + * - 0x28 + - \_\_le32 + - bg\_inode\_table\_hi + - Upper 32-bits of location of inodes table. + * - 0x2C + - \_\_le16 + - bg\_free\_blocks\_count\_hi + - Upper 16-bits of free block count. + * - 0x2E + - \_\_le16 + - bg\_free\_inodes\_count\_hi + - Upper 16-bits of free inode count. + * - 0x30 + - \_\_le16 + - bg\_used\_dirs\_count\_hi + - Upper 16-bits of directory count. + * - 0x32 + - \_\_le16 + - bg\_itable\_unused\_hi + - Upper 16-bits of unused inode count. + * - 0x34 + - \_\_le32 + - bg\_exclude\_bitmap\_hi + - Upper 32-bits of location of snapshot exclusion bitmap. + * - 0x38 + - \_\_le16 + - bg\_block\_bitmap\_csum\_hi + - Upper 16-bits of the block bitmap checksum. + * - 0x3A + - \_\_le16 + - bg\_inode\_bitmap\_csum\_hi + - Upper 16-bits of the inode bitmap checksum. + * - 0x3C + - \_\_u32 + - bg\_reserved + - Padding to 64 bytes. + +.. _bgflags: + +Block group flags can be any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT). + * - 0x2 + - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT). + * - 0x4 + - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED). diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ondisk/ifork.rst new file mode 100644 index 000000000000..5dbe3b2b121a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/ifork.rst @@ -0,0 +1,194 @@ +.. SPDX-License-Identifier: GPL-2.0 + +The Contents of inode.i\_block +------------------------------ + +Depending on the type of file an inode describes, the 60 bytes of +storage in ``inode.i_block`` can be used in different ways. In general, +regular files and directories will use it for file block indexing +information, and special files will use it for special purposes. + +Symbolic Links +~~~~~~~~~~~~~~ + +The target of a symbolic link will be stored in this field if the target +string is less than 60 bytes long. Otherwise, either extents or block +maps will be used to allocate data blocks to store the link target. + +Direct/Indirect Block Addressing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In ext2/3, file block numbers were mapped to logical block numbers by +means of an (up to) three level 1-1 block map. To find the logical block +that stores a particular file block, the code would navigate through +this increasingly complicated structure. Notice that there is neither a +magic number nor a checksum to provide any level of confidence that the +block isn't full of garbage. + +.. ifconfig:: builder != 'latex' + + .. include:: blockmap.rst + +.. ifconfig:: builder == 'latex' + + [Table omitted because LaTeX doesn't support nested tables.] + +Note that with this block mapping scheme, it is necessary to fill out a +lot of mapping data even for a large contiguous file! This inefficiency +led to the creation of the extent mapping scheme, discussed below. + +Notice also that a file using this mapping scheme cannot be placed +higher than 2^32 blocks. + +Extent Tree +~~~~~~~~~~~ + +In ext4, the file to logical block map has been replaced with an extent +tree. Under the old scheme, allocating a contiguous run of 1,000 blocks +requires an indirect block to map all 1,000 entries; with extents, the +mapping is reduced to a single ``struct ext4_extent`` with +``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate +very large files with a single extent, at a considerable reduction in +metadata block use, and some improvement in disk efficiency. The inode +must have the extents flag (0x80000) flag set for this feature to be in +use. + +Extents are arranged as a tree. Each node of the tree begins with a +``struct ext4_extent_header``. If the node is an interior node +(``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries`` +instances of ``struct ext4_extent_idx``; each of these index entries +points to a block containing more nodes in the extent tree. If the node +is a leaf node (``eh.eh_depth == 0``), then the header is followed by +``eh.eh_entries`` instances of ``struct ext4_extent``; these instances +point to the file's data blocks. The root node of the extent tree is +stored in ``inode.i_block``, which allows for the first four extents to +be recorded without the use of extra metadata blocks. + +The extent tree header is recorded in ``struct ext4_extent_header``, +which is 12 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - eh\_magic + - Magic number, 0xF30A. + * - 0x2 + - \_\_le16 + - eh\_entries + - Number of valid entries following the header. + * - 0x4 + - \_\_le16 + - eh\_max + - Maximum number of entries that could follow the header. + * - 0x6 + - \_\_le16 + - eh\_depth + - Depth of this extent node in the extent tree. 0 = this extent node + points to data blocks; otherwise, this extent node points to other + extent nodes. The extent tree can be at most 5 levels deep: a logical + block number can be at most ``2^32``, and the smallest ``n`` that + satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5. + * - 0x8 + - \_\_le32 + - eh\_generation + - Generation of the tree. (Used by Lustre, but not standard ext4). + +Internal nodes of the extent tree, also known as index nodes, are +recorded as ``struct ext4_extent_idx``, and are 12 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - ei\_block + - This index node covers file blocks from 'block' onward. + * - 0x4 + - \_\_le32 + - ei\_leaf\_lo + - Lower 32-bits of the block number of the extent node that is the next + level lower in the tree. The tree node pointed to can be either another + internal node or a leaf node, described below. + * - 0x8 + - \_\_le16 + - ei\_leaf\_hi + - Upper 16-bits of the previous field. + * - 0xA + - \_\_u16 + - ei\_unused + - + +Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, +and are also 12 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - ee\_block + - First file block number that this extent covers. + * - 0x4 + - \_\_le16 + - ee\_len + - Number of blocks covered by extent. If the value of this field is <= + 32768, the extent is initialized. If the value of the field is > 32768, + the extent is uninitialized and the actual extent length is ``ee_len`` - + 32768. Therefore, the maximum length of a initialized extent is 32768 + blocks, and the maximum length of an uninitialized extent is 32767. + * - 0x6 + - \_\_le16 + - ee\_start\_hi + - Upper 16-bits of the block number to which this extent points. + * - 0x8 + - \_\_le32 + - ee\_start\_lo + - Lower 32-bits of the block number to which this extent points. + +Prior to the introduction of metadata checksums, the extent header + +extent entries always left at least 4 bytes of unallocated space at the +end of each extent tree data block (because (2^x % 12) >= 4). Therefore, +the 32-bit checksum is inserted into this space. The 4 extents in the +inode do not need checksumming, since the inode is already checksummed. +The checksum is calculated against the FS UUID, the inode number, the +inode generation, and the entire extent block leading up to (but not +including) the checksum itself. + +``struct ext4_extent_tail`` is 4 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - eb\_checksum + - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock) + +Inline Data +~~~~~~~~~~~ + +If the inline data feature is enabled for the filesystem and the flag is +set for the inode, it is possible that the first 60 bytes of the file +data are stored here. diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst new file mode 100644 index 000000000000..f7d082c3a435 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/index.rst @@ -0,0 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +Data Structures and Algorithms +============================== +.. include:: about.rst +.. include:: overview.rst +.. include:: globals.rst +.. include:: dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/ondisk/inlinedata.rst new file mode 100644 index 000000000000..d1075178ce0b --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/inlinedata.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Inline Data +----------- + +The inline data feature was designed to handle the case that a file's +data is so tiny that it readily fits inside the inode, which +(theoretically) reduces disk block consumption and reduces seeks. If the +file is smaller than 60 bytes, then the data are stored inline in +``inode.i_block``. If the rest of the file would fit inside the extended +attribute space, then it might be found as an extended attribute +“system.data” within the inode body (“ibody EA”). This of course +constrains the amount of extended attributes one can attach to an inode. +If the data size increases beyond i\_block + ibody EA, a regular block +is allocated and the contents moved to that block. + +Pending a change to compact the extended attribute key used to store +inline data, one ought to be able to store 160 bytes of data in a +256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to +that, the limit was 156 bytes due to inefficient use of inode space. + +The inline data feature requires the presence of an extended attribute +for “system.data”, even if the attribute value is zero length. + +Inline Directories +~~~~~~~~~~~~~~~~~~ + +The first four bytes of i\_block are the inode number of the parent +directory. Following that is a 56-byte space for an array of directory +entries; see ``struct ext4_dir_entry``. If there is a “system.data” +attribute in the inode body, the EA value is an array of +``struct ext4_dir_entry`` as well. Note that for inline directories, the +i\_block and EA space are treated as separate dirent blocks; directory +entries cannot span the two. + +Inline directory entries are not checksummed, as the inode checksum +should protect all inline data contents. diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/ondisk/inodes.rst new file mode 100644 index 000000000000..655ce898f3f5 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/inodes.rst @@ -0,0 +1,575 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Index Nodes +----------- + +In a regular UNIX filesystem, the inode stores all the metadata +pertaining to the file (time stamps, block maps, extended attributes, +etc), not the directory entry. To find the information associated with a +file, one must traverse the directory files to find the directory entry +associated with a file, then load the inode to find the metadata for +that file. ext4 appears to cheat (for performance reasons) a little bit +by storing a copy of the file type (normally stored in the inode) in the +directory entry. (Compare all this to FAT, which stores all the file +information directly in the directory entry, but does not support hard +links and is in general more seek-happy than ext4 due to its simpler +block allocator and extensive use of linked lists.) + +The inode table is a linear array of ``struct ext4_inode``. The table is +sized to have enough blocks to store at least +``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the +block group containing an inode can be calculated as +``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the +group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There +is no inode 0. + +The inode checksum is calculated against the FS UUID, the inode number, +and the inode structure itself. + +The inode table entry is laid out in ``struct ext4_inode``. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - i\_mode + - File mode. See the table i_mode_ below. + * - 0x2 + - \_\_le16 + - i\_uid + - Lower 16-bits of Owner UID. + * - 0x4 + - \_\_le32 + - i\_size\_lo + - Lower 32-bits of size in bytes. + * - 0x8 + - \_\_le32 + - i\_atime + - Last access time, in seconds since the epoch. However, if the EA\_INODE + inode flag is set, this inode stores an extended attribute value and + this field contains the checksum of the value. + * - 0xC + - \_\_le32 + - i\_ctime + - Last inode change time, in seconds since the epoch. However, if the + EA\_INODE inode flag is set, this inode stores an extended attribute + value and this field contains the lower 32 bits of the attribute value's + reference count. + * - 0x10 + - \_\_le32 + - i\_mtime + - Last data modification time, in seconds since the epoch. However, if the + EA\_INODE inode flag is set, this inode stores an extended attribute + value and this field contains the number of the inode that owns the + extended attribute. + * - 0x14 + - \_\_le32 + - i\_dtime + - Deletion Time, in seconds since the epoch. + * - 0x18 + - \_\_le16 + - i\_gid + - Lower 16-bits of GID. + * - 0x1A + - \_\_le16 + - i\_links\_count + - Hard link count. Normally, ext4 does not permit an inode to have more + than 65,000 hard links. This applies to files as well as directories, + which means that there cannot be more than 64,998 subdirectories in a + directory (each subdirectory's '..' entry counts as a hard link, as does + the '.' entry in the directory itself). With the DIR\_NLINK feature + enabled, ext4 supports more than 64,998 subdirectories by setting this + field to 1 to indicate that the number of hard links is not known. + * - 0x1C + - \_\_le32 + - i\_blocks\_lo + - Lower 32-bits of “block” count. If the huge\_file feature flag is not + set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks + on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in + ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi + << 32)`` 512-byte blocks on disk. If huge\_file is set and + EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file + consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on + disk. + * - 0x20 + - \_\_le32 + - i\_flags + - Inode flags. See the table i_flags_ below. + * - 0x24 + - 4 bytes + - i\_osd1 + - See the table i_osd1_ for more details. + * - 0x28 + - 60 bytes + - i\_block[EXT4\_N\_BLOCKS=15] + - Block map or extent tree. See the section “The Contents of inode.i\_block”. + * - 0x64 + - \_\_le32 + - i\_generation + - File version (for NFS). + * - 0x68 + - \_\_le32 + - i\_file\_acl\_lo + - Lower 32-bits of extended attribute block. ACLs are of course one of + many possible extended attributes; I think the name of this field is a + result of the first use of extended attributes being for ACLs. + * - 0x6C + - \_\_le32 + - i\_size\_high / i\_dir\_acl + - Upper 32-bits of file/directory size. In ext2/3 this field was named + i\_dir\_acl, though it was usually set to zero and never used. + * - 0x70 + - \_\_le32 + - i\_obso\_faddr + - (Obsolete) fragment address. + * - 0x74 + - 12 bytes + - i\_osd2 + - See the table i_osd2_ for more details. + * - 0x80 + - \_\_le16 + - i\_extra\_isize + - Size of this inode - 128. Alternately, the size of the extended inode + fields beyond the original ext2 inode, including this field. + * - 0x82 + - \_\_le16 + - i\_checksum\_hi + - Upper 16-bits of the inode checksum. + * - 0x84 + - \_\_le32 + - i\_ctime\_extra + - Extra change time bits. This provides sub-second precision. See Inode + Timestamps section. + * - 0x88 + - \_\_le32 + - i\_mtime\_extra + - Extra modification time bits. This provides sub-second precision. + * - 0x8C + - \_\_le32 + - i\_atime\_extra + - Extra access time bits. This provides sub-second precision. + * - 0x90 + - \_\_le32 + - i\_crtime + - File creation time, in seconds since the epoch. + * - 0x94 + - \_\_le32 + - i\_crtime\_extra + - Extra file creation time bits. This provides sub-second precision. + * - 0x98 + - \_\_le32 + - i\_version\_hi + - Upper 32-bits for version number. + * - 0x9C + - \_\_le32 + - i\_projid + - Project ID. + +.. _i_mode: + +The ``i_mode`` value is a combination of the following flags: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - S\_IXOTH (Others may execute) + * - 0x2 + - S\_IWOTH (Others may write) + * - 0x4 + - S\_IROTH (Others may read) + * - 0x8 + - S\_IXGRP (Group members may execute) + * - 0x10 + - S\_IWGRP (Group members may write) + * - 0x20 + - S\_IRGRP (Group members may read) + * - 0x40 + - S\_IXUSR (Owner may execute) + * - 0x80 + - S\_IWUSR (Owner may write) + * - 0x100 + - S\_IRUSR (Owner may read) + * - 0x200 + - S\_ISVTX (Sticky bit) + * - 0x400 + - S\_ISGID (Set GID) + * - 0x800 + - S\_ISUID (Set UID) + * - + - These are mutually-exclusive file types: + * - 0x1000 + - S\_IFIFO (FIFO) + * - 0x2000 + - S\_IFCHR (Character device) + * - 0x4000 + - S\_IFDIR (Directory) + * - 0x6000 + - S\_IFBLK (Block device) + * - 0x8000 + - S\_IFREG (Regular file) + * - 0xA000 + - S\_IFLNK (Symbolic link) + * - 0xC000 + - S\_IFSOCK (Socket) + +.. _i_flags: + +The ``i_flags`` field is a combination of these values: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented) + * - 0x2 + - This file should be preserved, should undeletion be desired + (EXT4\_UNRM\_FL). (not implemented) + * - 0x4 + - File is compressed (EXT4\_COMPR\_FL). (not really implemented) + * - 0x8 + - All writes to the file must be synchronous (EXT4\_SYNC\_FL). + * - 0x10 + - File is immutable (EXT4\_IMMUTABLE\_FL). + * - 0x20 + - File can only be appended (EXT4\_APPEND\_FL). + * - 0x40 + - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL). + * - 0x80 + - Do not update access time (EXT4\_NOATIME\_FL). + * - 0x100 + - Dirty compressed file (EXT4\_DIRTY\_FL). (not used) + * - 0x200 + - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used) + * - 0x400 + - Do not compress file (EXT4\_NOCOMPR\_FL). (not used) + * - 0x800 + - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was + EXT4\_ECOMPR\_FL (compression error), which was never used. + * - 0x1000 + - Directory has hashed indexes (EXT4\_INDEX\_FL). + * - 0x2000 + - AFS magic directory (EXT4\_IMAGIC\_FL). + * - 0x4000 + - File data must always be written through the journal + (EXT4\_JOURNAL\_DATA\_FL). + * - 0x8000 + - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4) + * - 0x10000 + - All directory entry data should be written synchronously (see + ``dirsync``) (EXT4\_DIRSYNC\_FL). + * - 0x20000 + - Top of directory hierarchy (EXT4\_TOPDIR\_FL). + * - 0x40000 + - This is a huge file (EXT4\_HUGE\_FILE\_FL). + * - 0x80000 + - Inode uses extents (EXT4\_EXTENTS\_FL). + * - 0x200000 + - Inode stores a large extended attribute value in its data blocks + (EXT4\_EA\_INODE\_FL). + * - 0x400000 + - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL). + (deprecated) + * - 0x01000000 + - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline) + * - 0x04000000 + - Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in + mainline) + * - 0x08000000 + - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in + mainline) + * - 0x10000000 + - Inode has inline data (EXT4\_INLINE\_DATA\_FL). + * - 0x20000000 + - Create children with the same project ID (EXT4\_PROJINHERIT\_FL). + * - 0x80000000 + - Reserved for ext4 library (EXT4\_RESERVED\_FL). + * - + - Aggregate flags: + * - 0x4BDFFF + - User-visible flags. + * - 0x4B80FF + - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and + EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's + EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of + these flags in a special manner and they are masked out of the set of + flags that are saved directly to i\_flags. + +.. _i_osd1: + +The ``osd1`` field has multiple meanings depending on the creator: + +Linux: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - l\_i\_version + - Inode version. However, if the EA\_INODE inode flag is set, this inode + stores an extended attribute value and this field contains the upper 32 + bits of the attribute value's reference count. + +Hurd: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_i\_translator + - ?? + +Masix: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - m\_i\_reserved + - ?? + +.. _i_osd2: + +The ``osd2`` field has multiple meanings depending on the filesystem creator: + +Linux: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - l\_i\_blocks\_high + - Upper 16-bits of the block count. Please see the note attached to + i\_blocks\_lo. + * - 0x2 + - \_\_le16 + - l\_i\_file\_acl\_high + - Upper 16-bits of the extended attribute block (historically, the file + ACL location). See the Extended Attributes section below. + * - 0x4 + - \_\_le16 + - l\_i\_uid\_high + - Upper 16-bits of the Owner UID. + * - 0x6 + - \_\_le16 + - l\_i\_gid\_high + - Upper 16-bits of the GID. + * - 0x8 + - \_\_le16 + - l\_i\_checksum\_lo + - Lower 16-bits of the inode checksum. + * - 0xA + - \_\_le16 + - l\_i\_reserved + - Unused. + +Hurd: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - h\_i\_reserved1 + - ?? + * - 0x2 + - \_\_u16 + - h\_i\_mode\_high + - Upper 16-bits of the file mode. + * - 0x4 + - \_\_le16 + - h\_i\_uid\_high + - Upper 16-bits of the Owner UID. + * - 0x6 + - \_\_le16 + - h\_i\_gid\_high + - Upper 16-bits of the GID. + * - 0x8 + - \_\_u32 + - h\_i\_author + - Author code? + +Masix: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - h\_i\_reserved1 + - ?? + * - 0x2 + - \_\_u16 + - m\_i\_file\_acl\_high + - Upper 16-bits of the extended attribute block (historically, the file + ACL location). + * - 0x4 + - \_\_u32 + - m\_i\_reserved2[2] + - ?? + +Inode Size +~~~~~~~~~~ + +In ext2 and ext3, the inode structure size was fixed at 128 bytes +(``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of +128 bytes. Starting with ext4, it is possible to allocate a larger +on-disk inode at format time for all inodes in the filesystem to provide +space beyond the end of the original ext2 inode. The on-disk inode +record size is recorded in the superblock as ``s_inode_size``. The +number of bytes actually used by struct ext4\_inode beyond the original +128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each +inode, which allows struct ext4\_inode to grow for a new kernel without +having to upgrade all of the on-disk inodes. Access to fields beyond +EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within +``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as +of October 2013) the inode structure is 156 bytes +(``i_extra_isize = 28``). The extra space between the end of the inode +structure and the end of the inode record can be used to store extended +attributes. Each inode record can be as large as the filesystem block +size, though this is not terribly efficient. + +Finding an Inode +~~~~~~~~~~~~~~~~ + +Each block group contains ``sb->s_inodes_per_group`` inodes. Because +inode 0 is defined not to exist, this formula can be used to find the +block group that an inode lives in: +``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode +can be found within the block group's inode table at +``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte +address within the inode table, use +``offset = index * sb->s_inode_size``. + +Inode Timestamps +~~~~~~~~~~~~~~~~ + +Four timestamps are recorded in the lower 128 bytes of the inode +structure -- inode change time (ctime), access time (atime), data +modification time (mtime), and deletion time (dtime). The four fields +are 32-bit signed integers that represent seconds since the Unix epoch +(1970-01-01 00:00:00 GMT), which means that the fields will overflow in +January 2038. For inodes that are not linked from any directory but are +still open (orphan inodes), the dtime field is overloaded for use with +the orphan list. The superblock field ``s_last_orphan`` points to the +first inode in the orphan list; dtime is then the number of the next +orphaned inode, or zero if there are no more orphans. + +If the inode structure size ``sb->s_inode_size`` is larger than 128 +bytes and the ``i_inode_extra`` field is large enough to encompass the +respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime +inode fields are widened to 64 bits. Within this “extra” 32-bit field, +the lower two bits are used to extend the 32-bit seconds field to be 34 +bit wide; the upper 30 bits are used to provide nanosecond timestamp +accuracy. Therefore, timestamps should not overflow until May 2446. +dtime was not widened. There is also a fifth timestamp to record inode +creation time (crtime); this field is 64-bits wide and decoded in the +same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible +through the regular stat() interface, though debugfs will report them. + +We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)). +In other words: + +.. list-table:: + :widths: 20 20 20 20 20 + :header-rows: 1 + + * - Extra epoch bits + - MSB of 32-bit time + - Adjustment for signed 32-bit to 64-bit tv\_sec + - Decoded 64-bit tv\_sec + - valid time range + * - 0 0 + - 1 + - 0 + - ``-0x80000000 - -0x00000001`` + - 1901-12-13 to 1969-12-31 + * - 0 0 + - 0 + - 0 + - ``0x000000000 - 0x07fffffff`` + - 1970-01-01 to 2038-01-19 + * - 0 1 + - 1 + - 0x100000000 + - ``0x080000000 - 0x0ffffffff`` + - 2038-01-19 to 2106-02-07 + * - 0 1 + - 0 + - 0x100000000 + - ``0x100000000 - 0x17fffffff`` + - 2106-02-07 to 2174-02-25 + * - 1 0 + - 1 + - 0x200000000 + - ``0x180000000 - 0x1ffffffff`` + - 2174-02-25 to 2242-03-16 + * - 1 0 + - 0 + - 0x200000000 + - ``0x200000000 - 0x27fffffff`` + - 2242-03-16 to 2310-04-04 + * - 1 1 + - 1 + - 0x300000000 + - ``0x280000000 - 0x2ffffffff`` + - 2310-04-04 to 2378-04-22 + * - 1 1 + - 0 + - 0x300000000 + - ``0x300000000 - 0x37fffffff`` + - 2378-04-22 to 2446-05-10 + +This is a somewhat odd encoding since there are effectively seven times +as many positive values as negative values. There have also been +long-standing bugs decoding and encoding dates beyond 2038, which don't +seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels +incorrectly use the extra epoch bits 1,1 for dates between 1901 and +1970. At some point the kernel will be fixed and e2fsck will fix this +situation, assuming that it is run before 2310. diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/ondisk/journal.rst new file mode 100644 index 000000000000..e7031af86876 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/journal.rst @@ -0,0 +1,611 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Journal (jbd2) +-------------- + +Introduced in ext3, the ext4 filesystem employs a journal to protect the +filesystem against corruption in the case of a system crash. A small +continuous region of disk (default 128MiB) is reserved inside the +filesystem as a place to land “important” data writes on-disk as quickly +as possible. Once the important data transaction is fully written to the +disk and flushed from the disk write cache, a record of the data being +committed is also written to the journal. At some later point in time, +the journal code writes the transactions to their final locations on +disk (this could involve a lot of seeking or a lot of small +read-write-erases) before erasing the commit record. Should the system +crash during the second slow write, the journal can be replayed all the +way to the latest commit record, guaranteeing the atomicity of whatever +gets written through the journal to the disk. The effect of this is to +guarantee that the filesystem does not become stuck midway through a +metadata update. + +For performance reasons, ext4 by default only writes filesystem metadata +through the journal. This means that file data blocks are /not/ +guaranteed to be in any consistent state after a crash. If this default +guarantee level (``data=ordered``) is not satisfactory, there is a mount +option to control journal behavior. If ``data=journal``, all data and +metadata are written to disk through the journal. This is slower but +safest. If ``data=writeback``, dirty data blocks are not flushed to the +disk before the metadata are written to disk through the journal. + +The journal inode is typically inode 8. The first 68 bytes of the +journal inode are replicated in the ext4 superblock. The journal itself +is normal (but hidden) file within the filesystem. The file usually +consumes an entire block group, though mke2fs tries to put it in the +middle of the disk. + +All fields in jbd2 are written to disk in big-endian order. This is the +opposite of ext4. + +NOTE: Both ext4 and ocfs2 use jbd2. + +The maximum size of a journal embedded in an ext4 filesystem is 2^32 +blocks. jbd2 itself does not seem to care. + +Layout +~~~~~~ + +Generally speaking, the journal has this format: + +.. list-table:: + :widths: 1 1 78 + :header-rows: 1 + + * - Superblock + - descriptor\_block (data\_blocks or revocation\_block) [more data or + revocations] commmit\_block + - [more transactions...] + * - + - One transaction + - + +Notice that a transaction begins with either a descriptor and some data, +or a block revocation list. A finished transaction always ends with a +commit. If there is no commit record (or the checksums don't match), the +transaction will be discarded during replay. + +External Journal +~~~~~~~~~~~~~~~~ + +Optionally, an ext4 filesystem can be created with an external journal +device (as opposed to an internal journal, which uses a reserved inode). +In this case, on the filesystem device, ``s_journal_inum`` should be +zero and ``s_journal_uuid`` should be set. On the journal device there +will be an ext4 super block in the usual place, with a matching UUID. +The journal superblock will be in the next full block after the +superblock. + +.. list-table:: + :widths: 1 1 1 1 76 + :header-rows: 1 + + * - 1024 bytes of padding + - ext4 Superblock + - Journal Superblock + - descriptor\_block (data\_blocks or revocation\_block) [more data or + revocations] commmit\_block + - [more transactions...] + * - + - + - + - One transaction + - + +Block Header +~~~~~~~~~~~~ + +Every block in the journal starts with a common 12-byte header +``struct journal_header_s``: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_be32 + - h\_magic + - jbd2 magic number, 0xC03B3998. + * - 0x4 + - \_\_be32 + - h\_blocktype + - Description of what this block contains. See the jbd2_blocktype_ table + below. + * - 0x8 + - \_\_be32 + - h\_sequence + - The transaction ID that goes with this block. + +.. _jbd2_blocktype: + +The journal block type can be any one of: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 1 + - Descriptor. This block precedes a series of data blocks that were + written through the journal during a transaction. + * - 2 + - Block commit record. This block signifies the completion of a + transaction. + * - 3 + - Journal superblock, v1. + * - 4 + - Journal superblock, v2. + * - 5 + - Block revocation records. This speeds up recovery by enabling the + journal to skip writing blocks that were subsequently rewritten. + +Super Block +~~~~~~~~~~~ + +The super block for the journal is much simpler as compared to ext4's. +The key data kept within are size of the journal, and where to find the +start of the log of transactions. + +The journal superblock is recorded as ``struct journal_superblock_s``, +which is 1024 bytes long: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - + - + - + - Static information describing the journal. + * - 0x0 + - journal\_header\_t (12 bytes) + - s\_header + - Common header identifying this as a superblock. + * - 0xC + - \_\_be32 + - s\_blocksize + - Journal device block size. + * - 0x10 + - \_\_be32 + - s\_maxlen + - Total number of blocks in this journal. + * - 0x14 + - \_\_be32 + - s\_first + - First block of log information. + * - + - + - + - Dynamic information describing the current state of the log. + * - 0x18 + - \_\_be32 + - s\_sequence + - First commit ID expected in log. + * - 0x1C + - \_\_be32 + - s\_start + - Block number of the start of log. Contrary to the comments, this field + being zero does not imply that the journal is clean! + * - 0x20 + - \_\_be32 + - s\_errno + - Error value, as set by jbd2\_journal\_abort(). + * - + - + - + - The remaining fields are only valid in a v2 superblock. + * - 0x24 + - \_\_be32 + - s\_feature\_compat; + - Compatible feature set. See the table jbd2_compat_ below. + * - 0x28 + - \_\_be32 + - s\_feature\_incompat + - Incompatible feature set. See the table jbd2_incompat_ below. + * - 0x2C + - \_\_be32 + - s\_feature\_ro\_compat + - Read-only compatible feature set. There aren't any of these currently. + * - 0x30 + - \_\_u8 + - s\_uuid[16] + - 128-bit uuid for journal. This is compared against the copy in the ext4 + super block at mount time. + * - 0x40 + - \_\_be32 + - s\_nr\_users + - Number of file systems sharing this journal. + * - 0x44 + - \_\_be32 + - s\_dynsuper + - Location of dynamic super block copy. (Not used?) + * - 0x48 + - \_\_be32 + - s\_max\_transaction + - Limit of journal blocks per transaction. (Not used?) + * - 0x4C + - \_\_be32 + - s\_max\_trans\_data + - Limit of data blocks per transaction. (Not used?) + * - 0x50 + - \_\_u8 + - s\_checksum\_type + - Checksum algorithm used for the journal. See jbd2_checksum_type_ for + more info. + * - 0x51 + - \_\_u8[3] + - s\_padding2 + - + * - 0x54 + - \_\_u32 + - s\_padding[42] + - + * - 0xFC + - \_\_be32 + - s\_checksum + - Checksum of the entire superblock, with this field set to zero. + * - 0x100 + - \_\_u8 + - s\_users[16\*48] + - ids of all file systems sharing the log. e2fsprogs/Linux don't allow + shared external journals, but I imagine Lustre (or ocfs2?), which use + the jbd2 code, might. + +.. _jbd2_compat: + +The journal compat features are any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Journal maintains checksums on the data blocks. + (JBD2\_FEATURE\_COMPAT\_CHECKSUM) + +.. _jbd2_incompat: + +The journal incompat features are any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE) + * - 0x2 + - Journal can deal with 64-bit block numbers. + (JBD2\_FEATURE\_INCOMPAT\_64BIT) + * - 0x4 + - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT) + * - 0x8 + - This journal uses v2 of the checksum on-disk format. Each journal + metadata block gets its own checksum, and the block tags in the + descriptor table contain checksums for each of the data blocks in the + journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2) + * - 0x10 + - This journal uses v3 of the checksum on-disk format. This is the same as + v2, but the journal block tag size is fixed regardless of the size of + block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) + +.. _jbd2_checksum_type: + +Journal checksum type codes are one of the following. crc32 or crc32c are the +most likely choices. + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 1 + - CRC32 + * - 2 + - MD5 + * - 3 + - SHA1 + * - 4 + - CRC32C + +Descriptor Block +~~~~~~~~~~~~~~~~ + +The descriptor block contains an array of journal block tags that +describe the final locations of the data blocks that follow in the +journal. Descriptor blocks are open-coded instead of being completely +described by a data structure, but here is the block structure anyway. +Descriptor blocks consume at least 36 bytes, but use a full block: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - journal\_header\_t + - (open coded) + - Common block header. + * - 0xC + - struct journal\_block\_tag\_s + - open coded array[] + - Enough tags either to fill up the block or to describe all the data + blocks that follow this descriptor block. + +Journal block tags have any of the following formats, depending on which +journal feature and block tag flags are set. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is +defined as ``struct journal_block_tag3_s``, which looks like the +following. The size is 16 or 32 bytes. + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_blocknr + - Lower 32-bits of the location of where the corresponding data block + should end up on disk. + * - 0x4 + - \_\_be32 + - t\_flags + - Flags that go with the descriptor. See the table jbd2_tag_flags_ for + more info. + * - 0x8 + - \_\_be32 + - t\_blocknr\_high + - Upper 32-bits of the location of where the corresponding data block + should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is + not enabled. + * - 0xC + - \_\_be32 + - t\_checksum + - Checksum of the journal UUID, the sequence number, and the data block. + * - + - + - + - This field appears to be open coded. It always comes at the end of the + tag, after t_checksum. This field is not present if the "same UUID" flag + is set. + * - 0x8 or 0xC + - char + - uuid[16] + - A UUID to go with this tag. This field appears to be copied from the + ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that + field. + +.. _jbd2_tag_flags: + +The journal tag flags are any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - On-disk block is escaped. The first four bytes of the data block just + happened to match the jbd2 magic number. + * - 0x2 + - This block has the same UUID as previous, therefore the UUID field is + omitted. + * - 0x4 + - The data block was deleted by the transaction. (Not used?) + * - 0x8 + - This is the last tag in this descriptor block. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag +is defined as ``struct journal_block_tag_s``, which looks like the +following. The size is 8, 12, 24, or 28 bytes: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_blocknr + - Lower 32-bits of the location of where the corresponding data block + should end up on disk. + * - 0x4 + - \_\_be16 + - t\_checksum + - Checksum of the journal UUID, the sequence number, and the data block. + Note that only the lower 16 bits are stored. + * - 0x6 + - \_\_be16 + - t\_flags + - Flags that go with the descriptor. See the table jbd2_tag_flags_ for + more info. + * - + - + - + - This next field is only present if the super block indicates support for + 64-bit block numbers. + * - 0x8 + - \_\_be32 + - t\_blocknr\_high + - Upper 32-bits of the location of where the corresponding data block + should end up on disk. + * - + - + - + - This field appears to be open coded. It always comes at the end of the + tag, after t_flags or t_blocknr_high. This field is not present if the + "same UUID" flag is set. + * - 0x8 or 0xC + - char + - uuid[16] + - A UUID to go with this tag. This field appears to be copied from the + ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that + field. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or +JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a +``struct jbd2_journal_block_tail``, which looks like this: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_checksum + - Checksum of the journal UUID + the descriptor block, with this field set + to zero. + +Data Block +~~~~~~~~~~ + +In general, the data blocks being written to disk through the journal +are written verbatim into the journal file after the descriptor block. +However, if the first four bytes of the block match the jbd2 magic +number then those four bytes are replaced with zeroes and the “escaped” +flag is set in the descriptor block tag. + +Revocation Block +~~~~~~~~~~~~~~~~ + +A revocation block is used to prevent replay of a block in an earlier +transaction. This is used to mark blocks that were journalled at one +time but are no longer journalled. Typically this happens if a metadata +block is freed and re-allocated as a file data block; in this case, a +journal replay after the file block was written to disk will cause +corruption. + +**NOTE**: This mechanism is NOT used to express “this journal block is +superseded by this other journal block”, as the author (djwong) +mistakenly thought. Any block being added to a transaction will cause +the removal of all existing revocation records for that block. + +Revocation blocks are described in +``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in +length, but use a full block: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - journal\_header\_t + - r\_header + - Common block header. + * - 0xC + - \_\_be32 + - r\_count + - Number of bytes used in this block. + * - 0x10 + - \_\_be32 or \_\_be64 + - blocks[0] + - Blocks to revoke. + +After r\_count is a linear array of block numbers that are effectively +revoked by this transaction. The size of each block number is 8 bytes if +the superblock advertises 64-bit block number support, or 4 bytes +otherwise. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or +JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation +block is a ``struct jbd2_journal_revoke_tail``, which has this format: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_be32 + - r\_checksum + - Checksum of the journal UUID + revocation block + +Commit Block +~~~~~~~~~~~~ + +The commit block is a sentry that indicates that a transaction has been +completely written to the journal. Once this commit block reaches the +journal, the data stored with this transaction can be written to their +final locations on disk. + +The commit block is described by ``struct commit_header``, which is 32 +bytes long (but uses a full block): + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - journal\_header\_s + - (open coded) + - Common block header. + * - 0xC + - unsigned char + - h\_chksum\_type + - The type of checksum to use to verify the integrity of the data blocks + in the transaction. See jbd2_checksum_type_ for more info. + * - 0xD + - unsigned char + - h\_chksum\_size + - The number of bytes used by the checksum. Most likely 4. + * - 0xE + - unsigned char + - h\_padding[2] + - + * - 0x10 + - \_\_be32 + - h\_chksum[JBD2\_CHECKSUM\_BYTES] + - 32 bytes of space to store checksums. If + JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 + are set, the first ``__be32`` is the checksum of the journal UUID and + the entire commit block, with this field zeroed. If + JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the + crc32 of all the blocks already written to the transaction. + * - 0x30 + - \_\_be64 + - h\_commit\_sec + - The time that the transaction was committed, in seconds since the epoch. + * - 0x38 + - \_\_be32 + - h\_commit\_nsec + - Nanoseconds component of the above timestamp. + diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/ondisk/mmp.rst new file mode 100644 index 000000000000..b7d7a3137f80 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/mmp.rst @@ -0,0 +1,77 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Multiple Mount Protection +------------------------- + +Multiple mount protection (MMP) is a feature that protects the +filesystem against multiple hosts trying to use the filesystem +simultaneously. When a filesystem is opened (for mounting, or fsck, +etc.), the MMP code running on the node (call it node A) checks a +sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the +open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then +fsck is (hopefully) running, and open fails immediately. Otherwise, the +open code will wait for twice the specified MMP check interval and check +the sequence number again. If the sequence number has changed, then the +filesystem is active on another machine and the open fails. If the MMP +code passes all of those checks, a new MMP sequence number is generated +and written to the MMP block, and the mount proceeds. + +While the filesystem is live, the kernel sets up a timer to re-check the +MMP block at the specified MMP check interval. To perform the re-check, +the MMP sequence number is re-read; if it does not match the in-memory +MMP sequence number, then another node (node B) has mounted the +filesystem, and node A remounts the filesystem read-only. If the +sequence numbers match, the sequence number is incremented both in +memory and on disk, and the re-check is complete. + +The hostname and device filename are written into the MMP block whenever +an open operation succeeds. The MMP code does not use these values; they +are provided purely for informational purposes. + +The checksum is calculated against the FS UUID and the MMP structure. +The MMP structure (``struct mmp_struct``) is as follows: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - mmp\_magic + - Magic number for MMP, 0x004D4D50 (“MMP”). + * - 0x4 + - \_\_le32 + - mmp\_seq + - Sequence number, updated periodically. + * - 0x8 + - \_\_le64 + - mmp\_time + - Time that the MMP block was last updated. + * - 0x10 + - char[64] + - mmp\_nodename + - Hostname of the node that opened the filesystem. + * - 0x50 + - char[32] + - mmp\_bdevname + - Block device name of the filesystem. + * - 0x70 + - \_\_le16 + - mmp\_check\_interval + - The MMP re-check interval, in seconds. + * - 0x72 + - \_\_le16 + - mmp\_pad1 + - Zero. + * - 0x74 + - \_\_le32[226] + - mmp\_pad2 + - Zero. + * - 0x3FC + - \_\_le32 + - mmp\_checksum + - Checksum of the MMP block. diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/ondisk/overview.rst new file mode 100644 index 000000000000..cbab18baba12 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/overview.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +High Level Design +================= + +An ext4 file system is split into a series of block groups. To reduce +performance difficulties due to fragmentation, the block allocator tries +very hard to keep each file's blocks within the same group, thereby +reducing seek times. The size of a block group is specified in +``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \* +``block_size_in_bytes``. With the default block size of 4KiB, each group +will contain 32,768 blocks, for a length of 128MiB. The number of block +groups is the size of the device divided by the size of a block group. + +All fields in ext4 are written to disk in little-endian order. HOWEVER, +all fields in jbd2 (the journal) are written to disk in big-endian +order. + +.. include:: blocks.rst +.. include:: blockgroup.rst +.. include:: special_inodes.rst +.. include:: allocators.rst +.. include:: checksums.rst +.. include:: bigalloc.rst +.. include:: inlinedata.rst +.. include:: eainode.rst diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/ondisk/special_inodes.rst new file mode 100644 index 000000000000..a82f70c9baeb --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/special_inodes.rst @@ -0,0 +1,38 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Special inodes +-------------- + +ext4 reserves some inode for special features, as follows: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - inode Number + - Purpose + * - 0 + - Doesn't exist; there is no inode 0. + * - 1 + - List of defective blocks. + * - 2 + - Root directory. + * - 3 + - User quota. + * - 4 + - Group quota. + * - 5 + - Boot loader. + * - 6 + - Undelete directory. + * - 7 + - Reserved group descriptors inode. (“resize inode”) + * - 8 + - Journal inode. + * - 9 + - The “exclude” inode, for snapshots(?) + * - 10 + - Replica inode, used for some non-upstream feature? + * - 11 + - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock. + diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/ondisk/super.rst new file mode 100644 index 000000000000..5f81dd87e0b9 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/super.rst @@ -0,0 +1,801 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Super Block +----------- + +The superblock records various information about the enclosing +filesystem, such as block counts, inode counts, supported features, +maintenance information, and more. + +If the sparse\_super feature flag is set, redundant copies of the +superblock and group descriptors are kept only in the groups whose group +number is either 0 or a power of 3, 5, or 7. If the flag is not set, +redundant copies are kept in all groups. + +The superblock checksum is calculated against the superblock structure, +which includes the FS UUID. + +The ext4 superblock is laid out as follows in +``struct ext4_super_block``: + +.. list-table:: + :widths: 1 1 1 77 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - s\_inodes\_count + - Total inode count. + * - 0x4 + - \_\_le32 + - s\_blocks\_count\_lo + - Total block count. + * - 0x8 + - \_\_le32 + - s\_r\_blocks\_count\_lo + - This number of blocks can only be allocated by the super-user. + * - 0xC + - \_\_le32 + - s\_free\_blocks\_count\_lo + - Free block count. + * - 0x10 + - \_\_le32 + - s\_free\_inodes\_count + - Free inode count. + * - 0x14 + - \_\_le32 + - s\_first\_data\_block + - First data block. This must be at least 1 for 1k-block filesystems and + is typically 0 for all other block sizes. + * - 0x18 + - \_\_le32 + - s\_log\_block\_size + - Block size is 2 ^ (10 + s\_log\_block\_size). + * - 0x1C + - \_\_le32 + - s\_log\_cluster\_size + - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is + enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. + * - 0x20 + - \_\_le32 + - s\_blocks\_per\_group + - Blocks per group. + * - 0x24 + - \_\_le32 + - s\_clusters\_per\_group + - Clusters per group, if bigalloc is enabled. Otherwise + s\_clusters\_per\_group must equal s\_blocks\_per\_group. + * - 0x28 + - \_\_le32 + - s\_inodes\_per\_group + - Inodes per group. + * - 0x2C + - \_\_le32 + - s\_mtime + - Mount time, in seconds since the epoch. + * - 0x30 + - \_\_le32 + - s\_wtime + - Write time, in seconds since the epoch. + * - 0x34 + - \_\_le16 + - s\_mnt\_count + - Number of mounts since the last fsck. + * - 0x36 + - \_\_le16 + - s\_max\_mnt\_count + - Number of mounts beyond which a fsck is needed. + * - 0x38 + - \_\_le16 + - s\_magic + - Magic signature, 0xEF53 + * - 0x3A + - \_\_le16 + - s\_state + - File system state. See super_state_ for more info. + * - 0x3C + - \_\_le16 + - s\_errors + - Behaviour when detecting errors. See super_errors_ for more info. + * - 0x3E + - \_\_le16 + - s\_minor\_rev\_level + - Minor revision level. + * - 0x40 + - \_\_le32 + - s\_lastcheck + - Time of last check, in seconds since the epoch. + * - 0x44 + - \_\_le32 + - s\_checkinterval + - Maximum time between checks, in seconds. + * - 0x48 + - \_\_le32 + - s\_creator\_os + - Creator OS. See the table super_creator_ for more info. + * - 0x4C + - \_\_le32 + - s\_rev\_level + - Revision level. See the table super_revision_ for more info. + * - 0x50 + - \_\_le16 + - s\_def\_resuid + - Default uid for reserved blocks. + * - 0x52 + - \_\_le16 + - s\_def\_resgid + - Default gid for reserved blocks. + * - + - + - + - These fields are for EXT4_DYNAMIC_REV superblocks only. + + Note: the difference between the compatible feature set and the + incompatible feature set is that if there is a bit set in the + incompatible feature set that the kernel doesn't know about, it should + refuse to mount the filesystem. + + e2fsck's requirements are more strict; if it doesn't know + about a feature in either the compatible or incompatible feature set, it + must abort and not try to meddle with things it doesn't understand... + * - 0x54 + - \_\_le32 + - s\_first\_ino + - First non-reserved inode. + * - 0x58 + - \_\_le16 + - s\_inode\_size + - Size of inode structure, in bytes. + * - 0x5A + - \_\_le16 + - s\_block\_group\_nr + - Block group # of this superblock. + * - 0x5C + - \_\_le32 + - s\_feature\_compat + - Compatible feature set flags. Kernel can still read/write this fs even + if it doesn't understand a flag; fsck should not do that. See the + super_compat_ table for more info. + * - 0x60 + - \_\_le32 + - s\_feature\_incompat + - Incompatible feature set. If the kernel or fsck doesn't understand one + of these bits, it should stop. See the super_incompat_ table for more + info. + * - 0x64 + - \_\_le32 + - s\_feature\_ro\_compat + - Readonly-compatible feature set. If the kernel doesn't understand one of + these bits, it can still mount read-only. See the super_rocompat_ table + for more info. + * - 0x68 + - \_\_u8 + - s\_uuid[16] + - 128-bit UUID for volume. + * - 0x78 + - char + - s\_volume\_name[16] + - Volume label. + * - 0x88 + - char + - s\_last\_mounted[64] + - Directory where filesystem was last mounted. + * - 0xC8 + - \_\_le32 + - s\_algorithm\_usage\_bitmap + - For compression (Not used in e2fsprogs/Linux) + * - + - + - + - Performance hints. Directory preallocation should only happen if the + EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + * - 0xCC + - \_\_u8 + - s\_prealloc\_blocks + - #. of blocks to try to preallocate for ... files? (Not used in + e2fsprogs/Linux) + * - 0xCD + - \_\_u8 + - s\_prealloc\_dir\_blocks + - #. of blocks to preallocate for directories. (Not used in + e2fsprogs/Linux) + * - 0xCE + - \_\_le16 + - s\_reserved\_gdt\_blocks + - Number of reserved GDT entries for future filesystem expansion. + * - + - + - + - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is + set. + * - 0xD0 + - \_\_u8 + - s\_journal\_uuid[16] + - UUID of journal superblock + * - 0xE0 + - \_\_le32 + - s\_journal\_inum + - inode number of journal file. + * - 0xE4 + - \_\_le32 + - s\_journal\_dev + - Device number of journal file, if the external journal feature flag is + set. + * - 0xE8 + - \_\_le32 + - s\_last\_orphan + - Start of list of orphaned inodes to delete. + * - 0xEC + - \_\_le32 + - s\_hash\_seed[4] + - HTREE hash seed. + * - 0xFC + - \_\_u8 + - s\_def\_hash\_version + - Default hash algorithm to use for directory hashes. See super_def_hash_ + for more info. + * - 0xFD + - \_\_u8 + - s\_jnl\_backup\_type + - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the + ``s_jnl_blocks`` field contains a duplicate copy of the inode's + ``i_block[]`` array and ``i_size``. + * - 0xFE + - \_\_le16 + - s\_desc\_size + - Size of group descriptors, in bytes, if the 64bit incompat feature flag + is set. + * - 0x100 + - \_\_le32 + - s\_default\_mount\_opts + - Default mount options. See the super_mountopts_ table for more info. + * - 0x104 + - \_\_le32 + - s\_first\_meta\_bg + - First metablock block group, if the meta\_bg feature is enabled. + * - 0x108 + - \_\_le32 + - s\_mkfs\_time + - When the filesystem was created, in seconds since the epoch. + * - 0x10C + - \_\_le32 + - s\_jnl\_blocks[17] + - Backup copy of the journal inode's ``i_block[]`` array in the first 15 + elements and i\_size\_high and i\_size in the 16th and 17th elements, + respectively. + * - + - + - + - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set. + * - 0x150 + - \_\_le32 + - s\_blocks\_count\_hi + - High 32-bits of the block count. + * - 0x154 + - \_\_le32 + - s\_r\_blocks\_count\_hi + - High 32-bits of the reserved block count. + * - 0x158 + - \_\_le32 + - s\_free\_blocks\_count\_hi + - High 32-bits of the free block count. + * - 0x15C + - \_\_le16 + - s\_min\_extra\_isize + - All inodes have at least # bytes. + * - 0x15E + - \_\_le16 + - s\_want\_extra\_isize + - New inodes should reserve # bytes. + * - 0x160 + - \_\_le32 + - s\_flags + - Miscellaneous flags. See the super_flags_ table for more info. + * - 0x164 + - \_\_le16 + - s\_raid\_stride + - RAID stride. This is the number of logical blocks read from or written + to the disk before moving to the next disk. This affects the placement + of filesystem metadata, which will hopefully make RAID storage faster. + * - 0x166 + - \_\_le16 + - s\_mmp\_interval + - #. seconds to wait in multi-mount prevention (MMP) checking. In theory, + MMP is a mechanism to record in the superblock which host and device + have mounted the filesystem, in order to prevent multiple mounts. This + feature does not seem to be implemented... + * - 0x168 + - \_\_le64 + - s\_mmp\_block + - Block # for multi-mount protection data. + * - 0x170 + - \_\_le32 + - s\_raid\_stripe\_width + - RAID stripe width. This is the number of logical blocks read from or + written to the disk before coming back to the current disk. This is used + by the block allocator to try to reduce the number of read-modify-write + operations in a RAID5/6. + * - 0x174 + - \_\_u8 + - s\_log\_groups\_per\_flex + - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``. + * - 0x175 + - \_\_u8 + - s\_checksum\_type + - Metadata checksum algorithm type. The only valid value is 1 (crc32c). + * - 0x176 + - \_\_le16 + - s\_reserved\_pad + - + * - 0x178 + - \_\_le64 + - s\_kbytes\_written + - Number of KiB written to this filesystem over its lifetime. + * - 0x180 + - \_\_le32 + - s\_snapshot\_inum + - inode number of active snapshot. (Not used in e2fsprogs/Linux.) + * - 0x184 + - \_\_le32 + - s\_snapshot\_id + - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.) + * - 0x188 + - \_\_le64 + - s\_snapshot\_r\_blocks\_count + - Number of blocks reserved for active snapshot's future use. (Not used in + e2fsprogs/Linux.) + * - 0x190 + - \_\_le32 + - s\_snapshot\_list + - inode number of the head of the on-disk snapshot list. (Not used in + e2fsprogs/Linux.) + * - 0x194 + - \_\_le32 + - s\_error\_count + - Number of errors seen. + * - 0x198 + - \_\_le32 + - s\_first\_error\_time + - First time an error happened, in seconds since the epoch. + * - 0x19C + - \_\_le32 + - s\_first\_error\_ino + - inode involved in first error. + * - 0x1A0 + - \_\_le64 + - s\_first\_error\_block + - Number of block involved of first error. + * - 0x1A8 + - \_\_u8 + - s\_first\_error\_func[32] + - Name of function where the error happened. + * - 0x1C8 + - \_\_le32 + - s\_first\_error\_line + - Line number where error happened. + * - 0x1CC + - \_\_le32 + - s\_last\_error\_time + - Time of most recent error, in seconds since the epoch. + * - 0x1D0 + - \_\_le32 + - s\_last\_error\_ino + - inode involved in most recent error. + * - 0x1D4 + - \_\_le32 + - s\_last\_error\_line + - Line number where most recent error happened. + * - 0x1D8 + - \_\_le64 + - s\_last\_error\_block + - Number of block involved in most recent error. + * - 0x1E0 + - \_\_u8 + - s\_last\_error\_func[32] + - Name of function where the most recent error happened. + * - 0x200 + - \_\_u8 + - s\_mount\_opts[64] + - ASCIIZ string of mount options. + * - 0x240 + - \_\_le32 + - s\_usr\_quota\_inum + - Inode number of user `quota <quota>`__ file. + * - 0x244 + - \_\_le32 + - s\_grp\_quota\_inum + - Inode number of group `quota <quota>`__ file. + * - 0x248 + - \_\_le32 + - s\_overhead\_blocks + - Overhead blocks/clusters in fs. (Huh? This field is always zero, which + means that the kernel calculates it dynamically.) + * - 0x24C + - \_\_le32 + - s\_backup\_bgs[2] + - Block groups containing superblock backups (if sparse\_super2) + * - 0x254 + - \_\_u8 + - s\_encrypt\_algos[4] + - Encryption algorithms in use. There can be up to four algorithms in use + at any time; valid algorithm codes are given in the super_encrypt_ table + below. + * - 0x258 + - \_\_u8 + - s\_encrypt\_pw\_salt[16] + - Salt for the string2key algorithm for encryption. + * - 0x268 + - \_\_le32 + - s\_lpf\_ino + - Inode number of lost+found + * - 0x26C + - \_\_le32 + - s\_prj\_quota\_inum + - Inode that tracks project quotas. + * - 0x270 + - \_\_le32 + - s\_checksum\_seed + - Checksum seed used for metadata\_csum calculations. This value is + crc32c(~0, $orig\_fs\_uuid). + * - 0x274 + - \_\_u8 + - s\_wtime_hi + - Upper 8 bits of the s_wtime field. + * - 0x275 + - \_\_u8 + - s\_wtime_hi + - Upper 8 bits of the s_mtime field. + * - 0x276 + - \_\_u8 + - s\_mkfs_time_hi + - Upper 8 bits of the s_mkfs_time field. + * - 0x277 + - \_\_u8 + - s\_lastcheck_hi + - Upper 8 bits of the s_lastcheck_hi field. + * - 0x278 + - \_\_u8 + - s\_first_error_time_hi + - Upper 8 bits of the s_first_error_time_hi field. + * - 0x279 + - \_\_u8 + - s\_last_error_time_hi + - Upper 8 bits of the s_last_error_time_hi field. + * - 0x27A + - \_\_u8[2] + - s\_pad + - Zero padding. + * - 0x27C + - \_\_le32 + - s\_reserved[96] + - Padding to the end of the block. + * - 0x3FC + - \_\_le32 + - s\_checksum + - Superblock checksum. + +.. _super_state: + +The superblock state is some combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Cleanly umounted + * - 0x0002 + - Errors detected + * - 0x0004 + - Orphans being recovered + +.. _super_errors: + +The superblock error policy is one of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 1 + - Continue + * - 2 + - Remount read-only + * - 3 + - Panic + +.. _super_creator: + +The filesystem creator is one of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Linux + * - 1 + - Hurd + * - 2 + - Masix + * - 3 + - FreeBSD + * - 4 + - Lites + +.. _super_revision: + +The superblock revision is one of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Original format + * - 1 + - v2 format w/ dynamic inode sizes + +Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem. + +.. _super_compat: + +The superblock compatible features field is a combination of any of the +following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Directory preallocation (COMPAT\_DIR\_PREALLOC). + * - 0x2 + - “imagic inodes”. Not clear from the code what this does + (COMPAT\_IMAGIC\_INODES). + * - 0x4 + - Has a journal (COMPAT\_HAS\_JOURNAL). + * - 0x8 + - Supports extended attributes (COMPAT\_EXT\_ATTR). + * - 0x10 + - Has reserved GDT blocks for filesystem expansion + (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER. + * - 0x20 + - Has directory indices (COMPAT\_DIR\_INDEX). + * - 0x40 + - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized + block groups? (COMPAT\_LAZY\_BG) + * - 0x80 + - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE). + * - 0x100 + - “Exclude bitmap”. Seems to be used to indicate the presence of + snapshot-related exclude bitmaps? Not defined in kernel or used in + e2fsprogs (COMPAT\_EXCLUDE\_BITMAP). + * - 0x200 + - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs + points to the two block groups that contain backup superblocks + (COMPAT\_SPARSE\_SUPER2). + +.. _super_incompat: + +The superblock incompatible features field is a combination of any of the +following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Compression (INCOMPAT\_COMPRESSION). + * - 0x2 + - Directory entries record the file type. See ext4\_dir\_entry\_2 below + (INCOMPAT\_FILETYPE). + * - 0x4 + - Filesystem needs recovery (INCOMPAT\_RECOVER). + * - 0x8 + - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV). + * - 0x10 + - Meta block groups. See the earlier discussion of this feature + (INCOMPAT\_META\_BG). + * - 0x40 + - Files in this filesystem use extents (INCOMPAT\_EXTENTS). + * - 0x80 + - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). + * - 0x100 + - Multiple mount protection. Not implemented (INCOMPAT\_MMP). + * - 0x200 + - Flexible block groups. See the earlier discussion of this feature + (INCOMPAT\_FLEX\_BG). + * - 0x400 + - Inodes can be used to store large extended attribute values + (INCOMPAT\_EA\_INODE). + * - 0x1000 + - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?) + * - 0x2000 + - Metadata checksum seed is stored in the superblock. This feature enables + the administrator to change the UUID of a metadata\_csum filesystem + while the filesystem is mounted; without it, the checksum definition + requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED). + * - 0x4000 + - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to + this feature, directories could not be larger than 4GiB and could not + have an htree more than 2 levels deep. If this feature is enabled, + directories can be larger than 4GiB and have a maximum htree depth of 3. + * - 0x8000 + - Data in inode (INCOMPAT\_INLINE\_DATA). + * - 0x10000 + - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT). + +.. _super_rocompat: + +The superblock read-only compatible features field is a combination of any of +the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Sparse superblocks. See the earlier discussion of this feature + (RO\_COMPAT\_SPARSE\_SUPER). + * - 0x2 + - This filesystem has been used to store a file greater than 2GiB + (RO\_COMPAT\_LARGE\_FILE). + * - 0x4 + - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR). + * - 0x8 + - This filesystem has files whose sizes are represented in units of + logical blocks, not 512-byte sectors. This implies a very large file + indeed! (RO\_COMPAT\_HUGE\_FILE) + * - 0x10 + - Group descriptors have checksums. In addition to detecting corruption, + this is useful for lazy formatting with uninitialized groups + (RO\_COMPAT\_GDT\_CSUM). + * - 0x20 + - Indicates that the old ext3 32,000 subdirectory limit no longer applies + (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1 + if it is incremented past 64,999. + * - 0x40 + - Indicates that large inodes exist on this filesystem + (RO\_COMPAT\_EXTRA\_ISIZE). + * - 0x80 + - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT). + * - 0x100 + - `Quota <Quota>`__ (RO\_COMPAT\_QUOTA). + * - 0x200 + - This filesystem supports “bigalloc”, which means that file extents are + tracked in units of clusters (of blocks) instead of blocks + (RO\_COMPAT\_BIGALLOC). + * - 0x400 + - This filesystem supports metadata checksumming. + (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though + GDT\_CSUM must not be set) + * - 0x800 + - Filesystem supports replicas. This feature is neither in the kernel nor + e2fsprogs. (RO\_COMPAT\_REPLICA) + * - 0x1000 + - Read-only filesystem image; the kernel will not mount this image + read-write and most tools will refuse to write to the image. + (RO\_COMPAT\_READONLY) + * - 0x2000 + - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT) + +.. _super_def_hash: + +The ``s_def_hash_version`` field is one of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Legacy. + * - 0x1 + - Half MD4. + * - 0x2 + - Tea. + * - 0x3 + - Legacy, unsigned. + * - 0x4 + - Half MD4, unsigned. + * - 0x5 + - Tea, unsigned. + +.. _super_mountopts: + +The ``s_default_mount_opts`` field is any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG) + * - 0x0002 + - New files take the gid of the containing directory (instead of the fsgid + of the current process). (EXT4\_DEFM\_BSDGROUPS) + * - 0x0004 + - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER) + * - 0x0008 + - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL) + * - 0x0010 + - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16) + * - 0x0020 + - All data and metadata are commited to the journal. + (EXT4\_DEFM\_JMODE\_DATA) + * - 0x0040 + - All data are flushed to the disk before metadata are committed to the + journal. (EXT4\_DEFM\_JMODE\_ORDERED) + * - 0x0060 + - Data ordering is not preserved; data may be written after the metadata + has been written. (EXT4\_DEFM\_JMODE\_WBACK) + * - 0x0100 + - Disable write flushes. (EXT4\_DEFM\_NOBARRIER) + * - 0x0200 + - Track which blocks in a filesystem are metadata and therefore should not + be used as data blocks. This option will be enabled by default on 3.18, + hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY) + * - 0x0400 + - Enable DISCARD support, where the storage device is told about blocks + becoming unused. (EXT4\_DEFM\_DISCARD) + * - 0x0800 + - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC) + +.. _super_flags: + +The ``s_flags`` field is any combination of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Signed directory hash in use. + * - 0x0002 + - Unsigned directory hash in use. + * - 0x0004 + - To test development code. + +.. _super_encrypt: + +The ``s_encrypt_algos`` list can contain any of the following: + +.. list-table:: + :widths: 1 79 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Invalid algorithm (ENCRYPTION\_MODE\_INVALID). + * - 1 + - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS). + * - 2 + - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM). + * - 3 + - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC). + +Total size of the superblock is 1024 bytes. diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 69f8de995739..e5edd29687b5 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,24 @@ data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. fault_injection=%d Enable fault injection in all supported types with specified injection rate. +fault_type=%d Support configuring fault injection type, should be + enabled with fault_injection option, fault type value + is shown below, it supports single or combined type. + Type_Name Type_Value + FAULT_KMALLOC 0x000000001 + FAULT_KVMALLOC 0x000000002 + FAULT_PAGE_ALLOC 0x000000004 + FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 + FAULT_ALLOC_NID 0x000000020 + FAULT_ORPHAN 0x000000040 + FAULT_BLOCK 0x000000080 + FAULT_DIR_DEPTH 0x000000100 + FAULT_EVICT_INODE 0x000000200 + FAULT_TRUNCATE 0x000000400 + FAULT_IO 0x000000800 + FAULT_CHECKPOINT 0x000001000 + FAULT_DISCARD 0x000002000 mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 53b89d0edc15..46d1b1be3a51 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -71,6 +71,39 @@ Other Functions .. kernel-doc:: fs/block_dev.c :export: +.. kernel-doc:: fs/anon_inodes.c + :export: + +.. kernel-doc:: fs/attr.c + :export: + +.. kernel-doc:: fs/d_path.c + :export: + +.. kernel-doc:: fs/dax.c + :export: + +.. kernel-doc:: fs/direct-io.c + :export: + +.. kernel-doc:: fs/file_table.c + :export: + +.. kernel-doc:: fs/libfs.c + :export: + +.. kernel-doc:: fs/posix_acl.c + :export: + +.. kernel-doc:: fs/stat.c + :export: + +.. kernel-doc:: fs/sync.c + :export: + +.. kernel-doc:: fs/xattr.c + :export: + The proc filesystem =================== diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 72615a2c0752..51c136c821bf 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -10,10 +10,6 @@ union-filesystems). An overlay-filesystem tries to present a filesystem which is the result over overlaying one filesystem on top of the other. -The result will inevitably fail to look exactly like a normal -filesystem for various technical reasons. The expectation is that -many use cases will be able to ignore these differences. - Overlay objects --------------- @@ -266,6 +262,30 @@ rightmost one and going left. In the above example lower1 will be the top, lower2 the middle and lower3 the bottom layer. +Metadata only copy up +-------------------- + +When metadata only copy up feature is enabled, overlayfs will only copy +up metadata (as opposed to whole file), when a metadata specific operation +like chown/chmod is performed. Full file will be copied up later when +file is opened for WRITE operation. + +In other words, this is delayed data copy up operation and data is copied +up when there is a need to actually modify data. + +There are multiple ways to enable/disable this feature. A config option +CONFIG_OVERLAY_FS_METACOPY can be set/unset to enable/disable this feature +by default. Or one can enable/disable it at module load time with module +parameter metacopy=on/off. Lastly, there is also a per mount option +metacopy=on/off to enable/disable this feature per mount. + +Do not use metacopy=on with untrusted upper/lower directories. Otherwise +it is possible that an attacker can create a handcrafted file with +appropriate REDIRECT and METACOPY xattrs, and gain access to file on lower +pointed by REDIRECT. This should not be possible on local system as setting +"trusted." xattrs will require CAP_SYS_ADMIN. But it should be possible +for untrusted layers like from a pen drive. + Sharing and copying layers -------------------------- @@ -284,7 +304,7 @@ though it will not result in a crash or deadlock. Mounting an overlay using an upper layer path, where the upper layer path was previously used by another mounted overlay in combination with a different lower layer path, is allowed, unless the "inodes index" feature -is enabled. +or "metadata only copy up" feature is enabled. With the "inodes index" feature, on the first time mount, an NFS file handle of the lower layer root directory, along with the UUID of the lower @@ -297,6 +317,10 @@ lower root origin, mount will fail with ESTALE. An overlayfs mount with does not support NFS export, lower filesystem does not have a valid UUID or if the upper filesystem does not support extended attributes. +For "metadata only copy up" feature there is no verification mechanism at +mount time. So if same upper is mounted with different set of lower, mount +probably will succeed but expect the unexpected later on. So don't do it. + It is quite a common practice to copy overlay layers to a different directory tree on the same or different underlying filesystem, and even to a different machine. With the "inodes index" feature, trying to mount @@ -306,27 +330,40 @@ the copied layers will fail the verification of the lower root file handle. Non-standard behavior --------------------- -The copy_up operation essentially creates a new, identical file and -moves it over to the old name. Any open files referring to this inode -will access the old data. +Overlayfs can now act as a POSIX compliant filesystem with the following +features turned on: + +1) "redirect_dir" + +Enabled with the mount option or module option: "redirect_dir=on" or with +the kernel config option CONFIG_OVERLAY_FS_REDIRECT_DIR=y. + +If this feature is disabled, then rename(2) on a lower or merged directory +will fail with EXDEV ("Invalid cross-device link"). + +2) "inode index" + +Enabled with the mount option or module option "index=on" or with the +kernel config option CONFIG_OVERLAY_FS_INDEX=y. -The new file may be on a different filesystem, so both st_dev and st_ino -of the real file may change. The values of st_dev and st_ino returned by -stat(2) on an overlay object are often not the same as the real file -stat(2) values to prevent the values from changing on copy_up. +If this feature is disabled and a file with multiple hard links is copied +up, then this will "break" the link. Changes will not be propagated to +other names referring to the same inode. -Unless "xino" feature is enabled, when overlay layers are not all on the -same underlying filesystem, the value of st_dev may be different for two -non-directory objects in the same overlay filesystem and the value of -st_ino for directory objects may be non persistent and could change even -while the overlay filesystem is still mounted. +3) "xino" -Unless "inode index" feature is enabled, if a file with multiple hard -links is copied up, then this will "break" the link. Changes will not be -propagated to other names referring to the same inode. +Enabled with the mount option "xino=auto" or "xino=on", with the module +option "xino_auto=on" or with the kernel config option +CONFIG_OVERLAY_FS_XINO_AUTO=y. Also implicitly enabled by using the same +underlying filesystem for all layers making up the overlay. -Unless "redirect_dir" feature is enabled, rename(2) on a lower or merged -directory will fail with EXDEV. +If this feature is disabled or the underlying filesystem doesn't have +enough free bits in the inode number, then overlayfs will not be able to +guarantee that the values of st_ino and st_dev returned by stat(2) and the +value of d_ino returned by readdir(3) will act like on a normal filesystem. +E.g. the value of st_dev may be different for two objects in the same +overlay filesystem and the value of st_ino for directory objects may not be +persistent and could change even while the overlay filesystem is mounted. Changes to underlying filesystems diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 17bb4dc28fae..7b7b845c490a 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -602,3 +602,23 @@ in your dentry operations instead. dentry separately, and it now has request_mask and query_flags arguments to specify the fields and sync type requested by statx. Filesystems not supporting any statx-specific features may ignore the new arguments. +-- +[mandatory] + ->atomic_open() calling conventions have changed. Gone is int *opened, + along with FILE_OPENED/FILE_CREATED. In place of those we have + FMODE_OPENED/FMODE_CREATED, set in file->f_mode. Additionally, return + value for 'called finish_no_open(), open it yourself' case has become + 0, not 1. Since finish_no_open() itself is returning 0 now, that part + does not need any changes in ->atomic_open() instances. +-- +[mandatory] + alloc_file() has become static now; two wrappers are to be used instead. + alloc_file_pseudo(inode, vfsmount, name, flags, ops) is for the cases + when dentry needs to be created; that's the majority of old alloc_file() + users. Calling conventions: on success a reference to new struct file + is returned and callers reference to inode is subsumed by that. On + failure, ERR_PTR() is returned and no caller's references are affected, + so the caller needs to drop the inode reference it held. + alloc_file_clone(file, flags, ops) does not affect any caller's references. + On success you get a new struct file sharing the mount/dentry with the + original, on failure - ERR_PTR(). diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 520f6a84cf50..22b4b00dee31 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -870,6 +870,8 @@ Committed_AS: 100056 kB VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB +Percpu: 62080 kB +HardwareCorrupted: 0 kB AnonHugePages: 49152 kB ShmemHugePages: 0 kB ShmemPmdMapped: 0 kB @@ -915,6 +917,8 @@ MemAvailable: An estimate of how much memory is available for starting new Dirty: Memory which is waiting to get written back to the disk Writeback: Memory which is actively being written back to the disk AnonPages: Non-file backed pages mapped into userspace page tables +HardwareCorrupted: The amount of RAM/memory in KB, the kernel identifies as + corrupted. AnonHugePages: Non-file backed huge pages mapped into userspace page tables Mapped: files which have been mmaped, such as libraries Shmem: Total memory used by shared memory (shmem) and tmpfs @@ -959,6 +963,8 @@ Committed_AS: The amount of memory presently allocated on the system. VmallocTotal: total size of vmalloc memory area VmallocUsed: amount of vmalloc area which is used VmallocChunk: largest contiguous block of vmalloc area which is free + Percpu: Memory allocated to the percpu allocator used to back percpu + allocations. This stat excludes the cost of metadata. .............................................................................. diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt index 33e2f3694733..cd709a94d054 100644 --- a/Documentation/filesystems/relay.txt +++ b/Documentation/filesystems/relay.txt @@ -222,7 +222,7 @@ using debugfs: */ static struct dentry *create_buf_file_handler(const char *filename, struct dentry *parent, - int mode, + umode_t mode, struct rchan_buf *buf, int *is_global) { @@ -375,7 +375,7 @@ would be very similar: static int subbuf_start(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, - unsigned int prev_padding) + size_t prev_padding) { if (prev_subbuf) *((unsigned *)prev_subbuf) = prev_padding; diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt index 9de4303201e1..d412b236a9d6 100644 --- a/Documentation/filesystems/seq_file.txt +++ b/Documentation/filesystems/seq_file.txt @@ -66,23 +66,39 @@ kernel 3.10. Current versions require the following update The iterator interface -Modules implementing a virtual file with seq_file must implement a simple -iterator object that allows stepping through the data of interest. -Iterators must be able to move to a specific position - like the file they -implement - but the interpretation of that position is up to the iterator -itself. A seq_file implementation that is formatting firewall rules, for -example, could interpret position N as the Nth rule in the chain. -Positioning can thus be done in whatever way makes the most sense for the -generator of the data, which need not be aware of how a position translates -to an offset in the virtual file. The one obvious exception is that a -position of zero should indicate the beginning of the file. +Modules implementing a virtual file with seq_file must implement an +iterator object that allows stepping through the data of interest +during a "session" (roughly one read() system call). If the iterator +is able to move to a specific position - like the file they implement, +though with freedom to map the position number to a sequence location +in whatever way is convenient - the iterator need only exist +transiently during a session. If the iterator cannot easily find a +numerical position but works well with a first/next interface, the +iterator can be stored in the private data area and continue from one +session to the next. + +A seq_file implementation that is formatting firewall rules from a +table, for example, could provide a simple iterator that interprets +position N as the Nth rule in the chain. A seq_file implementation +that presents the content of a, potentially volatile, linked list +might record a pointer into that list, providing that can be done +without risk of the current location being removed. + +Positioning can thus be done in whatever way makes the most sense for +the generator of the data, which need not be aware of how a position +translates to an offset in the virtual file. The one obvious exception +is that a position of zero should indicate the beginning of the file. The /proc/sequence iterator just uses the count of the next number it will output as its position. -Four functions must be implemented to make the iterator work. The first, -called start() takes a position as an argument and returns an iterator -which will start reading at that position. For our simple sequence example, +Four functions must be implemented to make the iterator work. The +first, called start(), starts a session and takes a position as an +argument, returning an iterator which will start reading at that +position. The pos passed to start() will always be either zero, or +the most recent pos used in the previous session. + +For our simple sequence example, the start() function looks like: static void *ct_seq_start(struct seq_file *s, loff_t *pos) @@ -101,11 +117,12 @@ implementations; in most cases the start() function should check for a "past end of file" condition and return NULL if need be. For more complicated applications, the private field of the seq_file -structure can be used. There is also a special value which can be returned -by the start() function called SEQ_START_TOKEN; it can be used if you wish -to instruct your show() function (described below) to print a header at the -top of the output. SEQ_START_TOKEN should only be used if the offset is -zero, however. +structure can be used to hold state from session to session. There is +also a special value which can be returned by the start() function +called SEQ_START_TOKEN; it can be used if you wish to instruct your +show() function (described below) to print a header at the top of the +output. SEQ_START_TOKEN should only be used if the offset is zero, +however. The next function to implement is called, amazingly, next(); its job is to move the iterator forward to the next position in the sequence. The @@ -121,9 +138,13 @@ complete. Here's the example version: return spos; } -The stop() function is called when iteration is complete; its job, of -course, is to clean up. If dynamic memory is allocated for the iterator, -stop() is the place to free it. +The stop() function closes a session; its job, of course, is to clean +up. If dynamic memory is allocated for the iterator, stop() is the +place to free it; if a lock was taken by start(), stop() must release +that lock. The value that *pos was set to by the last next() call +before stop() is remembered, and used for the first start() call of +the next session unless lseek() has been called on the file; in that +case next start() will be asked to start at position zero. static void ct_seq_stop(struct seq_file *s, void *v) { diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 829a7b7857a4..4b2084d0f1fb 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -386,7 +386,7 @@ struct inode_operations { ssize_t (*listxattr) (struct dentry *, char *, size_t); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, - unsigned open_flag, umode_t create_mode, int *opened); + unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct inode *, struct dentry *, umode_t); }; @@ -496,13 +496,15 @@ otherwise noted. atomic_open: called on the last component of an open. Using this optional method the filesystem can look up, possibly create and open the file in - one atomic operation. If it cannot perform this (e.g. the file type - turned out to be wrong) it may signal this by returning 1 instead of - usual 0 or -ve . This method is only called if the last component is - negative or needs lookup. Cached positive dentries are still handled by - f_op->open(). If the file was created, the FILE_CREATED flag should be - set in "opened". In case of O_EXCL the method must only succeed if the - file didn't exist and hence FILE_CREATED shall always be set on success. + one atomic operation. If it wants to leave actual opening to the + caller (e.g. if the file turned out to be a symlink, device, or just + something filesystem won't do atomic open for), it may signal this by + returning finish_no_open(file, dentry). This method is only called if + the last component is negative or needs lookup. Cached positive dentries + are still handled by f_op->open(). If the file was created, + FMODE_CREATED flag should be set in file->f_mode. In case of O_EXCL + the method must only succeed if the file didn't exist and hence FMODE_CREATED + shall always be set on success. tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to atomically creating, opening and unlinking a file in given directory. @@ -857,8 +859,6 @@ struct file_operations { ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -903,17 +903,6 @@ otherwise noted. activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls - get_poll_head: Returns the struct wait_queue_head that callers can - wait on. Callers need to check the returned events using ->poll_mask - once woken. Can return NULL to indicate polling is not supported, - or any error code using the ERR_PTR convention to indicate that a - grave error occured and ->poll_mask shall not be called. - - poll_mask: return the mask of EPOLL* values describing the file descriptor - state. Called either before going to sleep on the waitqueue returned by - get_poll_head, or after it has been woken. If ->get_poll_head and - ->poll_mask are implemented ->poll does not need to be implement. - unlocked_ioctl: called by the ioctl(2) system call. compat_ioctl: called by the ioctl(2) system call when 32 bit system calls @@ -1000,8 +989,7 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int, unsigned int); + struct dentry *(*d_real)(struct dentry *, const struct inode *); }; d_revalidate: called when the VFS needs to revalidate a dentry. This @@ -1135,22 +1123,15 @@ struct dentry_operations { dentry being transited from. d_real: overlay/union type filesystems implement this method to return one of - the underlying dentries hidden by the overlay. It is used in three + the underlying dentries hidden by the overlay. It is used in two different modes: - Called from open it may need to copy-up the file depending on the - supplied open flags. This mode is selected with a non-zero flags - argument. In this mode the d_real method can return an error. - Called from file_dentry() it returns the real dentry matching the inode argument. The real dentry may be from a lower layer already copied up, but still referenced from the file. This mode is selected with a - non-NULL inode argument. This will always succeed. - - With NULL inode and zero flags the topmost real underlying dentry is - returned. This will always succeed. + non-NULL inode argument. - This method is never called with both non-NULL inode and non-zero flags. + With NULL inode the topmost real underlying dentry is returned. Each dentry has a pointer to its parent dentry, as well as a hash list of child dentries. Child dentries are basically like files in a diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 4d9ff0a7f8e1..a9ae82fb9d13 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -223,8 +223,6 @@ Deprecated Mount Options Name Removal Schedule ---- ---------------- - barrier no earlier than v4.15 - nobarrier no earlier than v4.15 Removed Mount Options @@ -236,6 +234,8 @@ Removed Mount Options ihashsize v4.0 irixsgid v4.0 osyncisdsync/osyncisosync v4.0 + barrier v4.19 + nobarrier v4.19 sysctls |