diff options
Diffstat (limited to 'Documentation')
18 files changed, 850 insertions, 11 deletions
diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt index 344e85cc7323..d7273a5f6456 100644 --- a/Documentation/arm64/memory.txt +++ b/Documentation/arm64/memory.txt @@ -17,7 +17,7 @@ User addresses have bits 63:48 set to 0 while the kernel addresses have the same bits set to 1. TTBRx selection is given by bit 63 of the virtual address. The swapper_pg_dir contains only kernel (global) mappings while the user pgd contains only user (non-global) mappings. -The swapper_pgd_dir address is written to TTBR1 and never written to +The swapper_pg_dir address is written to TTBR1 and never written to TTBR0. diff --git a/Documentation/devicetree/bindings/mailbox/mailbox.txt b/Documentation/devicetree/bindings/mailbox/mailbox.txt new file mode 100644 index 000000000000..1a2cd3d266db --- /dev/null +++ b/Documentation/devicetree/bindings/mailbox/mailbox.txt @@ -0,0 +1,38 @@ +* Generic Mailbox Controller and client driver bindings + +Generic binding to provide a way for Mailbox controller drivers to +assign appropriate mailbox channel to client drivers. + +* Mailbox Controller + +Required property: +- #mbox-cells: Must be at least 1. Number of cells in a mailbox + specifier. + +Example: + mailbox: mailbox { + ... + #mbox-cells = <1>; + }; + + +* Mailbox Client + +Required property: +- mboxes: List of phandle and mailbox channel specifiers. + +Optional property: +- mbox-names: List of identifier strings for each mailbox channel + required by the client. The use of this property + is discouraged in favor of using index in list of + 'mboxes' while requesting a mailbox. Instead the + platforms may define channel indices, in DT headers, + to something legible. + +Example: + pwr_cntrl: power { + ... + mbox-names = "pwr-ctrl", "rpc"; + mboxes = <&mailbox 0 + &mailbox 1>; + }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt b/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt index 0bda229a6171..3899d6a557c1 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt @@ -1,5 +1,20 @@ Freescale FlexTimer Module (FTM) PWM controller +The same FTM PWM device can have a different endianness on different SoCs. The +device tree provides a property to describing this so that an operating system +device driver can handle all variants of the device. Refer to the table below +for the endianness of the FTM PWM block as integrated into the existing SoCs: + + SoC | FTM-PWM endianness + --------+------------------- + Vybrid | LE + LS1 | BE + LS2 | LE + +Please see ../regmap/regmap.txt for more detail about how to specify endian +modes in device tree. + + Required properties: - compatible: Should be "fsl,vf610-ftm-pwm". - reg: Physical base address and length of the controller's registers @@ -16,7 +31,8 @@ Required properties: - pinctrl-names: Must contain a "default" entry. - pinctrl-NNN: One property must exist for each entry in pinctrl-names. See pinctrl/pinctrl-bindings.txt for details of the property values. - +- big-endian: Boolean property, required if the FTM PWM registers use a big- + endian rather than little-endian layout. Example: @@ -32,4 +48,5 @@ pwm0: pwm@40038000 { <&clks VF610_CLK_FTM0_EXT_FIX_EN>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_pwm0_1>; + big-endian; }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt index d47d15a6a298..b8be3d09ee26 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt @@ -7,8 +7,8 @@ Required properties: "rockchip,vop-pwm": found integrated in VOP on RK3288 SoC - reg: physical base address and length of the controller's registers - clocks: phandle and clock specifier of the PWM reference clock - - #pwm-cells: should be 2. See pwm.txt in this directory for a - description of the cell format. + - #pwm-cells: must be 2 (rk2928) or 3 (rk3288). See pwm.txt in this directory + for a description of the cell format. Example: diff --git a/Documentation/devicetree/bindings/thermal/imx-thermal.txt b/Documentation/devicetree/bindings/thermal/imx-thermal.txt index 1f0f67234a91..3c67bd50aa10 100644 --- a/Documentation/devicetree/bindings/thermal/imx-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/imx-thermal.txt @@ -1,7 +1,10 @@ * Temperature Monitor (TEMPMON) on Freescale i.MX SoCs Required properties: -- compatible : "fsl,imx6q-thermal" +- compatible : "fsl,imx6q-tempmon" for i.MX6Q, "fsl,imx6sx-tempmon" for i.MX6SX. + i.MX6SX has two more IRQs than i.MX6Q, one is IRQ_LOW and the other is IRQ_PANIC, + when temperature is below than low threshold, IRQ_LOW will be triggered, when temperature + is higher than panic threshold, system will auto reboot by SRC module. - fsl,tempmon : phandle pointer to system controller that contains TEMPMON control registers, e.g. ANATOP on imx6q. - fsl,tempmon-data : phandle pointer to fuse controller that contains TEMPMON diff --git a/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt b/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt new file mode 100644 index 000000000000..c3a36ee45552 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt @@ -0,0 +1,24 @@ +Zynq Watchdog Device Tree Bindings +------------------------------------------- + +Required properties: +- compatible : Should be "cdns,wdt-r1p2". +- clocks : This is pclk (APB clock). +- interrupts : This is wd_irq - watchdog timeout interrupt. +- interrupt-parent : Must be core interrupt controller. + +Optional properties +- reset-on-timeout : If this property exists, then a reset is done + when watchdog times out. +- timeout-sec : Watchdog timeout value (in seconds). + +Example: + watchdog@f8005000 { + compatible = "cdns,wdt-r1p2"; + clocks = <&clkc 45>; + interrupt-parent = <&intc>; + interrupts = <0 9 1>; + reg = <0xf8005000 0x1000>; + reset-on-timeout; + timeout-sec = <10>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt b/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt index e52ba2da868c..8dab6fd024aa 100644 --- a/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt @@ -7,7 +7,8 @@ Required properties: Optional property: - big-endian: If present the watchdog device's registers are implemented - in big endian mode, otherwise in little mode. + in big endian mode, otherwise in native mode(same with CPU), for more + detail please see: Documentation/devicetree/bindings/regmap/regmap.txt. Examples: diff --git a/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt b/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt new file mode 100644 index 000000000000..9200fc2d508c --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt @@ -0,0 +1,13 @@ +Meson SoCs Watchdog timer + +Required properties: + +- compatible : should be "amlogic,meson6-wdt" +- reg : Specifies base physical address and size of the registers. + +Example: + +wdt: watchdog@c1109900 { + compatible = "amlogic,meson6-wdt"; + reg = <0xc1109900 0x8>; +}; diff --git a/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt b/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt new file mode 100644 index 000000000000..4726924d034e --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt @@ -0,0 +1,24 @@ +Qualcomm Krait Processor Sub-system (KPSS) Watchdog +--------------------------------------------------- + +Required properties : +- compatible : shall contain only one of the following: + + "qcom,kpss-wdt-msm8960" + "qcom,kpss-wdt-apq8064" + "qcom,kpss-wdt-ipq8064" + +- reg : shall contain base register location and length +- clocks : shall contain the input clock + +Optional properties : +- timeout-sec : shall contain the default watchdog timeout in seconds, + if unset, the default timeout is 30 seconds + +Example: + watchdog@208a038 { + compatible = "qcom,kpss-wdt-ipq8064"; + reg = <0x0208a038 0x40>; + clocks = <&sleep_clk>; + timeout-sec = <10>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt b/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt index cfff37511aac..8f3d96af81d7 100644 --- a/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt @@ -9,6 +9,7 @@ Required properties: (a) "samsung,s3c2410-wdt" for Exynos4 and previous SoCs (b) "samsung,exynos5250-wdt" for Exynos5250 (c) "samsung,exynos5420-wdt" for Exynos5420 + (c) "samsung,exynos7-wdt" for Exynos7 - reg : base physical address of the controller and length of memory mapped region. diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 94d93b1f8b53..b30753cbf431 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -67,6 +67,7 @@ prototypes: struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); locking rules: all may block @@ -96,6 +97,7 @@ fiemap: no update_time: no atomic_open: yes tmpfile: no +dentry_open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt new file mode 100644 index 000000000000..530850a72735 --- /dev/null +++ b/Documentation/filesystems/overlayfs.txt @@ -0,0 +1,198 @@ +Written by: Neil Brown <neilb@suse.de> + +Overlay Filesystem +================== + +This document describes a prototype for a new approach to providing +overlay-filesystem functionality in Linux (sometimes referred to as +union-filesystems). An overlay-filesystem tries to present a +filesystem which is the result over overlaying one filesystem on top +of the other. + +The result will inevitably fail to look exactly like a normal +filesystem for various technical reasons. The expectation is that +many use cases will be able to ignore these differences. + +This approach is 'hybrid' because the objects that appear in the +filesystem do not all appear to belong to that filesystem. In many +cases an object accessed in the union will be indistinguishable +from accessing the corresponding object from the original filesystem. +This is most obvious from the 'st_dev' field returned by stat(2). + +While directories will report an st_dev from the overlay-filesystem, +all non-directory objects will report an st_dev from the lower or +upper filesystem that is providing the object. Similarly st_ino will +only be unique when combined with st_dev, and both of these can change +over the lifetime of a non-directory object. Many applications and +tools ignore these values and will not be affected. + +Upper and Lower +--------------- + +An overlay filesystem combines two filesystems - an 'upper' filesystem +and a 'lower' filesystem. When a name exists in both filesystems, the +object in the 'upper' filesystem is visible while the object in the +'lower' filesystem is either hidden or, in the case of directories, +merged with the 'upper' object. + +It would be more correct to refer to an upper and lower 'directory +tree' rather than 'filesystem' as it is quite possible for both +directory trees to be in the same filesystem and there is no +requirement that the root of a filesystem be given for either upper or +lower. + +The lower filesystem can be any filesystem supported by Linux and does +not need to be writable. The lower filesystem can even be another +overlayfs. The upper filesystem will normally be writable and if it +is it must support the creation of trusted.* extended attributes, and +must provide valid d_type in readdir responses, so NFS is not suitable. + +A read-only overlay of two read-only filesystems may use any +filesystem type. + +Directories +----------- + +Overlaying mainly involves directories. If a given name appears in both +upper and lower filesystems and refers to a non-directory in either, +then the lower object is hidden - the name refers only to the upper +object. + +Where both upper and lower objects are directories, a merged directory +is formed. + +At mount time, the two directories given as mount options "lowerdir" and +"upperdir" are combined into a merged directory: + + mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper,\ +workdir=/work /merged + +The "workdir" needs to be an empty directory on the same filesystem +as upperdir. + +Then whenever a lookup is requested in such a merged directory, the +lookup is performed in each actual directory and the combined result +is cached in the dentry belonging to the overlay filesystem. If both +actual lookups find directories, both are stored and a merged +directory is created, otherwise only one is stored: the upper if it +exists, else the lower. + +Only the lists of names from directories are merged. Other content +such as metadata and extended attributes are reported for the upper +directory only. These attributes of the lower directory are hidden. + +whiteouts and opaque directories +-------------------------------- + +In order to support rm and rmdir without changing the lower +filesystem, an overlay filesystem needs to record in the upper filesystem +that files have been removed. This is done using whiteouts and opaque +directories (non-directories are always opaque). + +A whiteout is created as a character device with 0/0 device number. +When a whiteout is found in the upper level of a merged directory, any +matching name in the lower level is ignored, and the whiteout itself +is also hidden. + +A directory is made opaque by setting the xattr "trusted.overlay.opaque" +to "y". Where the upper filesystem contains an opaque directory, any +directory in the lower filesystem with the same name is ignored. + +readdir +------- + +When a 'readdir' request is made on a merged directory, the upper and +lower directories are each read and the name lists merged in the +obvious way (upper is read first, then lower - entries that already +exist are not re-added). This merged name list is cached in the +'struct file' and so remains as long as the file is kept open. If the +directory is opened and read by two processes at the same time, they +will each have separate caches. A seekdir to the start of the +directory (offset 0) followed by a readdir will cause the cache to be +discarded and rebuilt. + +This means that changes to the merged directory do not appear while a +directory is being read. This is unlikely to be noticed by many +programs. + +seek offsets are assigned sequentially when the directories are read. +Thus if + - read part of a directory + - remember an offset, and close the directory + - re-open the directory some time later + - seek to the remembered offset + +there may be little correlation between the old and new locations in +the list of filenames, particularly if anything has changed in the +directory. + +Readdir on directories that are not merged is simply handled by the +underlying directory (upper or lower). + + +Non-directories +--------------- + +Objects that are not directories (files, symlinks, device-special +files etc.) are presented either from the upper or lower filesystem as +appropriate. When a file in the lower filesystem is accessed in a way +the requires write-access, such as opening for write access, changing +some metadata etc., the file is first copied from the lower filesystem +to the upper filesystem (copy_up). Note that creating a hard-link +also requires copy_up, though of course creation of a symlink does +not. + +The copy_up may turn out to be unnecessary, for example if the file is +opened for read-write but the data is not modified. + +The copy_up process first makes sure that the containing directory +exists in the upper filesystem - creating it and any parents as +necessary. It then creates the object with the same metadata (owner, +mode, mtime, symlink-target etc.) and then if the object is a file, the +data is copied from the lower to the upper filesystem. Finally any +extended attributes are copied up. + +Once the copy_up is complete, the overlay filesystem simply +provides direct access to the newly created file in the upper +filesystem - future operations on the file are barely noticed by the +overlay filesystem (though an operation on the name of the file such as +rename or unlink will of course be noticed and handled). + + +Non-standard behavior +--------------------- + +The copy_up operation essentially creates a new, identical file and +moves it over to the old name. The new file may be on a different +filesystem, so both st_dev and st_ino of the file may change. + +Any open files referring to this inode will access the old data and +metadata. Similarly any file locks obtained before copy_up will not +apply to the copied up file. + +On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and +fsetxattr(2) will fail with EROFS. + +If a file with multiple hard links is copied up, then this will +"break" the link. Changes will not be propagated to other names +referring to the same inode. + +Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory +object in overlayfs will not contain valid absolute paths, only +relative paths leading up to the filesystem's root. This will be +fixed in the future. + +Some operations are not atomic, for example a crash during copy_up or +rename will leave the filesystem in an inconsistent state. This will +be addressed in the future. + +Changes to underlying filesystems +--------------------------------- + +Offline changes, when the overlay is not mounted, are allowed to either +the upper or the lower trees. + +Changes to the underlying filesystems while part of a mounted overlay +filesystem are not allowed. If the underlying filesystem is changed, +the behavior of the overlay is undefined, though it will not result in +a crash or deadlock. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index fceff7c00a3c..20bf204426ca 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,6 +364,7 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -696,6 +697,12 @@ struct address_space_operations { but instead uses bmap to find out where the blocks in the file are and uses those addresses directly. + dentry_open: *WARNING: probably going away soon, do not use!* This is an + alternative to f_op->open(), the difference is that this method may open + a file not necessarily originating from the same filesystem as the one + i_op->open() was called on. It may be useful for stacking filesystems + which want to allow native I/O directly on underlying files. + invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7dbe5ec9d9cd..74339c57b914 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1015,10 +1015,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: {"off" | "on" | "skip[mbr]"} efi= [EFI] - Format: { "old_map" } + Format: { "old_map", "nochunk", "noruntime" } old_map [X86-64]: switch to the old ioremap-based EFI runtime services mapping. 32-bit still uses this one by default. + nochunk: disable reading files in "chunks" in the EFI + boot stub, as chunking can cause problems with some + firmware implementations. + noruntime : disable EFI runtime services support efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of @@ -2232,7 +2236,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nodsp [SH] Disable hardware DSP at boot time. - noefi [X86] Disable EFI runtime services support. + noefi Disable EFI runtime services support. noexec [IA-64] @@ -3465,6 +3469,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. e.g. base its process migration decisions on it. Default is on. + topology_updates= [KNL, PPC, NUMA] + Format: {off} + Specify if the kernel should ignore (off) + topology updates sent by the hypervisor to this + LPAR. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/Documentation/mailbox.txt b/Documentation/mailbox.txt new file mode 100644 index 000000000000..60f43ff629aa --- /dev/null +++ b/Documentation/mailbox.txt @@ -0,0 +1,122 @@ + The Common Mailbox Framework + Jassi Brar <jaswinder.singh@linaro.org> + + This document aims to help developers write client and controller +drivers for the API. But before we start, let us note that the +client (especially) and controller drivers are likely going to be +very platform specific because the remote firmware is likely to be +proprietary and implement non-standard protocol. So even if two +platforms employ, say, PL320 controller, the client drivers can't +be shared across them. Even the PL320 driver might need to accommodate +some platform specific quirks. So the API is meant mainly to avoid +similar copies of code written for each platform. Having said that, +nothing prevents the remote f/w to also be Linux based and use the +same api there. However none of that helps us locally because we only +ever deal at client's protocol level. + Some of the choices made during implementation are the result of this +peculiarity of this "common" framework. + + + + Part 1 - Controller Driver (See include/linux/mailbox_controller.h) + + Allocate mbox_controller and the array of mbox_chan. +Populate mbox_chan_ops, except peek_data() all are mandatory. +The controller driver might know a message has been consumed +by the remote by getting an IRQ or polling some hardware flag +or it can never know (the client knows by way of the protocol). +The method in order of preference is IRQ -> Poll -> None, which +the controller driver should set via 'txdone_irq' or 'txdone_poll' +or neither. + + + Part 2 - Client Driver (See include/linux/mailbox_client.h) + + The client might want to operate in blocking mode (synchronously +send a message through before returning) or non-blocking/async mode (submit +a message and a callback function to the API and return immediately). + + +struct demo_client { + struct mbox_client cl; + struct mbox_chan *mbox; + struct completion c; + bool async; + /* ... */ +}; + +/* + * This is the handler for data received from remote. The behaviour is purely + * dependent upon the protocol. This is just an example. + */ +static void message_from_remote(struct mbox_client *cl, void *mssg) +{ + struct demo_client *dc = container_of(mbox_client, + struct demo_client, cl); + if (dc->aysnc) { + if (is_an_ack(mssg)) { + /* An ACK to our last sample sent */ + return; /* Or do something else here */ + } else { /* A new message from remote */ + queue_req(mssg); + } + } else { + /* Remote f/w sends only ACK packets on this channel */ + return; + } +} + +static void sample_sent(struct mbox_client *cl, void *mssg, int r) +{ + struct demo_client *dc = container_of(mbox_client, + struct demo_client, cl); + complete(&dc->c); +} + +static void client_demo(struct platform_device *pdev) +{ + struct demo_client *dc_sync, *dc_async; + /* The controller already knows async_pkt and sync_pkt */ + struct async_pkt ap; + struct sync_pkt sp; + + dc_sync = kzalloc(sizeof(*dc_sync), GFP_KERNEL); + dc_async = kzalloc(sizeof(*dc_async), GFP_KERNEL); + + /* Populate non-blocking mode client */ + dc_async->cl.dev = &pdev->dev; + dc_async->cl.rx_callback = message_from_remote; + dc_async->cl.tx_done = sample_sent; + dc_async->cl.tx_block = false; + dc_async->cl.tx_tout = 0; /* doesn't matter here */ + dc_async->cl.knows_txdone = false; /* depending upon protocol */ + dc_async->async = true; + init_completion(&dc_async->c); + + /* Populate blocking mode client */ + dc_sync->cl.dev = &pdev->dev; + dc_sync->cl.rx_callback = message_from_remote; + dc_sync->cl.tx_done = NULL; /* operate in blocking mode */ + dc_sync->cl.tx_block = true; + dc_sync->cl.tx_tout = 500; /* by half a second */ + dc_sync->cl.knows_txdone = false; /* depending upon protocol */ + dc_sync->async = false; + + /* ASync mailbox is listed second in 'mboxes' property */ + dc_async->mbox = mbox_request_channel(&dc_async->cl, 1); + /* Populate data packet */ + /* ap.xxx = 123; etc */ + /* Send async message to remote */ + mbox_send_message(dc_async->mbox, &ap); + + /* Sync mailbox is listed first in 'mboxes' property */ + dc_sync->mbox = mbox_request_channel(&dc_sync->cl, 0); + /* Populate data packet */ + /* sp.abc = 123; etc */ + /* Send message to remote in blocking mode */ + mbox_send_message(dc_sync->mbox, &sp); + /* At this point 'sp' has been sent */ + + /* Now wait for async chan to be done */ + wait_for_completion(&dc_async->c); +} diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index a5da5c7e7128..129f7c0e1483 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt @@ -5,7 +5,8 @@ performance expectations by drivers, subsystems and user space applications on one of the parameters. Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput. +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. 2. the per-device PM QoS framework provides the API to manage the per-device latency constraints and PM QoS flags. @@ -13,6 +14,7 @@ Each parameters have defined units: * latency: usec * timeout: usec * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) 1. PM QoS framework diff --git a/Documentation/scsi/osd.txt b/Documentation/scsi/osd.txt index da162f7fd5f5..5a9879bad073 100644 --- a/Documentation/scsi/osd.txt +++ b/Documentation/scsi/osd.txt @@ -184,8 +184,7 @@ Any problems, questions, bug reports, lonely OSD nights, please email: More up-to-date information can be found on: http://open-osd.org -Boaz Harrosh <bharrosh@panasas.com> -Benny Halevy <bhalevy@panasas.com> +Boaz Harrosh <ooo@electrozaur.com> References ========== diff --git a/Documentation/target/tcmu-design.txt b/Documentation/target/tcmu-design.txt new file mode 100644 index 000000000000..5518465290bf --- /dev/null +++ b/Documentation/target/tcmu-design.txt @@ -0,0 +1,378 @@ +Contents: + +1) TCM Userspace Design + a) Background + b) Benefits + c) Design constraints + d) Implementation overview + i. Mailbox + ii. Command ring + iii. Data Area + e) Device discovery + f) Device events + g) Other contingencies +2) Writing a user pass-through handler + a) Discovering and configuring TCMU uio devices + b) Waiting for events on the device(s) + c) Managing the command ring +3) Command filtering and pass_level +4) A final note + + +TCM Userspace Design +-------------------- + +TCM is another name for LIO, an in-kernel iSCSI target (server). +Existing TCM targets run in the kernel. TCMU (TCM in Userspace) +allows userspace programs to be written which act as iSCSI targets. +This document describes the design. + +The existing kernel provides modules for different SCSI transport +protocols. TCM also modularizes the data storage. There are existing +modules for file, block device, RAM or using another SCSI device as +storage. These are called "backstores" or "storage engines". These +built-in modules are implemented entirely as kernel code. + +Background: + +In addition to modularizing the transport protocol used for carrying +SCSI commands ("fabrics"), the Linux kernel target, LIO, also modularizes +the actual data storage as well. These are referred to as "backstores" +or "storage engines". The target comes with backstores that allow a +file, a block device, RAM, or another SCSI device to be used for the +local storage needed for the exported SCSI LUN. Like the rest of LIO, +these are implemented entirely as kernel code. + +These backstores cover the most common use cases, but not all. One new +use case that other non-kernel target solutions, such as tgt, are able +to support is using Gluster's GLFS or Ceph's RBD as a backstore. The +target then serves as a translator, allowing initiators to store data +in these non-traditional networked storage systems, while still only +using standard protocols themselves. + +If the target is a userspace process, supporting these is easy. tgt, +for example, needs only a small adapter module for each, because the +modules just use the available userspace libraries for RBD and GLFS. + +Adding support for these backstores in LIO is considerably more +difficult, because LIO is entirely kernel code. Instead of undertaking +the significant work to port the GLFS or RBD APIs and protocols to the +kernel, another approach is to create a userspace pass-through +backstore for LIO, "TCMU". + + +Benefits: + +In addition to allowing relatively easy support for RBD and GLFS, TCMU +will also allow easier development of new backstores. TCMU combines +with the LIO loopback fabric to become something similar to FUSE +(Filesystem in Userspace), but at the SCSI layer instead of the +filesystem layer. A SUSE, if you will. + +The disadvantage is there are more distinct components to configure, and +potentially to malfunction. This is unavoidable, but hopefully not +fatal if we're careful to keep things as simple as possible. + +Design constraints: + +- Good performance: high throughput, low latency +- Cleanly handle if userspace: + 1) never attaches + 2) hangs + 3) dies + 4) misbehaves +- Allow future flexibility in user & kernel implementations +- Be reasonably memory-efficient +- Simple to configure & run +- Simple to write a userspace backend + + +Implementation overview: + +The core of the TCMU interface is a memory region that is shared +between kernel and userspace. Within this region is: a control area +(mailbox); a lockless producer/consumer circular buffer for commands +to be passed up, and status returned; and an in/out data buffer area. + +TCMU uses the pre-existing UIO subsystem. UIO allows device driver +development in userspace, and this is conceptually very close to the +TCMU use case, except instead of a physical device, TCMU implements a +memory-mapped layout designed for SCSI commands. Using UIO also +benefits TCMU by handling device introspection (e.g. a way for +userspace to determine how large the shared region is) and signaling +mechanisms in both directions. + +There are no embedded pointers in the memory region. Everything is +expressed as an offset from the region's starting address. This allows +the ring to still work if the user process dies and is restarted with +the region mapped at a different virtual address. + +See target_core_user.h for the struct definitions. + +The Mailbox: + +The mailbox is always at the start of the shared memory region, and +contains a version, details about the starting offset and size of the +command ring, and head and tail pointers to be used by the kernel and +userspace (respectively) to put commands on the ring, and indicate +when the commands are completed. + +version - 1 (userspace should abort if otherwise) +flags - none yet defined. +cmdr_off - The offset of the start of the command ring from the start +of the memory region, to account for the mailbox size. +cmdr_size - The size of the command ring. This does *not* need to be a +power of two. +cmd_head - Modified by the kernel to indicate when a command has been +placed on the ring. +cmd_tail - Modified by userspace to indicate when it has completed +processing of a command. + +The Command Ring: + +Commands are placed on the ring by the kernel incrementing +mailbox.cmd_head by the size of the command, modulo cmdr_size, and +then signaling userspace via uio_event_notify(). Once the command is +completed, userspace updates mailbox.cmd_tail in the same way and +signals the kernel via a 4-byte write(). When cmd_head equals +cmd_tail, the ring is empty -- no commands are currently waiting to be +processed by userspace. + +TCMU commands start with a common header containing "len_op", a 32-bit +value that stores the length, as well as the opcode in the lowest +unused bits. Currently only two opcodes are defined, TCMU_OP_PAD and +TCMU_OP_CMD. When userspace encounters a command with PAD opcode, it +should skip ahead by the bytes in "length". (The kernel inserts PAD +entries to ensure each CMD entry fits contigously into the circular +buffer.) + +When userspace handles a CMD, it finds the SCSI CDB (Command Data +Block) via tcmu_cmd_entry.req.cdb_off. This is an offset from the +start of the overall shared memory region, not the entry. The data +in/out buffers are accessible via tht req.iov[] array. Note that +each iov.iov_base is also an offset from the start of the region. + +TCMU currently does not support BIDI operations. + +When completing a command, userspace sets rsp.scsi_status, and +rsp.sense_buffer if necessary. Userspace then increments +mailbox.cmd_tail by entry.hdr.length (mod cmdr_size) and signals the +kernel via the UIO method, a 4-byte write to the file descriptor. + +The Data Area: + +This is shared-memory space after the command ring. The organization +of this area is not defined in the TCMU interface, and userspace +should access only the parts referenced by pending iovs. + + +Device Discovery: + +Other devices may be using UIO besides TCMU. Unrelated user processes +may also be handling different sets of TCMU devices. TCMU userspace +processes must find their devices by scanning sysfs +class/uio/uio*/name. For TCMU devices, these names will be of the +format: + +tcm-user/<hba_num>/<device_name>/<subtype>/<path> + +where "tcm-user" is common for all TCMU-backed UIO devices. <hba_num> +and <device_name> allow userspace to find the device's path in the +kernel target's configfs tree. Assuming the usual mount point, it is +found at: + +/sys/kernel/config/target/core/user_<hba_num>/<device_name> + +This location contains attributes such as "hw_block_size", that +userspace needs to know for correct operation. + +<subtype> will be a userspace-process-unique string to identify the +TCMU device as expecting to be backed by a certain handler, and <path> +will be an additional handler-specific string for the user process to +configure the device, if needed. The name cannot contain ':', due to +LIO limitations. + +For all devices so discovered, the user handler opens /dev/uioX and +calls mmap(): + +mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0) + +where size must be equal to the value read from +/sys/class/uio/uioX/maps/map0/size. + + +Device Events: + +If a new device is added or removed, a notification will be broadcast +over netlink, using a generic netlink family name of "TCM-USER" and a +multicast group named "config". This will include the UIO name as +described in the previous section, as well as the UIO minor +number. This should allow userspace to identify both the UIO device and +the LIO device, so that after determining the device is supported +(based on subtype) it can take the appropriate action. + + +Other contingencies: + +Userspace handler process never attaches: + +- TCMU will post commands, and then abort them after a timeout period + (30 seconds.) + +Userspace handler process is killed: + +- It is still possible to restart and re-connect to TCMU + devices. Command ring is preserved. However, after the timeout period, + the kernel will abort pending tasks. + +Userspace handler process hangs: + +- The kernel will abort pending tasks after a timeout period. + +Userspace handler process is malicious: + +- The process can trivially break the handling of devices it controls, + but should not be able to access kernel memory outside its shared + memory areas. + + +Writing a user pass-through handler (with example code) +------------------------------------------------------- + +A user process handing a TCMU device must support the following: + +a) Discovering and configuring TCMU uio devices +b) Waiting for events on the device(s) +c) Managing the command ring: Parsing operations and commands, + performing work as needed, setting response fields (scsi_status and + possibly sense_buffer), updating cmd_tail, and notifying the kernel + that work has been finished + +First, consider instead writing a plugin for tcmu-runner. tcmu-runner +implements all of this, and provides a higher-level API for plugin +authors. + +TCMU is designed so that multiple unrelated processes can manage TCMU +devices separately. All handlers should make sure to only open their +devices, based opon a known subtype string. + +a) Discovering and configuring TCMU UIO devices: + +(error checking omitted for brevity) + +int fd, dev_fd; +char buf[256]; +unsigned long long map_len; +void *map; + +fd = open("/sys/class/uio/uio0/name", O_RDONLY); +ret = read(fd, buf, sizeof(buf)); +close(fd); +buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ + +/* we only want uio devices whose name is a format we expect */ +if (strncmp(buf, "tcm-user", 8)) + exit(-1); + +/* Further checking for subtype also needed here */ + +fd = open(/sys/class/uio/%s/maps/map0/size, O_RDONLY); +ret = read(fd, buf, sizeof(buf)); +close(fd); +str_buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ + +map_len = strtoull(buf, NULL, 0); + +dev_fd = open("/dev/uio0", O_RDWR); +map = mmap(NULL, map_len, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0); + + +b) Waiting for events on the device(s) + +while (1) { + char buf[4]; + + int ret = read(dev_fd, buf, 4); /* will block */ + + handle_device_events(dev_fd, map); +} + + +c) Managing the command ring + +#include <linux/target_core_user.h> + +int handle_device_events(int fd, void *map) +{ + struct tcmu_mailbox *mb = map; + struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; + int did_some_work = 0; + + /* Process events from cmd ring until we catch up with cmd_head */ + while (ent != (void *)mb + mb->cmdr_off + mb->cmd_head) { + + if (tcmu_hdr_get_op(&ent->hdr) == TCMU_OP_CMD) { + uint8_t *cdb = (void *)mb + ent->req.cdb_off; + bool success = true; + + /* Handle command here. */ + printf("SCSI opcode: 0x%x\n", cdb[0]); + + /* Set response fields */ + if (success) + ent->rsp.scsi_status = SCSI_NO_SENSE; + else { + /* Also fill in rsp->sense_buffer here */ + ent->rsp.scsi_status = SCSI_CHECK_CONDITION; + } + } + else { + /* Do nothing for PAD entries */ + } + + /* update cmd_tail */ + mb->cmd_tail = (mb->cmd_tail + tcmu_hdr_get_len(&ent->hdr)) % mb->cmdr_size; + ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; + did_some_work = 1; + } + + /* Notify the kernel that work has been finished */ + if (did_some_work) { + uint32_t buf = 0; + + write(fd, &buf, 4); + } + + return 0; +} + + +Command filtering and pass_level +-------------------------------- + +TCMU supports a "pass_level" option with valid values of 0 or 1. When +the value is 0 (the default), nearly all SCSI commands received for +the device are passed through to the handler. This allows maximum +flexibility but increases the amount of code required by the handler, +to support all mandatory SCSI commands. If pass_level is set to 1, +then only IO-related commands are presented, and the rest are handled +by LIO's in-kernel command emulation. The commands presented at level +1 include all versions of: + +READ +WRITE +WRITE_VERIFY +XDWRITEREAD +WRITE_SAME +COMPARE_AND_WRITE +SYNCHRONIZE_CACHE +UNMAP + + +A final note +------------ + +Please be careful to return codes as defined by the SCSI +specifications. These are different than some values defined in the +scsi/scsi.h include file. For example, CHECK CONDITION's status code +is 2, not 1. |