diff options
450 files changed, 14965 insertions, 4847 deletions
diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt index 344e85cc7323..d7273a5f6456 100644 --- a/Documentation/arm64/memory.txt +++ b/Documentation/arm64/memory.txt @@ -17,7 +17,7 @@ User addresses have bits 63:48 set to 0 while the kernel addresses have the same bits set to 1. TTBRx selection is given by bit 63 of the virtual address. The swapper_pg_dir contains only kernel (global) mappings while the user pgd contains only user (non-global) mappings. -The swapper_pgd_dir address is written to TTBR1 and never written to +The swapper_pg_dir address is written to TTBR1 and never written to TTBR0. diff --git a/Documentation/devicetree/bindings/mailbox/mailbox.txt b/Documentation/devicetree/bindings/mailbox/mailbox.txt new file mode 100644 index 000000000000..1a2cd3d266db --- /dev/null +++ b/Documentation/devicetree/bindings/mailbox/mailbox.txt @@ -0,0 +1,38 @@ +* Generic Mailbox Controller and client driver bindings + +Generic binding to provide a way for Mailbox controller drivers to +assign appropriate mailbox channel to client drivers. + +* Mailbox Controller + +Required property: +- #mbox-cells: Must be at least 1. Number of cells in a mailbox + specifier. + +Example: + mailbox: mailbox { + ... + #mbox-cells = <1>; + }; + + +* Mailbox Client + +Required property: +- mboxes: List of phandle and mailbox channel specifiers. + +Optional property: +- mbox-names: List of identifier strings for each mailbox channel + required by the client. The use of this property + is discouraged in favor of using index in list of + 'mboxes' while requesting a mailbox. Instead the + platforms may define channel indices, in DT headers, + to something legible. + +Example: + pwr_cntrl: power { + ... + mbox-names = "pwr-ctrl", "rpc"; + mboxes = <&mailbox 0 + &mailbox 1>; + }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt b/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt index 0bda229a6171..3899d6a557c1 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt @@ -1,5 +1,20 @@ Freescale FlexTimer Module (FTM) PWM controller +The same FTM PWM device can have a different endianness on different SoCs. The +device tree provides a property to describing this so that an operating system +device driver can handle all variants of the device. Refer to the table below +for the endianness of the FTM PWM block as integrated into the existing SoCs: + + SoC | FTM-PWM endianness + --------+------------------- + Vybrid | LE + LS1 | BE + LS2 | LE + +Please see ../regmap/regmap.txt for more detail about how to specify endian +modes in device tree. + + Required properties: - compatible: Should be "fsl,vf610-ftm-pwm". - reg: Physical base address and length of the controller's registers @@ -16,7 +31,8 @@ Required properties: - pinctrl-names: Must contain a "default" entry. - pinctrl-NNN: One property must exist for each entry in pinctrl-names. See pinctrl/pinctrl-bindings.txt for details of the property values. - +- big-endian: Boolean property, required if the FTM PWM registers use a big- + endian rather than little-endian layout. Example: @@ -32,4 +48,5 @@ pwm0: pwm@40038000 { <&clks VF610_CLK_FTM0_EXT_FIX_EN>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_pwm0_1>; + big-endian; }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt index d47d15a6a298..b8be3d09ee26 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt @@ -7,8 +7,8 @@ Required properties: "rockchip,vop-pwm": found integrated in VOP on RK3288 SoC - reg: physical base address and length of the controller's registers - clocks: phandle and clock specifier of the PWM reference clock - - #pwm-cells: should be 2. See pwm.txt in this directory for a - description of the cell format. + - #pwm-cells: must be 2 (rk2928) or 3 (rk3288). See pwm.txt in this directory + for a description of the cell format. Example: diff --git a/Documentation/devicetree/bindings/thermal/imx-thermal.txt b/Documentation/devicetree/bindings/thermal/imx-thermal.txt index 1f0f67234a91..3c67bd50aa10 100644 --- a/Documentation/devicetree/bindings/thermal/imx-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/imx-thermal.txt @@ -1,7 +1,10 @@ * Temperature Monitor (TEMPMON) on Freescale i.MX SoCs Required properties: -- compatible : "fsl,imx6q-thermal" +- compatible : "fsl,imx6q-tempmon" for i.MX6Q, "fsl,imx6sx-tempmon" for i.MX6SX. + i.MX6SX has two more IRQs than i.MX6Q, one is IRQ_LOW and the other is IRQ_PANIC, + when temperature is below than low threshold, IRQ_LOW will be triggered, when temperature + is higher than panic threshold, system will auto reboot by SRC module. - fsl,tempmon : phandle pointer to system controller that contains TEMPMON control registers, e.g. ANATOP on imx6q. - fsl,tempmon-data : phandle pointer to fuse controller that contains TEMPMON diff --git a/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt b/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt new file mode 100644 index 000000000000..c3a36ee45552 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt @@ -0,0 +1,24 @@ +Zynq Watchdog Device Tree Bindings +------------------------------------------- + +Required properties: +- compatible : Should be "cdns,wdt-r1p2". +- clocks : This is pclk (APB clock). +- interrupts : This is wd_irq - watchdog timeout interrupt. +- interrupt-parent : Must be core interrupt controller. + +Optional properties +- reset-on-timeout : If this property exists, then a reset is done + when watchdog times out. +- timeout-sec : Watchdog timeout value (in seconds). + +Example: + watchdog@f8005000 { + compatible = "cdns,wdt-r1p2"; + clocks = <&clkc 45>; + interrupt-parent = <&intc>; + interrupts = <0 9 1>; + reg = <0xf8005000 0x1000>; + reset-on-timeout; + timeout-sec = <10>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt b/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt index e52ba2da868c..8dab6fd024aa 100644 --- a/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt @@ -7,7 +7,8 @@ Required properties: Optional property: - big-endian: If present the watchdog device's registers are implemented - in big endian mode, otherwise in little mode. + in big endian mode, otherwise in native mode(same with CPU), for more + detail please see: Documentation/devicetree/bindings/regmap/regmap.txt. Examples: diff --git a/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt b/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt new file mode 100644 index 000000000000..9200fc2d508c --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt @@ -0,0 +1,13 @@ +Meson SoCs Watchdog timer + +Required properties: + +- compatible : should be "amlogic,meson6-wdt" +- reg : Specifies base physical address and size of the registers. + +Example: + +wdt: watchdog@c1109900 { + compatible = "amlogic,meson6-wdt"; + reg = <0xc1109900 0x8>; +}; diff --git a/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt b/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt new file mode 100644 index 000000000000..4726924d034e --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt @@ -0,0 +1,24 @@ +Qualcomm Krait Processor Sub-system (KPSS) Watchdog +--------------------------------------------------- + +Required properties : +- compatible : shall contain only one of the following: + + "qcom,kpss-wdt-msm8960" + "qcom,kpss-wdt-apq8064" + "qcom,kpss-wdt-ipq8064" + +- reg : shall contain base register location and length +- clocks : shall contain the input clock + +Optional properties : +- timeout-sec : shall contain the default watchdog timeout in seconds, + if unset, the default timeout is 30 seconds + +Example: + watchdog@208a038 { + compatible = "qcom,kpss-wdt-ipq8064"; + reg = <0x0208a038 0x40>; + clocks = <&sleep_clk>; + timeout-sec = <10>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt b/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt index cfff37511aac..8f3d96af81d7 100644 --- a/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt @@ -9,6 +9,7 @@ Required properties: (a) "samsung,s3c2410-wdt" for Exynos4 and previous SoCs (b) "samsung,exynos5250-wdt" for Exynos5250 (c) "samsung,exynos5420-wdt" for Exynos5420 + (c) "samsung,exynos7-wdt" for Exynos7 - reg : base physical address of the controller and length of memory mapped region. diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 94d93b1f8b53..b30753cbf431 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -67,6 +67,7 @@ prototypes: struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); locking rules: all may block @@ -96,6 +97,7 @@ fiemap: no update_time: no atomic_open: yes tmpfile: no +dentry_open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt new file mode 100644 index 000000000000..530850a72735 --- /dev/null +++ b/Documentation/filesystems/overlayfs.txt @@ -0,0 +1,198 @@ +Written by: Neil Brown <neilb@suse.de> + +Overlay Filesystem +================== + +This document describes a prototype for a new approach to providing +overlay-filesystem functionality in Linux (sometimes referred to as +union-filesystems). An overlay-filesystem tries to present a +filesystem which is the result over overlaying one filesystem on top +of the other. + +The result will inevitably fail to look exactly like a normal +filesystem for various technical reasons. The expectation is that +many use cases will be able to ignore these differences. + +This approach is 'hybrid' because the objects that appear in the +filesystem do not all appear to belong to that filesystem. In many +cases an object accessed in the union will be indistinguishable +from accessing the corresponding object from the original filesystem. +This is most obvious from the 'st_dev' field returned by stat(2). + +While directories will report an st_dev from the overlay-filesystem, +all non-directory objects will report an st_dev from the lower or +upper filesystem that is providing the object. Similarly st_ino will +only be unique when combined with st_dev, and both of these can change +over the lifetime of a non-directory object. Many applications and +tools ignore these values and will not be affected. + +Upper and Lower +--------------- + +An overlay filesystem combines two filesystems - an 'upper' filesystem +and a 'lower' filesystem. When a name exists in both filesystems, the +object in the 'upper' filesystem is visible while the object in the +'lower' filesystem is either hidden or, in the case of directories, +merged with the 'upper' object. + +It would be more correct to refer to an upper and lower 'directory +tree' rather than 'filesystem' as it is quite possible for both +directory trees to be in the same filesystem and there is no +requirement that the root of a filesystem be given for either upper or +lower. + +The lower filesystem can be any filesystem supported by Linux and does +not need to be writable. The lower filesystem can even be another +overlayfs. The upper filesystem will normally be writable and if it +is it must support the creation of trusted.* extended attributes, and +must provide valid d_type in readdir responses, so NFS is not suitable. + +A read-only overlay of two read-only filesystems may use any +filesystem type. + +Directories +----------- + +Overlaying mainly involves directories. If a given name appears in both +upper and lower filesystems and refers to a non-directory in either, +then the lower object is hidden - the name refers only to the upper +object. + +Where both upper and lower objects are directories, a merged directory +is formed. + +At mount time, the two directories given as mount options "lowerdir" and +"upperdir" are combined into a merged directory: + + mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper,\ +workdir=/work /merged + +The "workdir" needs to be an empty directory on the same filesystem +as upperdir. + +Then whenever a lookup is requested in such a merged directory, the +lookup is performed in each actual directory and the combined result +is cached in the dentry belonging to the overlay filesystem. If both +actual lookups find directories, both are stored and a merged +directory is created, otherwise only one is stored: the upper if it +exists, else the lower. + +Only the lists of names from directories are merged. Other content +such as metadata and extended attributes are reported for the upper +directory only. These attributes of the lower directory are hidden. + +whiteouts and opaque directories +-------------------------------- + +In order to support rm and rmdir without changing the lower +filesystem, an overlay filesystem needs to record in the upper filesystem +that files have been removed. This is done using whiteouts and opaque +directories (non-directories are always opaque). + +A whiteout is created as a character device with 0/0 device number. +When a whiteout is found in the upper level of a merged directory, any +matching name in the lower level is ignored, and the whiteout itself +is also hidden. + +A directory is made opaque by setting the xattr "trusted.overlay.opaque" +to "y". Where the upper filesystem contains an opaque directory, any +directory in the lower filesystem with the same name is ignored. + +readdir +------- + +When a 'readdir' request is made on a merged directory, the upper and +lower directories are each read and the name lists merged in the +obvious way (upper is read first, then lower - entries that already +exist are not re-added). This merged name list is cached in the +'struct file' and so remains as long as the file is kept open. If the +directory is opened and read by two processes at the same time, they +will each have separate caches. A seekdir to the start of the +directory (offset 0) followed by a readdir will cause the cache to be +discarded and rebuilt. + +This means that changes to the merged directory do not appear while a +directory is being read. This is unlikely to be noticed by many +programs. + +seek offsets are assigned sequentially when the directories are read. +Thus if + - read part of a directory + - remember an offset, and close the directory + - re-open the directory some time later + - seek to the remembered offset + +there may be little correlation between the old and new locations in +the list of filenames, particularly if anything has changed in the +directory. + +Readdir on directories that are not merged is simply handled by the +underlying directory (upper or lower). + + +Non-directories +--------------- + +Objects that are not directories (files, symlinks, device-special +files etc.) are presented either from the upper or lower filesystem as +appropriate. When a file in the lower filesystem is accessed in a way +the requires write-access, such as opening for write access, changing +some metadata etc., the file is first copied from the lower filesystem +to the upper filesystem (copy_up). Note that creating a hard-link +also requires copy_up, though of course creation of a symlink does +not. + +The copy_up may turn out to be unnecessary, for example if the file is +opened for read-write but the data is not modified. + +The copy_up process first makes sure that the containing directory +exists in the upper filesystem - creating it and any parents as +necessary. It then creates the object with the same metadata (owner, +mode, mtime, symlink-target etc.) and then if the object is a file, the +data is copied from the lower to the upper filesystem. Finally any +extended attributes are copied up. + +Once the copy_up is complete, the overlay filesystem simply +provides direct access to the newly created file in the upper +filesystem - future operations on the file are barely noticed by the +overlay filesystem (though an operation on the name of the file such as +rename or unlink will of course be noticed and handled). + + +Non-standard behavior +--------------------- + +The copy_up operation essentially creates a new, identical file and +moves it over to the old name. The new file may be on a different +filesystem, so both st_dev and st_ino of the file may change. + +Any open files referring to this inode will access the old data and +metadata. Similarly any file locks obtained before copy_up will not +apply to the copied up file. + +On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and +fsetxattr(2) will fail with EROFS. + +If a file with multiple hard links is copied up, then this will +"break" the link. Changes will not be propagated to other names +referring to the same inode. + +Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory +object in overlayfs will not contain valid absolute paths, only +relative paths leading up to the filesystem's root. This will be +fixed in the future. + +Some operations are not atomic, for example a crash during copy_up or +rename will leave the filesystem in an inconsistent state. This will +be addressed in the future. + +Changes to underlying filesystems +--------------------------------- + +Offline changes, when the overlay is not mounted, are allowed to either +the upper or the lower trees. + +Changes to the underlying filesystems while part of a mounted overlay +filesystem are not allowed. If the underlying filesystem is changed, +the behavior of the overlay is undefined, though it will not result in +a crash or deadlock. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index fceff7c00a3c..20bf204426ca 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,6 +364,7 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -696,6 +697,12 @@ struct address_space_operations { but instead uses bmap to find out where the blocks in the file are and uses those addresses directly. + dentry_open: *WARNING: probably going away soon, do not use!* This is an + alternative to f_op->open(), the difference is that this method may open + a file not necessarily originating from the same filesystem as the one + i_op->open() was called on. It may be useful for stacking filesystems + which want to allow native I/O directly on underlying files. + invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7dbe5ec9d9cd..74339c57b914 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1015,10 +1015,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: {"off" | "on" | "skip[mbr]"} efi= [EFI] - Format: { "old_map" } + Format: { "old_map", "nochunk", "noruntime" } old_map [X86-64]: switch to the old ioremap-based EFI runtime services mapping. 32-bit still uses this one by default. + nochunk: disable reading files in "chunks" in the EFI + boot stub, as chunking can cause problems with some + firmware implementations. + noruntime : disable EFI runtime services support efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of @@ -2232,7 +2236,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nodsp [SH] Disable hardware DSP at boot time. - noefi [X86] Disable EFI runtime services support. + noefi Disable EFI runtime services support. noexec [IA-64] @@ -3465,6 +3469,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. e.g. base its process migration decisions on it. Default is on. + topology_updates= [KNL, PPC, NUMA] + Format: {off} + Specify if the kernel should ignore (off) + topology updates sent by the hypervisor to this + LPAR. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/Documentation/mailbox.txt b/Documentation/mailbox.txt new file mode 100644 index 000000000000..60f43ff629aa --- /dev/null +++ b/Documentation/mailbox.txt @@ -0,0 +1,122 @@ + The Common Mailbox Framework + Jassi Brar <jaswinder.singh@linaro.org> + + This document aims to help developers write client and controller +drivers for the API. But before we start, let us note that the +client (especially) and controller drivers are likely going to be +very platform specific because the remote firmware is likely to be +proprietary and implement non-standard protocol. So even if two +platforms employ, say, PL320 controller, the client drivers can't +be shared across them. Even the PL320 driver might need to accommodate +some platform specific quirks. So the API is meant mainly to avoid +similar copies of code written for each platform. Having said that, +nothing prevents the remote f/w to also be Linux based and use the +same api there. However none of that helps us locally because we only +ever deal at client's protocol level. + Some of the choices made during implementation are the result of this +peculiarity of this "common" framework. + + + + Part 1 - Controller Driver (See include/linux/mailbox_controller.h) + + Allocate mbox_controller and the array of mbox_chan. +Populate mbox_chan_ops, except peek_data() all are mandatory. +The controller driver might know a message has been consumed +by the remote by getting an IRQ or polling some hardware flag +or it can never know (the client knows by way of the protocol). +The method in order of preference is IRQ -> Poll -> None, which +the controller driver should set via 'txdone_irq' or 'txdone_poll' +or neither. + + + Part 2 - Client Driver (See include/linux/mailbox_client.h) + + The client might want to operate in blocking mode (synchronously +send a message through before returning) or non-blocking/async mode (submit +a message and a callback function to the API and return immediately). + + +struct demo_client { + struct mbox_client cl; + struct mbox_chan *mbox; + struct completion c; + bool async; + /* ... */ +}; + +/* + * This is the handler for data received from remote. The behaviour is purely + * dependent upon the protocol. This is just an example. + */ +static void message_from_remote(struct mbox_client *cl, void *mssg) +{ + struct demo_client *dc = container_of(mbox_client, + struct demo_client, cl); + if (dc->aysnc) { + if (is_an_ack(mssg)) { + /* An ACK to our last sample sent */ + return; /* Or do something else here */ + } else { /* A new message from remote */ + queue_req(mssg); + } + } else { + /* Remote f/w sends only ACK packets on this channel */ + return; + } +} + +static void sample_sent(struct mbox_client *cl, void *mssg, int r) +{ + struct demo_client *dc = container_of(mbox_client, + struct demo_client, cl); + complete(&dc->c); +} + +static void client_demo(struct platform_device *pdev) +{ + struct demo_client *dc_sync, *dc_async; + /* The controller already knows async_pkt and sync_pkt */ + struct async_pkt ap; + struct sync_pkt sp; + + dc_sync = kzalloc(sizeof(*dc_sync), GFP_KERNEL); + dc_async = kzalloc(sizeof(*dc_async), GFP_KERNEL); + + /* Populate non-blocking mode client */ + dc_async->cl.dev = &pdev->dev; + dc_async->cl.rx_callback = message_from_remote; + dc_async->cl.tx_done = sample_sent; + dc_async->cl.tx_block = false; + dc_async->cl.tx_tout = 0; /* doesn't matter here */ + dc_async->cl.knows_txdone = false; /* depending upon protocol */ + dc_async->async = true; + init_completion(&dc_async->c); + + /* Populate blocking mode client */ + dc_sync->cl.dev = &pdev->dev; + dc_sync->cl.rx_callback = message_from_remote; + dc_sync->cl.tx_done = NULL; /* operate in blocking mode */ + dc_sync->cl.tx_block = true; + dc_sync->cl.tx_tout = 500; /* by half a second */ + dc_sync->cl.knows_txdone = false; /* depending upon protocol */ + dc_sync->async = false; + + /* ASync mailbox is listed second in 'mboxes' property */ + dc_async->mbox = mbox_request_channel(&dc_async->cl, 1); + /* Populate data packet */ + /* ap.xxx = 123; etc */ + /* Send async message to remote */ + mbox_send_message(dc_async->mbox, &ap); + + /* Sync mailbox is listed first in 'mboxes' property */ + dc_sync->mbox = mbox_request_channel(&dc_sync->cl, 0); + /* Populate data packet */ + /* sp.abc = 123; etc */ + /* Send message to remote in blocking mode */ + mbox_send_message(dc_sync->mbox, &sp); + /* At this point 'sp' has been sent */ + + /* Now wait for async chan to be done */ + wait_for_completion(&dc_async->c); +} diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index a5da5c7e7128..129f7c0e1483 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt @@ -5,7 +5,8 @@ performance expectations by drivers, subsystems and user space applications on one of the parameters. Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput. +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. 2. the per-device PM QoS framework provides the API to manage the per-device latency constraints and PM QoS flags. @@ -13,6 +14,7 @@ Each parameters have defined units: * latency: usec * timeout: usec * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) 1. PM QoS framework diff --git a/Documentation/scsi/osd.txt b/Documentation/scsi/osd.txt index da162f7fd5f5..5a9879bad073 100644 --- a/Documentation/scsi/osd.txt +++ b/Documentation/scsi/osd.txt @@ -184,8 +184,7 @@ Any problems, questions, bug reports, lonely OSD nights, please email: More up-to-date information can be found on: http://open-osd.org -Boaz Harrosh <bharrosh@panasas.com> -Benny Halevy <bhalevy@panasas.com> +Boaz Harrosh <ooo@electrozaur.com> References ========== diff --git a/Documentation/target/tcmu-design.txt b/Documentation/target/tcmu-design.txt new file mode 100644 index 000000000000..5518465290bf --- /dev/null +++ b/Documentation/target/tcmu-design.txt @@ -0,0 +1,378 @@ +Contents: + +1) TCM Userspace Design + a) Background + b) Benefits + c) Design constraints + d) Implementation overview + i. Mailbox + ii. Command ring + iii. Data Area + e) Device discovery + f) Device events + g) Other contingencies +2) Writing a user pass-through handler + a) Discovering and configuring TCMU uio devices + b) Waiting for events on the device(s) + c) Managing the command ring +3) Command filtering and pass_level +4) A final note + + +TCM Userspace Design +-------------------- + +TCM is another name for LIO, an in-kernel iSCSI target (server). +Existing TCM targets run in the kernel. TCMU (TCM in Userspace) +allows userspace programs to be written which act as iSCSI targets. +This document describes the design. + +The existing kernel provides modules for different SCSI transport +protocols. TCM also modularizes the data storage. There are existing +modules for file, block device, RAM or using another SCSI device as +storage. These are called "backstores" or "storage engines". These +built-in modules are implemented entirely as kernel code. + +Background: + +In addition to modularizing the transport protocol used for carrying +SCSI commands ("fabrics"), the Linux kernel target, LIO, also modularizes +the actual data storage as well. These are referred to as "backstores" +or "storage engines". The target comes with backstores that allow a +file, a block device, RAM, or another SCSI device to be used for the +local storage needed for the exported SCSI LUN. Like the rest of LIO, +these are implemented entirely as kernel code. + +These backstores cover the most common use cases, but not all. One new +use case that other non-kernel target solutions, such as tgt, are able +to support is using Gluster's GLFS or Ceph's RBD as a backstore. The +target then serves as a translator, allowing initiators to store data +in these non-traditional networked storage systems, while still only +using standard protocols themselves. + +If the target is a userspace process, supporting these is easy. tgt, +for example, needs only a small adapter module for each, because the +modules just use the available userspace libraries for RBD and GLFS. + +Adding support for these backstores in LIO is considerably more +difficult, because LIO is entirely kernel code. Instead of undertaking +the significant work to port the GLFS or RBD APIs and protocols to the +kernel, another approach is to create a userspace pass-through +backstore for LIO, "TCMU". + + +Benefits: + +In addition to allowing relatively easy support for RBD and GLFS, TCMU +will also allow easier development of new backstores. TCMU combines +with the LIO loopback fabric to become something similar to FUSE +(Filesystem in Userspace), but at the SCSI layer instead of the +filesystem layer. A SUSE, if you will. + +The disadvantage is there are more distinct components to configure, and +potentially to malfunction. This is unavoidable, but hopefully not +fatal if we're careful to keep things as simple as possible. + +Design constraints: + +- Good performance: high throughput, low latency +- Cleanly handle if userspace: + 1) never attaches + 2) hangs + 3) dies + 4) misbehaves +- Allow future flexibility in user & kernel implementations +- Be reasonably memory-efficient +- Simple to configure & run +- Simple to write a userspace backend + + +Implementation overview: + +The core of the TCMU interface is a memory region that is shared +between kernel and userspace. Within this region is: a control area +(mailbox); a lockless producer/consumer circular buffer for commands +to be passed up, and status returned; and an in/out data buffer area. + +TCMU uses the pre-existing UIO subsystem. UIO allows device driver +development in userspace, and this is conceptually very close to the +TCMU use case, except instead of a physical device, TCMU implements a +memory-mapped layout designed for SCSI commands. Using UIO also +benefits TCMU by handling device introspection (e.g. a way for +userspace to determine how large the shared region is) and signaling +mechanisms in both directions. + +There are no embedded pointers in the memory region. Everything is +expressed as an offset from the region's starting address. This allows +the ring to still work if the user process dies and is restarted with +the region mapped at a different virtual address. + +See target_core_user.h for the struct definitions. + +The Mailbox: + +The mailbox is always at the start of the shared memory region, and +contains a version, details about the starting offset and size of the +command ring, and head and tail pointers to be used by the kernel and +userspace (respectively) to put commands on the ring, and indicate +when the commands are completed. + +version - 1 (userspace should abort if otherwise) +flags - none yet defined. +cmdr_off - The offset of the start of the command ring from the start +of the memory region, to account for the mailbox size. +cmdr_size - The size of the command ring. This does *not* need to be a +power of two. +cmd_head - Modified by the kernel to indicate when a command has been +placed on the ring. +cmd_tail - Modified by userspace to indicate when it has completed +processing of a command. + +The Command Ring: + +Commands are placed on the ring by the kernel incrementing +mailbox.cmd_head by the size of the command, modulo cmdr_size, and +then signaling userspace via uio_event_notify(). Once the command is +completed, userspace updates mailbox.cmd_tail in the same way and +signals the kernel via a 4-byte write(). When cmd_head equals +cmd_tail, the ring is empty -- no commands are currently waiting to be +processed by userspace. + +TCMU commands start with a common header containing "len_op", a 32-bit +value that stores the length, as well as the opcode in the lowest +unused bits. Currently only two opcodes are defined, TCMU_OP_PAD and +TCMU_OP_CMD. When userspace encounters a command with PAD opcode, it +should skip ahead by the bytes in "length". (The kernel inserts PAD +entries to ensure each CMD entry fits contigously into the circular +buffer.) + +When userspace handles a CMD, it finds the SCSI CDB (Command Data +Block) via tcmu_cmd_entry.req.cdb_off. This is an offset from the +start of the overall shared memory region, not the entry. The data +in/out buffers are accessible via tht req.iov[] array. Note that +each iov.iov_base is also an offset from the start of the region. + +TCMU currently does not support BIDI operations. + +When completing a command, userspace sets rsp.scsi_status, and +rsp.sense_buffer if necessary. Userspace then increments +mailbox.cmd_tail by entry.hdr.length (mod cmdr_size) and signals the +kernel via the UIO method, a 4-byte write to the file descriptor. + +The Data Area: + +This is shared-memory space after the command ring. The organization +of this area is not defined in the TCMU interface, and userspace +should access only the parts referenced by pending iovs. + + +Device Discovery: + +Other devices may be using UIO besides TCMU. Unrelated user processes +may also be handling different sets of TCMU devices. TCMU userspace +processes must find their devices by scanning sysfs +class/uio/uio*/name. For TCMU devices, these names will be of the +format: + +tcm-user/<hba_num>/<device_name>/<subtype>/<path> + +where "tcm-user" is common for all TCMU-backed UIO devices. <hba_num> +and <device_name> allow userspace to find the device's path in the +kernel target's configfs tree. Assuming the usual mount point, it is +found at: + +/sys/kernel/config/target/core/user_<hba_num>/<device_name> + +This location contains attributes such as "hw_block_size", that +userspace needs to know for correct operation. + +<subtype> will be a userspace-process-unique string to identify the +TCMU device as expecting to be backed by a certain handler, and <path> +will be an additional handler-specific string for the user process to +configure the device, if needed. The name cannot contain ':', due to +LIO limitations. + +For all devices so discovered, the user handler opens /dev/uioX and +calls mmap(): + +mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0) + +where size must be equal to the value read from +/sys/class/uio/uioX/maps/map0/size. + + +Device Events: + +If a new device is added or removed, a notification will be broadcast +over netlink, using a generic netlink family name of "TCM-USER" and a +multicast group named "config". This will include the UIO name as +described in the previous section, as well as the UIO minor +number. This should allow userspace to identify both the UIO device and +the LIO device, so that after determining the device is supported +(based on subtype) it can take the appropriate action. + + +Other contingencies: + +Userspace handler process never attaches: + +- TCMU will post commands, and then abort them after a timeout period + (30 seconds.) + +Userspace handler process is killed: + +- It is still possible to restart and re-connect to TCMU + devices. Command ring is preserved. However, after the timeout period, + the kernel will abort pending tasks. + +Userspace handler process hangs: + +- The kernel will abort pending tasks after a timeout period. + +Userspace handler process is malicious: + +- The process can trivially break the handling of devices it controls, + but should not be able to access kernel memory outside its shared + memory areas. + + +Writing a user pass-through handler (with example code) +------------------------------------------------------- + +A user process handing a TCMU device must support the following: + +a) Discovering and configuring TCMU uio devices +b) Waiting for events on the device(s) +c) Managing the command ring: Parsing operations and commands, + performing work as needed, setting response fields (scsi_status and + possibly sense_buffer), updating cmd_tail, and notifying the kernel + that work has been finished + +First, consider instead writing a plugin for tcmu-runner. tcmu-runner +implements all of this, and provides a higher-level API for plugin +authors. + +TCMU is designed so that multiple unrelated processes can manage TCMU +devices separately. All handlers should make sure to only open their +devices, based opon a known subtype string. + +a) Discovering and configuring TCMU UIO devices: + +(error checking omitted for brevity) + +int fd, dev_fd; +char buf[256]; +unsigned long long map_len; +void *map; + +fd = open("/sys/class/uio/uio0/name", O_RDONLY); +ret = read(fd, buf, sizeof(buf)); +close(fd); +buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ + +/* we only want uio devices whose name is a format we expect */ +if (strncmp(buf, "tcm-user", 8)) + exit(-1); + +/* Further checking for subtype also needed here */ + +fd = open(/sys/class/uio/%s/maps/map0/size, O_RDONLY); +ret = read(fd, buf, sizeof(buf)); +close(fd); +str_buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ + +map_len = strtoull(buf, NULL, 0); + +dev_fd = open("/dev/uio0", O_RDWR); +map = mmap(NULL, map_len, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0); + + +b) Waiting for events on the device(s) + +while (1) { + char buf[4]; + + int ret = read(dev_fd, buf, 4); /* will block */ + + handle_device_events(dev_fd, map); +} + + +c) Managing the command ring + +#include <linux/target_core_user.h> + +int handle_device_events(int fd, void *map) +{ + struct tcmu_mailbox *mb = map; + struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; + int did_some_work = 0; + + /* Process events from cmd ring until we catch up with cmd_head */ + while (ent != (void *)mb + mb->cmdr_off + mb->cmd_head) { + + if (tcmu_hdr_get_op(&ent->hdr) == TCMU_OP_CMD) { + uint8_t *cdb = (void *)mb + ent->req.cdb_off; + bool success = true; + + /* Handle command here. */ + printf("SCSI opcode: 0x%x\n", cdb[0]); + + /* Set response fields */ + if (success) + ent->rsp.scsi_status = SCSI_NO_SENSE; + else { + /* Also fill in rsp->sense_buffer here */ + ent->rsp.scsi_status = SCSI_CHECK_CONDITION; + } + } + else { + /* Do nothing for PAD entries */ + } + + /* update cmd_tail */ + mb->cmd_tail = (mb->cmd_tail + tcmu_hdr_get_len(&ent->hdr)) % mb->cmdr_size; + ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; + did_some_work = 1; + } + + /* Notify the kernel that work has been finished */ + if (did_some_work) { + uint32_t buf = 0; + + write(fd, &buf, 4); + } + + return 0; +} + + +Command filtering and pass_level +-------------------------------- + +TCMU supports a "pass_level" option with valid values of 0 or 1. When +the value is 0 (the default), nearly all SCSI commands received for +the device are passed through to the handler. This allows maximum +flexibility but increases the amount of code required by the handler, +to support all mandatory SCSI commands. If pass_level is set to 1, +then only IO-related commands are presented, and the rest are handled +by LIO's in-kernel command emulation. The commands presented at level +1 include all versions of: + +READ +WRITE +WRITE_VERIFY +XDWRITEREAD +WRITE_SAME +COMPARE_AND_WRITE +SYNCHRONIZE_CACHE +UNMAP + + +A final note +------------ + +Please be careful to return codes as defined by the SCSI +specifications. These are different than some values defined in the +scsi/scsi.h include file. For example, CHECK CONDITION's status code +is 2, not 1. diff --git a/MAINTAINERS b/MAINTAINERS index 71fdbd4f4ac6..dab92a78d1d5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5841,6 +5841,14 @@ S: Maintained F: drivers/net/macvlan.c F: include/linux/if_macvlan.h +MAILBOX API +M: Jassi Brar <jassisinghbrar@gmail.com> +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/mailbox/ +F: include/linux/mailbox_client.h +F: include/linux/mailbox_controller.h + MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7 M: Michael Kerrisk <mtk.manpages@gmail.com> W: http://www.kernel.org/doc/man-pages @@ -6829,7 +6837,7 @@ S: Orphan F: drivers/net/wireless/orinoco/ OSD LIBRARY and FILESYSTEM -M: Boaz Harrosh <bharrosh@panasas.com> +M: Boaz Harrosh <ooo@electrozaur.com> M: Benny Halevy <bhalevy@primarydata.com> L: osd-dev@open-osd.org W: http://open-osd.org @@ -6839,6 +6847,13 @@ F: drivers/scsi/osd/ F: include/scsi/osd_* F: fs/exofs/ +OVERLAYFS FILESYSTEM +M: Miklos Szeredi <miklos@szeredi.hu> +L: linux-fsdevel@vger.kernel.org +S: Supported +F: fs/overlayfs/* +F: Documentation/filesystems/overlayfs.txt + P54 WIRELESS DRIVER M: Christian Lamparter <chunkeey@googlemail.com> L: linux-wireless@vger.kernel.org diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 9596b0ab108d..fe44b2494609 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -9,6 +9,7 @@ config ARC def_bool y select BUILDTIME_EXTABLE_SORT + select COMMON_CLK select CLONE_BACKWARDS # ARC Busybox based initramfs absolutely relies on DEVTMPFS for /dev select DEVTMPFS if !INITRAMFS_SOURCE="" @@ -73,9 +74,6 @@ config STACKTRACE_SUPPORT config HAVE_LATENCYTOP_SUPPORT def_bool y -config NO_DMA - def_bool n - source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -354,7 +352,7 @@ config ARC_CURR_IN_REG kernel mode. This saves memory access for each such access -config ARC_MISALIGN_ACCESS +config ARC_EMUL_UNALIGNED bool "Emulate unaligned memory access (userspace only)" select SYSCTL_ARCH_UNALIGN_NO_WARN select SYSCTL_ARCH_UNALIGN_ALLOW diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 8c0b1aa56f7e..10bc3d4e8a44 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -25,7 +25,6 @@ ifdef CONFIG_ARC_CURR_IN_REG LINUXINCLUDE += -include ${src}/arch/arc/include/asm/current.h endif -upto_gcc42 := $(call cc-ifversion, -le, 0402, y) upto_gcc44 := $(call cc-ifversion, -le, 0404, y) atleast_gcc44 := $(call cc-ifversion, -ge, 0404, y) atleast_gcc48 := $(call cc-ifversion, -ge, 0408, y) @@ -60,25 +59,11 @@ ldflags-$(CONFIG_CPU_BIG_ENDIAN) += -EB # --build-id w/o "-marclinux". Default arc-elf32-ld is OK ldflags-$(upto_gcc44) += -marclinux -ARC_LIBGCC := -mA7 -cflags-$(CONFIG_ARC_HAS_HW_MPY) += -multcost=16 - ifndef CONFIG_ARC_HAS_HW_MPY cflags-y += -mno-mpy - -# newlib for ARC700 assumes MPY to be always present, which is generally true -# However, if someone really doesn't want MPY, we need to use the 600 ver -# which coupled with -mno-mpy will use mpy emulation -# With gcc 4.4.7, -mno-mpy is enough to make any other related adjustments, -# e.g. increased cost of MPY. With gcc 4.2.1 this had to be explicitly hinted - - ifeq ($(upto_gcc42),y) - ARC_LIBGCC := -marc600 - cflags-y += -multcost=30 - endif endif -LIBGCC := $(shell $(CC) $(ARC_LIBGCC) $(cflags-y) --print-libgcc-file-name) +LIBGCC := $(shell $(CC) $(cflags-y) --print-libgcc-file-name) # Modules with short calls might break for calls into builtin-kernel KBUILD_CFLAGS_MODULE += -mlong-calls diff --git a/arch/arc/boot/dts/angel4.dts b/arch/arc/boot/dts/angel4.dts index 6b57475967a6..757e0c62c4f9 100644 --- a/arch/arc/boot/dts/angel4.dts +++ b/arch/arc/boot/dts/angel4.dts @@ -24,11 +24,6 @@ serial0 = &arcuart0; }; - memory { - device_type = "memory"; - reg = <0x00000000 0x10000000>; /* 256M */ - }; - fpga { compatible = "simple-bus"; #address-cells = <1>; diff --git a/arch/arc/boot/dts/nsimosci.dts b/arch/arc/boot/dts/nsimosci.dts index 4f31b2eb5cdf..cfaedd9c61c9 100644 --- a/arch/arc/boot/dts/nsimosci.dts +++ b/arch/arc/boot/dts/nsimosci.dts @@ -20,18 +20,13 @@ /* this is for console on PGU */ /* bootargs = "console=tty0 consoleblank=0"; */ /* this is for console on serial */ - bootargs = "earlycon=uart8250,mmio32,0xc0000000,115200n8 console=ttyS0,115200n8 consoleblank=0 debug"; + bootargs = "earlycon=uart8250,mmio32,0xc0000000,115200n8 console=tty0 console=ttyS0,115200n8 consoleblank=0 debug"; }; aliases { serial0 = &uart0; }; - memory { - device_type = "memory"; - reg = <0x80000000 0x10000000>; /* 256M */ - }; - fpga { compatible = "simple-bus"; #address-cells = <1>; diff --git a/arch/arc/configs/fpga_defconfig b/arch/arc/configs/fpga_defconfig index e283aa586934..ef4d3bc7b6c0 100644 --- a/arch/arc/configs/fpga_defconfig +++ b/arch/arc/configs/fpga_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set CONFIG_ARC_PLAT_FPGA_LEGACY=y -CONFIG_ARC_BOARD_ML509=y # CONFIG_ARC_HAS_RTSC is not set CONFIG_ARC_BUILTIN_DTB_NAME="angel4" CONFIG_PREEMPT=y diff --git a/arch/arc/configs/fpga_noramfs_defconfig b/arch/arc/configs/fpga_noramfs_defconfig index 5276a52f6a2f..49c93011ab96 100644 --- a/arch/arc/configs/fpga_noramfs_defconfig +++ b/arch/arc/configs/fpga_noramfs_defconfig @@ -20,7 +20,6 @@ CONFIG_MODULES=y # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set CONFIG_ARC_PLAT_FPGA_LEGACY=y -CONFIG_ARC_BOARD_ML509=y # CONFIG_ARC_HAS_RTSC is not set CONFIG_ARC_BUILTIN_DTB_NAME="angel4" CONFIG_PREEMPT=y diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index c01ba35a4eff..278dacf2a3f9 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -21,7 +21,6 @@ CONFIG_MODULES=y # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set CONFIG_ARC_PLAT_FPGA_LEGACY=y -CONFIG_ARC_BOARD_ML509=y # CONFIG_ARC_IDE is not set # CONFIG_ARCTANGENT_EMAC is not set # CONFIG_ARC_HAS_RTSC is not set diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 372466b371bf..be33db8a2ee3 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -9,19 +9,16 @@ #ifndef _ASM_ARC_ARCREGS_H #define _ASM_ARC_ARCREGS_H -#ifdef __KERNEL__ - /* Build Configuration Registers */ #define ARC_REG_DCCMBASE_BCR 0x61 /* DCCM Base Addr */ #define ARC_REG_CRC_BCR 0x62 -#define ARC_REG_DVFB_BCR 0x64 -#define ARC_REG_EXTARITH_BCR 0x65 #define ARC_REG_VECBASE_BCR 0x68 #define ARC_REG_PERIBASE_BCR 0x69 -#define ARC_REG_FP_BCR 0x6B /* Single-Precision FPU */ -#define ARC_REG_DPFP_BCR 0x6C /* Dbl Precision FPU */ +#define ARC_REG_FP_BCR 0x6B /* ARCompact: Single-Precision FPU */ +#define ARC_REG_DPFP_BCR 0x6C /* ARCompact: Dbl Precision FPU */ #define ARC_REG_DCCM_BCR 0x74 /* DCCM Present + SZ */ #define ARC_REG_TIMERS_BCR 0x75 +#define ARC_REG_AP_BCR 0x76 #define ARC_REG_ICCM_BCR 0x78 #define ARC_REG_XY_MEM_BCR 0x79 #define ARC_REG_MAC_BCR 0x7a @@ -31,6 +28,9 @@ #define ARC_REG_MIXMAX_BCR 0x7e #define ARC_REG_BARREL_BCR 0x7f #define ARC_REG_D_UNCACH_BCR 0x6A +#define ARC_REG_BPU_BCR 0xc0 +#define ARC_REG_ISA_CFG_BCR 0xc1 +#define ARC_REG_SMART_BCR 0xFF /* status32 Bits Positions */ #define STATUS_AE_BIT 5 /* Exception active */ @@ -191,14 +191,6 @@ #define PAGES_TO_KB(n_pages) ((n_pages) << (PAGE_SHIFT - 10)) #define PAGES_TO_MB(n_pages) (PAGES_TO_KB(n_pages) >> 10) -#ifdef CONFIG_ARC_FPU_SAVE_RESTORE -/* These DPFP regs need to be saved/restored across ctx-sw */ -struct arc_fpu { - struct { - unsigned int l, h; - } aux_dpfp[2]; -}; -#endif /* *************************************************************** @@ -212,27 +204,19 @@ struct bcr_identity { #endif }; -#define EXTN_SWAP_VALID 0x1 -#define EXTN_NORM_VALID 0x2 -#define EXTN_MINMAX_VALID 0x2 -#define EXTN_BARREL_VALID 0x2 - -struct bcr_extn { +struct bcr_isa { #ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int pad:20, crc:1, ext_arith:2, mul:2, barrel:2, minmax:2, - norm:2, swap:1; + unsigned int pad1:23, atomic1:1, ver:8; #else - unsigned int swap:1, norm:2, minmax:2, barrel:2, mul:2, ext_arith:2, - crc:1, pad:20; + unsigned int ver:8, atomic1:1, pad1:23; #endif }; -/* DSP Options Ref Manual */ -struct bcr_extn_mac_mul { +struct bcr_mpy { #ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int pad:16, type:8, ver:8; + unsigned int pad:8, x1616:8, dsp:4, cycles:2, type:2, ver:8; #else - unsigned int ver:8, type:8, pad:16; + unsigned int ver:8, type:2, cycles:2, dsp:4, x1616:8, pad:8; #endif }; @@ -251,6 +235,7 @@ struct bcr_perip { unsigned int pad:8, sz:8, pad2:8, start:8; #endif }; + struct bcr_iccm { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int base:16, pad:5, sz:3, ver:8; @@ -277,8 +262,8 @@ struct bcr_dccm { #endif }; -/* Both SP and DP FPU BCRs have same format */ -struct bcr_fp { +/* ARCompact: Both SP and DP FPU BCRs have same format */ +struct bcr_fp_arcompact { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int fast:1, ver:8; #else @@ -286,6 +271,30 @@ struct bcr_fp { #endif }; +struct bcr_timer { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad2:15, rtsc:1, pad1:6, t1:1, t0:1, ver:8; +#else + unsigned int ver:8, t0:1, t1:1, pad1:6, rtsc:1, pad2:15; +#endif +}; + +struct bcr_bpu_arcompact { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad2:19, fam:1, pad:2, ent:2, ver:8; +#else + unsigned int ver:8, ent:2, pad:2, fam:1, pad2:19; +#endif +}; + +struct bcr_generic { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad:24, ver:8; +#else + unsigned int ver:8, pad:24; +#endif +}; + /* ******************************************************************* * Generic structures to hold build configuration used at runtime @@ -299,6 +308,10 @@ struct cpuinfo_arc_cache { unsigned int sz_k:8, line_len:8, assoc:4, ver:4, alias:1, vipt:1, pad:6; }; +struct cpuinfo_arc_bpu { + unsigned int ver, full, num_cache, num_pred; +}; + struct cpuinfo_arc_ccm { unsigned int base_addr, sz; }; @@ -306,21 +319,25 @@ struct cpuinfo_arc_ccm { struct cpuinfo_arc { struct cpuinfo_arc_cache icache, dcache; struct cpuinfo_arc_mmu mmu; + struct cpuinfo_arc_bpu bpu; struct bcr_identity core; - unsigned int timers; + struct bcr_isa isa; + struct bcr_timer timers; unsigned int vec_base; unsigned int uncached_base; struct cpuinfo_arc_ccm iccm, dccm; - struct bcr_extn extn; + struct { + unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, pad1:3, + fpu_sp:1, fpu_dp:1, pad2:6, + debug:1, ap:1, smart:1, rtt:1, pad3:4, + pad4:8; + } extn; + struct bcr_mpy extn_mpy; struct bcr_extn_xymem extn_xymem; - struct bcr_extn_mac_mul extn_mac_mul; - struct bcr_fp fp, dpfp; }; extern struct cpuinfo_arc cpuinfo_arc700[]; #endif /* __ASEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_ARC_ARCREGS_H */ diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 173f303a868f..067551b6920a 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -9,8 +9,6 @@ #ifndef _ASM_ARC_ATOMIC_H #define _ASM_ARC_ATOMIC_H -#ifdef __KERNEL__ - #ifndef __ASSEMBLY__ #include <linux/types.h> @@ -170,5 +168,3 @@ ATOMIC_OP(and, &=, and) #endif #endif - -#endif diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index ebc0cf3164dc..1a5bf07eefe2 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -13,8 +13,6 @@ #error only <linux/bitops.h> can be included directly #endif -#ifdef __KERNEL__ - #ifndef __ASSEMBLY__ #include <linux/types.h> @@ -508,6 +506,4 @@ static inline __attribute__ ((const)) int __ffs(unsigned long word) #endif /* !__ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h index 5b18e94c6678..ea022d47896c 100644 --- a/arch/arc/include/asm/bug.h +++ b/arch/arc/include/asm/bug.h @@ -21,10 +21,9 @@ void show_kernel_fault_diag(const char *str, struct pt_regs *regs, unsigned long address); void die(const char *str, struct pt_regs *regs, unsigned long address); -#define BUG() do { \ - dump_stack(); \ - pr_warn("Kernel BUG in %s: %s: %d!\n", \ - __FILE__, __func__, __LINE__); \ +#define BUG() do { \ + pr_warn("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \ + dump_stack(); \ } while (0) #define HAVE_ARCH_BUG diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index b3c750979aa1..7861255da32d 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -20,7 +20,7 @@ #define CACHE_LINE_MASK (~(L1_CACHE_BYTES - 1)) /* - * ARC700 doesn't cache any access in top 256M. + * ARC700 doesn't cache any access in top 1G (0xc000_0000 to 0xFFFF_FFFF) * Ideal for wiring memory mapped peripherals as we don't need to do * explicit uncached accesses (LD.di/ST.di) hence more portable drivers */ diff --git a/arch/arc/include/asm/current.h b/arch/arc/include/asm/current.h index 87b918585c4a..c2453ee62801 100644 --- a/arch/arc/include/asm/current.h +++ b/arch/arc/include/asm/current.h @@ -12,8 +12,6 @@ #ifndef _ASM_ARC_CURRENT_H #define _ASM_ARC_CURRENT_H -#ifdef __KERNEL__ - #ifndef __ASSEMBLY__ #ifdef CONFIG_ARC_CURR_IN_REG @@ -27,6 +25,4 @@ register struct task_struct *curr_arc asm("r25"); #endif /* ! __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_ARC_CURRENT_H */ diff --git a/arch/arc/include/asm/irqflags.h b/arch/arc/include/asm/irqflags.h index 587df8236e8b..742816f1b210 100644 --- a/arch/arc/include/asm/irqflags.h +++ b/arch/arc/include/asm/irqflags.h @@ -15,8 +15,6 @@ * -Conditionally disable interrupts (if they are not enabled, don't disable) */ -#ifdef __KERNEL__ - #include <asm/arcregs.h> /* status32 Reg bits related to Interrupt Handling */ @@ -169,6 +167,4 @@ static inline int arch_irqs_disabled(void) #endif /* __ASSEMBLY__ */ -#endif /* KERNEL */ - #endif diff --git a/arch/arc/include/asm/kgdb.h b/arch/arc/include/asm/kgdb.h index b65fca7ffeb5..fea931634136 100644 --- a/arch/arc/include/asm/kgdb.h +++ b/arch/arc/include/asm/kgdb.h @@ -19,7 +19,7 @@ * register API yet */ #undef DBG_MAX_REG_NUM -#define GDB_MAX_REGS 39 +#define GDB_MAX_REGS 87 #define BREAK_INSTR_SIZE 2 #define CACHE_FLUSH_IS_SAFE 1 @@ -33,23 +33,27 @@ static inline void arch_kgdb_breakpoint(void) extern void kgdb_trap(struct pt_regs *regs); -enum arc700_linux_regnums { +/* This is the numbering of registers according to the GDB. See GDB's + * arc-tdep.h for details. + * + * Registers are ordered for GDB 7.5. It is incompatible with GDB 6.8. */ +enum arc_linux_regnums { _R0 = 0, _R1, _R2, _R3, _R4, _R5, _R6, _R7, _R8, _R9, _R10, _R11, _R12, _R13, _R14, _R15, _R16, _R17, _R18, _R19, _R20, _R21, _R22, _R23, _R24, _R25, _R26, - _BTA = 27, - _LP_START = 28, - _LP_END = 29, - _LP_COUNT = 30, - _STATUS32 = 31, - _BLINK = 32, - _FP = 33, - __SP = 34, - _EFA = 35, - _RET = 36, - _ORIG_R8 = 37, - _STOP_PC = 38 + _FP = 27, + __SP = 28, + _R30 = 30, + _BLINK = 31, + _LP_COUNT = 60, + _STOP_PC = 64, + _RET = 64, + _LP_START = 65, + _LP_END = 66, + _STATUS32 = 67, + _ECR = 76, + _BTA = 82, }; #else diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 82588f3ba77f..210fe97464c3 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -14,12 +14,19 @@ #ifndef __ASM_ARC_PROCESSOR_H #define __ASM_ARC_PROCESSOR_H -#ifdef __KERNEL__ - #ifndef __ASSEMBLY__ #include <asm/ptrace.h> +#ifdef CONFIG_ARC_FPU_SAVE_RESTORE +/* These DPFP regs need to be saved/restored across ctx-sw */ +struct arc_fpu { + struct { + unsigned int l, h; + } aux_dpfp[2]; +}; +#endif + /* Arch specific stuff which needs to be saved per task. * However these items are not so important so as to earn a place in * struct thread_info @@ -128,6 +135,4 @@ extern unsigned int get_wchan(struct task_struct *p); */ #define TASK_UNMAPPED_BASE (TASK_SIZE / 3) -#endif /* __KERNEL__ */ - #endif /* __ASM_ARC_PROCESSOR_H */ diff --git a/arch/arc/include/asm/setup.h b/arch/arc/include/asm/setup.h index e10f8cef56a8..6e3ef5ba4f74 100644 --- a/arch/arc/include/asm/setup.h +++ b/arch/arc/include/asm/setup.h @@ -29,7 +29,6 @@ struct cpuinfo_data { }; extern int root_mountflags, end_mem; -extern int running_on_hw; void setup_processor(void); void __init setup_arch_memory(void); diff --git a/arch/arc/include/asm/smp.h b/arch/arc/include/asm/smp.h index 5d06eee43ea9..3845b9e94f69 100644 --- a/arch/arc/include/asm/smp.h +++ b/arch/arc/include/asm/smp.h @@ -59,7 +59,15 @@ struct plat_smp_ops { /* TBD: stop exporting it for direct population by platform */ extern struct plat_smp_ops plat_smp_ops; -#endif /* CONFIG_SMP */ +#else /* CONFIG_SMP */ + +static inline void smp_init_cpus(void) {} +static inline const char *arc_platform_smp_cpuinfo(void) +{ + return ""; +} + +#endif /* !CONFIG_SMP */ /* * ARC700 doesn't support atomic Read-Modify-Write ops. diff --git a/arch/arc/include/asm/string.h b/arch/arc/include/asm/string.h index 87676c8f1412..95822b550a18 100644 --- a/arch/arc/include/asm/string.h +++ b/arch/arc/include/asm/string.h @@ -17,8 +17,6 @@ #include <linux/types.h> -#ifdef __KERNEL__ - #define __HAVE_ARCH_MEMSET #define __HAVE_ARCH_MEMCPY #define __HAVE_ARCH_MEMCMP @@ -36,5 +34,4 @@ extern char *strcpy(char *dest, const char *src); extern int strcmp(const char *cs, const char *ct); extern __kernel_size_t strlen(const char *); -#endif /* __KERNEL__ */ #endif /* _ASM_ARC_STRING_H */ diff --git a/arch/arc/include/asm/syscalls.h b/arch/arc/include/asm/syscalls.h index dd785befe7fd..e56f9fcc5581 100644 --- a/arch/arc/include/asm/syscalls.h +++ b/arch/arc/include/asm/syscalls.h @@ -9,8 +9,6 @@ #ifndef _ASM_ARC_SYSCALLS_H #define _ASM_ARC_SYSCALLS_H 1 -#ifdef __KERNEL__ - #include <linux/compiler.h> #include <linux/linkage.h> #include <linux/types.h> @@ -22,6 +20,4 @@ int sys_arc_gettls(void); #include <asm-generic/syscalls.h> -#endif /* __KERNEL__ */ - #endif diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index 45be21672011..02bc5ec0fb2e 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -16,8 +16,6 @@ #ifndef _ASM_THREAD_INFO_H #define _ASM_THREAD_INFO_H -#ifdef __KERNEL__ - #include <asm/page.h> #ifdef CONFIG_16KSTACKS @@ -114,6 +112,4 @@ static inline __attribute_const__ struct thread_info *current_thread_info(void) * syscall, so all that reamins to be tested is _TIF_WORK_MASK */ -#endif /* __KERNEL__ */ - #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/arc/include/asm/unaligned.h b/arch/arc/include/asm/unaligned.h index 3e5f071bc00c..6da6b4edaeda 100644 --- a/arch/arc/include/asm/unaligned.h +++ b/arch/arc/include/asm/unaligned.h @@ -14,7 +14,7 @@ #include <asm-generic/unaligned.h> #include <asm/ptrace.h> -#ifdef CONFIG_ARC_MISALIGN_ACCESS +#ifdef CONFIG_ARC_EMUL_UNALIGNED int misaligned_fixup(unsigned long address, struct pt_regs *regs, struct callee_regs *cregs); #else diff --git a/arch/arc/kernel/Makefile b/arch/arc/kernel/Makefile index 8004b4fa6461..113f2033da9f 100644 --- a/arch/arc/kernel/Makefile +++ b/arch/arc/kernel/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_MODULES) += arcksyms.o module.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_ARC_DW2_UNWIND) += unwind.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_ARC_MISALIGN_ACCESS) += unaligned.o +obj-$(CONFIG_ARC_EMUL_UNALIGNED) += unaligned.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_ARC_METAWARE_HLINK) += arc_hostlink.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o diff --git a/arch/arc/kernel/disasm.c b/arch/arc/kernel/disasm.c index b8a549c4f540..3b7cd4864ba2 100644 --- a/arch/arc/kernel/disasm.c +++ b/arch/arc/kernel/disasm.c @@ -15,7 +15,7 @@ #include <linux/uaccess.h> #include <asm/disasm.h> -#if defined(CONFIG_KGDB) || defined(CONFIG_ARC_MISALIGN_ACCESS) || \ +#if defined(CONFIG_KGDB) || defined(CONFIG_ARC_EMUL_UNALIGNED) || \ defined(CONFIG_KPROBES) /* disasm_instr: Analyses instruction at addr, stores @@ -535,4 +535,4 @@ int __kprobes disasm_next_pc(unsigned long pc, struct pt_regs *regs, return instr.is_branch; } -#endif /* CONFIG_KGDB || CONFIG_ARC_MISALIGN_ACCESS || CONFIG_KPROBES */ +#endif /* CONFIG_KGDB || CONFIG_ARC_EMUL_UNALIGNED || CONFIG_KPROBES */ diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S index 4d2481bd8b98..b0e8666fdccc 100644 --- a/arch/arc/kernel/head.S +++ b/arch/arc/kernel/head.S @@ -91,16 +91,6 @@ stext: st r0, [@uboot_tag] st r2, [@uboot_arg] - ; Identify if running on ISS vs Silicon - ; IDENTITY Reg [ 3 2 1 0 ] - ; (chip-id) ^^^^^ ==> 0xffff for ISS - lr r0, [identity] - lsr r3, r0, 16 - cmp r3, 0xffff - mov.z r4, 0 - mov.nz r4, 1 - st r4, [@running_on_hw] - ; setup "current" tsk and optionally cache it in dedicated r25 mov r9, @init_task SET_CURR_TASK_ON_CPU r9, r0 ; r9 = tsk, r0 = scratch diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c index a2ff5c5d1450..ecf6a7869375 100644 --- a/arch/arc/kernel/kgdb.c +++ b/arch/arc/kernel/kgdb.c @@ -158,11 +158,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, return -1; } -unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs) -{ - return instruction_pointer(regs); -} - int kgdb_arch_init(void) { single_step_data.armed = 0; diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index b9a5685a990e..ae1c485cbc68 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -244,25 +244,23 @@ static int arc_pmu_device_probe(struct platform_device *pdev) pr_err("This core does not have performance counters!\n"); return -ENODEV; } + BUG_ON(pct_bcr.c > ARC_PMU_MAX_HWEVENTS); - arc_pmu = devm_kzalloc(&pdev->dev, sizeof(struct arc_pmu), - GFP_KERNEL); + READ_BCR(ARC_REG_CC_BUILD, cc_bcr); + if (!cc_bcr.v) { + pr_err("Performance counters exist, but no countable conditions?\n"); + return -ENODEV; + } + + arc_pmu = devm_kzalloc(&pdev->dev, sizeof(struct arc_pmu), GFP_KERNEL); if (!arc_pmu) return -ENOMEM; arc_pmu->n_counters = pct_bcr.c; - BUG_ON(arc_pmu->n_counters > ARC_PMU_MAX_HWEVENTS); - arc_pmu->counter_size = 32 + (pct_bcr.s << 4); - pr_info("ARC PMU found with %d counters of size %d bits\n", - arc_pmu->n_counters, arc_pmu->counter_size); - - READ_BCR(ARC_REG_CC_BUILD, cc_bcr); - - if (!cc_bcr.v) - pr_err("Strange! Performance counters exist, but no countable conditions?\n"); - pr_info("ARC PMU has %d countable conditions\n", cc_bcr.c); + pr_info("ARC perf\t: %d counters (%d bits), %d countable conditions\n", + arc_pmu->n_counters, arc_pmu->counter_size, cc_bcr.c); cc_name.str[8] = 0; for (i = 0; i < PERF_COUNT_HW_MAX; i++) diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 119dddb752b2..252bf603db9c 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -13,7 +13,9 @@ #include <linux/console.h> #include <linux/module.h> #include <linux/cpu.h> +#include <linux/clk-provider.h> #include <linux/of_fdt.h> +#include <linux/of_platform.h> #include <linux/cache.h> #include <asm/sections.h> #include <asm/arcregs.h> @@ -24,11 +26,10 @@ #include <asm/unwind.h> #include <asm/clk.h> #include <asm/mach_desc.h> +#include <asm/smp.h> #define FIX_PTR(x) __asm__ __volatile__(";" : "+r"(x)) -int running_on_hw = 1; /* vs. on ISS */ - /* Part of U-boot ABI: see head.S */ int __initdata uboot_tag; char __initdata *uboot_arg; @@ -42,26 +43,26 @@ struct cpuinfo_arc cpuinfo_arc700[NR_CPUS]; static void read_arc_build_cfg_regs(void) { struct bcr_perip uncached_space; + struct bcr_generic bcr; struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; FIX_PTR(cpu); READ_BCR(AUX_IDENTITY, cpu->core); + READ_BCR(ARC_REG_ISA_CFG_BCR, cpu->isa); - cpu->timers = read_aux_reg(ARC_REG_TIMERS_BCR); + READ_BCR(ARC_REG_TIMERS_BCR, cpu->timers); cpu->vec_base = read_aux_reg(AUX_INTR_VEC_BASE); READ_BCR(ARC_REG_D_UNCACH_BCR, uncached_space); cpu->uncached_base = uncached_space.start << 24; - cpu->extn.mul = read_aux_reg(ARC_REG_MUL_BCR); - cpu->extn.swap = read_aux_reg(ARC_REG_SWAP_BCR); - cpu->extn.norm = read_aux_reg(ARC_REG_NORM_BCR); - cpu->extn.minmax = read_aux_reg(ARC_REG_MIXMAX_BCR); - cpu->extn.barrel = read_aux_reg(ARC_REG_BARREL_BCR); - READ_BCR(ARC_REG_MAC_BCR, cpu->extn_mac_mul); + READ_BCR(ARC_REG_MUL_BCR, cpu->extn_mpy); - cpu->extn.ext_arith = read_aux_reg(ARC_REG_EXTARITH_BCR); - cpu->extn.crc = read_aux_reg(ARC_REG_CRC_BCR); + cpu->extn.norm = read_aux_reg(ARC_REG_NORM_BCR) > 1 ? 1 : 0; /* 2,3 */ + cpu->extn.barrel = read_aux_reg(ARC_REG_BARREL_BCR) > 1 ? 1 : 0; /* 2,3 */ + cpu->extn.swap = read_aux_reg(ARC_REG_SWAP_BCR) ? 1 : 0; /* 1,3 */ + cpu->extn.crc = read_aux_reg(ARC_REG_CRC_BCR) ? 1 : 0; + cpu->extn.minmax = read_aux_reg(ARC_REG_MIXMAX_BCR) > 1 ? 1 : 0; /* 2 */ /* Note that we read the CCM BCRs independent of kernel config * This is to catch the cases where user doesn't know that @@ -95,43 +96,76 @@ static void read_arc_build_cfg_regs(void) read_decode_mmu_bcr(); read_decode_cache_bcr(); - READ_BCR(ARC_REG_FP_BCR, cpu->fp); - READ_BCR(ARC_REG_DPFP_BCR, cpu->dpfp); + { + struct bcr_fp_arcompact sp, dp; + struct bcr_bpu_arcompact bpu; + + READ_BCR(ARC_REG_FP_BCR, sp); + READ_BCR(ARC_REG_DPFP_BCR, dp); + cpu->extn.fpu_sp = sp.ver ? 1 : 0; + cpu->extn.fpu_dp = dp.ver ? 1 : 0; + + READ_BCR(ARC_REG_BPU_BCR, bpu); + cpu->bpu.ver = bpu.ver; + cpu->bpu.full = bpu.fam ? 1 : 0; + if (bpu.ent) { + cpu->bpu.num_cache = 256 << (bpu.ent - 1); + cpu->bpu.num_pred = 256 << (bpu.ent - 1); + } + } + + READ_BCR(ARC_REG_AP_BCR, bcr); + cpu->extn.ap = bcr.ver ? 1 : 0; + + READ_BCR(ARC_REG_SMART_BCR, bcr); + cpu->extn.smart = bcr.ver ? 1 : 0; + + cpu->extn.debug = cpu->extn.ap | cpu->extn.smart; } static const struct cpuinfo_data arc_cpu_tbl[] = { - { {0x10, "ARCTangent A5"}, 0x1F}, { {0x20, "ARC 600" }, 0x2F}, { {0x30, "ARC 700" }, 0x33}, { {0x34, "ARC 700 R4.10"}, 0x34}, + { {0x35, "ARC 700 R4.11"}, 0x35}, { {0x00, NULL } } }; +#define IS_AVAIL1(v, str) ((v) ? str : "") +#define IS_USED(cfg) (IS_ENABLED(cfg) ? "" : "(not used) ") +#define IS_AVAIL2(v, str, cfg) IS_AVAIL1(v, str), IS_AVAIL1(v, IS_USED(cfg)) + static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) { - int n = 0; struct cpuinfo_arc *cpu = &cpuinfo_arc700[cpu_id]; struct bcr_identity *core = &cpu->core; const struct cpuinfo_data *tbl; - int be = 0; -#ifdef CONFIG_CPU_BIG_ENDIAN - be = 1; -#endif + char *isa_nm; + int i, be, atomic; + int n = 0; + FIX_PTR(cpu); + { + isa_nm = "ARCompact"; + be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); + + atomic = cpu->isa.atomic1; + if (!cpu->isa.ver) /* ISA BCR absent, use Kconfig info */ + atomic = IS_ENABLED(CONFIG_ARC_HAS_LLSC); + } + n += scnprintf(buf + n, len - n, - "\nARC IDENTITY\t: Family [%#02x]" - " Cpu-id [%#02x] Chip-id [%#4x]\n", - core->family, core->cpu_id, - core->chip_id); + "\nIDENTITY\t: ARCVER [%#02x] ARCNUM [%#02x] CHIPID [%#4x]\n", + core->family, core->cpu_id, core->chip_id); for (tbl = &arc_cpu_tbl[0]; tbl->info.id != 0; tbl++) { if ((core->family >= tbl->info.id) && (core->family <= tbl->up_range)) { n += scnprintf(buf + n, len - n, - "processor\t: %s %s\n", - tbl->info.str, - be ? "[Big Endian]" : ""); + "processor [%d]\t: %s (%s ISA) %s\n", + cpu_id, tbl->info.str, isa_nm, + IS_AVAIL1(be, "[Big-Endian]")); break; } } @@ -143,34 +177,35 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) (unsigned int)(arc_get_core_freq() / 1000000), (unsigned int)(arc_get_core_freq() / 10000) % 100); - n += scnprintf(buf + n, len - n, "Timers\t\t: %s %s\n", - (cpu->timers & 0x200) ? "TIMER1" : "", - (cpu->timers & 0x100) ? "TIMER0" : ""); + n += scnprintf(buf + n, len - n, "Timers\t\t: %s%s%s%s\nISA Extn\t: ", + IS_AVAIL1(cpu->timers.t0, "Timer0 "), + IS_AVAIL1(cpu->timers.t1, "Timer1 "), + IS_AVAIL2(cpu->timers.rtsc, "64-bit RTSC ", CONFIG_ARC_HAS_RTSC)); - n += scnprintf(buf + n, len - n, "Vect Tbl Base\t: %#x\n", - cpu->vec_base); + n += i = scnprintf(buf + n, len - n, "%s%s", + IS_AVAIL2(atomic, "atomic ", CONFIG_ARC_HAS_LLSC)); - n += scnprintf(buf + n, len - n, "UNCACHED Base\t: %#x\n", - cpu->uncached_base); + if (i) + n += scnprintf(buf + n, len - n, "\n\t\t: "); - return buf; -} + n += scnprintf(buf + n, len - n, "%s%s%s%s%s%s%s%s\n", + IS_AVAIL1(cpu->extn_mpy.ver, "mpy "), + IS_AVAIL1(cpu->extn.norm, "norm "), + IS_AVAIL1(cpu->extn.barrel, "barrel-shift "), + IS_AVAIL1(cpu->extn.swap, "swap "), + IS_AVAIL1(cpu->extn.minmax, "minmax "), + IS_AVAIL1(cpu->extn.crc, "crc "), + IS_AVAIL2(1, "swape", CONFIG_ARC_HAS_SWAPE)); -static const struct id_to_str mul_type_nm[] = { - { 0x0, "N/A"}, - { 0x1, "32x32 (spl Result Reg)" }, - { 0x2, "32x32 (ANY Result Reg)" } -}; + if (cpu->bpu.ver) + n += scnprintf(buf + n, len - n, + "BPU\t\t: %s%s match, cache:%d, Predict Table:%d\n", + IS_AVAIL1(cpu->bpu.full, "full"), + IS_AVAIL1(!cpu->bpu.full, "partial"), + cpu->bpu.num_cache, cpu->bpu.num_pred); -static const struct id_to_str mac_mul_nm[] = { - {0x0, "N/A"}, - {0x1, "N/A"}, - {0x2, "Dual 16 x 16"}, - {0x3, "N/A"}, - {0x4, "32x16"}, - {0x5, "N/A"}, - {0x6, "Dual 16x16 and 32x16"} -}; + return buf; +} static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len) { @@ -178,67 +213,46 @@ static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len) struct cpuinfo_arc *cpu = &cpuinfo_arc700[cpu_id]; FIX_PTR(cpu); -#define IS_AVAIL1(var, str) ((var) ? str : "") -#define IS_AVAIL2(var, str) ((var == 0x2) ? str : "") -#define IS_USED(cfg) (IS_ENABLED(cfg) ? "(in-use)" : "(not used)") n += scnprintf(buf + n, len - n, - "Extn [700-Base]\t: %s %s %s %s %s %s\n", - IS_AVAIL2(cpu->extn.norm, "norm,"), - IS_AVAIL2(cpu->extn.barrel, "barrel-shift,"), - IS_AVAIL1(cpu->extn.swap, "swap,"), - IS_AVAIL2(cpu->extn.minmax, "minmax,"), - IS_AVAIL1(cpu->extn.crc, "crc,"), - IS_AVAIL2(cpu->extn.ext_arith, "ext-arith")); - - n += scnprintf(buf + n, len - n, "Extn [700-MPY]\t: %s", - mul_type_nm[cpu->extn.mul].str); - - n += scnprintf(buf + n, len - n, " MAC MPY: %s\n", - mac_mul_nm[cpu->extn_mac_mul.type].str); - - if (cpu->core.family == 0x34) { - n += scnprintf(buf + n, len - n, - "Extn [700-4.10]\t: LLOCK/SCOND %s, SWAPE %s, RTSC %s\n", - IS_USED(CONFIG_ARC_HAS_LLSC), - IS_USED(CONFIG_ARC_HAS_SWAPE), - IS_USED(CONFIG_ARC_HAS_RTSC)); - } - - n += scnprintf(buf + n, len - n, "Extn [CCM]\t: %s", - !(cpu->dccm.sz || cpu->iccm.sz) ? "N/A" : ""); - - if (cpu->dccm.sz) - n += scnprintf(buf + n, len - n, "DCCM: @ %x, %d KB ", - cpu->dccm.base_addr, TO_KB(cpu->dccm.sz)); - - if (cpu->iccm.sz) - n += scnprintf(buf + n, len - n, "ICCM: @ %x, %d KB", + "Vector Table\t: %#x\nUncached Base\t: %#x\n", + cpu->vec_base, cpu->uncached_base); + + if (cpu->extn.fpu_sp || cpu->extn.fpu_dp) + n += scnprintf(buf + n, len - n, "FPU\t\t: %s%s\n", + IS_AVAIL1(cpu->extn.fpu_sp, "SP "), + IS_AVAIL1(cpu->extn.fpu_dp, "DP ")); + + if (cpu->extn.debug) + n += scnprintf(buf + n, len - n, "DEBUG\t\t: %s%s%s\n", + IS_AVAIL1(cpu->extn.ap, "ActionPoint "), + IS_AVAIL1(cpu->extn.smart, "smaRT "), + IS_AVAIL1(cpu->extn.rtt, "RTT ")); + + if (cpu->dccm.sz || cpu->iccm.sz) + n += scnprintf(buf + n, len - n, "Extn [CCM]\t: DCCM @ %x, %d KB / ICCM: @ %x, %d KB\n", + cpu->dccm.base_addr, TO_KB(cpu->dccm.sz), cpu->iccm.base_addr, TO_KB(cpu->iccm.sz)); - n += scnprintf(buf + n, len - n, "\nExtn [FPU]\t: %s", - !(cpu->fp.ver || cpu->dpfp.ver) ? "N/A" : ""); - - if (cpu->fp.ver) - n += scnprintf(buf + n, len - n, "SP [v%d] %s", - cpu->fp.ver, cpu->fp.fast ? "(fast)" : ""); - - if (cpu->dpfp.ver) - n += scnprintf(buf + n, len - n, "DP [v%d] %s", - cpu->dpfp.ver, cpu->dpfp.fast ? "(fast)" : ""); - - n += scnprintf(buf + n, len - n, "\n"); - n += scnprintf(buf + n, len - n, "OS ABI [v3]\t: no-legacy-syscalls\n"); return buf; } -static void arc_chk_ccms(void) +static void arc_chk_core_config(void) { -#if defined(CONFIG_ARC_HAS_DCCM) || defined(CONFIG_ARC_HAS_ICCM) struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; + int fpu_enabled; + + if (!cpu->timers.t0) + panic("Timer0 is not present!\n"); + + if (!cpu->timers.t1) + panic("Timer1 is not present!\n"); + + if (IS_ENABLED(CONFIG_ARC_HAS_RTSC) && !cpu->timers.rtsc) + panic("RTSC is not present\n"); #ifdef CONFIG_ARC_HAS_DCCM /* @@ -256,33 +270,20 @@ static void arc_chk_ccms(void) if (CONFIG_ARC_ICCM_SZ != cpu->iccm.sz) panic("Linux built with incorrect ICCM Size\n"); #endif -#endif -} -/* - * Ensure that FP hardware and kernel config match - * -If hardware contains DPFP, kernel needs to save/restore FPU state - * across context switches - * -If hardware lacks DPFP, but kernel configured to save FPU state then - * kernel trying to access non-existant DPFP regs will crash - * - * We only check for Dbl precision Floating Point, because only DPFP - * hardware has dedicated regs which need to be saved/restored on ctx-sw - * (Single Precision uses core regs), thus kernel is kind of oblivious to it - */ -static void arc_chk_fpu(void) -{ - struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; + /* + * FP hardware/software config sanity + * -If hardware contains DPFP, kernel needs to save/restore FPU state + * -If not, it will crash trying to save/restore the non-existant regs + * + * (only DPDP checked since SP has no arch visible regs) + */ + fpu_enabled = IS_ENABLED(CONFIG_ARC_FPU_SAVE_RESTORE); - if (cpu->dpfp.ver) { -#ifndef CONFIG_ARC_FPU_SAVE_RESTORE - pr_warn("DPFP support broken in this kernel...\n"); -#endif - } else { -#ifdef CONFIG_ARC_FPU_SAVE_RESTORE - panic("H/w lacks DPFP support, apps won't work\n"); -#endif - } + if (cpu->extn.fpu_dp && !fpu_enabled) + pr_warn("CONFIG_ARC_FPU_SAVE_RESTORE needed for working apps\n"); + else if (!cpu->extn.fpu_dp && fpu_enabled) + panic("FPU non-existent, disable CONFIG_ARC_FPU_SAVE_RESTORE\n"); } /* @@ -303,15 +304,11 @@ void setup_processor(void) arc_mmu_init(); arc_cache_init(); - arc_chk_ccms(); printk(arc_extn_mumbojumbo(cpu_id, str, sizeof(str))); - -#ifdef CONFIG_SMP printk(arc_platform_smp_cpuinfo()); -#endif - arc_chk_fpu(); + arc_chk_core_config(); } static inline int is_kernel(unsigned long addr) @@ -360,11 +357,7 @@ void __init setup_arch(char **cmdline_p) machine_desc->init_early(); setup_processor(); - -#ifdef CONFIG_SMP smp_init_cpus(); -#endif - setup_arch_memory(); /* copy flat DT out of .init and then unflatten it */ @@ -385,7 +378,13 @@ void __init setup_arch(char **cmdline_p) static int __init customize_machine(void) { - /* Add platform devices */ + of_clk_init(NULL); + /* + * Traverses flattened DeviceTree - registering platform devices + * (if any) complete with their resources + */ + of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); + if (machine_desc->init_machine) machine_desc->init_machine(); @@ -419,19 +418,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, arc_cpu_mumbojumbo(cpu_id, str, PAGE_SIZE)); - seq_printf(m, "Bogo MIPS : \t%lu.%02lu\n", + seq_printf(m, "Bogo MIPS\t: %lu.%02lu\n", loops_per_jiffy / (500000 / HZ), (loops_per_jiffy / (5000 / HZ)) % 100); seq_printf(m, arc_mmu_mumbojumbo(cpu_id, str, PAGE_SIZE)); - seq_printf(m, arc_cache_mumbojumbo(cpu_id, str, PAGE_SIZE)); - seq_printf(m, arc_extn_mumbojumbo(cpu_id, str, PAGE_SIZE)); - -#ifdef CONFIG_SMP seq_printf(m, arc_platform_smp_cpuinfo()); -#endif free_page((unsigned long)str); done: diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index dcd317c47d09..d01df0c517a2 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -101,7 +101,7 @@ void __weak arc_platform_smp_wait_to_boot(int cpu) const char *arc_platform_smp_cpuinfo(void) { - return plat_smp_ops.info; + return plat_smp_ops.info ? : ""; } /* diff --git a/arch/arc/mm/cache_arc700.c b/arch/arc/mm/cache_arc700.c index 9e1142729fd1..8c3a3e02ba92 100644 --- a/arch/arc/mm/cache_arc700.c +++ b/arch/arc/mm/cache_arc700.c @@ -530,16 +530,9 @@ EXPORT_SYMBOL(dma_cache_wback); */ void flush_icache_range(unsigned long kstart, unsigned long kend) { - unsigned int tot_sz, off, sz; - unsigned long phy, pfn; + unsigned int tot_sz; - /* printk("Kernel Cache Cohenercy: %lx to %lx\n",kstart, kend); */ - - /* This is not the right API for user virtual address */ - if (kstart < TASK_SIZE) { - BUG_ON("Flush icache range for user virtual addr space"); - return; - } + WARN(kstart < TASK_SIZE, "%s() can't handle user vaddr", __func__); /* Shortcut for bigger flush ranges. * Here we don't care if this was kernel virtual or phy addr @@ -572,6 +565,9 @@ void flush_icache_range(unsigned long kstart, unsigned long kend) * straddles across 2 virtual pages and hence need for loop */ while (tot_sz > 0) { + unsigned int off, sz; + unsigned long phy, pfn; + off = kstart % PAGE_SIZE; pfn = vmalloc_to_pfn((void *)kstart); phy = (pfn << PAGE_SHIFT) + off; diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index e1acf0ce5647..7f47d2a56f44 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -609,14 +609,12 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len) int n = 0; struct cpuinfo_arc_mmu *p_mmu = &cpuinfo_arc700[cpu_id].mmu; - n += scnprintf(buf + n, len - n, "ARC700 MMU [v%x]\t: %dk PAGE, ", - p_mmu->ver, TO_KB(p_mmu->pg_sz)); - n += scnprintf(buf + n, len - n, - "J-TLB %d (%dx%d), uDTLB %d, uITLB %d, %s\n", + "MMU [v%x]\t: %dk PAGE, JTLB %d (%dx%d), uDTLB %d, uITLB %d %s\n", + p_mmu->ver, TO_KB(p_mmu->pg_sz), p_mmu->num_tlb, p_mmu->sets, p_mmu->ways, p_mmu->u_dtlb, p_mmu->u_itlb, - IS_ENABLED(CONFIG_ARC_MMU_SASID) ? "SASID" : ""); + IS_ENABLED(CONFIG_ARC_MMU_SASID) ? ",SASID" : ""); return buf; } diff --git a/arch/arc/plat-arcfpga/Kconfig b/arch/arc/plat-arcfpga/Kconfig index b9f34cf55acf..217593a70751 100644 --- a/arch/arc/plat-arcfpga/Kconfig +++ b/arch/arc/plat-arcfpga/Kconfig @@ -8,7 +8,7 @@ menuconfig ARC_PLAT_FPGA_LEGACY bool "\"Legacy\" ARC FPGA dev Boards" - select ISS_SMP_EXTN if SMP + select ARC_HAS_COH_CACHES if SMP help Support for ARC development boards, provided by Synopsys. These are based on FPGA or ISS. e.g. @@ -18,17 +18,6 @@ menuconfig ARC_PLAT_FPGA_LEGACY if ARC_PLAT_FPGA_LEGACY -config ARC_BOARD_ANGEL4 - bool "ARC Angel4" - default y - help - ARC Angel4 FPGA Ref Platform (Xilinx Virtex Based) - -config ARC_BOARD_ML509 - bool "ML509" - help - ARC ML509 FPGA Ref Platform (Xilinx Virtex-5 Based) - config ISS_SMP_EXTN bool "ARC SMP Extensions (ISS Models only)" default n diff --git a/arch/arc/plat-arcfpga/include/plat/irq.h b/arch/arc/plat-arcfpga/include/plat/irq.h deleted file mode 100644 index 2c9dea690ac4..000000000000 --- a/arch/arc/plat-arcfpga/include/plat/irq.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * vineetg: Feb 2009 - * -For AA4 board, IRQ assignments to peripherals - */ - -#ifndef __PLAT_IRQ_H -#define __PLAT_IRQ_H - -#define UART0_IRQ 5 -#define UART1_IRQ 10 -#define UART2_IRQ 11 - -#define IDE_IRQ 13 -#define PCI_IRQ 14 -#define PS2_IRQ 15 - -#ifdef CONFIG_SMP -#define IDU_INTERRUPT_0 16 -#endif - -#endif diff --git a/arch/arc/plat-arcfpga/include/plat/memmap.h b/arch/arc/plat-arcfpga/include/plat/memmap.h deleted file mode 100644 index 5c78e6135a1f..000000000000 --- a/arch/arc/plat-arcfpga/include/plat/memmap.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * vineetg: Feb 2009 - * -For AA4 board, System Memory Map for Peripherals etc - */ - -#ifndef __PLAT_MEMMAP_H -#define __PLAT_MEMMAP_H - -#define UART0_BASE 0xC0FC1000 -#define UART1_BASE 0xC0FC1100 - -#define IDE_CONTROLLER_BASE 0xC0FC9000 - -#define AHB_PCI_HOST_BRG_BASE 0xC0FD0000 - -#define PGU_BASEADDR 0xC0FC8000 -#define VLCK_ADDR 0xC0FCF028 - -#define BVCI_LAT_UNIT_BASE 0xC0FED000 - -#define PS2_BASE_ADDR 0xC0FCC000 - -#endif diff --git a/arch/arc/plat-arcfpga/platform.c b/arch/arc/plat-arcfpga/platform.c index 1038949a99a1..afc88254acc1 100644 --- a/arch/arc/plat-arcfpga/platform.c +++ b/arch/arc/plat-arcfpga/platform.c @@ -8,37 +8,9 @@ * published by the Free Software Foundation. */ -#include <linux/types.h> #include <linux/init.h> -#include <linux/device.h> -#include <linux/platform_device.h> -#include <linux/io.h> -#include <linux/console.h> -#include <linux/of_platform.h> -#include <asm/setup.h> -#include <asm/clk.h> #include <asm/mach_desc.h> -#include <plat/memmap.h> #include <plat/smp.h> -#include <plat/irq.h> - -static void __init plat_fpga_early_init(void) -{ - pr_info("[plat-arcfpga]: registering early dev resources\n"); - -#ifdef CONFIG_ISS_SMP_EXTN - iss_model_init_early_smp(); -#endif -} - -static void __init plat_fpga_populate_dev(void) -{ - /* - * Traverses flattened DeviceTree - registering platform devices - * (if any) complete with their resources - */ - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} /*----------------------- Machine Descriptions ------------------------------ * @@ -48,41 +20,26 @@ static void __init plat_fpga_populate_dev(void) * callback set, by matching the DT compatible name. */ -static const char *aa4_compat[] __initconst = { +static const char *legacy_fpga_compat[] __initconst = { "snps,arc-angel4", - NULL, -}; - -MACHINE_START(ANGEL4, "angel4") - .dt_compat = aa4_compat, - .init_early = plat_fpga_early_init, - .init_machine = plat_fpga_populate_dev, -#ifdef CONFIG_ISS_SMP_EXTN - .init_smp = iss_model_init_smp, -#endif -MACHINE_END - -static const char *ml509_compat[] __initconst = { "snps,arc-ml509", NULL, }; -MACHINE_START(ML509, "ml509") - .dt_compat = ml509_compat, - .init_early = plat_fpga_early_init, - .init_machine = plat_fpga_populate_dev, -#ifdef CONFIG_SMP +MACHINE_START(LEGACY_FPGA, "legacy_fpga") + .dt_compat = legacy_fpga_compat, +#ifdef CONFIG_ISS_SMP_EXTN + .init_early = iss_model_init_early_smp, .init_smp = iss_model_init_smp, #endif MACHINE_END -static const char *nsimosci_compat[] __initconst = { +static const char *simulation_compat[] __initconst = { + "snps,nsim", "snps,nsimosci", NULL, }; -MACHINE_START(NSIMOSCI, "nsimosci") - .dt_compat = nsimosci_compat, - .init_early = NULL, - .init_machine = plat_fpga_populate_dev, +MACHINE_START(SIMULATION, "simulation") + .dt_compat = simulation_compat, MACHINE_END diff --git a/arch/arc/plat-arcfpga/smp.c b/arch/arc/plat-arcfpga/smp.c index 92bad9122077..64797ba3bbe3 100644 --- a/arch/arc/plat-arcfpga/smp.c +++ b/arch/arc/plat-arcfpga/smp.c @@ -13,9 +13,10 @@ #include <linux/smp.h> #include <linux/irq.h> -#include <plat/irq.h> #include <plat/smp.h> +#define IDU_INTERRUPT_0 16 + static char smp_cpuinfo_buf[128]; /* diff --git a/arch/arc/plat-tb10x/Kconfig b/arch/arc/plat-tb10x/Kconfig index 6994c188dc88..d14b3d3c5dfd 100644 --- a/arch/arc/plat-tb10x/Kconfig +++ b/arch/arc/plat-tb10x/Kconfig @@ -18,7 +18,6 @@ menuconfig ARC_PLAT_TB10X bool "Abilis TB10x" - select COMMON_CLK select PINCTRL select PINCTRL_TB10X select PINMUX diff --git a/arch/arc/plat-tb10x/tb10x.c b/arch/arc/plat-tb10x/tb10x.c index 06cb30929460..da0ac0960a4b 100644 --- a/arch/arc/plat-tb10x/tb10x.c +++ b/arch/arc/plat-tb10x/tb10x.c @@ -19,21 +19,9 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - #include <linux/init.h> -#include <linux/of_platform.h> -#include <linux/clk-provider.h> -#include <linux/pinctrl/consumer.h> - #include <asm/mach_desc.h> - -static void __init tb10x_platform_init(void) -{ - of_clk_init(NULL); - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - static const char *tb10x_compat[] __initdata = { "abilis,arc-tb10x", NULL, @@ -41,5 +29,4 @@ static const char *tb10x_compat[] __initdata = { MACHINE_START(TB10x, "tb10x") .dt_compat = tb10x_compat, - .init_machine = tb10x_platform_init, MACHINE_END diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 709ecc9009ec..f1dc7fc668f3 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -261,6 +261,7 @@ CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=y CONFIG_ORION_WATCHDOG=y CONFIG_SUNXI_WATCHDOG=y +CONFIG_MESON_WATCHDOG=y CONFIG_MFD_AS3722=y CONFIG_MFD_BCM590XX=y CONFIG_MFD_CROS_EC=y diff --git a/arch/arm/mach-highbank/highbank.c b/arch/arm/mach-highbank/highbank.c index 8c35ae4ff176..07a09570175d 100644 --- a/arch/arm/mach-highbank/highbank.c +++ b/arch/arm/mach-highbank/highbank.c @@ -20,7 +20,7 @@ #include <linux/input.h> #include <linux/io.h> #include <linux/irqchip.h> -#include <linux/mailbox.h> +#include <linux/pl320-ipc.h> #include <linux/of.h> #include <linux/of_irq.h> #include <linux/of_platform.h> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ac9afde76dea..9532f8d5857e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1,5 +1,6 @@ config ARM64 def_bool y + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST @@ -232,7 +233,7 @@ config ARM64_VA_BITS_42 config ARM64_VA_BITS_48 bool "48-bit" - depends on BROKEN + depends on !ARM_SMMU endchoice diff --git a/arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi b/arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi index ac2cb2418025..c46cbb29f3c6 100644 --- a/arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi +++ b/arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi @@ -22,7 +22,7 @@ bank-width = <4>; }; - vram@2,00000000 { + v2m_video_ram: vram@2,00000000 { compatible = "arm,vexpress-vram"; reg = <2 0x00000000 0x00800000>; }; @@ -179,9 +179,42 @@ clcd@1f0000 { compatible = "arm,pl111", "arm,primecell"; reg = <0x1f0000 0x1000>; + interrupt-names = "combined"; interrupts = <14>; clocks = <&v2m_oscclk1>, <&v2m_clk24mhz>; clock-names = "clcdclk", "apb_pclk"; + arm,pl11x,framebuffer = <0x18000000 0x00180000>; + memory-region = <&v2m_video_ram>; + max-memory-bandwidth = <130000000>; /* 16bpp @ 63.5MHz */ + + port { + v2m_clcd_pads: endpoint { + remote-endpoint = <&v2m_clcd_panel>; + arm,pl11x,tft-r0g0b0-pads = <0 8 16>; + }; + }; + + panel { + compatible = "panel-dpi"; + + port { + v2m_clcd_panel: endpoint { + remote-endpoint = <&v2m_clcd_pads>; + }; + }; + + panel-timing { + clock-frequency = <63500127>; + hactive = <1024>; + hback-porch = <152>; + hfront-porch = <48>; + hsync-len = <104>; + vactive = <768>; + vback-porch = <23>; + vfront-porch = <3>; + vsync-len = <4>; + }; + }; }; virtio_block@0130000 { diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 9cd37de9aa8d..4ce602c2c6de 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -78,6 +78,7 @@ CONFIG_NET_XGENE=y # CONFIG_WLAN is not set CONFIG_INPUT_EVDEV=y # CONFIG_SERIO_SERPORT is not set +CONFIG_SERIO_AMBAKMI=y CONFIG_LEGACY_PTY_COUNT=16 CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y @@ -90,6 +91,7 @@ CONFIG_VIRTIO_CONSOLE=y CONFIG_REGULATOR=y CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_FB=y +CONFIG_FB_ARMCLCD=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 253e33bc94fb..56de5aadede2 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -37,8 +37,8 @@ typedef s32 compat_ssize_t; typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; -typedef u32 __compat_uid_t; -typedef u32 __compat_gid_t; +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; typedef u16 __compat_uid16_t; typedef u16 __compat_gid16_t; typedef u32 __compat_uid32_t; diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 01d3aab64b79..1f65be393139 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -126,7 +126,7 @@ typedef struct user_fpsimd_state elf_fpregset_t; * that it will "exec", and that there is sufficient room for the brk. */ extern unsigned long randomize_et_dyn(unsigned long base); -#define ELF_ET_DYN_BASE (randomize_et_dyn(2 * TASK_SIZE_64 / 3)) +#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) /* * When the program starts, a1 contains a pointer to a function to be @@ -169,7 +169,7 @@ extern unsigned long arch_randomize_brk(struct mm_struct *mm); #define COMPAT_ELF_PLATFORM ("v8l") #endif -#define COMPAT_ELF_ET_DYN_BASE (randomize_et_dyn(2 * TASK_SIZE_32 / 3)) +#define COMPAT_ELF_ET_DYN_BASE (2 * TASK_SIZE_32 / 3) /* AArch32 registers. */ #define COMPAT_ELF_NGREG 18 diff --git a/arch/arm64/include/asm/irq_work.h b/arch/arm64/include/asm/irq_work.h index 8e24ef3f7c82..b4f6b19a8a68 100644 --- a/arch/arm64/include/asm/irq_work.h +++ b/arch/arm64/include/asm/irq_work.h @@ -1,6 +1,8 @@ #ifndef __ASM_IRQ_WORK_H #define __ASM_IRQ_WORK_H +#ifdef CONFIG_SMP + #include <asm/smp.h> static inline bool arch_irq_work_has_interrupt(void) @@ -8,4 +10,13 @@ static inline bool arch_irq_work_has_interrupt(void) return !!__smp_cross_call; } +#else + +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} + +#endif + #endif /* __ASM_IRQ_WORK_H */ diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 03aaa99e1ea0..95c49ebc660d 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -89,7 +89,8 @@ static int __init uefi_init(void) */ if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { pr_err("System table signature incorrect\n"); - return -EINVAL; + retval = -EINVAL; + goto out; } if ((efi.systab->hdr.revision >> 16) < 2) pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n", @@ -103,6 +104,7 @@ static int __init uefi_init(void) for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) vendor[i] = c16[i]; vendor[i] = '\0'; + early_memunmap(c16, sizeof(vendor)); } pr_info("EFI v%u.%.02u by %s\n", @@ -113,29 +115,11 @@ static int __init uefi_init(void) if (retval == 0) set_bit(EFI_CONFIG_TABLES, &efi.flags); - early_memunmap(c16, sizeof(vendor)); +out: early_memunmap(efi.systab, sizeof(efi_system_table_t)); - return retval; } -static __initdata char memory_type_name[][32] = { - {"Reserved"}, - {"Loader Code"}, - {"Loader Data"}, - {"Boot Code"}, - {"Boot Data"}, - {"Runtime Code"}, - {"Runtime Data"}, - {"Conventional Memory"}, - {"Unusable Memory"}, - {"ACPI Reclaim Memory"}, - {"ACPI Memory NVS"}, - {"Memory Mapped I/O"}, - {"MMIO Port Space"}, - {"PAL Code"}, -}; - /* * Return true for RAM regions we want to permanently reserve. */ @@ -166,10 +150,13 @@ static __init void reserve_regions(void) paddr = md->phys_addr; npages = md->num_pages; - if (uefi_debug) - pr_info(" 0x%012llx-0x%012llx [%s]", + if (uefi_debug) { + char buf[64]; + + pr_info(" 0x%012llx-0x%012llx %s", paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, - memory_type_name[md->type]); + efi_md_typeattr_format(buf, sizeof(buf), md)); + } memrange_efi_to_native(&paddr, &npages); size = npages << PAGE_SHIFT; @@ -393,11 +380,16 @@ static int __init arm64_enter_virtual_mode(void) return -1; } - pr_info("Remapping and enabling EFI services.\n"); - - /* replace early memmap mapping with permanent mapping */ mapsize = memmap.map_end - memmap.map; early_memunmap(memmap.map, mapsize); + + if (efi_runtime_disabled()) { + pr_info("EFI runtime services will be disabled.\n"); + return -1; + } + + pr_info("Remapping and enabling EFI services.\n"); + /* replace early memmap mapping with permanent mapping */ memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map, mapsize); memmap.map_end = memmap.map + mapsize; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c3065dbc4fa2..fde9923af859 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -378,8 +378,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) { return randomize_base(mm->brk); } - -unsigned long randomize_et_dyn(unsigned long base) -{ - return randomize_base(base); -} diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index fa324bd5a5c4..4a07630a6616 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -105,10 +105,10 @@ EXPORT_SYMBOL(ioremap_cache); static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; #if CONFIG_ARM64_PGTABLE_LEVELS > 2 -static pte_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; +static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; #endif #if CONFIG_ARM64_PGTABLE_LEVELS > 3 -static pte_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; #endif static inline pud_t * __init early_ioremap_pud(unsigned long addr) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6894ef3e6234..0bf90d26e745 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -297,11 +297,15 @@ static void __init map_mem(void) * create_mapping requires puds, pmds and ptes to be allocated from * memory addressable from the initial direct kernel mapping. * - * The initial direct kernel mapping, located at swapper_pg_dir, - * gives us PUD_SIZE memory starting from PHYS_OFFSET (which must be - * aligned to 2MB as per Documentation/arm64/booting.txt). + * The initial direct kernel mapping, located at swapper_pg_dir, gives + * us PUD_SIZE (4K pages) or PMD_SIZE (64K pages) memory starting from + * PHYS_OFFSET (which must be aligned to 2MB as per + * Documentation/arm64/booting.txt). */ - limit = PHYS_OFFSET + PUD_SIZE; + if (IS_ENABLED(CONFIG_ARM64_64K_PAGES)) + limit = PHYS_OFFSET + PMD_SIZE; + else + limit = PHYS_OFFSET + PUD_SIZE; memblock_set_current_limit(limit); /* map all the memory banks */ diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 62c6101df260..6682b361d3ac 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -30,12 +30,14 @@ #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) +static struct kmem_cache *pgd_cache; + pgd_t *pgd_alloc(struct mm_struct *mm) { if (PGD_SIZE == PAGE_SIZE) return (pgd_t *)get_zeroed_page(GFP_KERNEL); else - return kzalloc(PGD_SIZE, GFP_KERNEL); + return kmem_cache_zalloc(pgd_cache, GFP_KERNEL); } void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -43,5 +45,17 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) if (PGD_SIZE == PAGE_SIZE) free_page((unsigned long)pgd); else - kfree(pgd); + kmem_cache_free(pgd_cache, pgd); +} + +static int __init pgd_cache_init(void) +{ + /* + * Naturally aligned pgds required by the architecture. + */ + if (PGD_SIZE != PAGE_SIZE) + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, + SLAB_PANIC, NULL); + return 0; } +core_initcall(pgd_cache_init); diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index 2134f7e6c288..de0a81a539a0 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -144,8 +144,12 @@ /* Data-processing (2 source) */ /* Rd = Rn OP Rm */ -#define A64_UDIV(sf, Rd, Rn, Rm) aarch64_insn_gen_data2(Rd, Rn, Rm, \ - A64_VARIANT(sf), AARCH64_INSN_DATA2_UDIV) +#define A64_DATA2(sf, Rd, Rn, Rm, type) aarch64_insn_gen_data2(Rd, Rn, Rm, \ + A64_VARIANT(sf), AARCH64_INSN_DATA2_##type) +#define A64_UDIV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, UDIV) +#define A64_LSLV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, LSLV) +#define A64_LSRV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, LSRV) +#define A64_ASRV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, ASRV) /* Data-processing (3 source) */ /* Rd = Ra + Rn * Rm */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 7ae33545535b..41f1e3e2ea24 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -19,12 +19,13 @@ #define pr_fmt(fmt) "bpf_jit: " fmt #include <linux/filter.h> -#include <linux/moduleloader.h> #include <linux/printk.h> #include <linux/skbuff.h> #include <linux/slab.h> + #include <asm/byteorder.h> #include <asm/cacheflush.h> +#include <asm/debug-monitors.h> #include "bpf_jit.h" @@ -119,6 +120,14 @@ static inline int bpf2a64_offset(int bpf_to, int bpf_from, return to - from; } +static void jit_fill_hole(void *area, unsigned int size) +{ + u32 *ptr; + /* We are guaranteed to have aligned memory. */ + for (ptr = area; size >= sizeof(u32); size -= sizeof(u32)) + *ptr++ = cpu_to_le32(AARCH64_BREAK_FAULT); +} + static inline int epilogue_offset(const struct jit_ctx *ctx) { int to = ctx->offset[ctx->prog->len - 1]; @@ -196,6 +205,12 @@ static void build_epilogue(struct jit_ctx *ctx) emit(A64_RET(A64_LR), ctx); } +/* JITs an eBPF instruction. + * Returns: + * 0 - successfully JITed an 8-byte eBPF instruction. + * >0 - successfully JITed a 16-byte eBPF instruction. + * <0 - failed to JIT. + */ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) { const u8 code = insn->code; @@ -252,6 +267,18 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit(A64_MUL(is64, tmp, tmp, src), ctx); emit(A64_SUB(is64, dst, dst, tmp), ctx); break; + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU64 | BPF_LSH | BPF_X: + emit(A64_LSLV(is64, dst, dst, src), ctx); + break; + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU64 | BPF_RSH | BPF_X: + emit(A64_LSRV(is64, dst, dst, src), ctx); + break; + case BPF_ALU | BPF_ARSH | BPF_X: + case BPF_ALU64 | BPF_ARSH | BPF_X: + emit(A64_ASRV(is64, dst, dst, src), ctx); + break; /* dst = -dst */ case BPF_ALU | BPF_NEG: case BPF_ALU64 | BPF_NEG: @@ -443,6 +470,27 @@ emit_cond_jmp: emit(A64_B(jmp_offset), ctx); break; + /* dst = imm64 */ + case BPF_LD | BPF_IMM | BPF_DW: + { + const struct bpf_insn insn1 = insn[1]; + u64 imm64; + + if (insn1.code != 0 || insn1.src_reg != 0 || + insn1.dst_reg != 0 || insn1.off != 0) { + /* Note: verifier in BPF core must catch invalid + * instructions. + */ + pr_err_once("Invalid BPF_LD_IMM64 instruction\n"); + return -EINVAL; + } + + imm64 = (u64)insn1.imm << 32 | imm; + emit_a64_mov_i64(dst, imm64, ctx); + + return 1; + } + /* LDX: dst = *(size *)(src + off) */ case BPF_LDX | BPF_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_H: @@ -594,6 +642,10 @@ static int build_body(struct jit_ctx *ctx) ctx->offset[i] = ctx->idx; ret = build_insn(insn, ctx); + if (ret > 0) { + i++; + continue; + } if (ret) return ret; } @@ -613,8 +665,10 @@ void bpf_jit_compile(struct bpf_prog *prog) void bpf_int_jit_compile(struct bpf_prog *prog) { + struct bpf_binary_header *header; struct jit_ctx ctx; int image_size; + u8 *image_ptr; if (!bpf_jit_enable) return; @@ -636,23 +690,25 @@ void bpf_int_jit_compile(struct bpf_prog *prog) goto out; build_prologue(&ctx); - build_epilogue(&ctx); /* Now we know the actual image size. */ image_size = sizeof(u32) * ctx.idx; - ctx.image = module_alloc(image_size); - if (unlikely(ctx.image == NULL)) + header = bpf_jit_binary_alloc(image_size, &image_ptr, + sizeof(u32), jit_fill_hole); + if (header == NULL) goto out; /* 2. Now, the actual pass. */ + ctx.image = (u32 *)image_ptr; ctx.idx = 0; + build_prologue(&ctx); ctx.body_offset = ctx.idx; if (build_body(&ctx)) { - module_free(NULL, ctx.image); + bpf_jit_binary_free(header); goto out; } @@ -663,17 +719,25 @@ void bpf_int_jit_compile(struct bpf_prog *prog) bpf_jit_dump(prog->len, image_size, 2, ctx.image); bpf_flush_icache(ctx.image, ctx.image + ctx.idx); - prog->bpf_func = (void *)ctx.image; - prog->jited = 1; + set_memory_ro((unsigned long)header, header->pages); + prog->bpf_func = (void *)ctx.image; + prog->jited = true; out: kfree(ctx.offset); } void bpf_jit_free(struct bpf_prog *prog) { - if (prog->jited) - module_free(NULL, prog->bpf_func); + unsigned long addr = (unsigned long)prog->bpf_func & PAGE_MASK; + struct bpf_binary_header *header = (void *)addr; + + if (!prog->jited) + goto free_filter; + + set_memory_rw(addr, header->pages); + bpf_jit_binary_free(header); - kfree(prog); +free_filter: + bpf_prog_unlock_free(prog); } diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 741b99c1a0b1..c52d7540dc05 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -568,6 +568,7 @@ efi_init (void) { const char *unit; unsigned long size; + char buf[64]; md = p; size = md->num_pages << EFI_PAGE_SHIFT; @@ -586,9 +587,10 @@ efi_init (void) unit = "KB"; } - printk("mem%02d: type=%2u, attr=0x%016lx, " + printk("mem%02d: %s " "range=[0x%016lx-0x%016lx) (%4lu%s)\n", - i, md->type, md->attribute, md->phys_addr, + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, md->phys_addr + efi_md_size(md), size, unit); } } diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ad6badb6be71..f43aa536c517 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2066,6 +2066,7 @@ config MIPS_CPS support is unavailable. config MIPS_CPS_PM + depends on MIPS_CPS select MIPS_CPC bool diff --git a/arch/mips/ath79/mach-db120.c b/arch/mips/ath79/mach-db120.c index 4d661a1d2dae..9423f5aed287 100644 --- a/arch/mips/ath79/mach-db120.c +++ b/arch/mips/ath79/mach-db120.c @@ -113,7 +113,7 @@ static void __init db120_pci_init(u8 *eeprom) ath79_register_pci(); } #else -static inline void db120_pci_init(void) {} +static inline void db120_pci_init(u8 *eeprom) {} #endif /* CONFIG_PCI */ static void __init db120_setup(void) diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index 38f4c32e2816..5ebdb32d9a2b 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -806,15 +806,6 @@ void __init prom_init(void) #endif } - if (octeon_is_simulation()) { - /* - * The simulator uses a mtdram device pre filled with - * the filesystem. Also specify the calibration delay - * to avoid calculating it every time. - */ - strcat(arcs_cmdline, " rw root=1f00 slram=root,0x40000000,+1073741824"); - } - mips_hpt_frequency = octeon_get_clock_rate(); octeon_init_cvmcount(); diff --git a/arch/mips/include/asm/cop2.h b/arch/mips/include/asm/cop2.h index 51f80bd36fcc..63b3468ede4c 100644 --- a/arch/mips/include/asm/cop2.h +++ b/arch/mips/include/asm/cop2.h @@ -37,15 +37,15 @@ extern void nlm_cop2_restore(struct nlm_cop2_state *); #define cop2_present 1 #define cop2_lazy_restore 1 -#define cop2_save(r) do { (r); } while (0) -#define cop2_restore(r) do { (r); } while (0) +#define cop2_save(r) do { (void)(r); } while (0) +#define cop2_restore(r) do { (void)(r); } while (0) #else #define cop2_present 0 #define cop2_lazy_restore 0 -#define cop2_save(r) do { (r); } while (0) -#define cop2_restore(r) do { (r); } while (0) +#define cop2_save(r) do { (void)(r); } while (0) +#define cop2_restore(r) do { (void)(r); } while (0) #endif enum cu2_ops { diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h index 992aaba603b5..b463f2aa5a61 100644 --- a/arch/mips/include/asm/ftrace.h +++ b/arch/mips/include/asm/ftrace.h @@ -24,7 +24,7 @@ do { \ asm volatile ( \ "1: " load " %[tmp_dst], 0(%[tmp_src])\n" \ " li %[tmp_err], 0\n" \ - "2:\n" \ + "2: .insn\n" \ \ ".section .fixup, \"ax\"\n" \ "3: li %[tmp_err], 1\n" \ @@ -46,7 +46,7 @@ do { \ asm volatile ( \ "1: " store " %[tmp_src], 0(%[tmp_dst])\n"\ " li %[tmp_err], 0\n" \ - "2:\n" \ + "2: .insn\n" \ \ ".section .fixup, \"ax\"\n" \ "3: li %[tmp_err], 1\n" \ diff --git a/arch/mips/include/asm/idle.h b/arch/mips/include/asm/idle.h index d9f932de80e9..1c967abd545c 100644 --- a/arch/mips/include/asm/idle.h +++ b/arch/mips/include/asm/idle.h @@ -8,19 +8,12 @@ extern void (*cpu_wait)(void); extern void r4k_wait(void); extern asmlinkage void __r4k_wait(void); extern void r4k_wait_irqoff(void); -extern void __pastwait(void); static inline int using_rollback_handler(void) { return cpu_wait == r4k_wait; } -static inline int address_is_in_r4k_wait_irqoff(unsigned long addr) -{ - return addr >= (unsigned long)r4k_wait_irqoff && - addr < (unsigned long)__pastwait; -} - extern int mips_cpuidle_wait_enter(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); diff --git a/arch/mips/include/uapi/asm/ptrace.h b/arch/mips/include/uapi/asm/ptrace.h index bbcfb8ba8106..91a3d197ede3 100644 --- a/arch/mips/include/uapi/asm/ptrace.h +++ b/arch/mips/include/uapi/asm/ptrace.h @@ -9,6 +9,8 @@ #ifndef _UAPI_ASM_PTRACE_H #define _UAPI_ASM_PTRACE_H +#include <linux/types.h> + /* 0 - 31 are integer registers, 32 - 63 are fp registers. */ #define FPR_BASE 32 #define PC 64 diff --git a/arch/mips/kernel/idle.c b/arch/mips/kernel/idle.c index 09ce45980758..0b9082b6b683 100644 --- a/arch/mips/kernel/idle.c +++ b/arch/mips/kernel/idle.c @@ -68,9 +68,6 @@ void r4k_wait_irqoff(void) " wait \n" " .set pop \n"); local_irq_enable(); - __asm__( - " .globl __pastwait \n" - "__pastwait: \n"); } /* diff --git a/arch/mips/lasat/Kconfig b/arch/mips/lasat/Kconfig index 1d2ee8a9be13..8776d0a34274 100644 --- a/arch/mips/lasat/Kconfig +++ b/arch/mips/lasat/Kconfig @@ -4,7 +4,7 @@ config PICVUE config PICVUE_PROC tristate "PICVUE LCD display driver /proc interface" - depends on PICVUE + depends on PICVUE && PROC_FS config DS1603 bool "DS1603 RTC driver" diff --git a/arch/mips/loongson/lemote-2f/clock.c b/arch/mips/loongson/lemote-2f/clock.c index a217061beee3..462e34d46b4a 100644 --- a/arch/mips/loongson/lemote-2f/clock.c +++ b/arch/mips/loongson/lemote-2f/clock.c @@ -91,6 +91,7 @@ EXPORT_SYMBOL(clk_put); int clk_set_rate(struct clk *clk, unsigned long rate) { + unsigned int rate_khz = rate / 1000; struct cpufreq_frequency_table *pos; int ret = 0; int regval; @@ -107,9 +108,9 @@ int clk_set_rate(struct clk *clk, unsigned long rate) propagate_rate(clk); cpufreq_for_each_valid_entry(pos, loongson2_clockmod_table) - if (rate == pos->frequency) + if (rate_khz == pos->frequency) break; - if (rate != pos->frequency) + if (rate_khz != pos->frequency) return -ENOTSUPP; clk->rate = rate; diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index 7a4727795a70..51a0fde4bec1 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -1023,7 +1023,7 @@ emul: goto emul; case cop1x_op: - if (cpu_has_mips_4_5 || cpu_has_mips64) + if (cpu_has_mips_4_5 || cpu_has_mips64 || cpu_has_mips32r2) /* its one of ours */ goto emul; @@ -1068,7 +1068,7 @@ emul: break; case cop1x_op: - if (!cpu_has_mips_4_5 && !cpu_has_mips64) + if (!cpu_has_mips_4_5 && !cpu_has_mips64 && !cpu_has_mips32r2) return SIGILL; sig = fpux_emu(xcp, ctx, ir, fault_addr); diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index a08dd53a1cc5..b5f228e7eae6 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -1062,6 +1062,7 @@ static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep) struct mips_huge_tlb_info { int huge_pte; int restore_scratch; + bool need_reload_pte; }; static struct mips_huge_tlb_info @@ -1076,6 +1077,7 @@ build_fast_tlb_refill_handler (u32 **p, struct uasm_label **l, rv.huge_pte = scratch; rv.restore_scratch = 0; + rv.need_reload_pte = false; if (check_for_high_segbits) { UASM_i_MFC0(p, tmp, C0_BADVADDR); @@ -1264,6 +1266,7 @@ static void build_r4000_tlb_refill_handler(void) } else { htlb_info.huge_pte = K0; htlb_info.restore_scratch = 0; + htlb_info.need_reload_pte = true; vmalloc_mode = refill_noscratch; /* * create the plain linear handler @@ -1300,7 +1303,8 @@ static void build_r4000_tlb_refill_handler(void) } #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT uasm_l_tlb_huge_update(&l, p); - UASM_i_LW(&p, K0, 0, K1); + if (htlb_info.need_reload_pte) + UASM_i_LW(&p, htlb_info.huge_pte, 0, K1); build_huge_update_entries(&p, htlb_info.huge_pte, K1); build_huge_tlb_write_entry(&p, &l, &r, K0, tlb_random, htlb_info.restore_scratch); diff --git a/arch/mips/mti-malta/Makefile b/arch/mips/mti-malta/Makefile index b9510ea8db56..6510ace272d4 100644 --- a/arch/mips/mti-malta/Makefile +++ b/arch/mips/mti-malta/Makefile @@ -5,8 +5,9 @@ # Copyright (C) 2008 Wind River Systems, Inc. # written by Ralf Baechle <ralf@linux-mips.org> # -obj-y := malta-amon.o malta-display.o malta-init.o \ +obj-y := malta-display.o malta-init.o \ malta-int.o malta-memory.o malta-platform.o \ malta-reset.o malta-setup.o malta-time.o +obj-$(CONFIG_MIPS_CMP) += malta-amon.o obj-$(CONFIG_MIPS_MALTA_PM) += malta-pm.o diff --git a/arch/mips/mti-sead3/Makefile b/arch/mips/mti-sead3/Makefile index febf4334545e..2ae49e99eb67 100644 --- a/arch/mips/mti-sead3/Makefile +++ b/arch/mips/mti-sead3/Makefile @@ -14,7 +14,6 @@ obj-y := sead3-lcd.o sead3-display.o sead3-init.o \ sead3-setup.o sead3-time.o obj-y += sead3-i2c-dev.o sead3-i2c.o \ - sead3-pic32-i2c-drv.o sead3-pic32-bus.o \ leds-sead3.o sead3-leds.o obj-$(CONFIG_EARLY_PRINTK) += sead3-console.o diff --git a/arch/mips/mti-sead3/sead3-i2c.c b/arch/mips/mti-sead3/sead3-i2c.c index f70d5fc58ef5..795ae83894e0 100644 --- a/arch/mips/mti-sead3/sead3-i2c.c +++ b/arch/mips/mti-sead3/sead3-i2c.c @@ -5,10 +5,8 @@ * * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. */ -#include <linux/module.h> #include <linux/init.h> #include <linux/platform_device.h> -#include <irq.h> struct resource sead3_i2c_resources[] = { { @@ -30,8 +28,4 @@ static int __init sead3_i2c_init(void) return platform_device_register(&sead3_i2c_device); } -module_init(sead3_i2c_init); - -MODULE_AUTHOR("Chris Dearman <chris@mips.com>"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("I2C probe driver for SEAD3"); +device_initcall(sead3_i2c_init); diff --git a/arch/mips/mti-sead3/sead3-pic32-bus.c b/arch/mips/mti-sead3/sead3-pic32-bus.c deleted file mode 100644 index 3b12aa5a7c88..000000000000 --- a/arch/mips/mti-sead3/sead3-pic32-bus.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - */ -#include <linux/delay.h> -#include <linux/kernel.h> -#include <linux/spinlock.h> -#include <linux/io.h> -#include <linux/errno.h> - -#define PIC32_NULL 0x00 -#define PIC32_RD 0x01 -#define PIC32_SYSRD 0x02 -#define PIC32_WR 0x10 -#define PIC32_SYSWR 0x20 -#define PIC32_IRQ_CLR 0x40 -#define PIC32_STATUS 0x80 - -#define DELAY() udelay(100) /* FIXME: needed? */ - -/* spinlock to ensure atomic access to PIC32 */ -static DEFINE_SPINLOCK(pic32_bus_lock); - -/* FIXME: io_remap these */ -static void __iomem *bus_xfer = (void __iomem *)0xbf000600; -static void __iomem *bus_status = (void __iomem *)0xbf000060; - -static inline unsigned int ioready(void) -{ - return readl(bus_status) & 1; -} - -static inline void wait_ioready(void) -{ - do { } while (!ioready()); -} - -static inline void wait_ioclear(void) -{ - do { } while (ioready()); -} - -static inline void check_ioclear(void) -{ - if (ioready()) { - pr_debug("ioclear: initially busy\n"); - do { - (void) readl(bus_xfer); - DELAY(); - } while (ioready()); - pr_debug("ioclear: cleared busy\n"); - } -} - -u32 pic32_bus_readl(u32 reg) -{ - unsigned long flags; - u32 status, val; - - spin_lock_irqsave(&pic32_bus_lock, flags); - - check_ioclear(); - - writel((PIC32_RD << 24) | (reg & 0x00ffffff), bus_xfer); - DELAY(); - wait_ioready(); - status = readl(bus_xfer); - DELAY(); - val = readl(bus_xfer); - wait_ioclear(); - - pr_debug("pic32_bus_readl: *%x -> %x (status=%x)\n", reg, val, status); - - spin_unlock_irqrestore(&pic32_bus_lock, flags); - - return val; -} - -void pic32_bus_writel(u32 val, u32 reg) -{ - unsigned long flags; - u32 status; - - spin_lock_irqsave(&pic32_bus_lock, flags); - - check_ioclear(); - - writel((PIC32_WR << 24) | (reg & 0x00ffffff), bus_xfer); - DELAY(); - writel(val, bus_xfer); - DELAY(); - wait_ioready(); - status = readl(bus_xfer); - wait_ioclear(); - - pr_debug("pic32_bus_writel: *%x <- %x (status=%x)\n", reg, val, status); - - spin_unlock_irqrestore(&pic32_bus_lock, flags); -} diff --git a/arch/mips/mti-sead3/sead3-pic32-i2c-drv.c b/arch/mips/mti-sead3/sead3-pic32-i2c-drv.c deleted file mode 100644 index 80fe194cfa53..000000000000 --- a/arch/mips/mti-sead3/sead3-pic32-i2c-drv.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - */ -#include <linux/delay.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/platform_device.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/i2c.h> -#include <linux/slab.h> - -#define PIC32_I2CxCON 0x0000 -#define PIC32_I2CxCONCLR 0x0004 -#define PIC32_I2CxCONSET 0x0008 -#define PIC32_I2CxCONINV 0x000C -#define I2CCON_ON (1<<15) -#define I2CCON_FRZ (1<<14) -#define I2CCON_SIDL (1<<13) -#define I2CCON_SCLREL (1<<12) -#define I2CCON_STRICT (1<<11) -#define I2CCON_A10M (1<<10) -#define I2CCON_DISSLW (1<<9) -#define I2CCON_SMEN (1<<8) -#define I2CCON_GCEN (1<<7) -#define I2CCON_STREN (1<<6) -#define I2CCON_ACKDT (1<<5) -#define I2CCON_ACKEN (1<<4) -#define I2CCON_RCEN (1<<3) -#define I2CCON_PEN (1<<2) -#define I2CCON_RSEN (1<<1) -#define I2CCON_SEN (1<<0) - -#define PIC32_I2CxSTAT 0x0010 -#define PIC32_I2CxSTATCLR 0x0014 -#define PIC32_I2CxSTATSET 0x0018 -#define PIC32_I2CxSTATINV 0x001C -#define I2CSTAT_ACKSTAT (1<<15) -#define I2CSTAT_TRSTAT (1<<14) -#define I2CSTAT_BCL (1<<10) -#define I2CSTAT_GCSTAT (1<<9) -#define I2CSTAT_ADD10 (1<<8) -#define I2CSTAT_IWCOL (1<<7) -#define I2CSTAT_I2COV (1<<6) -#define I2CSTAT_DA (1<<5) -#define I2CSTAT_P (1<<4) -#define I2CSTAT_S (1<<3) -#define I2CSTAT_RW (1<<2) -#define I2CSTAT_RBF (1<<1) -#define I2CSTAT_TBF (1<<0) - -#define PIC32_I2CxADD 0x0020 -#define PIC32_I2CxADDCLR 0x0024 -#define PIC32_I2CxADDSET 0x0028 -#define PIC32_I2CxADDINV 0x002C -#define PIC32_I2CxMSK 0x0030 -#define PIC32_I2CxMSKCLR 0x0034 -#define PIC32_I2CxMSKSET 0x0038 -#define PIC32_I2CxMSKINV 0x003C -#define PIC32_I2CxBRG 0x0040 -#define PIC32_I2CxBRGCLR 0x0044 -#define PIC32_I2CxBRGSET 0x0048 -#define PIC32_I2CxBRGINV 0x004C -#define PIC32_I2CxTRN 0x0050 -#define PIC32_I2CxTRNCLR 0x0054 -#define PIC32_I2CxTRNSET 0x0058 -#define PIC32_I2CxTRNINV 0x005C -#define PIC32_I2CxRCV 0x0060 - -struct i2c_platform_data { - u32 base; - struct i2c_adapter adap; - u32 xfer_timeout; - u32 ack_timeout; - u32 ctl_timeout; -}; - -extern u32 pic32_bus_readl(u32 reg); -extern void pic32_bus_writel(u32 val, u32 reg); - -static inline void -StartI2C(struct i2c_platform_data *adap) -{ - pr_debug("StartI2C\n"); - pic32_bus_writel(I2CCON_SEN, adap->base + PIC32_I2CxCONSET); -} - -static inline void -StopI2C(struct i2c_platform_data *adap) -{ - pr_debug("StopI2C\n"); - pic32_bus_writel(I2CCON_PEN, adap->base + PIC32_I2CxCONSET); -} - -static inline void -AckI2C(struct i2c_platform_data *adap) -{ - pr_debug("AckI2C\n"); - pic32_bus_writel(I2CCON_ACKDT, adap->base + PIC32_I2CxCONCLR); - pic32_bus_writel(I2CCON_ACKEN, adap->base + PIC32_I2CxCONSET); -} - -static inline void -NotAckI2C(struct i2c_platform_data *adap) -{ - pr_debug("NakI2C\n"); - pic32_bus_writel(I2CCON_ACKDT, adap->base + PIC32_I2CxCONSET); - pic32_bus_writel(I2CCON_ACKEN, adap->base + PIC32_I2CxCONSET); -} - -static inline int -IdleI2C(struct i2c_platform_data *adap) -{ - int i; - - pr_debug("IdleI2C\n"); - for (i = 0; i < adap->ctl_timeout; i++) { - if (((pic32_bus_readl(adap->base + PIC32_I2CxCON) & - (I2CCON_ACKEN | I2CCON_RCEN | I2CCON_PEN | I2CCON_RSEN | - I2CCON_SEN)) == 0) && - ((pic32_bus_readl(adap->base + PIC32_I2CxSTAT) & - (I2CSTAT_TRSTAT)) == 0)) - return 0; - udelay(1); - } - return -ETIMEDOUT; -} - -static inline u32 -MasterWriteI2C(struct i2c_platform_data *adap, u32 byte) -{ - pr_debug("MasterWriteI2C\n"); - - pic32_bus_writel(byte, adap->base + PIC32_I2CxTRN); - - return pic32_bus_readl(adap->base + PIC32_I2CxSTAT) & I2CSTAT_IWCOL; -} - -static inline u32 -MasterReadI2C(struct i2c_platform_data *adap) -{ - pr_debug("MasterReadI2C\n"); - - pic32_bus_writel(I2CCON_RCEN, adap->base + PIC32_I2CxCONSET); - - while (pic32_bus_readl(adap->base + PIC32_I2CxCON) & I2CCON_RCEN) - ; - - pic32_bus_writel(I2CSTAT_I2COV, adap->base + PIC32_I2CxSTATCLR); - - return pic32_bus_readl(adap->base + PIC32_I2CxRCV); -} - -static int -do_address(struct i2c_platform_data *adap, unsigned int addr, int rd) -{ - pr_debug("doaddress\n"); - - IdleI2C(adap); - StartI2C(adap); - IdleI2C(adap); - - addr <<= 1; - if (rd) - addr |= 1; - - if (MasterWriteI2C(adap, addr)) - return -EIO; - IdleI2C(adap); - if (pic32_bus_readl(adap->base + PIC32_I2CxSTAT) & I2CSTAT_ACKSTAT) - return -EIO; - return 0; -} - -static int -i2c_read(struct i2c_platform_data *adap, unsigned char *buf, - unsigned int len) -{ - int i; - u32 data; - - pr_debug("i2c_read\n"); - - i = 0; - while (i < len) { - data = MasterReadI2C(adap); - buf[i++] = data; - if (i < len) - AckI2C(adap); - else - NotAckI2C(adap); - } - - StopI2C(adap); - IdleI2C(adap); - return 0; -} - -static int -i2c_write(struct i2c_platform_data *adap, unsigned char *buf, - unsigned int len) -{ - int i; - u32 data; - - pr_debug("i2c_write\n"); - - i = 0; - while (i < len) { - data = buf[i]; - if (MasterWriteI2C(adap, data)) - return -EIO; - IdleI2C(adap); - if (pic32_bus_readl(adap->base + PIC32_I2CxSTAT) & - I2CSTAT_ACKSTAT) - return -EIO; - i++; - } - - StopI2C(adap); - IdleI2C(adap); - return 0; -} - -static int -platform_xfer(struct i2c_adapter *i2c_adap, struct i2c_msg *msgs, int num) -{ - struct i2c_platform_data *adap = i2c_adap->algo_data; - struct i2c_msg *p; - int i, err = 0; - - pr_debug("platform_xfer\n"); - for (i = 0; i < num; i++) { -#define __BUFSIZE 80 - int ii; - static char buf[__BUFSIZE]; - char *b = buf; - - p = &msgs[i]; - b += sprintf(buf, " [%d bytes]", p->len); - if ((p->flags & I2C_M_RD) == 0) { - for (ii = 0; ii < p->len; ii++) { - if (b < &buf[__BUFSIZE-4]) { - b += sprintf(b, " %02x", p->buf[ii]); - } else { - strcat(b, "..."); - break; - } - } - } - pr_debug("xfer%d: DevAddr: %04x Op:%s Data:%s\n", i, p->addr, - (p->flags & I2C_M_RD) ? "Rd" : "Wr", buf); - } - - - for (i = 0; !err && i < num; i++) { - p = &msgs[i]; - err = do_address(adap, p->addr, p->flags & I2C_M_RD); - if (err || !p->len) - continue; - if (p->flags & I2C_M_RD) - err = i2c_read(adap, p->buf, p->len); - else - err = i2c_write(adap, p->buf, p->len); - } - - /* Return the number of messages processed, or the error code. */ - if (err == 0) - err = num; - - return err; -} - -static u32 -platform_func(struct i2c_adapter *adap) -{ - pr_debug("platform_algo\n"); - return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; -} - -static const struct i2c_algorithm platform_algo = { - .master_xfer = platform_xfer, - .functionality = platform_func, -}; - -static void i2c_platform_setup(struct i2c_platform_data *priv) -{ - pr_debug("i2c_platform_setup\n"); - - pic32_bus_writel(500, priv->base + PIC32_I2CxBRG); - pic32_bus_writel(I2CCON_ON, priv->base + PIC32_I2CxCONCLR); - pic32_bus_writel(I2CCON_ON, priv->base + PIC32_I2CxCONSET); - pic32_bus_writel((I2CSTAT_BCL | I2CSTAT_IWCOL), - (priv->base + PIC32_I2CxSTATCLR)); -} - -static void i2c_platform_disable(struct i2c_platform_data *priv) -{ - pr_debug("i2c_platform_disable\n"); -} - -static int i2c_platform_probe(struct platform_device *pdev) -{ - struct i2c_platform_data *priv; - struct resource *r; - int ret; - - pr_debug("i2c_platform_probe\n"); - r = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!r) - return -ENODEV; - - priv = devm_kzalloc(&pdev->dev, sizeof(struct i2c_platform_data), - GFP_KERNEL); - if (!priv) - return -ENOMEM; - - /* FIXME: need to allocate resource in PIC32 space */ -#if 0 - priv->base = bus_request_region(r->start, resource_size(r), - pdev->name); -#else - priv->base = r->start; -#endif - if (!priv->base) - return -EBUSY; - - priv->xfer_timeout = 200; - priv->ack_timeout = 200; - priv->ctl_timeout = 200; - - priv->adap.nr = pdev->id; - priv->adap.algo = &platform_algo; - priv->adap.algo_data = priv; - priv->adap.dev.parent = &pdev->dev; - strlcpy(priv->adap.name, "PIC32 I2C", sizeof(priv->adap.name)); - - i2c_platform_setup(priv); - - ret = i2c_add_numbered_adapter(&priv->adap); - if (ret) { - i2c_platform_disable(priv); - return ret; - } - - platform_set_drvdata(pdev, priv); - return 0; -} - -static int i2c_platform_remove(struct platform_device *pdev) -{ - struct i2c_platform_data *priv = platform_get_drvdata(pdev); - - pr_debug("i2c_platform_remove\n"); - platform_set_drvdata(pdev, NULL); - i2c_del_adapter(&priv->adap); - i2c_platform_disable(priv); - return 0; -} - -#ifdef CONFIG_PM -static int -i2c_platform_suspend(struct platform_device *pdev, pm_message_t state) -{ - struct i2c_platform_data *priv = platform_get_drvdata(pdev); - - dev_dbg(&pdev->dev, "i2c_platform_disable\n"); - i2c_platform_disable(priv); - - return 0; -} - -static int -i2c_platform_resume(struct platform_device *pdev) -{ - struct i2c_platform_data *priv = platform_get_drvdata(pdev); - - dev_dbg(&pdev->dev, "i2c_platform_setup\n"); - i2c_platform_setup(priv); - - return 0; -} -#else -#define i2c_platform_suspend NULL -#define i2c_platform_resume NULL -#endif - -static struct platform_driver i2c_platform_driver = { - .driver = { - .name = "i2c_pic32", - .owner = THIS_MODULE, - }, - .probe = i2c_platform_probe, - .remove = i2c_platform_remove, - .suspend = i2c_platform_suspend, - .resume = i2c_platform_resume, -}; - -static int __init -i2c_platform_init(void) -{ - pr_debug("i2c_platform_init\n"); - return platform_driver_register(&i2c_platform_driver); -} - -static void __exit -i2c_platform_exit(void) -{ - pr_debug("i2c_platform_exit\n"); - platform_driver_unregister(&i2c_platform_driver); -} - -MODULE_AUTHOR("Chris Dearman, MIPS Technologies INC."); -MODULE_DESCRIPTION("PIC32 I2C driver"); -MODULE_LICENSE("GPL"); - -module_init(i2c_platform_init); -module_exit(i2c_platform_exit); diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c index 37fe8e7887e2..d3ed15b2b2d1 100644 --- a/arch/mips/pci/pci-lantiq.c +++ b/arch/mips/pci/pci-lantiq.c @@ -215,17 +215,12 @@ static int ltq_pci_probe(struct platform_device *pdev) pci_clear_flags(PCI_PROBE_ONLY); - res_cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); res_bridge = platform_get_resource(pdev, IORESOURCE_MEM, 1); - if (!res_cfg || !res_bridge) { - dev_err(&pdev->dev, "missing memory resources\n"); - return -EINVAL; - } - ltq_pci_membase = devm_ioremap_resource(&pdev->dev, res_bridge); if (IS_ERR(ltq_pci_membase)) return PTR_ERR(ltq_pci_membase); + res_cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); ltq_pci_mapped_cfg = devm_ioremap_resource(&pdev->dev, res_cfg); if (IS_ERR(ltq_pci_mapped_cfg)) return PTR_ERR(ltq_pci_mapped_cfg); diff --git a/arch/mips/pmcs-msp71xx/msp_irq.c b/arch/mips/pmcs-msp71xx/msp_irq.c index f914c753de21..8d53d7a2ed45 100644 --- a/arch/mips/pmcs-msp71xx/msp_irq.c +++ b/arch/mips/pmcs-msp71xx/msp_irq.c @@ -16,6 +16,7 @@ #include <linux/time.h> #include <asm/irq_cpu.h> +#include <asm/setup.h> #include <msp_int.h> diff --git a/arch/mips/pmcs-msp71xx/msp_irq_cic.c b/arch/mips/pmcs-msp71xx/msp_irq_cic.c index b8df2f7b3328..1207ec4dfb77 100644 --- a/arch/mips/pmcs-msp71xx/msp_irq_cic.c +++ b/arch/mips/pmcs-msp71xx/msp_irq_cic.c @@ -131,11 +131,11 @@ static int msp_cic_irq_set_affinity(struct irq_data *d, int cpu; unsigned long flags; unsigned int mtflags; - unsigned long imask = (1 << (irq - MSP_CIC_INTBASE)); + unsigned long imask = (1 << (d->irq - MSP_CIC_INTBASE)); volatile u32 *cic_mask = (volatile u32 *)CIC_VPE0_MSK_REG; /* timer balancing should be disabled in kernel code */ - BUG_ON(irq == MSP_INT_VPE0_TIMER || irq == MSP_INT_VPE1_TIMER); + BUG_ON(d->irq == MSP_INT_VPE0_TIMER || d->irq == MSP_INT_VPE1_TIMER); LOCK_CORE(flags, mtflags); /* enable if any of each VPE's TCs require this IRQ */ diff --git a/arch/mips/sibyte/Makefile b/arch/mips/sibyte/Makefile index c8ed2c807e69..455c40d6d625 100644 --- a/arch/mips/sibyte/Makefile +++ b/arch/mips/sibyte/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SIBYTE_RHONE) += swarm/ obj-$(CONFIG_SIBYTE_SENTOSA) += swarm/ obj-$(CONFIG_SIBYTE_SWARM) += swarm/ obj-$(CONFIG_SIBYTE_BIGSUR) += swarm/ +obj-$(CONFIG_SIBYTE_LITTLESUR) += swarm/ diff --git a/arch/powerpc/configs/pseries_le_defconfig b/arch/powerpc/configs/pseries_le_defconfig index 63392f4b29a4..d2008887eb8c 100644 --- a/arch/powerpc/configs/pseries_le_defconfig +++ b/arch/powerpc/configs/pseries_le_defconfig @@ -48,7 +48,6 @@ CONFIG_KEXEC=y CONFIG_IRQ_ALL_CPUS=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y -CONFIG_CMA=y CONFIG_PPC_64K_PAGES=y CONFIG_PPC_SUBPAGE_PROT=y CONFIG_SCHED_SMT=y @@ -138,6 +137,7 @@ CONFIG_NETCONSOLE=y CONFIG_NETPOLL_TRAP=y CONFIG_TUN=m CONFIG_VIRTIO_NET=m +CONFIG_VHOST_NET=m CONFIG_VORTEX=y CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y @@ -303,4 +303,9 @@ CONFIG_CRYPTO_LZO=m # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_NX_ENCRYPT=m +CONFIG_VIRTUALIZATION=y +CONFIG_KVM_BOOK3S_64=m +CONFIG_KVM_BOOK3S_64_HV=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 3b260efbfbf9..ca07f9c27335 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -71,9 +71,10 @@ struct device_node; #define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */ #define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */ -#define EEH_PE_RESET (1 << 2) /* PE reset in progress */ +#define EEH_PE_CFG_BLOCKED (1 << 2) /* Block config access */ #define EEH_PE_KEEP (1 << 8) /* Keep PE on hotplug */ +#define EEH_PE_CFG_RESTRICTED (1 << 9) /* Block config on error */ struct eeh_pe { int type; /* PE type: PHB/Bus/Device */ diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h index 0bb23725b1e7..8bf1b6351716 100644 --- a/arch/powerpc/include/asm/perf_event.h +++ b/arch/powerpc/include/asm/perf_event.h @@ -34,7 +34,7 @@ do { \ (regs)->result = 0; \ (regs)->nip = __ip; \ - (regs)->gpr[1] = *(unsigned long *)__get_SP(); \ + (regs)->gpr[1] = current_stack_pointer(); \ asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ } while (0) #endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index fe3f9488f321..c998279bd85b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1265,8 +1265,7 @@ static inline unsigned long mfvtb (void) #define proc_trap() asm volatile("trap") -#define __get_SP() ({unsigned long sp; \ - asm volatile("mr %0,1": "=r" (sp)); sp;}) +extern unsigned long current_stack_pointer(void); extern unsigned long scom970_read(unsigned int address); extern void scom970_write(unsigned int address, unsigned long value); diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 6fa2708da153..6240698fee9a 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -19,7 +19,7 @@ /* ftrace syscalls requires exporting the sys_call_table */ #ifdef CONFIG_FTRACE_SYSCALLS -extern const unsigned long *sys_call_table; +extern const unsigned long sys_call_table[]; #endif /* CONFIG_FTRACE_SYSCALLS */ static inline long syscall_get_nr(struct task_struct *task, diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index adac9dc54aee..484b2d4462c1 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -53,9 +53,16 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size, #else struct page *page; int node = dev_to_node(dev); +#ifdef CONFIG_FSL_SOC u64 pfn = get_pfn_limit(dev); int zone; + /* + * This code should be OK on other platforms, but we have drivers that + * don't set coherent_dma_mask. As a workaround we just ifdef it. This + * whole routine needs some serious cleanup. + */ + zone = dma_pfn_limit_to_zone(pfn); if (zone < 0) { dev_err(dev, "%s: No suitable zone for pfn %#llx\n", @@ -73,6 +80,7 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size, break; #endif }; +#endif /* CONFIG_FSL_SOC */ /* ignore region specifiers */ flag &= ~(__GFP_HIGHMEM); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index d543e4179c18..2248a1999c64 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -257,6 +257,13 @@ static void *eeh_dump_pe_log(void *data, void *flag) struct eeh_dev *edev, *tmp; size_t *plen = flag; + /* If the PE's config space is blocked, 0xFF's will be + * returned. It's pointless to collect the log in this + * case. + */ + if (pe->state & EEH_PE_CFG_BLOCKED) + return NULL; + eeh_pe_for_each_dev(pe, edev, tmp) *plen += eeh_dump_dev_log(edev, pci_regs_buf + *plen, EEH_PCI_REGS_LOG_LEN - *plen); @@ -673,18 +680,18 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat switch (state) { case pcie_deassert_reset: eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); break; case pcie_hot_reset: - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); eeh_ops->reset(pe, EEH_RESET_HOT); break; case pcie_warm_reset: - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); break; default: - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); return -EINVAL; }; @@ -1523,7 +1530,7 @@ int eeh_pe_reset(struct eeh_pe *pe, int option) switch (option) { case EEH_RESET_DEACTIVATE: ret = eeh_ops->reset(pe, option); - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); if (ret) break; @@ -1538,7 +1545,7 @@ int eeh_pe_reset(struct eeh_pe *pe, int option) */ eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); ret = eeh_ops->reset(pe, option); break; default: diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 3fd514f8e4b2..6535936bdf27 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -528,13 +528,13 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_report_error, &result); /* Issue reset */ - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); ret = eeh_reset_pe(pe); if (ret) { - eeh_pe_state_clear(pe, EEH_PE_RECOVERING | EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING | EEH_PE_CFG_BLOCKED); return ret; } - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); /* Unfreeze the PE */ ret = eeh_clear_pe_frozen_state(pe, true); @@ -601,10 +601,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) * config accesses. So we prefer to block them. However, controlled * PCI config accesses initiated from EEH itself are allowed. */ - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); rc = eeh_reset_pe(pe); if (rc) { - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); return rc; } @@ -613,7 +613,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) /* Restore PE */ eeh_ops->configure_bridge(pe); eeh_pe_restore_bars(pe); - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); /* Clear frozen state */ rc = eeh_clear_pe_frozen_state(pe, false); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 53dd0915e690..5a63e2b0f65b 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -525,7 +525,7 @@ static void *__eeh_pe_state_mark(void *data, void *flag) pe->state |= state; /* Offline PCI devices if applicable */ - if (state != EEH_PE_ISOLATED) + if (!(state & EEH_PE_ISOLATED)) return NULL; eeh_pe_for_each_dev(pe, edev, tmp) { @@ -534,6 +534,10 @@ static void *__eeh_pe_state_mark(void *data, void *flag) pdev->error_state = pci_channel_io_frozen; } + /* Block PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state |= EEH_PE_CFG_BLOCKED; + return NULL; } @@ -611,6 +615,10 @@ static void *__eeh_pe_state_clear(void *data, void *flag) pdev->error_state = pci_channel_io_normal; } + /* Unblock PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state &= ~EEH_PE_CFG_BLOCKED; + return NULL; } diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 050f79a4a168..72e783ea0681 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1270,11 +1270,6 @@ hmi_exception_early: addi r3,r1,STACK_FRAME_OVERHEAD bl hmi_exception_realmode /* Windup the stack. */ - /* Clear MSR_RI before setting SRR0 and SRR1. */ - li r0,MSR_RI - mfmsr r9 /* get MSR value */ - andc r9,r9,r0 - mtmsrd r9,1 /* Clear MSR_RI */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) mtspr SPRN_HSRR1,r9 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 8eb857f216c1..c14383575fe8 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -466,7 +466,7 @@ static inline void check_stack_overflow(void) #ifdef CONFIG_DEBUG_STACKOVERFLOW long sp; - sp = __get_SP() & (THREAD_SIZE-1); + sp = current_stack_pointer() & (THREAD_SIZE-1); /* check for stack overflow: is there less than 2KB free? */ if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 7ce26d45777e..0d432194c018 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -114,3 +114,7 @@ _GLOBAL(longjmp) mtlr r0 mr r3,r4 blr + +_GLOBAL(current_stack_pointer) + PPC_LL r3,0(r1) + blr diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index c4dfff6c2719..202963ee013a 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -41,3 +41,5 @@ EXPORT_SYMBOL(giveup_spe); #ifdef CONFIG_EPAPR_PARAVIRT EXPORT_SYMBOL(epapr_hypercall_start); #endif + +EXPORT_SYMBOL(current_stack_pointer); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index aa1df89c8b2a..923cd2daba89 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1545,7 +1545,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) tsk = current; if (sp == 0) { if (tsk == current) - asm("mr %0,1" : "=r" (sp)); + sp = current_stack_pointer(); else sp = tsk->thread.ksp; } diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index c168337aef9d..7c55b86206b3 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -66,6 +66,11 @@ int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) return PCIBIOS_DEVICE_NOT_FOUND; if (!config_access_valid(pdn, where)) return PCIBIOS_BAD_REGISTER_NUMBER; +#ifdef CONFIG_EEH + if (pdn->edev && pdn->edev->pe && + (pdn->edev->pe->state & EEH_PE_CFG_BLOCKED)) + return PCIBIOS_SET_FAILED; +#endif addr = rtas_config_addr(pdn->busno, pdn->devfn, where); buid = pdn->phb->buid; @@ -90,9 +95,6 @@ static int rtas_pci_read_config(struct pci_bus *bus, struct device_node *busdn, *dn; struct pci_dn *pdn; bool found = false; -#ifdef CONFIG_EEH - struct eeh_dev *edev; -#endif int ret; /* Search only direct children of the bus */ @@ -109,11 +111,6 @@ static int rtas_pci_read_config(struct pci_bus *bus, if (!found) return PCIBIOS_DEVICE_NOT_FOUND; -#ifdef CONFIG_EEH - edev = of_node_to_eeh_dev(dn); - if (edev && edev->pe && edev->pe->state & EEH_PE_RESET) - return PCIBIOS_DEVICE_NOT_FOUND; -#endif ret = rtas_read_config(pdn, where, size, val); if (*val == EEH_IO_ERROR_VALUE(size) && @@ -132,6 +129,11 @@ int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) return PCIBIOS_DEVICE_NOT_FOUND; if (!config_access_valid(pdn, where)) return PCIBIOS_BAD_REGISTER_NUMBER; +#ifdef CONFIG_EEH + if (pdn->edev && pdn->edev->pe && + (pdn->edev->pe->state & EEH_PE_CFG_BLOCKED)) + return PCIBIOS_SET_FAILED; +#endif addr = rtas_config_addr(pdn->busno, pdn->devfn, where); buid = pdn->phb->buid; @@ -155,10 +157,6 @@ static int rtas_pci_write_config(struct pci_bus *bus, struct device_node *busdn, *dn; struct pci_dn *pdn; bool found = false; -#ifdef CONFIG_EEH - struct eeh_dev *edev; -#endif - int ret; /* Search only direct children of the bus */ busdn = pci_bus_to_OF_node(bus); @@ -173,14 +171,8 @@ static int rtas_pci_write_config(struct pci_bus *bus, if (!found) return PCIBIOS_DEVICE_NOT_FOUND; -#ifdef CONFIG_EEH - edev = of_node_to_eeh_dev(dn); - if (edev && edev->pe && (edev->pe->state & EEH_PE_RESET)) - return PCIBIOS_DEVICE_NOT_FOUND; -#endif - ret = rtas_write_config(pdn, where, size, val); - return ret; + return rtas_write_config(pdn, where, size, val); } static struct pci_ops rtas_pci_ops = { diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index cd07d79ad21c..4f3cfe1b6a33 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -522,36 +522,36 @@ void __init setup_system(void) smp_release_cpus(); #endif - printk("Starting Linux PPC64 %s\n", init_utsname()->version); + pr_info("Starting Linux PPC64 %s\n", init_utsname()->version); - printk("-----------------------------------------------------\n"); - printk("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); - printk("phys_mem_size = 0x%llx\n", memblock_phys_mem_size()); + pr_info("-----------------------------------------------------\n"); + pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); + pr_info("phys_mem_size = 0x%llx\n", memblock_phys_mem_size()); if (ppc64_caches.dline_size != 0x80) - printk("dcache_line_size = 0x%x\n", ppc64_caches.dline_size); + pr_info("dcache_line_size = 0x%x\n", ppc64_caches.dline_size); if (ppc64_caches.iline_size != 0x80) - printk("icache_line_size = 0x%x\n", ppc64_caches.iline_size); + pr_info("icache_line_size = 0x%x\n", ppc64_caches.iline_size); - printk("cpu_features = 0x%016lx\n", cur_cpu_spec->cpu_features); - printk(" possible = 0x%016lx\n", CPU_FTRS_POSSIBLE); - printk(" always = 0x%016lx\n", CPU_FTRS_ALWAYS); - printk("cpu_user_features = 0x%08x 0x%08x\n", cur_cpu_spec->cpu_user_features, + pr_info("cpu_features = 0x%016lx\n", cur_cpu_spec->cpu_features); + pr_info(" possible = 0x%016lx\n", CPU_FTRS_POSSIBLE); + pr_info(" always = 0x%016lx\n", CPU_FTRS_ALWAYS); + pr_info("cpu_user_features = 0x%08x 0x%08x\n", cur_cpu_spec->cpu_user_features, cur_cpu_spec->cpu_user_features2); - printk("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features); - printk("firmware_features = 0x%016lx\n", powerpc_firmware_features); + pr_info("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features); + pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #ifdef CONFIG_PPC_STD_MMU_64 if (htab_address) - printk("htab_address = 0x%p\n", htab_address); + pr_info("htab_address = 0x%p\n", htab_address); - printk("htab_hash_mask = 0x%lx\n", htab_hash_mask); + pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); #endif if (PHYSICAL_START > 0) - printk("physical_start = 0x%llx\n", + pr_info("physical_start = 0x%llx\n", (unsigned long long)PHYSICAL_START); - printk("-----------------------------------------------------\n"); + pr_info("-----------------------------------------------------\n"); DBG(" <- setup_system()\n"); } diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 3d30ef1038e5..ea43a347a104 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -50,7 +50,7 @@ void save_stack_trace(struct stack_trace *trace) { unsigned long sp; - asm("mr %0,1" : "=r" (sp)); + sp = current_stack_pointer(); save_context_stack(trace, sp, current, 1); } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 649666d5d1c2..e5236c24dc07 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -8,6 +8,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "numa: " fmt + #include <linux/threads.h> #include <linux/bootmem.h> #include <linux/init.h> @@ -1153,6 +1155,22 @@ static int __init early_numa(char *p) } early_param("numa", early_numa); +static bool topology_updates_enabled = true; + +static int __init early_topology_updates(char *p) +{ + if (!p) + return 0; + + if (!strcmp(p, "off")) { + pr_info("Disabling topology updates\n"); + topology_updates_enabled = false; + } + + return 0; +} +early_param("topology_updates", early_topology_updates); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Find the node associated with a hot added memory section for @@ -1442,8 +1460,11 @@ static long hcall_vphn(unsigned long cpu, __be32 *associativity) long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; u64 flags = 1; int hwcpu = get_hard_smp_processor_id(cpu); + int i; rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); + for (i = 0; i < 6; i++) + retbuf[i] = cpu_to_be64(retbuf[i]); vphn_unpack_associativity(retbuf, associativity); return rc; @@ -1539,6 +1560,9 @@ int arch_update_cpu_topology(void) struct device *dev; int weight, new_nid, i = 0; + if (!prrn_enabled && !vphn_enabled) + return 0; + weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) return 0; @@ -1592,6 +1616,15 @@ int arch_update_cpu_topology(void) cpu = cpu_last_thread_sibling(cpu); } + pr_debug("Topology update for the following CPUs:\n"); + if (cpumask_weight(&updated_cpus)) { + for (ud = &updates[0]; ud; ud = ud->next) { + pr_debug("cpu %d moving from node %d " + "to %d\n", ud->cpu, + ud->old_nid, ud->new_nid); + } + } + /* * In cases where we have nothing to update (because the updates list * is too short or because the new topology is same as the old one), @@ -1800,8 +1833,12 @@ static const struct file_operations topology_ops = { static int topology_update_init(void) { - start_topology_update(); - proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops); + /* Do not poll for changes if disabled at boot */ + if (topology_updates_enabled) + start_topology_update(); + + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) + return -ENOMEM; return 0; } diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 426814a2ede3..eba9cb10619c 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -373,7 +373,7 @@ static int ioda_eeh_get_pe_state(struct eeh_pe *pe) * moving forward, we have to return operational * state during PE reset. */ - if (pe->state & EEH_PE_RESET) { + if (pe->state & EEH_PE_CFG_BLOCKED) { result = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE | EEH_STATE_MMIO_ENABLED | diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 3e89cbf55885..1d19e7917d7f 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -169,6 +169,26 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) } /* + * If the PE contains any one of following adapters, the + * PCI config space can't be accessed when dumping EEH log. + * Otherwise, we will run into fenced PHB caused by shortage + * of outbound credits in the adapter. The PCI config access + * should be blocked until PE reset. MMIO access is dropped + * by hardware certainly. In order to drop PCI config requests, + * one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which + * will be checked in the backend for PE state retrival. If + * the PE becomes frozen for the first time and the flag has + * been set for the PE, we will set EEH_PE_CFG_BLOCKED for + * that PE to block its config space. + * + * Broadcom Austin 4-ports NICs (14e4:1657) + * Broadcom Shiner 2-ports 10G NICs (14e4:168e) + */ + if ((dev->vendor == PCI_VENDOR_ID_BROADCOM && dev->device == 0x1657) || + (dev->vendor == PCI_VENDOR_ID_BROADCOM && dev->device == 0x168e)) + edev->pe->state |= EEH_PE_CFG_RESTRICTED; + + /* * Cache the PE primary bus, which can't be fetched when * full hotplug is in progress. In that case, all child * PCI devices of the PE are expected to be removed prior @@ -383,6 +403,39 @@ static int powernv_eeh_err_inject(struct eeh_pe *pe, int type, int func, return ret; } +static inline bool powernv_eeh_cfg_blocked(struct device_node *dn) +{ + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + + if (!edev || !edev->pe) + return false; + + if (edev->pe->state & EEH_PE_CFG_BLOCKED) + return true; + + return false; +} + +static int powernv_eeh_read_config(struct device_node *dn, + int where, int size, u32 *val) +{ + if (powernv_eeh_cfg_blocked(dn)) { + *val = 0xFFFFFFFF; + return PCIBIOS_SET_FAILED; + } + + return pnv_pci_cfg_read(dn, where, size, val); +} + +static int powernv_eeh_write_config(struct device_node *dn, + int where, int size, u32 val) +{ + if (powernv_eeh_cfg_blocked(dn)) + return PCIBIOS_SET_FAILED; + + return pnv_pci_cfg_write(dn, where, size, val); +} + /** * powernv_eeh_next_error - Retrieve next EEH error to handle * @pe: Affected PE @@ -440,8 +493,8 @@ static struct eeh_ops powernv_eeh_ops = { .get_log = powernv_eeh_get_log, .configure_bridge = powernv_eeh_configure_bridge, .err_inject = powernv_eeh_err_inject, - .read_config = pnv_pci_cfg_read, - .write_config = pnv_pci_cfg_write, + .read_config = powernv_eeh_read_config, + .write_config = powernv_eeh_write_config, .next_error = powernv_eeh_next_error, .restore_config = powernv_eeh_restore_config }; diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index b642b0562f5a..d019b081df9d 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -194,6 +194,27 @@ static int __init opal_register_exception_handlers(void) * fwnmi area at 0x7000 to provide the glue space to OPAL */ glue = 0x7000; + + /* + * Check if we are running on newer firmware that exports + * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch + * the HMI interrupt and we catch it directly in Linux. + * + * For older firmware (i.e currently released POWER8 System Firmware + * as of today <= SV810_087), we fallback to old behavior and let OPAL + * patch the HMI vector and handle it inside OPAL firmware. + * + * For newer firmware (in development/yet to be released) we will + * start catching/handling HMI directly in Linux. + */ + if (!opal_check_token(OPAL_HANDLE_HMI)) { + pr_info("opal: Old firmware detected, OPAL handles HMIs.\n"); + opal_register_exception_handler( + OPAL_HYPERVISOR_MAINTENANCE_HANDLER, + 0, glue); + glue += 128; + } + opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); #endif diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index b3ca77ddf36d..b2187d0068b8 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -505,7 +505,7 @@ static bool pnv_pci_cfg_check(struct pci_controller *hose, edev = of_node_to_eeh_dev(dn); if (edev) { if (edev->pe && - (edev->pe->state & EEH_PE_RESET)) + (edev->pe->state & EEH_PE_CFG_BLOCKED)) return false; if (edev->mode & EEH_DEV_REMOVED) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index fdf01b660d59..6ad83bd11fe2 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -25,11 +25,11 @@ #include <asm/rtas.h> struct cc_workarea { - u32 drc_index; - u32 zero; - u32 name_offset; - u32 prop_length; - u32 prop_offset; + __be32 drc_index; + __be32 zero; + __be32 name_offset; + __be32 prop_length; + __be32 prop_offset; }; void dlpar_free_cc_property(struct property *prop) @@ -49,11 +49,11 @@ static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa) if (!prop) return NULL; - name = (char *)ccwa + ccwa->name_offset; + name = (char *)ccwa + be32_to_cpu(ccwa->name_offset); prop->name = kstrdup(name, GFP_KERNEL); - prop->length = ccwa->prop_length; - value = (char *)ccwa + ccwa->prop_offset; + prop->length = be32_to_cpu(ccwa->prop_length); + value = (char *)ccwa + be32_to_cpu(ccwa->prop_offset); prop->value = kmemdup(value, prop->length, GFP_KERNEL); if (!prop->value) { dlpar_free_cc_property(prop); @@ -79,7 +79,7 @@ static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa, if (!dn) return NULL; - name = (char *)ccwa + ccwa->name_offset; + name = (char *)ccwa + be32_to_cpu(ccwa->name_offset); dn->full_name = kasprintf(GFP_KERNEL, "%s/%s", path, name); if (!dn->full_name) { kfree(dn); @@ -126,7 +126,7 @@ void dlpar_free_cc_nodes(struct device_node *dn) #define CALL_AGAIN -2 #define ERR_CFG_USE -9003 -struct device_node *dlpar_configure_connector(u32 drc_index, +struct device_node *dlpar_configure_connector(__be32 drc_index, struct device_node *parent) { struct device_node *dn; @@ -414,7 +414,7 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count) if (!parent) return -ENODEV; - dn = dlpar_configure_connector(drc_index, parent); + dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent); if (!dn) return -EINVAL; diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index b174fa751d26..5c375f93c669 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -247,7 +247,7 @@ static int pseries_add_processor(struct device_node *np) unsigned int cpu; cpumask_var_t candidate_mask, tmp; int err = -ENOSPC, len, nthreads, i; - const u32 *intserv; + const __be32 *intserv; intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len); if (!intserv) @@ -293,7 +293,7 @@ static int pseries_add_processor(struct device_node *np) for_each_cpu(cpu, tmp) { BUG_ON(cpu_present(cpu)); set_cpu_present(cpu, true); - set_hard_smp_processor_id(cpu, *intserv++); + set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++)); } err = 0; out_unlock: diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index de1ec54a2a57..e32e00976a94 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -30,7 +30,6 @@ #include <linux/mm.h> #include <linux/memblock.h> #include <linux/spinlock.h> -#include <linux/sched.h> /* for show_stack */ #include <linux/string.h> #include <linux/pci.h> #include <linux/dma-mapping.h> @@ -168,7 +167,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\ttcenum = 0x%llx\n", (u64)tcenum); printk("\ttce val = 0x%llx\n", tce ); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } tcenum++; @@ -257,7 +256,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\tnpages = 0x%llx\n", (u64)npages); printk("\ttce[0] val = 0x%llx\n", tcep[0]); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } return ret; } @@ -273,7 +272,7 @@ static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\ttcenum = 0x%llx\n", (u64)tcenum); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } tcenum++; @@ -292,7 +291,7 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n printk("\trc = %lld\n", rc); printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\tnpages = 0x%llx\n", (u64)npages); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } } @@ -307,7 +306,7 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\ttcenum = 0x%llx\n", (u64)tcenum); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } return tce_ret; diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 361add62abf1..1796c5438cc6 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -56,7 +56,8 @@ extern void hvc_vio_init_early(void); /* Dynamic logical Partitioning/Mobility */ extern void dlpar_free_cc_nodes(struct device_node *); extern void dlpar_free_cc_property(struct property *); -extern struct device_node *dlpar_configure_connector(u32, struct device_node *); +extern struct device_node *dlpar_configure_connector(__be32, + struct device_node *); extern int dlpar_attach_node(struct device_node *); extern int dlpar_detach_node(struct device_node *); diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 0c75214b6f92..73b64c73505b 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -145,59 +145,64 @@ void msi_bitmap_free(struct msi_bitmap *bmp) #ifdef CONFIG_MSI_BITMAP_SELFTEST -#define check(x) \ - if (!(x)) printk("msi_bitmap: test failed at line %d\n", __LINE__); - static void __init test_basics(void) { struct msi_bitmap bmp; - int i, size = 512; + int rc, i, size = 512; /* Can't allocate a bitmap of 0 irqs */ - check(msi_bitmap_alloc(&bmp, 0, NULL) != 0); + WARN_ON(msi_bitmap_alloc(&bmp, 0, NULL) == 0); /* of_node may be NULL */ - check(0 == msi_bitmap_alloc(&bmp, size, NULL)); + WARN_ON(msi_bitmap_alloc(&bmp, size, NULL)); /* Should all be free by default */ - check(0 == bitmap_find_free_region(bmp.bitmap, size, - get_count_order(size))); + WARN_ON(bitmap_find_free_region(bmp.bitmap, size, get_count_order(size))); bitmap_release_region(bmp.bitmap, 0, get_count_order(size)); /* With no node, there's no msi-available-ranges, so expect > 0 */ - check(msi_bitmap_reserve_dt_hwirqs(&bmp) > 0); + WARN_ON(msi_bitmap_reserve_dt_hwirqs(&bmp) <= 0); /* Should all still be free */ - check(0 == bitmap_find_free_region(bmp.bitmap, size, - get_count_order(size))); + WARN_ON(bitmap_find_free_region(bmp.bitmap, size, get_count_order(size))); bitmap_release_region(bmp.bitmap, 0, get_count_order(size)); /* Check we can fill it up and then no more */ for (i = 0; i < size; i++) - check(msi_bitmap_alloc_hwirqs(&bmp, 1) >= 0); + WARN_ON(msi_bitmap_alloc_hwirqs(&bmp, 1) < 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 1) < 0); + WARN_ON(msi_bitmap_alloc_hwirqs(&bmp, 1) >= 0); /* Should all be allocated */ - check(bitmap_find_free_region(bmp.bitmap, size, 0) < 0); + WARN_ON(bitmap_find_free_region(bmp.bitmap, size, 0) >= 0); /* And if we free one we can then allocate another */ msi_bitmap_free_hwirqs(&bmp, size / 2, 1); - check(msi_bitmap_alloc_hwirqs(&bmp, 1) == size / 2); + WARN_ON(msi_bitmap_alloc_hwirqs(&bmp, 1) != size / 2); + + /* Free most of them for the alignment tests */ + msi_bitmap_free_hwirqs(&bmp, 3, size - 3); /* Check we get a naturally aligned offset */ - check(msi_bitmap_alloc_hwirqs(&bmp, 2) % 2 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 4) % 4 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 8) % 8 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 9) % 16 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 3) % 4 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 7) % 8 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 121) % 128 == 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 2); + WARN_ON(rc < 0 && rc % 2 != 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 4); + WARN_ON(rc < 0 && rc % 4 != 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 8); + WARN_ON(rc < 0 && rc % 8 != 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 9); + WARN_ON(rc < 0 && rc % 16 != 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 3); + WARN_ON(rc < 0 && rc % 4 != 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 7); + WARN_ON(rc < 0 && rc % 8 != 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 121); + WARN_ON(rc < 0 && rc % 128 != 0); msi_bitmap_free(&bmp); - /* Clients may check bitmap == NULL for "not-allocated" */ - check(bmp.bitmap == NULL); + /* Clients may WARN_ON bitmap == NULL for "not-allocated" */ + WARN_ON(bmp.bitmap != NULL); kfree(bmp.bitmap); } @@ -219,14 +224,13 @@ static void __init test_of_node(void) of_node_init(&of_node); of_node.full_name = node_name; - check(0 == msi_bitmap_alloc(&bmp, size, &of_node)); + WARN_ON(msi_bitmap_alloc(&bmp, size, &of_node)); /* No msi-available-ranges, so expect > 0 */ - check(msi_bitmap_reserve_dt_hwirqs(&bmp) > 0); + WARN_ON(msi_bitmap_reserve_dt_hwirqs(&bmp) <= 0); /* Should all still be free */ - check(0 == bitmap_find_free_region(bmp.bitmap, size, - get_count_order(size))); + WARN_ON(bitmap_find_free_region(bmp.bitmap, size, get_count_order(size))); bitmap_release_region(bmp.bitmap, 0, get_count_order(size)); /* Now create a fake msi-available-ranges property */ @@ -240,11 +244,11 @@ static void __init test_of_node(void) of_node.properties = ∝ /* msi-available-ranges, so expect == 0 */ - check(msi_bitmap_reserve_dt_hwirqs(&bmp) == 0); + WARN_ON(msi_bitmap_reserve_dt_hwirqs(&bmp)); /* Check we got the expected result */ - check(0 == bitmap_parselist(expected_str, expected, size)); - check(bitmap_equal(expected, bmp.bitmap, size)); + WARN_ON(bitmap_parselist(expected_str, expected, size)); + WARN_ON(!bitmap_equal(expected, bmp.bitmap, size)); msi_bitmap_free(&bmp); kfree(bmp.bitmap); diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index 940ac49198db..4197c89c52d4 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -286,7 +286,8 @@ #define __NR_seccomp 348 #define __NR_getrandom 349 #define __NR_memfd_create 350 -#define NR_syscalls 351 +#define __NR_bpf 351 +#define NR_syscalls 352 /* * There are some system calls that are not present on 64 bit, some diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c index faf6caa510dc..c4f7a3d655b8 100644 --- a/arch/s390/kernel/compat_wrapper.c +++ b/arch/s390/kernel/compat_wrapper.c @@ -217,3 +217,4 @@ COMPAT_SYSCALL_WRAP5(renameat2, int, olddfd, const char __user *, oldname, int, COMPAT_SYSCALL_WRAP3(seccomp, unsigned int, op, unsigned int, flags, const char __user *, uargs) COMPAT_SYSCALL_WRAP3(getrandom, char __user *, buf, size_t, count, unsigned int, flags) COMPAT_SYSCALL_WRAP2(memfd_create, const char __user *, uname, unsigned int, flags) +COMPAT_SYSCALL_WRAP3(bpf, int, cmd, union bpf_attr *, attr, unsigned int, size); diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 6fe886ac2db5..9f7087fd58de 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -359,3 +359,4 @@ SYSCALL(sys_renameat2,sys_renameat2,compat_sys_renameat2) SYSCALL(sys_seccomp,sys_seccomp,compat_sys_seccomp) SYSCALL(sys_getrandom,sys_getrandom,compat_sys_getrandom) SYSCALL(sys_memfd_create,sys_memfd_create,compat_sys_memfd_create) /* 350 */ +SYSCALL(sys_bpf,sys_bpf,compat_sys_bpf) diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index 956f4f7a591c..f6b3cd056ec2 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -5,13 +5,13 @@ * Author(s): Jan Willeke, */ -#include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/uprobes.h> #include <linux/compat.h> #include <linux/kdebug.h> #include <asm/switch_to.h> #include <asm/facility.h> +#include <asm/kprobes.h> #include <asm/dis.h> #include "entry.h" diff --git a/arch/s390/lib/probes.c b/arch/s390/lib/probes.c index c5d64a099719..ae90e1ae3607 100644 --- a/arch/s390/lib/probes.c +++ b/arch/s390/lib/probes.c @@ -4,7 +4,7 @@ * Copyright IBM Corp. 2014 */ -#include <linux/kprobes.h> +#include <asm/kprobes.h> #include <asm/dis.h> int probe_is_prohibited_opcode(u16 *insn) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 296b61a4af59..1b79ca67392f 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -656,7 +656,7 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) } pgste_set_unlock(ptep, pgste); out_pte: - pte_unmap_unlock(*ptep, ptl); + pte_unmap_unlock(ptep, ptl); } EXPORT_SYMBOL_GPL(__gmap_zap); @@ -943,7 +943,7 @@ retry: } if (!(pte_val(*ptep) & _PAGE_INVALID) && (pte_val(*ptep) & _PAGE_PROTECT)) { - pte_unmap_unlock(*ptep, ptl); + pte_unmap_unlock(ptep, ptl); if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { up_read(&mm->mmap_sem); return -EFAULT; @@ -974,7 +974,7 @@ retry: pgste_val(new) |= PGSTE_UC_BIT; pgste_set_unlock(ptep, new); - pte_unmap_unlock(*ptep, ptl); + pte_unmap_unlock(ptep, ptl); up_read(&mm->mmap_sem); return 0; } diff --git a/arch/sparc/include/asm/oplib_64.h b/arch/sparc/include/asm/oplib_64.h index f34682430fcf..2e3a4add8591 100644 --- a/arch/sparc/include/asm/oplib_64.h +++ b/arch/sparc/include/asm/oplib_64.h @@ -62,7 +62,8 @@ struct linux_mem_p1275 { /* You must call prom_init() before using any of the library services, * preferably as early as possible. Pass it the romvec pointer. */ -void prom_init(void *cif_handler, void *cif_stack); +void prom_init(void *cif_handler); +void prom_init_report(void); /* Boot argument acquisition, returns the boot command line string. */ char *prom_getbootargs(void); diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index f5fffd84d0dd..29d64b1758ed 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -48,6 +48,8 @@ unsigned long safe_compute_effective_address(struct pt_regs *, unsigned int); #endif #ifdef CONFIG_SPARC64 +void __init start_early_boot(void); + /* unaligned_64.c */ int handle_ldf_stq(u32 insn, struct pt_regs *regs); void handle_ld_nf(u32 insn, struct pt_regs *regs); diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h index ebaba6167dd4..88d322b67fac 100644 --- a/arch/sparc/kernel/entry.h +++ b/arch/sparc/kernel/entry.h @@ -65,13 +65,10 @@ struct pause_patch_entry { extern struct pause_patch_entry __pause_3insn_patch, __pause_3insn_patch_end; -void __init per_cpu_patch(void); void sun4v_patch_1insn_range(struct sun4v_1insn_patch_entry *, struct sun4v_1insn_patch_entry *); void sun4v_patch_2insn_range(struct sun4v_2insn_patch_entry *, struct sun4v_2insn_patch_entry *); -void __init sun4v_patch(void); -void __init boot_cpu_id_too_large(int cpu); extern unsigned int dcache_parity_tl1_occurred; extern unsigned int icache_parity_tl1_occurred; diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index 4fdeb8040d4d..3d61fcae7ee3 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -672,14 +672,12 @@ tlb_fixup_done: sethi %hi(init_thread_union), %g6 or %g6, %lo(init_thread_union), %g6 ldx [%g6 + TI_TASK], %g4 - mov %sp, %l6 wr %g0, ASI_P, %asi mov 1, %g1 sllx %g1, THREAD_SHIFT, %g1 sub %g1, (STACKFRAME_SZ + STACK_BIAS), %g1 add %g6, %g1, %sp - mov 0, %fp /* Set per-cpu pointer initially to zero, this makes * the boot-cpu use the in-kernel-image per-cpu areas @@ -706,44 +704,14 @@ tlb_fixup_done: nop #endif - mov %l6, %o1 ! OpenPROM stack call prom_init mov %l7, %o0 ! OpenPROM cif handler - /* Initialize current_thread_info()->cpu as early as possible. - * In order to do that accurately we have to patch up the get_cpuid() - * assembler sequences. And that, in turn, requires that we know - * if we are on a Starfire box or not. While we're here, patch up - * the sun4v sequences as well. + /* To create a one-register-window buffer between the kernel's + * initial stack and the last stack frame we use from the firmware, + * do the rest of the boot from a C helper function. */ - call check_if_starfire - nop - call per_cpu_patch - nop - call sun4v_patch - nop - -#ifdef CONFIG_SMP - call hard_smp_processor_id - nop - cmp %o0, NR_CPUS - blu,pt %xcc, 1f - nop - call boot_cpu_id_too_large - nop - /* Not reached... */ - -1: -#else - mov 0, %o0 -#endif - sth %o0, [%g6 + TI_CPU] - - call prom_init_report - nop - - /* Off we go.... */ - call start_kernel + call start_early_boot nop /* Not reached... */ diff --git a/arch/sparc/kernel/hvtramp.S b/arch/sparc/kernel/hvtramp.S index b7ddcdd1dea9..cdbfec299f2f 100644 --- a/arch/sparc/kernel/hvtramp.S +++ b/arch/sparc/kernel/hvtramp.S @@ -109,7 +109,6 @@ hv_cpu_startup: sllx %g5, THREAD_SHIFT, %g5 sub %g5, (STACKFRAME_SZ + STACK_BIAS), %g5 add %g6, %g5, %sp - mov 0, %fp call init_irqwork_curcpu nop diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index e629b8377587..c38d19fc27ba 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -30,6 +30,7 @@ #include <linux/cpu.h> #include <linux/initrd.h> #include <linux/module.h> +#include <linux/start_kernel.h> #include <asm/io.h> #include <asm/processor.h> @@ -162,7 +163,7 @@ char reboot_command[COMMAND_LINE_SIZE]; static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 }; -void __init per_cpu_patch(void) +static void __init per_cpu_patch(void) { struct cpuid_patch_entry *p; unsigned long ver; @@ -254,7 +255,7 @@ void sun4v_patch_2insn_range(struct sun4v_2insn_patch_entry *start, } } -void __init sun4v_patch(void) +static void __init sun4v_patch(void) { extern void sun4v_hvapi_init(void); @@ -323,14 +324,25 @@ static void __init pause_patch(void) } } -#ifdef CONFIG_SMP -void __init boot_cpu_id_too_large(int cpu) +void __init start_early_boot(void) { - prom_printf("Serious problem, boot cpu id (%d) >= NR_CPUS (%d)\n", - cpu, NR_CPUS); - prom_halt(); + int cpu; + + check_if_starfire(); + per_cpu_patch(); + sun4v_patch(); + + cpu = hard_smp_processor_id(); + if (cpu >= NR_CPUS) { + prom_printf("Serious problem, boot cpu id (%d) >= NR_CPUS (%d)\n", + cpu, NR_CPUS); + prom_halt(); + } + current_thread_info()->cpu = cpu; + + prom_init_report(); + start_kernel(); } -#endif /* On Ultra, we support all of the v8 capabilities. */ unsigned long sparc64_elf_hwcap = (HWCAP_SPARC_FLUSH | HWCAP_SPARC_STBAR | diff --git a/arch/sparc/kernel/trampoline_64.S b/arch/sparc/kernel/trampoline_64.S index 737f8cbc7d56..88ede1d53b4c 100644 --- a/arch/sparc/kernel/trampoline_64.S +++ b/arch/sparc/kernel/trampoline_64.S @@ -109,10 +109,13 @@ startup_continue: brnz,pn %g1, 1b nop - sethi %hi(p1275buf), %g2 - or %g2, %lo(p1275buf), %g2 - ldx [%g2 + 0x10], %l2 - add %l2, -(192 + 128), %sp + /* Get onto temporary stack which will be in the locked + * kernel image. + */ + sethi %hi(tramp_stack), %g1 + or %g1, %lo(tramp_stack), %g1 + add %g1, TRAMP_STACK_SIZE, %g1 + sub %g1, STACKFRAME_SZ + STACK_BIAS + 256, %sp flushw /* Setup the loop variables: @@ -394,7 +397,6 @@ after_lock_tlb: sllx %g5, THREAD_SHIFT, %g5 sub %g5, (STACKFRAME_SZ + STACK_BIAS), %g5 add %g6, %g5, %sp - mov 0, %fp rdpr %pstate, %o1 or %o1, PSTATE_IE, %o1 diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 1aed0432c64b..ae6ce383d4df 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -160,6 +160,36 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next, flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { diff --git a/arch/sparc/prom/cif.S b/arch/sparc/prom/cif.S index 9c86b4b7d429..8050f381f518 100644 --- a/arch/sparc/prom/cif.S +++ b/arch/sparc/prom/cif.S @@ -11,11 +11,10 @@ .text .globl prom_cif_direct prom_cif_direct: + save %sp, -192, %sp sethi %hi(p1275buf), %o1 or %o1, %lo(p1275buf), %o1 - ldx [%o1 + 0x0010], %o2 ! prom_cif_stack - save %o2, -192, %sp - ldx [%i1 + 0x0008], %l2 ! prom_cif_handler + ldx [%o1 + 0x0008], %l2 ! prom_cif_handler mov %g4, %l0 mov %g5, %l1 mov %g6, %l3 diff --git a/arch/sparc/prom/init_64.c b/arch/sparc/prom/init_64.c index d95db755828f..110b0d78b864 100644 --- a/arch/sparc/prom/init_64.c +++ b/arch/sparc/prom/init_64.c @@ -26,13 +26,13 @@ phandle prom_chosen_node; * It gets passed the pointer to the PROM vector. */ -extern void prom_cif_init(void *, void *); +extern void prom_cif_init(void *); -void __init prom_init(void *cif_handler, void *cif_stack) +void __init prom_init(void *cif_handler) { phandle node; - prom_cif_init(cif_handler, cif_stack); + prom_cif_init(cif_handler); prom_chosen_node = prom_finddevice(prom_chosen_path); if (!prom_chosen_node || (s32)prom_chosen_node == -1) diff --git a/arch/sparc/prom/p1275.c b/arch/sparc/prom/p1275.c index b2340f008ae0..545d8bb79b65 100644 --- a/arch/sparc/prom/p1275.c +++ b/arch/sparc/prom/p1275.c @@ -20,7 +20,6 @@ struct { long prom_callback; /* 0x00 */ void (*prom_cif_handler)(long *); /* 0x08 */ - unsigned long prom_cif_stack; /* 0x10 */ } p1275buf; extern void prom_world(int); @@ -52,5 +51,4 @@ void p1275_cmd_direct(unsigned long *args) void prom_cif_init(void *cif_handler, void *cif_stack) { p1275buf.prom_cif_handler = (void (*)(long *))cif_handler; - p1275buf.prom_cif_stack = (unsigned long)cif_stack; } diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index de8eebd6f67c..1acf605a646d 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -330,8 +330,10 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) size = pci->romsize + sizeof(*rom); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for rom\n"); return status; + } memset(rom, 0, sizeof(*rom)); @@ -344,14 +346,18 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, &(rom->vendor)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->vendor\n"); goto free_struct; + } status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, &(rom->devid)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->devid\n"); goto free_struct; + } status = efi_early->call(pci->get_location, pci, &(rom->segment), &(rom->bus), &(rom->device), &(rom->function)); @@ -432,8 +438,10 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) size = pci->romsize + sizeof(*rom); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for rom\n"); return status; + } rom->data.type = SETUP_PCI; rom->data.len = size - sizeof(struct setup_data); @@ -444,14 +452,18 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, &(rom->vendor)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->vendor\n"); goto free_struct; + } status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, &(rom->devid)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->devid\n"); goto free_struct; + } status = efi_early->call(pci->get_location, pci, &(rom->segment), &(rom->bus), &(rom->device), &(rom->function)); @@ -538,8 +550,10 @@ static void setup_efi_pci(struct boot_params *params) EFI_LOADER_DATA, size, (void **)&pci_handle); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for pci_handle\n"); return; + } status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, &pci_proto, @@ -1105,6 +1119,10 @@ struct boot_params *make_boot_params(struct efi_config *c) memset(sdt, 0, sizeof(*sdt)); + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + goto fail2; + status = handle_cmdline_files(sys_table, image, (char *)(unsigned long)hdr->cmd_line_ptr, "initrd=", hdr->initrd_addr_max, diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0ec241ede5a2..9b11757975d0 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -81,24 +81,23 @@ extern u64 asmlinkage efi_call(void *fp, ...); */ #define __efi_call_virt(f, args...) efi_call_virt(f, args) -extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, - u32 type, u64 attribute); +extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, + u32 type, u64 attribute); #endif /* CONFIG_X86_32 */ -extern int add_efi_memmap; extern struct efi_scratch efi_scratch; -extern void efi_set_executable(efi_memory_desc_t *md, bool executable); -extern int efi_memblock_x86_reserve_range(void); -extern void efi_call_phys_prelog(void); -extern void efi_call_phys_epilog(void); -extern void efi_unmap_memmap(void); -extern void efi_memory_uc(u64 addr, unsigned long size); +extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); +extern int __init efi_memblock_x86_reserve_range(void); +extern void __init efi_call_phys_prolog(void); +extern void __init efi_call_phys_epilog(void); +extern void __init efi_unmap_memmap(void); +extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); -extern int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); -extern void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); +extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); +extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); extern void __init old_map_region(efi_memory_desc_t *md); extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_mkexec(void); @@ -162,16 +161,6 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( extern bool efi_reboot_required(void); #else -/* - * IF EFI is not configured, have the EFI calls return -ENOSYS. - */ -#define efi_call0(_f) (-ENOSYS) -#define efi_call1(_f, _a1) (-ENOSYS) -#define efi_call2(_f, _a1, _a2) (-ENOSYS) -#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS) -#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) -#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) -#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} static inline bool efi_reboot_required(void) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7d603a71ab3a..6ed0c30d6a0c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -989,6 +989,20 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } +static inline u64 get_canonical(u64 la) +{ + return ((int64_t)la << 16) >> 16; +} + +static inline bool is_noncanonical_address(u64 la) +{ +#ifdef CONFIG_X86_64 + return get_canonical(la) != la; +#else + return false; +#endif +} + #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) @@ -1050,7 +1064,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); -void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 0e79420376eb..990a2fe1588d 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -67,6 +67,7 @@ #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 #define EXIT_REASON_PREEMPTION_TIMER 52 +#define EXIT_REASON_INVVPID 53 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 @@ -114,6 +115,7 @@ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ + { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a46207a05835..749f9fa38254 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -504,11 +504,6 @@ static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); } -static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) -{ - register_address_increment(ctxt, &ctxt->_eip, rel); -} - static u32 desc_limit_scaled(struct desc_struct *desc) { u32 limit = get_desc_limit(desc); @@ -569,6 +564,38 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } +static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, + int cs_l) +{ + switch (ctxt->op_bytes) { + case 2: + ctxt->_eip = (u16)dst; + break; + case 4: + ctxt->_eip = (u32)dst; + break; + case 8: + if ((cs_l && is_noncanonical_address(dst)) || + (!cs_l && (dst & ~(u32)-1))) + return emulate_gp(ctxt, 0); + ctxt->_eip = dst; + break; + default: + WARN(1, "unsupported eip assignment size\n"); + } + return X86EMUL_CONTINUE; +} + +static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +{ + return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64); +} + +static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) +{ + return assign_eip_near(ctxt, ctxt->_eip + rel); +} + static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) { u16 selector; @@ -751,8 +778,10 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, unsigned size) { - if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size)) - return __do_insn_fetch_bytes(ctxt, size); + unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr; + + if (unlikely(done_size < size)) + return __do_insn_fetch_bytes(ctxt, size - done_size); else return X86EMUL_CONTINUE; } @@ -1416,7 +1445,9 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg, u8 cpl, bool in_task_switch) + u16 selector, int seg, u8 cpl, + bool in_task_switch, + struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; u8 dpl, rpl; @@ -1557,6 +1588,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); + if (desc) + *desc = seg_desc; return X86EMUL_CONTINUE; exception: return emulate_exception(ctxt, err_vec, err_code, true); @@ -1566,7 +1599,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); - return __load_segment_descriptor(ctxt, selector, seg, cpl, false); + return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); } static void write_register_operand(struct operand *op) @@ -1960,17 +1993,31 @@ static int em_iret(struct x86_emulate_ctxt *ctxt) static int em_jmp_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned short sel; + unsigned short sel, old_sel; + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; + u8 cpl = ctxt->ops->cpl(ctxt); + + /* Assignment of RIP may only fail in 64-bit mode */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + ops->get_segment(ctxt, &old_sel, &old_desc, NULL, + VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - ctxt->_eip = 0; - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); - return X86EMUL_CONTINUE; + rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + if (rc != X86EMUL_CONTINUE) { + WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); + /* assigning eip failed; restore the old cs */ + ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS); + return rc; + } + return rc; } static int em_grp45(struct x86_emulate_ctxt *ctxt) @@ -1981,13 +2028,15 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) case 2: /* call near abs */ { long int old_eip; old_eip = ctxt->_eip; - ctxt->_eip = ctxt->src.val; + rc = assign_eip_near(ctxt, ctxt->src.val); + if (rc != X86EMUL_CONTINUE) + break; ctxt->src.val = old_eip; rc = em_push(ctxt); break; } case 4: /* jmp abs */ - ctxt->_eip = ctxt->src.val; + rc = assign_eip_near(ctxt, ctxt->src.val); break; case 5: /* jmp far */ rc = em_jmp_far(ctxt); @@ -2022,30 +2071,47 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - return em_pop(ctxt); + int rc; + unsigned long eip; + + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + + return assign_eip_near(ctxt, eip); } static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long cs; + unsigned long eip, cs; + u16 old_cs; int cpl = ctxt->ops->cpl(ctxt); + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; - rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); + if (ctxt->mode == X86EMUL_MODE_PROT64) + ops->get_segment(ctxt, &old_cs, &old_desc, NULL, + VCPU_SREG_CS); + + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->op_bytes == 4) - ctxt->_eip = (u32)ctxt->_eip; rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false, + &new_desc); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = assign_eip_far(ctxt, eip, new_desc.l); + if (rc != X86EMUL_CONTINUE) { + WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); + ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + } return rc; } @@ -2306,7 +2372,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; - u64 msr_data; + u64 msr_data, rcx, rdx; int usermode; u16 cs_sel = 0, ss_sel = 0; @@ -2322,6 +2388,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) else usermode = X86EMUL_MODE_PROT32; + rcx = reg_read(ctxt, VCPU_REGS_RCX); + rdx = reg_read(ctxt, VCPU_REGS_RDX); + cs.dpl = 3; ss.dpl = 3; ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); @@ -2339,6 +2408,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ss_sel = cs_sel + 8; cs.d = 0; cs.l = 1; + if (is_noncanonical_address(rcx) || + is_noncanonical_address(rdx)) + return emulate_gp(ctxt, 0); break; } cs_sel |= SELECTOR_RPL_MASK; @@ -2347,8 +2419,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX); - *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX); + ctxt->_eip = rdx; + *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; } @@ -2466,19 +2538,24 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ - ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2603,25 +2680,32 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, + cpl, true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2888,10 +2972,13 @@ static int em_aad(struct x86_emulate_ctxt *ctxt) static int em_call(struct x86_emulate_ctxt *ctxt) { + int rc; long rel = ctxt->src.val; ctxt->src.val = (unsigned long)ctxt->_eip; - jmp_rel(ctxt, rel); + rc = jmp_rel(ctxt, rel); + if (rc != X86EMUL_CONTINUE) + return rc; return em_push(ctxt); } @@ -2900,34 +2987,50 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) u16 sel, old_cs; ulong old_eip; int rc; + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; + int cpl = ctxt->ops->cpl(ctxt); - old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); old_eip = ctxt->_eip; + ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + &new_desc); + if (rc != X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - ctxt->_eip = 0; - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); + rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + if (rc != X86EMUL_CONTINUE) + goto fail; ctxt->src.val = old_cs; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) - return rc; + goto fail; ctxt->src.val = old_eip; - return em_push(ctxt); + rc = em_push(ctxt); + /* If we failed, we tainted the memory, but the very least we should + restore cs */ + if (rc != X86EMUL_CONTINUE) + goto fail; + return rc; +fail: + ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + return rc; + } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; + unsigned long eip; - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = assign_eip_near(ctxt, eip); if (rc != X86EMUL_CONTINUE) return rc; rsp_increment(ctxt, ctxt->src.val); @@ -3254,20 +3357,24 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt) static int em_loop(struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_CONTINUE; + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); - return X86EMUL_CONTINUE; + return rc; } static int em_jcxz(struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_CONTINUE; + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); - return X86EMUL_CONTINUE; + return rc; } static int em_in(struct x86_emulate_ctxt *ctxt) @@ -3355,6 +3462,12 @@ static int em_bswap(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_clflush(struct x86_emulate_ctxt *ctxt) +{ + /* emulating clflush regardless of cpuid */ + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3693,6 +3806,16 @@ static const struct opcode group11[] = { X7(D(Undefined)), }; +static const struct gprefix pfx_0f_ae_7 = { + I(SrcMem | ByteOp, em_clflush), N, N, N, +}; + +static const struct group_dual group15 = { { + N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), +}, { + N, N, N, N, N, N, N, N, +} }; + static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; @@ -3901,10 +4024,11 @@ static const struct opcode twobyte_table[256] = { N, I(ImplicitOps | EmulateOnUD, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, - N, D(ImplicitOps | ModRM), N, N, + N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, - D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), + D(ImplicitOps | ModRM | SrcMem | NoAccess), + N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 0x20 - 0x2F */ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), @@ -3956,7 +4080,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), - D(ModRM), F(DstReg | SrcMem | ModRM, em_imul), + GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), @@ -4473,10 +4597,10 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); -done: if (ctxt->rip_relative) ctxt->memopp->addr.mem.ea += ctxt->_eip; +done: return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } @@ -4726,7 +4850,7 @@ special_insn: break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; @@ -4756,7 +4880,7 @@ special_insn: break; case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ @@ -4881,13 +5005,11 @@ twobyte_insn: break; case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xae: /* clflush */ - break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 518d86471b76..298781d4cfb4 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -262,8 +262,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) return; timer = &pit->pit_state.timer; + mutex_lock(&pit->pit_state.lock); if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + mutex_unlock(&pit->pit_state.lock); } static void destroy_pit_timer(struct kvm_pit *pit) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 806d58e3c320..fd49c867b25a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -298,7 +298,7 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT(!is_long_mode(vcpu) && is_pae(vcpu)); + ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 65510f624dfe..7527cefc5a43 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3251,7 +3251,7 @@ static int wrmsr_interception(struct vcpu_svm *svm) msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, &msr)) { + if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { @@ -3551,9 +3551,9 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_code; - return 0; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } return svm_exit_handlers[exit_code](svm); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0acac81f198b..a8b76c4c95e2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2659,12 +2659,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: msr = find_msr_entry(vmx, msr_index); if (msr) { + u64 old_msr_data = msr->data; msr->data = data; if (msr - vmx->guest_msrs < vmx->save_nmsrs) { preempt_disable(); - kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); preempt_enable(); + if (ret) + msr->data = old_msr_data; } break; } @@ -5291,7 +5294,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) msr.data = data; msr.index = ecx; msr.host_initiated = false; - if (vmx_set_msr(vcpu, &msr) != 0) { + if (kvm_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -6743,6 +6746,12 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; } +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6788,6 +6797,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, + [EXIT_REASON_INVVPID] = handle_invvpid, }; static const int kvm_vmx_max_exit_handlers = @@ -7023,7 +7033,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - case EXIT_REASON_INVEPT: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -7164,10 +7174,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = exit_reason; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } - return 0; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 34c8f94331f8..0033df32a745 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -229,20 +229,25 @@ static void kvm_shared_msr_cpu_online(void) shared_msr_update(i, shared_msrs_global.msrs[i]); } -void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + int err; if (((value ^ smsr->values[slot].curr) & mask) == 0) - return; + return 0; smsr->values[slot].curr = value; - wrmsrl(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + if (err) + return 1; + if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); smsr->registered = true; } + return 0; } EXPORT_SYMBOL_GPL(kvm_set_shared_msr); @@ -987,7 +992,6 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); - /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -995,8 +999,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); */ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { + switch (msr->index) { + case MSR_FS_BASE: + case MSR_GS_BASE: + case MSR_KERNEL_GS_BASE: + case MSR_CSTAR: + case MSR_LSTAR: + if (is_noncanonical_address(msr->data)) + return 1; + break; + case MSR_IA32_SYSENTER_EIP: + case MSR_IA32_SYSENTER_ESP: + /* + * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if + * non-canonical address is written on Intel but not on + * AMD (which ignores the top 32-bits, because it does + * not implement 64-bit SYSENTER). + * + * 64-bit code should hence be able to write a non-canonical + * value on AMD. Making the address canonical ensures that + * vmentry does not fail on Intel after writing a non-canonical + * value, and that something deterministic happens if the guest + * invokes 64-bit SYSENTER. + */ + msr->data = get_canonical(msr->data); + } return kvm_x86_ops->set_msr(vcpu, msr); } +EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index f15103dff4b4..d143d216d52b 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -40,20 +40,40 @@ void __init efi_bgrt_init(void) if (ACPI_FAILURE(status)) return; - if (bgrt_tab->header.length < sizeof(*bgrt_tab)) + if (bgrt_tab->header.length < sizeof(*bgrt_tab)) { + pr_err("Ignoring BGRT: invalid length %u (expected %zu)\n", + bgrt_tab->header.length, sizeof(*bgrt_tab)); return; - if (bgrt_tab->version != 1 || bgrt_tab->status != 1) + } + if (bgrt_tab->version != 1) { + pr_err("Ignoring BGRT: invalid version %u (expected 1)\n", + bgrt_tab->version); + return; + } + if (bgrt_tab->status != 1) { + pr_err("Ignoring BGRT: invalid status %u (expected 1)\n", + bgrt_tab->status); + return; + } + if (bgrt_tab->image_type != 0) { + pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n", + bgrt_tab->image_type); return; - if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) + } + if (!bgrt_tab->image_address) { + pr_err("Ignoring BGRT: null image address\n"); return; + } image = efi_lookup_mapped_addr(bgrt_tab->image_address); if (!image) { image = early_memremap(bgrt_tab->image_address, sizeof(bmp_header)); ioremapped = true; - if (!image) + if (!image) { + pr_err("Ignoring BGRT: failed to map image header memory\n"); return; + } } memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); @@ -61,14 +81,18 @@ void __init efi_bgrt_init(void) early_iounmap(image, sizeof(bmp_header)); bgrt_image_size = bmp_header.size; - bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL); - if (!bgrt_image) + bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN); + if (!bgrt_image) { + pr_err("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n", + bgrt_image_size); return; + } if (ioremapped) { image = early_memremap(bgrt_tab->image_address, bmp_header.size); if (!image) { + pr_err("Ignoring BGRT: failed to map image memory\n"); kfree(bgrt_image); bgrt_image = NULL; return; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 850da94fef30..dbc8627a5cdf 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,17 +70,7 @@ static efi_config_table_type_t arch_tables[] __initdata = { u64 efi_setup; /* efi setup_data physical address */ -static bool disable_runtime __initdata = false; -static int __init setup_noefi(char *arg) -{ - disable_runtime = true; - return 0; -} -early_param("noefi", setup_noefi); - -int add_efi_memmap; -EXPORT_SYMBOL(add_efi_memmap); - +static int add_efi_memmap __initdata; static int __init setup_add_efi_memmap(char *arg) { add_efi_memmap = 1; @@ -96,7 +86,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map( { efi_status_t status; - efi_call_phys_prelog(); + efi_call_phys_prolog(); status = efi_call_phys(efi_phys.set_virtual_address_map, memory_map_size, descriptor_size, descriptor_version, virtual_map); @@ -210,9 +200,12 @@ static void __init print_efi_memmap(void) for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + char buf[64]; + md = p; - pr_info("mem%02u: type=%u, attr=0x%llx, range=[0x%016llx-0x%016llx) (%lluMB)\n", - i, md->type, md->attribute, md->phys_addr, + pr_info("mem%02u: %s range=[0x%016llx-0x%016llx) (%lluMB)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } @@ -344,9 +337,9 @@ static int __init efi_runtime_init32(void) } /* - * We will only need *early* access to the following two - * EFI runtime services before set_virtual_address_map - * is invoked. + * We will only need *early* access to the SetVirtualAddressMap + * EFI runtime service. All other runtime services will be called + * via the virtual mapping. */ efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) @@ -368,9 +361,9 @@ static int __init efi_runtime_init64(void) } /* - * We will only need *early* access to the following two - * EFI runtime services before set_virtual_address_map - * is invoked. + * We will only need *early* access to the SetVirtualAddressMap + * EFI runtime service. All other runtime services will be called + * via the virtual mapping. */ efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) @@ -492,7 +485,7 @@ void __init efi_init(void) if (!efi_runtime_supported()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else { - if (disable_runtime || efi_runtime_init()) + if (efi_runtime_disabled() || efi_runtime_init()) return; } if (efi_memmap_init()) @@ -537,7 +530,7 @@ void __init runtime_code_page_mkexec(void) } } -void efi_memory_uc(u64 addr, unsigned long size) +void __init efi_memory_uc(u64 addr, unsigned long size) { unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; u64 npages; @@ -732,6 +725,7 @@ static void __init kexec_enter_virtual_mode(void) */ if (!efi_is_native()) { efi_unmap_memmap(); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -805,6 +799,7 @@ static void __init __efi_enter_virtual_mode(void) new_memmap = efi_map_regions(&count, &pg_shift); if (!new_memmap) { pr_err("Error reallocating memory, EFI runtime non-functional!\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -812,8 +807,10 @@ static void __init __efi_enter_virtual_mode(void) BUG_ON(!efi.systab); - if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) + if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) { + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; + } efi_sync_low_kernel_mappings(); efi_dump_pagetable(); @@ -938,14 +935,11 @@ u64 efi_mem_attributes(unsigned long phys_addr) return 0; } -static int __init parse_efi_cmdline(char *str) +static int __init arch_parse_efi_cmdline(char *str) { - if (*str == '=') - str++; - - if (!strncmp(str, "old_map", 7)) + if (parse_option_str(str, "old_map")) set_bit(EFI_OLD_MEMMAP, &efi.flags); return 0; } -early_param("efi", parse_efi_cmdline); +early_param("efi", arch_parse_efi_cmdline); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 9ee3491e31fb..40e7cda52936 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -33,7 +33,7 @@ /* * To make EFI call EFI runtime service in physical addressing mode we need - * prelog/epilog before/after the invocation to disable interrupt, to + * prolog/epilog before/after the invocation to disable interrupt, to * claim EFI runtime service handler exclusively and to duplicate a memory in * low memory space say 0 - 3G. */ @@ -41,11 +41,13 @@ static unsigned long efi_rt_eflags; void efi_sync_low_kernel_mappings(void) {} void __init efi_dump_pagetable(void) {} -int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) +int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { return 0; } -void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) {} +void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) +{ +} void __init efi_map_region(efi_memory_desc_t *md) { @@ -55,7 +57,7 @@ void __init efi_map_region(efi_memory_desc_t *md) void __init efi_map_region_fixed(efi_memory_desc_t *md) {} void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} -void efi_call_phys_prelog(void) +void __init efi_call_phys_prolog(void) { struct desc_ptr gdt_descr; @@ -69,7 +71,7 @@ void efi_call_phys_prelog(void) load_gdt(&gdt_descr); } -void efi_call_phys_epilog(void) +void __init efi_call_phys_epilog(void) { struct desc_ptr gdt_descr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 290d397e1dd9..35aecb6042fb 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -79,7 +79,7 @@ static void __init early_code_mapping_set_exec(int executable) } } -void __init efi_call_phys_prelog(void) +void __init efi_call_phys_prolog(void) { unsigned long vaddress; int pgd; @@ -139,7 +139,7 @@ void efi_sync_low_kernel_mappings(void) sizeof(pgd_t) * num_pgds); } -int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) +int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { unsigned long text; struct page *page; @@ -192,7 +192,7 @@ int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 0; } -void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) +void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) { pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index fbe66e626c09..040192b50d02 100644 --- a/arch/x86/platform/efi/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S @@ -27,13 +27,13 @@ ENTRY(efi_call_phys) * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found * the values of these registers are the same. And, the corresponding * GDT entries are identical. So I will do nothing about segment reg - * and GDT, but change GDT base register in prelog and epilog. + * and GDT, but change GDT base register in prolog and epilog. */ /* * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET. * But to make it smoothly switch from virtual mode to flat mode. - * The mapping of lower virtual memory has been created in prelog and + * The mapping of lower virtual memory has been created in prolog and * epilog. */ movl $1f, %edx diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h index 46aa25c8ce06..3c1c3866d82b 100644 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h @@ -10,10 +10,9 @@ */ -/* __attribute__((weak)) makes these declarations overridable */ /* For every CPU addition a new get_<cpuname>_ops interface needs * to be added. */ -extern void *get_penwell_ops(void) __attribute__((weak)); -extern void *get_cloverview_ops(void) __attribute__((weak)); -extern void *get_tangier_ops(void) __attribute__((weak)); +extern void *get_penwell_ops(void); +extern void *get_cloverview_ops(void); +extern void *get_tangier_ops(void); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1a3f0445432a..fac5e4f9607c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1636,9 +1636,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); - /* Allocate and initialize top and mid mfn levels for p2m structure */ - xen_build_mfn_list_list(); - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f62af7647ec9..a8a1a3d08d4d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1217,10 +1217,13 @@ static void __init xen_pagetable_p2m_copy(void) static void __init xen_pagetable_init(void) { paging_init(); - xen_setup_shared_info(); #ifdef CONFIG_X86_64 xen_pagetable_p2m_copy(); #endif + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + + xen_setup_shared_info(); xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 9f5983b01ed9..b456b048eca9 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -163,6 +163,7 @@ #include <linux/hash.h> #include <linux/sched.h> #include <linux/seq_file.h> +#include <linux/bootmem.h> #include <asm/cache.h> #include <asm/setup.h> @@ -181,21 +182,20 @@ static void __init m2p_override_init(void); unsigned long xen_max_p2m_pfn __read_mostly; +static unsigned long *p2m_mid_missing_mfn; +static unsigned long *p2m_top_mfn; +static unsigned long **p2m_top_mfn_p; + /* Placeholders for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); /* For each I/O range remapped we may lose up to two leaf pages for the boundary * violations and three mid pages to cover up to 3GB. With @@ -272,11 +272,11 @@ static void p2m_init(unsigned long *p2m) * Build the parallel p2m_top_mfn and p2m_mid_mfn structures * * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() + * - At boot time we're called rather early, and must use alloc_bootmem*() * to allocate memory. * * - After resume we're called from within stop_machine, but the mfn - * tree should alreay be completely allocated. + * tree should already be completely allocated. */ void __ref xen_build_mfn_list_list(void) { @@ -287,20 +287,17 @@ void __ref xen_build_mfn_list_list(void) /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); - p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_p_init(p2m_top_mfn_p); - p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { @@ -328,10 +325,9 @@ void __ref xen_build_mfn_list_list(void) /* * XXX boot-time only! We should never find * missing parts of the mfn tree after - * runtime. extend_brk() will BUG if we call - * it too late. + * runtime. */ - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; @@ -415,7 +411,6 @@ void __init xen_build_dynamic_phys_to_machine(void) m2p_override_init(); } #ifdef CONFIG_X86_64 -#include <linux/bootmem.h> unsigned long __init xen_revector_p2m_tree(void) { unsigned long va_start; @@ -477,7 +472,6 @@ unsigned long __init xen_revector_p2m_tree(void) copy_page(new, mid_p); p2m_top[topidx][mididx] = &mfn_list[pfn_free]; - p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); pfn_free += P2M_PER_PAGE; @@ -538,12 +532,13 @@ static bool alloc_p2m(unsigned long pfn) unsigned topidx, mididx; unsigned long ***top_p, **mid; unsigned long *top_mfn_p, *mid_mfn; + unsigned long *p2m_orig; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); top_p = &p2m_top[topidx]; - mid = *top_p; + mid = ACCESS_ONCE(*top_p); if (mid == p2m_mid_missing) { /* Mid level is missing, allocate a new one */ @@ -558,7 +553,7 @@ static bool alloc_p2m(unsigned long pfn) } top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = p2m_top_mfn_p[topidx]; + mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); @@ -566,6 +561,7 @@ static bool alloc_p2m(unsigned long pfn) /* Separately check the mid mfn level */ unsigned long missing_mfn; unsigned long mid_mfn_mfn; + unsigned long old_mfn; mid_mfn = alloc_p2m_page(); if (!mid_mfn) @@ -575,17 +571,19 @@ static bool alloc_p2m(unsigned long pfn) missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); - if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); + if (old_mfn != missing_mfn) { free_p2m_page(mid_mfn); - else + mid_mfn = mfn_to_virt(old_mfn); + } else { p2m_top_mfn_p[topidx] = mid_mfn; + } } - if (p2m_top[topidx][mididx] == p2m_identity || - p2m_top[topidx][mididx] == p2m_missing) { + p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]); + if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; - unsigned long *p2m_orig = p2m_top[topidx][mididx]; p2m = alloc_p2m_page(); if (!p2m) @@ -606,7 +604,6 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; unsigned long *p2m; - unsigned long *mid_mfn_p; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -633,43 +630,21 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) p2m_top[topidx][mididx] = p2m; - /* For save/restore we need to MFN of the P2M saved */ - - mid_mfn_p = p2m_top_mfn_p[topidx]; - WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), - "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", - topidx, mididx); - mid_mfn_p[mididx] = virt_to_mfn(p2m); - return true; } static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx = p2m_top_index(pfn); - unsigned long *mid_mfn_p; unsigned long **mid; mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; if (mid == p2m_mid_missing) { mid = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; - - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - } - /* And the save/restore P2M tables.. */ - if (mid_mfn_p == p2m_mid_missing_mfn) { - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p, p2m_missing); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - /* Note: we don't set mid_mfn_p[midix] here, - * look in early_alloc_p2m() */ } return true; } @@ -680,14 +655,13 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) * replace the P2M leaf with a p2m_missing or p2m_identity. * Stick the old page in the new P2M tree location. */ -bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn) { unsigned topidx; unsigned mididx; unsigned ident_pfns; unsigned inv_pfns; unsigned long *p2m; - unsigned long *mid_mfn_p; unsigned idx; unsigned long pfn; @@ -733,11 +707,6 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_ found: /* Found one, replace old with p2m_identity or p2m_missing */ p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); - /* And the other for save/restore.. */ - mid_mfn_p = p2m_top_mfn_p[topidx]; - /* NOTE: Even if it is a p2m_identity it should still be point to - * a page filled with INVALID_P2M_ENTRY entries. */ - mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); /* Reset where we want to stick the old page in. */ topidx = p2m_top_index(set_pfn); @@ -752,8 +721,6 @@ found: p2m_init(p2m); p2m_top[topidx][mididx] = p2m; - mid_mfn_p = p2m_top_mfn_p[topidx]; - mid_mfn_p[mididx] = virt_to_mfn(p2m); return true; } @@ -763,7 +730,7 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) if (!early_alloc_p2m_middle(pfn)) return false; - if (early_can_reuse_p2m_middle(pfn, mfn)) + if (early_can_reuse_p2m_middle(pfn)) return __set_phys_to_machine(pfn, mfn); if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index af7216128d93..29834b3fd87f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -595,6 +595,7 @@ char * __init xen_memory_setup(void) rc = 0; } BUG_ON(rc); + BUG_ON(memmap.nr_entries == 0); /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a1d430b112b3..f473d268d387 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -158,7 +158,7 @@ cycle_t xen_clocksource_read(void) cycle_t ret; preempt_disable_notrace(); - src = this_cpu_ptr(&xen_vcpu->time); + src = &__this_cpu_read(xen_vcpu)->time; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; diff --git a/crypto/cts.c b/crypto/cts.c index 042223f8e733..133f0874c95e 100644 --- a/crypto/cts.c +++ b/crypto/cts.c @@ -202,7 +202,8 @@ static int cts_cbc_decrypt(struct crypto_cts_ctx *ctx, /* 5. Append the tail (BB - Ln) bytes of Xn (tmp) to Cn to create En */ memcpy(s + bsize + lastn, tmp + lastn, bsize - lastn); /* 6. Decrypt En to create Pn-1 */ - memset(iv, 0, sizeof(iv)); + memzero_explicit(iv, sizeof(iv)); + sg_set_buf(&sgsrc[0], s + bsize, bsize); sg_set_buf(&sgdst[0], d, bsize); err = crypto_blkcipher_decrypt_iv(&lcldesc, sgdst, sgsrc, bsize); diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c index 42794803c480..7bb047432782 100644 --- a/crypto/sha1_generic.c +++ b/crypto/sha1_generic.c @@ -64,7 +64,7 @@ int crypto_sha1_update(struct shash_desc *desc, const u8 *data, src = data + done; } while (done + SHA1_BLOCK_SIZE <= len); - memset(temp, 0, sizeof(temp)); + memzero_explicit(temp, sizeof(temp)); partial = 0; } memcpy(sctx->buffer + partial, src, len - done); diff --git a/crypto/sha256_generic.c b/crypto/sha256_generic.c index 0bb558344699..65e7b76b057f 100644 --- a/crypto/sha256_generic.c +++ b/crypto/sha256_generic.c @@ -211,10 +211,9 @@ static void sha256_transform(u32 *state, const u8 *input) /* clear any sensitive info... */ a = b = c = d = e = f = g = h = t1 = t2 = 0; - memset(W, 0, 64 * sizeof(u32)); + memzero_explicit(W, 64 * sizeof(u32)); } - static int sha224_init(struct shash_desc *desc) { struct sha256_state *sctx = shash_desc_ctx(desc); @@ -317,7 +316,7 @@ static int sha224_final(struct shash_desc *desc, u8 *hash) sha256_final(desc, D); memcpy(hash, D, SHA224_DIGEST_SIZE); - memset(D, 0, SHA256_DIGEST_SIZE); + memzero_explicit(D, SHA256_DIGEST_SIZE); return 0; } diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c index 6dde57dc511b..95db67197cd9 100644 --- a/crypto/sha512_generic.c +++ b/crypto/sha512_generic.c @@ -239,7 +239,7 @@ static int sha384_final(struct shash_desc *desc, u8 *hash) sha512_final(desc, D); memcpy(hash, D, 48); - memset(D, 0, 64); + memzero_explicit(D, 64); return 0; } diff --git a/crypto/tgr192.c b/crypto/tgr192.c index 87403556fd0b..3c7af0d1ff7a 100644 --- a/crypto/tgr192.c +++ b/crypto/tgr192.c @@ -612,7 +612,7 @@ static int tgr160_final(struct shash_desc *desc, u8 * out) tgr192_final(desc, D); memcpy(out, D, TGR160_DIGEST_SIZE); - memset(D, 0, TGR192_DIGEST_SIZE); + memzero_explicit(D, TGR192_DIGEST_SIZE); return 0; } @@ -623,7 +623,7 @@ static int tgr128_final(struct shash_desc *desc, u8 * out) tgr192_final(desc, D); memcpy(out, D, TGR128_DIGEST_SIZE); - memset(D, 0, TGR192_DIGEST_SIZE); + memzero_explicit(D, TGR192_DIGEST_SIZE); return 0; } diff --git a/crypto/vmac.c b/crypto/vmac.c index 2eb11a30c29c..d84c24bd7ff7 100644 --- a/crypto/vmac.c +++ b/crypto/vmac.c @@ -613,7 +613,7 @@ static int vmac_final(struct shash_desc *pdesc, u8 *out) } mac = vmac(ctx->partial, ctx->partial_size, nonce, NULL, ctx); memcpy(out, &mac, sizeof(vmac_t)); - memset(&mac, 0, sizeof(vmac_t)); + memzero_explicit(&mac, sizeof(vmac_t)); memset(&ctx->__vmac_ctx, 0, sizeof(struct vmac_ctx)); ctx->partial_size = 0; return 0; diff --git a/crypto/wp512.c b/crypto/wp512.c index 180f1d6e03f4..ec64e7762fbb 100644 --- a/crypto/wp512.c +++ b/crypto/wp512.c @@ -1102,8 +1102,8 @@ static int wp384_final(struct shash_desc *desc, u8 *out) u8 D[64]; wp512_final(desc, D); - memcpy (out, D, WP384_DIGEST_SIZE); - memset (D, 0, WP512_DIGEST_SIZE); + memcpy(out, D, WP384_DIGEST_SIZE); + memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } @@ -1113,8 +1113,8 @@ static int wp256_final(struct shash_desc *desc, u8 *out) u8 D[64]; wp512_final(desc, D); - memcpy (out, D, WP256_DIGEST_SIZE); - memset (D, 0, WP512_DIGEST_SIZE); + memcpy(out, D, WP256_DIGEST_SIZE); + memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index d0f3265fb85d..b23fe37f67c0 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -144,7 +144,7 @@ config ACPI_VIDEO config ACPI_FAN tristate "Fan" - select THERMAL + depends on THERMAL default y help This driver supports ACPI fan devices, allowing user-mode diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 505d4d79fe3e..c3b2fcb729f3 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -43,6 +43,7 @@ acpi-y += pci_root.o pci_link.o pci_irq.o acpi-y += acpi_lpss.o acpi-y += acpi_platform.o acpi-y += acpi_pnp.o +acpi-y += int340x_thermal.o acpi-y += power.o acpi-y += event.o acpi-y += sysfs.o diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index 2bf9082f7523..6ba8beb6b9d2 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -16,6 +16,7 @@ #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/dma-mapping.h> #include <linux/platform_device.h> #include "internal.h" @@ -102,6 +103,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) pdevinfo.res = resources; pdevinfo.num_res = count; pdevinfo.acpi_node.companion = adev; + pdevinfo.dma_mask = DMA_BIT_MASK(32); pdev = platform_device_register_full(&pdevinfo); if (IS_ERR(pdev)) dev_err(&adev->dev, "platform device creation failed: %ld\n", @@ -113,3 +115,4 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) kfree(resources); return pdev; } +EXPORT_SYMBOL_GPL(acpi_create_platform_device); diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index 2ad2351a9833..c318d3e27893 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -127,7 +127,7 @@ acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, - acpi_event_status * event_status); + acpi_event_status *event_status); acpi_status acpi_hw_disable_all_gpes(void); diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index 2747279fbe3c..c00e7e41ad75 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -413,8 +413,8 @@ struct acpi_gpe_handler_info { acpi_gpe_handler address; /* Address of handler, if any */ void *context; /* Context to be passed to handler */ struct acpi_namespace_node *method_node; /* Method node for this GPE level (saved) */ - u8 original_flags; /* Original (pre-handler) GPE info */ - u8 originally_enabled; /* True if GPE was originally enabled */ + u8 original_flags; /* Original (pre-handler) GPE info */ + u8 originally_enabled; /* True if GPE was originally enabled */ }; /* Notify info for implicit notify, multiple device objects */ diff --git a/drivers/acpi/acpica/actables.h b/drivers/acpi/acpica/actables.h index f14882788eee..1afe46e44dac 100644 --- a/drivers/acpi/acpica/actables.h +++ b/drivers/acpi/acpica/actables.h @@ -49,6 +49,8 @@ acpi_status acpi_allocate_root_table(u32 initial_table_count); /* * tbxfroot - Root pointer utilities */ +u32 acpi_tb_get_rsdp_length(struct acpi_table_rsdp *rsdp); + acpi_status acpi_tb_validate_rsdp(struct acpi_table_rsdp *rsdp); u8 *acpi_tb_scan_memory_for_rsdp(u8 *start_address, u32 length); diff --git a/drivers/acpi/acpica/amlresrc.h b/drivers/acpi/acpica/amlresrc.h index f3f834408441..3a0beeb86ba5 100644 --- a/drivers/acpi/acpica/amlresrc.h +++ b/drivers/acpi/acpica/amlresrc.h @@ -117,6 +117,12 @@ struct asl_resource_node { struct asl_resource_node *next; }; +struct asl_resource_info { + union acpi_parse_object *descriptor_type_op; /* Resource descriptor parse node */ + union acpi_parse_object *mapping_op; /* Used for mapfile support */ + u32 current_byte_offset; /* Offset in resource template */ +}; + /* Macros used to generate AML resource length fields */ #define ACPI_AML_SIZE_LARGE(r) (sizeof (r) - sizeof (struct aml_resource_large_header)) @@ -449,4 +455,32 @@ union aml_resource { u8 byte_item; }; +/* Interfaces used by both the disassembler and compiler */ + +void +mp_save_gpio_info(union acpi_parse_object *op, + union aml_resource *resource, + u32 pin_count, u16 *pin_list, char *device_name); + +void +mp_save_serial_info(union acpi_parse_object *op, + union aml_resource *resource, char *device_name); + +char *mp_get_hid_from_parse_tree(struct acpi_namespace_node *hid_node); + +char *mp_get_hid_via_namestring(char *device_name); + +char *mp_get_connection_info(union acpi_parse_object *op, + u32 pin_index, + struct acpi_namespace_node **target_node, + char **target_name); + +char *mp_get_parent_device_hid(union acpi_parse_object *op, + struct acpi_namespace_node **target_node, + char **parent_device_name); + +char *mp_get_ddn_value(char *device_name); + +char *mp_get_hid_value(struct acpi_namespace_node *device_node); + #endif diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c index e4ba4dec86af..2095dfb72bcb 100644 --- a/drivers/acpi/acpica/evgpe.c +++ b/drivers/acpi/acpica/evgpe.c @@ -100,13 +100,14 @@ acpi_ev_update_gpe_enable_mask(struct acpi_gpe_event_info *gpe_event_info) * * FUNCTION: acpi_ev_enable_gpe * - * PARAMETERS: gpe_event_info - GPE to enable + * PARAMETERS: gpe_event_info - GPE to enable * * RETURN: Status * * DESCRIPTION: Clear a GPE of stale events and enable it. * ******************************************************************************/ + acpi_status acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) { acpi_status status; @@ -125,6 +126,7 @@ acpi_status acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) } /* Clear the GPE (of stale events) */ + status = acpi_hw_clear_gpe(gpe_event_info); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); @@ -136,7 +138,6 @@ acpi_status acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) return_ACPI_STATUS(status); } - /******************************************************************************* * * FUNCTION: acpi_ev_add_gpe_reference @@ -212,7 +213,7 @@ acpi_ev_remove_gpe_reference(struct acpi_gpe_event_info *gpe_event_info) if (ACPI_SUCCESS(status)) { status = acpi_hw_low_set_gpe(gpe_event_info, - ACPI_GPE_DISABLE); + ACPI_GPE_DISABLE); } if (ACPI_FAILURE(status)) { @@ -334,7 +335,7 @@ struct acpi_gpe_event_info *acpi_ev_get_gpe_event_info(acpi_handle gpe_device, * ******************************************************************************/ -u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) +u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info *gpe_xrupt_list) { acpi_status status; struct acpi_gpe_block_info *gpe_block; @@ -427,7 +428,7 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) /* Check if there is anything active at all in this register */ - enabled_status_byte = (u8) (status_reg & enable_reg); + enabled_status_byte = (u8)(status_reg & enable_reg); if (!enabled_status_byte) { /* No active GPEs in this register, move on */ @@ -450,7 +451,7 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) acpi_ev_gpe_dispatch(gpe_block-> node, &gpe_block-> - event_info[((acpi_size) i * ACPI_GPE_REGISTER_WIDTH) + j], j + gpe_register_info->base_gpe_number); + event_info[((acpi_size) i * ACPI_GPE_REGISTER_WIDTH) + j], j + gpe_register_info->base_gpe_number); } } } @@ -636,7 +637,7 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_enable_gpe(void *context) * ******************************************************************************/ -acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info *gpe_event_info) +acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info * gpe_event_info) { acpi_status status; @@ -666,9 +667,9 @@ acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info *gpe_event_info) * * FUNCTION: acpi_ev_gpe_dispatch * - * PARAMETERS: gpe_device - Device node. NULL for GPE0/GPE1 - * gpe_event_info - Info for this GPE - * gpe_number - Number relative to the parent GPE block + * PARAMETERS: gpe_device - Device node. NULL for GPE0/GPE1 + * gpe_event_info - Info for this GPE + * gpe_number - Number relative to the parent GPE block * * RETURN: INTERRUPT_HANDLED or INTERRUPT_NOT_HANDLED * @@ -681,7 +682,7 @@ acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info *gpe_event_info) u32 acpi_ev_gpe_dispatch(struct acpi_namespace_node *gpe_device, - struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) + struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) { acpi_status status; u32 return_value; diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c index 49fc7effd961..7be928379879 100644 --- a/drivers/acpi/acpica/evgpeinit.c +++ b/drivers/acpi/acpica/evgpeinit.c @@ -424,6 +424,7 @@ acpi_ev_match_gpe_method(acpi_handle obj_handle, } /* Disable the GPE in case it's been enabled already. */ + (void)acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_DISABLE); /* diff --git a/drivers/acpi/acpica/evxface.c b/drivers/acpi/acpica/evxface.c index 11e5803b8b41..55a58f3ec8df 100644 --- a/drivers/acpi/acpica/evxface.c +++ b/drivers/acpi/acpica/evxface.c @@ -786,18 +786,26 @@ acpi_install_gpe_handler(acpi_handle gpe_device, handler->method_node = gpe_event_info->dispatch.method_node; handler->original_flags = (u8)(gpe_event_info->flags & (ACPI_GPE_XRUPT_TYPE_MASK | - ACPI_GPE_DISPATCH_MASK)); + ACPI_GPE_DISPATCH_MASK)); /* * If the GPE is associated with a method, it may have been enabled * automatically during initialization, in which case it has to be * disabled now to avoid spurious execution of the handler. */ - - if ((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) - && gpe_event_info->runtime_count) { - handler->originally_enabled = 1; + if (((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) || + (handler->original_flags & ACPI_GPE_DISPATCH_NOTIFY)) && + gpe_event_info->runtime_count) { + handler->originally_enabled = TRUE; (void)acpi_ev_remove_gpe_reference(gpe_event_info); + + /* Sanity check of original type against new type */ + + if (type != + (u32)(gpe_event_info->flags & ACPI_GPE_XRUPT_TYPE_MASK)) { + ACPI_WARNING((AE_INFO, + "GPE type mismatch (level/edge)")); + } } /* Install the handler */ @@ -808,7 +816,7 @@ acpi_install_gpe_handler(acpi_handle gpe_device, gpe_event_info->flags &= ~(ACPI_GPE_XRUPT_TYPE_MASK | ACPI_GPE_DISPATCH_MASK); - gpe_event_info->flags |= (u8) (type | ACPI_GPE_DISPATCH_HANDLER); + gpe_event_info->flags |= (u8)(type | ACPI_GPE_DISPATCH_HANDLER); acpi_os_release_lock(acpi_gbl_gpe_lock, flags); @@ -893,7 +901,7 @@ acpi_remove_gpe_handler(acpi_handle gpe_device, gpe_event_info->dispatch.method_node = handler->method_node; gpe_event_info->flags &= - ~(ACPI_GPE_XRUPT_TYPE_MASK | ACPI_GPE_DISPATCH_MASK); + ~(ACPI_GPE_XRUPT_TYPE_MASK | ACPI_GPE_DISPATCH_MASK); gpe_event_info->flags |= handler->original_flags; /* @@ -901,7 +909,8 @@ acpi_remove_gpe_handler(acpi_handle gpe_device, * enabled, it should be enabled at this point to restore the * post-initialization configuration. */ - if ((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) && + if (((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) || + (handler->original_flags & ACPI_GPE_DISPATCH_NOTIFY)) && handler->originally_enabled) { (void)acpi_ev_add_gpe_reference(gpe_event_info); } @@ -946,7 +955,7 @@ ACPI_EXPORT_SYMBOL(acpi_remove_gpe_handler) * handle is returned. * ******************************************************************************/ -acpi_status acpi_acquire_global_lock(u16 timeout, u32 * handle) +acpi_status acpi_acquire_global_lock(u16 timeout, u32 *handle) { acpi_status status; diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index e286640ad4ff..bb8cbf5961bf 100644 --- a/drivers/acpi/acpica/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -324,8 +324,9 @@ ACPI_EXPORT_SYMBOL(acpi_clear_event) ******************************************************************************/ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) { - acpi_status status = AE_OK; - u32 value; + acpi_status status; + acpi_event_status local_event_status = 0; + u32 in_byte; ACPI_FUNCTION_TRACE(acpi_get_event_status); @@ -339,29 +340,40 @@ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) return_ACPI_STATUS(AE_BAD_PARAMETER); } - /* Get the status of the requested fixed event */ + /* Fixed event currently can be dispatched? */ + + if (acpi_gbl_fixed_event_handlers[event].handler) { + local_event_status |= ACPI_EVENT_FLAG_HAS_HANDLER; + } + + /* Fixed event currently enabled? */ status = acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. - enable_register_id, &value); - if (ACPI_FAILURE(status)) + enable_register_id, &in_byte); + if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); + } - *event_status = value; + if (in_byte) { + local_event_status |= ACPI_EVENT_FLAG_ENABLED; + } + + /* Fixed event currently active? */ status = acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. - status_register_id, &value); - if (ACPI_FAILURE(status)) + status_register_id, &in_byte); + if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); + } - if (value) - *event_status |= ACPI_EVENT_FLAG_SET; - - if (acpi_gbl_fixed_event_handlers[event].handler) - *event_status |= ACPI_EVENT_FLAG_HANDLE; + if (in_byte) { + local_event_status |= ACPI_EVENT_FLAG_SET; + } - return_ACPI_STATUS(status); + (*event_status) = local_event_status; + return_ACPI_STATUS(AE_OK); } ACPI_EXPORT_SYMBOL(acpi_get_event_status) diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c index 56710a03c9b0..e889a5304abd 100644 --- a/drivers/acpi/acpica/evxfgpe.c +++ b/drivers/acpi/acpica/evxfgpe.c @@ -106,8 +106,8 @@ ACPI_EXPORT_SYMBOL(acpi_update_all_gpes) * * FUNCTION: acpi_enable_gpe * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block * * RETURN: Status * @@ -115,7 +115,6 @@ ACPI_EXPORT_SYMBOL(acpi_update_all_gpes) * hardware-enabled. * ******************************************************************************/ - acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number) { acpi_status status = AE_BAD_PARAMETER; @@ -490,8 +489,8 @@ ACPI_EXPORT_SYMBOL(acpi_clear_gpe) * * FUNCTION: acpi_get_gpe_status * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block * event_status - Where the current status of the event * will be returned * @@ -524,9 +523,6 @@ acpi_get_gpe_status(acpi_handle gpe_device, status = acpi_hw_get_gpe_status(gpe_event_info, event_status); - if (gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) - *event_status |= ACPI_EVENT_FLAG_HANDLE; - unlock_and_exit: acpi_os_release_lock(acpi_gbl_gpe_lock, flags); return_ACPI_STATUS(status); diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index ea62d40fd161..48ac7b7b59cd 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -202,7 +202,7 @@ acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info * gpe_event_info) acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, - acpi_event_status * event_status) + acpi_event_status *event_status) { u32 in_byte; u32 register_bit; @@ -216,6 +216,13 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, return (AE_BAD_PARAMETER); } + /* GPE currently handled? */ + + if ((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) != + ACPI_GPE_DISPATCH_NONE) { + local_event_status |= ACPI_EVENT_FLAG_HAS_HANDLER; + } + /* Get the info block for the entire GPE register */ gpe_register_info = gpe_event_info->register_info; diff --git a/drivers/acpi/acpica/tbxfroot.c b/drivers/acpi/acpica/tbxfroot.c index 65ab8fed3d5e..43a54af2b548 100644 --- a/drivers/acpi/acpica/tbxfroot.c +++ b/drivers/acpi/acpica/tbxfroot.c @@ -50,6 +50,36 @@ ACPI_MODULE_NAME("tbxfroot") /******************************************************************************* * + * FUNCTION: acpi_tb_get_rsdp_length + * + * PARAMETERS: rsdp - Pointer to RSDP + * + * RETURN: Table length + * + * DESCRIPTION: Get the length of the RSDP + * + ******************************************************************************/ +u32 acpi_tb_get_rsdp_length(struct acpi_table_rsdp *rsdp) +{ + + if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature)) { + + /* BAD Signature */ + + return (0); + } + + /* "Length" field is available if table version >= 2 */ + + if (rsdp->revision >= 2) { + return (rsdp->length); + } else { + return (ACPI_RSDP_CHECKSUM_LENGTH); + } +} + +/******************************************************************************* + * * FUNCTION: acpi_tb_validate_rsdp * * PARAMETERS: rsdp - Pointer to unvalidated RSDP @@ -59,7 +89,8 @@ ACPI_MODULE_NAME("tbxfroot") * DESCRIPTION: Validate the RSDP (ptr) * ******************************************************************************/ -acpi_status acpi_tb_validate_rsdp(struct acpi_table_rsdp *rsdp) + +acpi_status acpi_tb_validate_rsdp(struct acpi_table_rsdp * rsdp) { /* diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index bea6896be122..143ec6ea1468 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -343,6 +343,7 @@ int acpi_device_update_power(struct acpi_device *device, int *state_p) return 0; } +EXPORT_SYMBOL_GPL(acpi_device_update_power); int acpi_bus_update_power(acpi_handle handle, int *state_p) { @@ -710,7 +711,7 @@ int acpi_pm_device_run_wake(struct device *phys_dev, bool enable) return -ENODEV; } - return acpi_device_wakeup(adev, enable, ACPI_STATE_S0); + return acpi_device_wakeup(adev, ACPI_STATE_S0, enable); } EXPORT_SYMBOL(acpi_pm_device_run_wake); #endif /* CONFIG_PM_RUNTIME */ diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index cb6066c809ea..3d304ff7f095 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -128,12 +128,13 @@ static int EC_FLAGS_SKIP_DSDT_SCAN; /* Not all BIOS survive early DSDT scan */ static int EC_FLAGS_CLEAR_ON_RESUME; /* Needs acpi_ec_clear() on boot/resume */ /* -------------------------------------------------------------------------- - Transaction Management - -------------------------------------------------------------------------- */ + * Transaction Management + * -------------------------------------------------------------------------- */ static inline u8 acpi_ec_read_status(struct acpi_ec *ec) { u8 x = inb(ec->command_addr); + pr_debug("EC_SC(R) = 0x%2.2x " "SCI_EVT=%d BURST=%d CMD=%d IBF=%d OBF=%d\n", x, @@ -148,6 +149,7 @@ static inline u8 acpi_ec_read_status(struct acpi_ec *ec) static inline u8 acpi_ec_read_data(struct acpi_ec *ec) { u8 x = inb(ec->data_addr); + pr_debug("EC_DATA(R) = 0x%2.2x\n", x); return x; } @@ -164,10 +166,32 @@ static inline void acpi_ec_write_data(struct acpi_ec *ec, u8 data) outb(data, ec->data_addr); } +#ifdef DEBUG +static const char *acpi_ec_cmd_string(u8 cmd) +{ + switch (cmd) { + case 0x80: + return "RD_EC"; + case 0x81: + return "WR_EC"; + case 0x82: + return "BE_EC"; + case 0x83: + return "BD_EC"; + case 0x84: + return "QR_EC"; + } + return "UNKNOWN"; +} +#else +#define acpi_ec_cmd_string(cmd) "UNDEF" +#endif + static int ec_transaction_completed(struct acpi_ec *ec) { unsigned long flags; int ret = 0; + spin_lock_irqsave(&ec->lock, flags); if (ec->curr && (ec->curr->flags & ACPI_EC_COMMAND_COMPLETE)) ret = 1; @@ -181,7 +205,8 @@ static bool advance_transaction(struct acpi_ec *ec) u8 status; bool wakeup = false; - pr_debug("===== %s =====\n", in_interrupt() ? "IRQ" : "TASK"); + pr_debug("===== %s (%d) =====\n", + in_interrupt() ? "IRQ" : "TASK", smp_processor_id()); status = acpi_ec_read_status(ec); t = ec->curr; if (!t) @@ -198,7 +223,8 @@ static bool advance_transaction(struct acpi_ec *ec) if (t->rlen == t->ri) { t->flags |= ACPI_EC_COMMAND_COMPLETE; if (t->command == ACPI_EC_COMMAND_QUERY) - pr_debug("hardware QR_EC completion\n"); + pr_debug("***** Command(%s) hardware completion *****\n", + acpi_ec_cmd_string(t->command)); wakeup = true; } } else @@ -221,7 +247,8 @@ static bool advance_transaction(struct acpi_ec *ec) t->flags |= ACPI_EC_COMMAND_POLL; t->rdata[t->ri++] = 0x00; t->flags |= ACPI_EC_COMMAND_COMPLETE; - pr_debug("software QR_EC completion\n"); + pr_debug("***** Command(%s) software completion *****\n", + acpi_ec_cmd_string(t->command)); wakeup = true; } else if ((status & ACPI_EC_FLAG_IBF) == 0) { acpi_ec_write_cmd(ec, t->command); @@ -264,6 +291,7 @@ static int ec_poll(struct acpi_ec *ec) { unsigned long flags; int repeat = 5; /* number of command restarts */ + while (repeat--) { unsigned long delay = jiffies + msecs_to_jiffies(ec_delay); @@ -296,18 +324,25 @@ static int acpi_ec_transaction_unlocked(struct acpi_ec *ec, { unsigned long tmp; int ret = 0; + if (EC_FLAGS_MSI) udelay(ACPI_EC_MSI_UDELAY); /* start transaction */ spin_lock_irqsave(&ec->lock, tmp); /* following two actions should be kept atomic */ ec->curr = t; + pr_debug("***** Command(%s) started *****\n", + acpi_ec_cmd_string(t->command)); start_transaction(ec); spin_unlock_irqrestore(&ec->lock, tmp); ret = ec_poll(ec); spin_lock_irqsave(&ec->lock, tmp); - if (ec->curr->command == ACPI_EC_COMMAND_QUERY) + if (ec->curr->command == ACPI_EC_COMMAND_QUERY) { clear_bit(EC_FLAGS_QUERY_PENDING, &ec->flags); + pr_debug("***** Event stopped *****\n"); + } + pr_debug("***** Command(%s) stopped *****\n", + acpi_ec_cmd_string(t->command)); ec->curr = NULL; spin_unlock_irqrestore(&ec->lock, tmp); return ret; @@ -317,6 +352,7 @@ static int acpi_ec_transaction(struct acpi_ec *ec, struct transaction *t) { int status; u32 glk; + if (!ec || (!t) || (t->wlen && !t->wdata) || (t->rlen && !t->rdata)) return -EINVAL; if (t->rdata) @@ -333,8 +369,6 @@ static int acpi_ec_transaction(struct acpi_ec *ec, struct transaction *t) goto unlock; } } - pr_debug("transaction start (cmd=0x%02x, addr=0x%02x)\n", - t->command, t->wdata ? t->wdata[0] : 0); /* disable GPE during transaction if storm is detected */ if (test_bit(EC_FLAGS_GPE_STORM, &ec->flags)) { /* It has to be disabled, so that it doesn't trigger. */ @@ -355,7 +389,6 @@ static int acpi_ec_transaction(struct acpi_ec *ec, struct transaction *t) t->irq_count); set_bit(EC_FLAGS_GPE_STORM, &ec->flags); } - pr_debug("transaction end\n"); if (ec->global_lock) acpi_release_global_lock(glk); unlock: @@ -383,7 +416,7 @@ static int acpi_ec_burst_disable(struct acpi_ec *ec) acpi_ec_transaction(ec, &t) : 0; } -static int acpi_ec_read(struct acpi_ec *ec, u8 address, u8 * data) +static int acpi_ec_read(struct acpi_ec *ec, u8 address, u8 *data) { int result; u8 d; @@ -419,10 +452,9 @@ int ec_read(u8 addr, u8 *val) if (!err) { *val = temp_data; return 0; - } else - return err; + } + return err; } - EXPORT_SYMBOL(ec_read); int ec_write(u8 addr, u8 val) @@ -436,22 +468,21 @@ int ec_write(u8 addr, u8 val) return err; } - EXPORT_SYMBOL(ec_write); int ec_transaction(u8 command, - const u8 * wdata, unsigned wdata_len, - u8 * rdata, unsigned rdata_len) + const u8 *wdata, unsigned wdata_len, + u8 *rdata, unsigned rdata_len) { struct transaction t = {.command = command, .wdata = wdata, .rdata = rdata, .wlen = wdata_len, .rlen = rdata_len}; + if (!first_ec) return -ENODEV; return acpi_ec_transaction(first_ec, &t); } - EXPORT_SYMBOL(ec_transaction); /* Get the handle to the EC device */ @@ -461,7 +492,6 @@ acpi_handle ec_get_handle(void) return NULL; return first_ec->handle; } - EXPORT_SYMBOL(ec_get_handle); /* @@ -525,13 +555,14 @@ void acpi_ec_unblock_transactions_early(void) clear_bit(EC_FLAGS_BLOCKED, &first_ec->flags); } -static int acpi_ec_query_unlocked(struct acpi_ec *ec, u8 * data) +static int acpi_ec_query_unlocked(struct acpi_ec *ec, u8 *data) { int result; u8 d; struct transaction t = {.command = ACPI_EC_COMMAND_QUERY, .wdata = NULL, .rdata = &d, .wlen = 0, .rlen = 1}; + if (!ec || !data) return -EINVAL; /* @@ -557,6 +588,7 @@ int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit, { struct acpi_ec_query_handler *handler = kzalloc(sizeof(struct acpi_ec_query_handler), GFP_KERNEL); + if (!handler) return -ENOMEM; @@ -569,12 +601,12 @@ int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit, mutex_unlock(&ec->mutex); return 0; } - EXPORT_SYMBOL_GPL(acpi_ec_add_query_handler); void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit) { struct acpi_ec_query_handler *handler, *tmp; + mutex_lock(&ec->mutex); list_for_each_entry_safe(handler, tmp, &ec->list, node) { if (query_bit == handler->query_bit) { @@ -584,20 +616,20 @@ void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit) } mutex_unlock(&ec->mutex); } - EXPORT_SYMBOL_GPL(acpi_ec_remove_query_handler); static void acpi_ec_run(void *cxt) { struct acpi_ec_query_handler *handler = cxt; + if (!handler) return; - pr_debug("start query execution\n"); + pr_debug("##### Query(0x%02x) started #####\n", handler->query_bit); if (handler->func) handler->func(handler->data); else if (handler->handle) acpi_evaluate_object(handler->handle, NULL, NULL, NULL); - pr_debug("stop query execution\n"); + pr_debug("##### Query(0x%02x) stopped #####\n", handler->query_bit); kfree(handler); } @@ -620,8 +652,8 @@ static int acpi_ec_sync_query(struct acpi_ec *ec, u8 *data) if (!copy) return -ENOMEM; memcpy(copy, handler, sizeof(*copy)); - pr_debug("push query execution (0x%2x) on queue\n", - value); + pr_debug("##### Query(0x%02x) scheduled #####\n", + handler->query_bit); return acpi_os_execute((copy->func) ? OSL_NOTIFY_HANDLER : OSL_GPE_HANDLER, acpi_ec_run, copy); @@ -633,6 +665,7 @@ static int acpi_ec_sync_query(struct acpi_ec *ec, u8 *data) static void acpi_ec_gpe_query(void *ec_cxt) { struct acpi_ec *ec = ec_cxt; + if (!ec) return; mutex_lock(&ec->mutex); @@ -644,7 +677,7 @@ static int ec_check_sci(struct acpi_ec *ec, u8 state) { if (state & ACPI_EC_FLAG_SCI) { if (!test_and_set_bit(EC_FLAGS_QUERY_PENDING, &ec->flags)) { - pr_debug("push gpe query to the queue\n"); + pr_debug("***** Event started *****\n"); return acpi_os_execute(OSL_NOTIFY_HANDLER, acpi_ec_gpe_query, ec); } @@ -667,8 +700,8 @@ static u32 acpi_ec_gpe_handler(acpi_handle gpe_device, } /* -------------------------------------------------------------------------- - Address Space Management - -------------------------------------------------------------------------- */ + * Address Space Management + * -------------------------------------------------------------------------- */ static acpi_status acpi_ec_space_handler(u32 function, acpi_physical_address address, @@ -699,27 +732,26 @@ acpi_ec_space_handler(u32 function, acpi_physical_address address, switch (result) { case -EINVAL: return AE_BAD_PARAMETER; - break; case -ENODEV: return AE_NOT_FOUND; - break; case -ETIME: return AE_TIME; - break; default: return AE_OK; } } /* -------------------------------------------------------------------------- - Driver Interface - -------------------------------------------------------------------------- */ + * Driver Interface + * -------------------------------------------------------------------------- */ + static acpi_status ec_parse_io_ports(struct acpi_resource *resource, void *context); static struct acpi_ec *make_acpi_ec(void) { struct acpi_ec *ec = kzalloc(sizeof(struct acpi_ec), GFP_KERNEL); + if (!ec) return NULL; ec->flags = 1 << EC_FLAGS_QUERY_PENDING; @@ -742,9 +774,8 @@ acpi_ec_register_query_methods(acpi_handle handle, u32 level, status = acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer); - if (ACPI_SUCCESS(status) && sscanf(node_name, "_Q%x", &value) == 1) { + if (ACPI_SUCCESS(status) && sscanf(node_name, "_Q%x", &value) == 1) acpi_ec_add_query_handler(ec, value, handle, NULL, NULL); - } return AE_OK; } @@ -753,7 +784,6 @@ ec_parse_device(acpi_handle handle, u32 Level, void *context, void **retval) { acpi_status status; unsigned long long tmp = 0; - struct acpi_ec *ec = context; /* clear addr values, ec_parse_io_ports depend on it */ @@ -781,6 +811,7 @@ ec_parse_device(acpi_handle handle, u32 Level, void *context, void **retval) static int ec_install_handlers(struct acpi_ec *ec) { acpi_status status; + if (test_bit(EC_FLAGS_HANDLERS_INSTALLED, &ec->flags)) return 0; status = acpi_install_gpe_handler(NULL, ec->gpe, @@ -1078,7 +1109,8 @@ int __init acpi_ec_ecdt_probe(void) boot_ec->data_addr = ecdt_ptr->data.address; boot_ec->gpe = ecdt_ptr->gpe; boot_ec->handle = ACPI_ROOT_OBJECT; - acpi_get_handle(ACPI_ROOT_OBJECT, ecdt_ptr->id, &boot_ec->handle); + acpi_get_handle(ACPI_ROOT_OBJECT, ecdt_ptr->id, + &boot_ec->handle); /* Don't trust ECDT, which comes from ASUSTek */ if (!EC_FLAGS_VALIDATE_ECDT) goto install; @@ -1162,6 +1194,5 @@ static void __exit acpi_ec_exit(void) { acpi_bus_unregister_driver(&acpi_ec_driver); - return; } #endif /* 0 */ diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 5328b1090e08..caf9b76b7ef8 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -30,22 +30,19 @@ #include <linux/uaccess.h> #include <linux/thermal.h> #include <linux/acpi.h> - -#define ACPI_FAN_CLASS "fan" -#define ACPI_FAN_FILE_STATE "state" - -#define _COMPONENT ACPI_FAN_COMPONENT -ACPI_MODULE_NAME("fan"); +#include <linux/platform_device.h> +#include <linux/sort.h> MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI Fan Driver"); MODULE_LICENSE("GPL"); -static int acpi_fan_add(struct acpi_device *device); -static int acpi_fan_remove(struct acpi_device *device); +static int acpi_fan_probe(struct platform_device *pdev); +static int acpi_fan_remove(struct platform_device *pdev); static const struct acpi_device_id fan_device_ids[] = { {"PNP0C0B", 0}, + {"INT3404", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, fan_device_ids); @@ -64,37 +61,100 @@ static struct dev_pm_ops acpi_fan_pm = { #define FAN_PM_OPS_PTR NULL #endif -static struct acpi_driver acpi_fan_driver = { - .name = "fan", - .class = ACPI_FAN_CLASS, - .ids = fan_device_ids, - .ops = { - .add = acpi_fan_add, - .remove = acpi_fan_remove, - }, - .drv.pm = FAN_PM_OPS_PTR, +struct acpi_fan_fps { + u64 control; + u64 trip_point; + u64 speed; + u64 noise_level; + u64 power; +}; + +struct acpi_fan_fif { + u64 revision; + u64 fine_grain_ctrl; + u64 step_size; + u64 low_speed_notification; +}; + +struct acpi_fan { + bool acpi4; + struct acpi_fan_fif fif; + struct acpi_fan_fps *fps; + int fps_count; + struct thermal_cooling_device *cdev; +}; + +static struct platform_driver acpi_fan_driver = { + .probe = acpi_fan_probe, + .remove = acpi_fan_remove, + .driver = { + .name = "acpi-fan", + .acpi_match_table = fan_device_ids, + .pm = FAN_PM_OPS_PTR, + }, }; /* thermal cooling device callbacks */ static int fan_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state) { - /* ACPI fan device only support two states: ON/OFF */ - *state = 1; + struct acpi_device *device = cdev->devdata; + struct acpi_fan *fan = acpi_driver_data(device); + + if (fan->acpi4) + *state = fan->fps_count - 1; + else + *state = 1; return 0; } -static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long - *state) +static int fan_get_state_acpi4(struct acpi_device *device, unsigned long *state) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_fan *fan = acpi_driver_data(device); + union acpi_object *obj; + acpi_status status; + int control, i; + + status = acpi_evaluate_object(device->handle, "_FST", NULL, &buffer); + if (ACPI_FAILURE(status)) { + dev_err(&device->dev, "Get fan state failed\n"); + return status; + } + + obj = buffer.pointer; + if (!obj || obj->type != ACPI_TYPE_PACKAGE || + obj->package.count != 3 || + obj->package.elements[1].type != ACPI_TYPE_INTEGER) { + dev_err(&device->dev, "Invalid _FST data\n"); + status = -EINVAL; + goto err; + } + + control = obj->package.elements[1].integer.value; + for (i = 0; i < fan->fps_count; i++) { + if (control == fan->fps[i].control) + break; + } + if (i == fan->fps_count) { + dev_dbg(&device->dev, "Invalid control value returned\n"); + status = -EINVAL; + goto err; + } + + *state = i; + +err: + kfree(obj); + return status; +} + +static int fan_get_state(struct acpi_device *device, unsigned long *state) { - struct acpi_device *device = cdev->devdata; int result; int acpi_state = ACPI_STATE_D0; - if (!device) - return -EINVAL; - - result = acpi_bus_update_power(device->handle, &acpi_state); + result = acpi_device_update_power(device, &acpi_state); if (result) return result; @@ -103,21 +163,57 @@ static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long return 0; } -static int -fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) +static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long + *state) { struct acpi_device *device = cdev->devdata; - int result; + struct acpi_fan *fan = acpi_driver_data(device); - if (!device || (state != 0 && state != 1)) + if (fan->acpi4) + return fan_get_state_acpi4(device, state); + else + return fan_get_state(device, state); +} + +static int fan_set_state(struct acpi_device *device, unsigned long state) +{ + if (state != 0 && state != 1) return -EINVAL; - result = acpi_bus_set_power(device->handle, - state ? ACPI_STATE_D0 : ACPI_STATE_D3_COLD); + return acpi_device_set_power(device, + state ? ACPI_STATE_D0 : ACPI_STATE_D3_COLD); +} - return result; +static int fan_set_state_acpi4(struct acpi_device *device, unsigned long state) +{ + struct acpi_fan *fan = acpi_driver_data(device); + acpi_status status; + + if (state >= fan->fps_count) + return -EINVAL; + + status = acpi_execute_simple_method(device->handle, "_FSL", + fan->fps[state].control); + if (ACPI_FAILURE(status)) { + dev_dbg(&device->dev, "Failed to set state by _FSL\n"); + return status; + } + + return 0; } +static int +fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) +{ + struct acpi_device *device = cdev->devdata; + struct acpi_fan *fan = acpi_driver_data(device); + + if (fan->acpi4) + return fan_set_state_acpi4(device, state); + else + return fan_set_state(device, state); + } + static const struct thermal_cooling_device_ops fan_cooling_ops = { .get_max_state = fan_get_max_state, .get_cur_state = fan_get_cur_state, @@ -129,21 +225,125 @@ static const struct thermal_cooling_device_ops fan_cooling_ops = { * -------------------------------------------------------------------------- */ -static int acpi_fan_add(struct acpi_device *device) +static bool acpi_fan_is_acpi4(struct acpi_device *device) { - int result = 0; - struct thermal_cooling_device *cdev; + return acpi_has_method(device->handle, "_FIF") && + acpi_has_method(device->handle, "_FPS") && + acpi_has_method(device->handle, "_FSL") && + acpi_has_method(device->handle, "_FST"); +} - if (!device) - return -EINVAL; +static int acpi_fan_get_fif(struct acpi_device *device) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_fan *fan = acpi_driver_data(device); + struct acpi_buffer format = { sizeof("NNNN"), "NNNN" }; + struct acpi_buffer fif = { sizeof(fan->fif), &fan->fif }; + union acpi_object *obj; + acpi_status status; + + status = acpi_evaluate_object(device->handle, "_FIF", NULL, &buffer); + if (ACPI_FAILURE(status)) + return status; + + obj = buffer.pointer; + if (!obj || obj->type != ACPI_TYPE_PACKAGE) { + dev_err(&device->dev, "Invalid _FIF data\n"); + status = -EINVAL; + goto err; + } - strcpy(acpi_device_name(device), "Fan"); - strcpy(acpi_device_class(device), ACPI_FAN_CLASS); + status = acpi_extract_package(obj, &format, &fif); + if (ACPI_FAILURE(status)) { + dev_err(&device->dev, "Invalid _FIF element\n"); + status = -EINVAL; + } - result = acpi_bus_update_power(device->handle, NULL); - if (result) { - dev_err(&device->dev, "Setting initial power state\n"); - goto end; +err: + kfree(obj); + return status; +} + +static int acpi_fan_speed_cmp(const void *a, const void *b) +{ + const struct acpi_fan_fps *fps1 = a; + const struct acpi_fan_fps *fps2 = b; + return fps1->speed - fps2->speed; +} + +static int acpi_fan_get_fps(struct acpi_device *device) +{ + struct acpi_fan *fan = acpi_driver_data(device); + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + acpi_status status; + int i; + + status = acpi_evaluate_object(device->handle, "_FPS", NULL, &buffer); + if (ACPI_FAILURE(status)) + return status; + + obj = buffer.pointer; + if (!obj || obj->type != ACPI_TYPE_PACKAGE || obj->package.count < 2) { + dev_err(&device->dev, "Invalid _FPS data\n"); + status = -EINVAL; + goto err; + } + + fan->fps_count = obj->package.count - 1; /* minus revision field */ + fan->fps = devm_kzalloc(&device->dev, + fan->fps_count * sizeof(struct acpi_fan_fps), + GFP_KERNEL); + if (!fan->fps) { + dev_err(&device->dev, "Not enough memory\n"); + status = -ENOMEM; + goto err; + } + for (i = 0; i < fan->fps_count; i++) { + struct acpi_buffer format = { sizeof("NNNNN"), "NNNNN" }; + struct acpi_buffer fps = { sizeof(fan->fps[i]), &fan->fps[i] }; + status = acpi_extract_package(&obj->package.elements[i + 1], + &format, &fps); + if (ACPI_FAILURE(status)) { + dev_err(&device->dev, "Invalid _FPS element\n"); + break; + } + } + + /* sort the state array according to fan speed in increase order */ + sort(fan->fps, fan->fps_count, sizeof(*fan->fps), + acpi_fan_speed_cmp, NULL); + +err: + kfree(obj); + return status; +} + +static int acpi_fan_probe(struct platform_device *pdev) +{ + int result = 0; + struct thermal_cooling_device *cdev; + struct acpi_fan *fan; + struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + + fan = devm_kzalloc(&pdev->dev, sizeof(*fan), GFP_KERNEL); + if (!fan) { + dev_err(&device->dev, "No memory for fan\n"); + return -ENOMEM; + } + device->driver_data = fan; + platform_set_drvdata(pdev, fan); + + if (acpi_fan_is_acpi4(device)) { + if (acpi_fan_get_fif(device) || acpi_fan_get_fps(device)) + goto end; + fan->acpi4 = true; + } else { + result = acpi_device_update_power(device, NULL); + if (result) { + dev_err(&device->dev, "Setting initial power state\n"); + goto end; + } } cdev = thermal_cooling_device_register("Fan", device, @@ -153,44 +353,32 @@ static int acpi_fan_add(struct acpi_device *device) goto end; } - dev_dbg(&device->dev, "registered as cooling_device%d\n", cdev->id); + dev_dbg(&pdev->dev, "registered as cooling_device%d\n", cdev->id); - device->driver_data = cdev; - result = sysfs_create_link(&device->dev.kobj, + fan->cdev = cdev; + result = sysfs_create_link(&pdev->dev.kobj, &cdev->device.kobj, "thermal_cooling"); if (result) - dev_err(&device->dev, "Failed to create sysfs link " - "'thermal_cooling'\n"); + dev_err(&pdev->dev, "Failed to create sysfs link 'thermal_cooling'\n"); result = sysfs_create_link(&cdev->device.kobj, - &device->dev.kobj, + &pdev->dev.kobj, "device"); if (result) - dev_err(&device->dev, "Failed to create sysfs link 'device'\n"); - - dev_info(&device->dev, "ACPI: %s [%s] (%s)\n", - acpi_device_name(device), acpi_device_bid(device), - !device->power.state ? "on" : "off"); + dev_err(&pdev->dev, "Failed to create sysfs link 'device'\n"); end: return result; } -static int acpi_fan_remove(struct acpi_device *device) +static int acpi_fan_remove(struct platform_device *pdev) { - struct thermal_cooling_device *cdev; - - if (!device) - return -EINVAL; - - cdev = acpi_driver_data(device); - if (!cdev) - return -EINVAL; + struct acpi_fan *fan = platform_get_drvdata(pdev); - sysfs_remove_link(&device->dev.kobj, "thermal_cooling"); - sysfs_remove_link(&cdev->device.kobj, "device"); - thermal_cooling_device_unregister(cdev); + sysfs_remove_link(&pdev->dev.kobj, "thermal_cooling"); + sysfs_remove_link(&fan->cdev->device.kobj, "device"); + thermal_cooling_device_unregister(fan->cdev); return 0; } @@ -198,10 +386,11 @@ static int acpi_fan_remove(struct acpi_device *device) #ifdef CONFIG_PM_SLEEP static int acpi_fan_suspend(struct device *dev) { - if (!dev) - return -EINVAL; + struct acpi_fan *fan = dev_get_drvdata(dev); + if (fan->acpi4) + return 0; - acpi_bus_set_power(to_acpi_device(dev)->handle, ACPI_STATE_D0); + acpi_device_set_power(ACPI_COMPANION(dev), ACPI_STATE_D0); return AE_OK; } @@ -209,11 +398,12 @@ static int acpi_fan_suspend(struct device *dev) static int acpi_fan_resume(struct device *dev) { int result; + struct acpi_fan *fan = dev_get_drvdata(dev); - if (!dev) - return -EINVAL; + if (fan->acpi4) + return 0; - result = acpi_bus_update_power(to_acpi_device(dev)->handle, NULL); + result = acpi_device_update_power(ACPI_COMPANION(dev), NULL); if (result) dev_err(dev, "Error updating fan power state\n"); @@ -221,4 +411,4 @@ static int acpi_fan_resume(struct device *dev) } #endif -module_acpi_driver(acpi_fan_driver); +module_platform_driver(acpi_fan_driver); diff --git a/drivers/acpi/int340x_thermal.c b/drivers/acpi/int340x_thermal.c new file mode 100644 index 000000000000..a27d31d1ba24 --- /dev/null +++ b/drivers/acpi/int340x_thermal.c @@ -0,0 +1,51 @@ +/* + * ACPI support for int340x thermal drivers + * + * Copyright (C) 2014, Intel Corporation + * Authors: Zhang Rui <rui.zhang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/acpi.h> +#include <linux/module.h> + +#include "internal.h" + +#define DO_ENUMERATION 0x01 +static const struct acpi_device_id int340x_thermal_device_ids[] = { + {"INT3400", DO_ENUMERATION }, + {"INT3401"}, + {"INT3402"}, + {"INT3403"}, + {"INT3404"}, + {"INT3406"}, + {"INT3407"}, + {"INT3408"}, + {"INT3409"}, + {"INT340A"}, + {"INT340B"}, + {""}, +}; + +static int int340x_thermal_handler_attach(struct acpi_device *adev, + const struct acpi_device_id *id) +{ +#if defined(CONFIG_INT340X_THERMAL) || defined(CONFIG_INT340X_THERMAL_MODULE) + if (id->driver_data == DO_ENUMERATION) + acpi_create_platform_device(adev); +#endif + return 1; +} + +static struct acpi_scan_handler int340x_thermal_handler = { + .ids = int340x_thermal_device_ids, + .attach = int340x_thermal_handler_attach, +}; + +void __init acpi_int340x_thermal_init(void) +{ + acpi_scan_add_handler(&int340x_thermal_handler); +} diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 4c5cf77e7576..447f6d679b29 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -31,6 +31,7 @@ void acpi_pci_link_init(void); void acpi_processor_init(void); void acpi_platform_init(void); void acpi_pnp_init(void); +void acpi_int340x_thermal_init(void); int acpi_sysfs_init(void); void acpi_container_init(void); void acpi_memory_hotplug_init(void); @@ -103,8 +104,6 @@ int acpi_power_get_inferred_state(struct acpi_device *device, int *state); int acpi_power_on_resources(struct acpi_device *device, int state); int acpi_power_transition(struct acpi_device *device, int state); -int acpi_device_update_power(struct acpi_device *device, int *state_p); - int acpi_wakeup_device_init(void); #ifdef CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC @@ -168,13 +167,6 @@ static inline void suspend_nvs_restore(void) {} #endif /*-------------------------------------------------------------------------- - Platform bus support - -------------------------------------------------------------------------- */ -struct platform_device; - -struct platform_device *acpi_create_platform_device(struct acpi_device *adev); - -/*-------------------------------------------------------------------------- Video -------------------------------------------------------------------------- */ #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index ae44d8654c82..d670158a26c5 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1470,7 +1470,7 @@ static void acpi_wakeup_gpe_init(struct acpi_device *device) if (ACPI_FAILURE(status)) return; - wakeup->flags.run_wake = !!(event_status & ACPI_EVENT_FLAG_HANDLE); + wakeup->flags.run_wake = !!(event_status & ACPI_EVENT_FLAG_HAS_HANDLER); } static void acpi_bus_get_wakeup_device_flags(struct acpi_device *device) @@ -2315,6 +2315,7 @@ int __init acpi_scan_init(void) acpi_container_init(); acpi_memory_hotplug_init(); acpi_pnp_init(); + acpi_int340x_thermal_init(); mutex_lock(&acpi_scan_lock); /* diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 38cb9782d4b8..13e577c80201 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -537,7 +537,7 @@ static ssize_t counter_show(struct kobject *kobj, if (result) goto end; - if (!(status & ACPI_EVENT_FLAG_HANDLE)) + if (!(status & ACPI_EVENT_FLAG_HAS_HANDLER)) size += sprintf(buf + size, " invalid"); else if (status & ACPI_EVENT_FLAG_ENABLED) size += sprintf(buf + size, " enabled"); @@ -581,7 +581,7 @@ static ssize_t counter_set(struct kobject *kobj, if (result) goto end; - if (!(status & ACPI_EVENT_FLAG_HANDLE)) { + if (!(status & ACPI_EVENT_FLAG_HAS_HANDLER)) { printk(KERN_WARNING PREFIX "Can not change Invalid GPE/Fixed Event status\n"); return -EINVAL; diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 112817e963e0..d24fa1964eb8 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -528,7 +528,6 @@ static void acpi_thermal_check(void *data) } /* sys I/F for generic thermal sysfs support */ -#define KELVIN_TO_MILLICELSIUS(t, off) (((t) - (off)) * 100) static int thermal_get_temp(struct thermal_zone_device *thermal, unsigned long *temp) @@ -543,7 +542,8 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, if (result) return result; - *temp = KELVIN_TO_MILLICELSIUS(tz->temperature, tz->kelvin_offset); + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(tz->temperature, + tz->kelvin_offset); return 0; } @@ -647,7 +647,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.critical.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.critical.temperature, tz->kelvin_offset); return 0; @@ -657,7 +657,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.hot.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.hot.temperature, tz->kelvin_offset); return 0; @@ -667,7 +667,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.passive.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.passive.temperature, tz->kelvin_offset); return 0; @@ -678,7 +678,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].flags.valid; i++) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.active[i].temperature, tz->kelvin_offset); return 0; @@ -694,7 +694,7 @@ static int thermal_get_crit_temp(struct thermal_zone_device *thermal, struct acpi_thermal *tz = thermal->devdata; if (tz->trips.critical.flags.valid) { - *temperature = KELVIN_TO_MILLICELSIUS( + *temperature = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.critical.temperature, tz->kelvin_offset); return 0; @@ -714,8 +714,8 @@ static int thermal_get_trend(struct thermal_zone_device *thermal, if (type == THERMAL_TRIP_ACTIVE) { unsigned long trip_temp; - unsigned long temp = KELVIN_TO_MILLICELSIUS(tz->temperature, - tz->kelvin_offset); + unsigned long temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( + tz->temperature, tz->kelvin_offset); if (thermal_get_trip_temp(thermal, trip, &trip_temp)) return -EINVAL; diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 834f35c4bf8d..371ac12d25b1 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -149,6 +149,21 @@ acpi_extract_package(union acpi_object *package, break; } break; + case ACPI_TYPE_LOCAL_REFERENCE: + switch (format_string[i]) { + case 'R': + size_required += sizeof(void *); + tail_offset += sizeof(void *); + break; + default: + printk(KERN_WARNING PREFIX "Invalid package element" + " [%d] got reference," + " expecting [%c]\n", + i, format_string[i]); + return AE_BAD_DATA; + break; + } + break; case ACPI_TYPE_PACKAGE: default: @@ -247,7 +262,18 @@ acpi_extract_package(union acpi_object *package, break; } break; - + case ACPI_TYPE_LOCAL_REFERENCE: + switch (format_string[i]) { + case 'R': + *(void **)head = + (void *)element->reference.handle; + head += sizeof(void *); + break; + default: + /* Should never get here */ + break; + } + break; case ACPI_TYPE_PACKAGE: /* TBD: handle nested packages... */ default: diff --git a/drivers/char/random.c b/drivers/char/random.c index 82759cef9043..04645c09fe5e 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1106,7 +1106,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) __mix_pool_bytes(r, hash.w, sizeof(hash.w)); spin_unlock_irqrestore(&r->lock, flags); - memset(workspace, 0, sizeof(workspace)); + memzero_explicit(workspace, sizeof(workspace)); /* * In case the hash function has some recognizable output @@ -1118,7 +1118,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) hash.w[2] ^= rol32(hash.w[2], 16); memcpy(out, &hash, EXTRACT_SIZE); - memset(&hash, 0, sizeof(hash)); + memzero_explicit(&hash, sizeof(hash)); } /* @@ -1175,7 +1175,7 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } @@ -1218,7 +1218,7 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 6bbb8b913446..92c162af5045 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -18,6 +18,7 @@ #include <linux/cpu.h> #include <linux/cpu_cooling.h> #include <linux/cpufreq.h> +#include <linux/cpufreq-dt.h> #include <linux/cpumask.h> #include <linux/err.h> #include <linux/module.h> @@ -146,8 +147,8 @@ try_again: goto try_again; } - dev_warn(cpu_dev, "failed to get cpu%d regulator: %ld\n", - cpu, PTR_ERR(cpu_reg)); + dev_dbg(cpu_dev, "no regulator for cpu%d: %ld\n", + cpu, PTR_ERR(cpu_reg)); } cpu_clk = clk_get(cpu_dev, NULL); @@ -178,6 +179,7 @@ try_again: static int cpufreq_init(struct cpufreq_policy *policy) { + struct cpufreq_dt_platform_data *pd; struct cpufreq_frequency_table *freq_table; struct thermal_cooling_device *cdev; struct device_node *np; @@ -265,9 +267,18 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->driver_data = priv; policy->clk = cpu_clk; - ret = cpufreq_generic_init(policy, freq_table, transition_latency); - if (ret) + ret = cpufreq_table_validate_and_show(policy, freq_table); + if (ret) { + dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__, + ret); goto out_cooling_unregister; + } + + policy->cpuinfo.transition_latency = transition_latency; + + pd = cpufreq_get_driver_data(); + if (pd && !pd->independent_clocks) + cpumask_setall(policy->cpus); of_node_put(np); @@ -335,6 +346,8 @@ static int dt_cpufreq_probe(struct platform_device *pdev) if (!IS_ERR(cpu_reg)) regulator_put(cpu_reg); + dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev); + ret = cpufreq_register_driver(&dt_cpufreq_driver); if (ret) dev_err(cpu_dev, "failed register driver: %d\n", ret); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 24bf76fba141..644b54e1e7d1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -512,7 +512,18 @@ show_one(cpuinfo_max_freq, cpuinfo.max_freq); show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); show_one(scaling_min_freq, min); show_one(scaling_max_freq, max); -show_one(scaling_cur_freq, cur); + +static ssize_t show_scaling_cur_freq( + struct cpufreq_policy *policy, char *buf) +{ + ssize_t ret; + + if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) + ret = sprintf(buf, "%u\n", cpufreq_driver->get(policy->cpu)); + else + ret = sprintf(buf, "%u\n", policy->cur); + return ret; +} static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_policy *new_policy); @@ -906,11 +917,11 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy, if (ret) goto err_out_kobj_put; } - if (has_target()) { - ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); - if (ret) - goto err_out_kobj_put; - } + + ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); + if (ret) + goto err_out_kobj_put; + if (cpufreq_driver->bios_limit) { ret = sysfs_create_file(&policy->kobj, &bios_limit.attr); if (ret) @@ -1731,6 +1742,21 @@ const char *cpufreq_get_current_driver(void) } EXPORT_SYMBOL_GPL(cpufreq_get_current_driver); +/** + * cpufreq_get_driver_data - return current driver data + * + * Return the private data of the currently loaded cpufreq + * driver, or NULL if no cpufreq driver is loaded. + */ +void *cpufreq_get_driver_data(void) +{ + if (cpufreq_driver) + return cpufreq_driver->driver_data; + + return NULL; +} +EXPORT_SYMBOL_GPL(cpufreq_get_driver_data); + /********************************************************************* * NOTIFIER LISTS INTERFACE * *********************************************************************/ diff --git a/drivers/cpufreq/highbank-cpufreq.c b/drivers/cpufreq/highbank-cpufreq.c index ec399ad2f059..1608f7105c9f 100644 --- a/drivers/cpufreq/highbank-cpufreq.c +++ b/drivers/cpufreq/highbank-cpufreq.c @@ -19,7 +19,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/of.h> -#include <linux/mailbox.h> +#include <linux/pl320-ipc.h> #include <linux/platform_device.h> #define HB_CPUFREQ_CHANGE_NOTE 0x80000001 diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 0668b389c516..27bb6d3877ed 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -52,6 +52,17 @@ static inline int32_t div_fp(int32_t x, int32_t y) return div_s64((int64_t)x << FRAC_BITS, y); } +static inline int ceiling_fp(int32_t x) +{ + int mask, ret; + + ret = fp_toint(x); + mask = (1 << FRAC_BITS) - 1; + if (x & mask) + ret += 1; + return ret; +} + struct sample { int32_t core_pct_busy; u64 aperf; @@ -64,6 +75,7 @@ struct pstate_data { int current_pstate; int min_pstate; int max_pstate; + int scaling; int turbo_pstate; }; @@ -113,6 +125,7 @@ struct pstate_funcs { int (*get_max)(void); int (*get_min)(void); int (*get_turbo)(void); + int (*get_scaling)(void); void (*set)(struct cpudata*, int pstate); void (*get_vid)(struct cpudata *); }; @@ -138,6 +151,7 @@ struct perf_limits { static struct perf_limits limits = { .no_turbo = 0, + .turbo_disabled = 0, .max_perf_pct = 100, .max_perf = int_tofp(1), .min_perf_pct = 0, @@ -218,6 +232,18 @@ static inline void intel_pstate_reset_all_pid(void) } } +static inline void update_turbo_state(void) +{ + u64 misc_en; + struct cpudata *cpu; + + cpu = all_cpu_data[0]; + rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); + limits.turbo_disabled = + (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || + cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); +} + /************************** debugfs begin ************************/ static int pid_param_set(void *data, u64 val) { @@ -274,6 +300,20 @@ static void __init intel_pstate_debug_expose_params(void) return sprintf(buf, "%u\n", limits.object); \ } +static ssize_t show_no_turbo(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + ssize_t ret; + + update_turbo_state(); + if (limits.turbo_disabled) + ret = sprintf(buf, "%u\n", limits.turbo_disabled); + else + ret = sprintf(buf, "%u\n", limits.no_turbo); + + return ret; +} + static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, const char *buf, size_t count) { @@ -283,11 +323,14 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, ret = sscanf(buf, "%u", &input); if (ret != 1) return -EINVAL; - limits.no_turbo = clamp_t(int, input, 0 , 1); + + update_turbo_state(); if (limits.turbo_disabled) { pr_warn("Turbo disabled by BIOS or unavailable on processor\n"); - limits.no_turbo = limits.turbo_disabled; + return -EPERM; } + limits.no_turbo = clamp_t(int, input, 0, 1); + return count; } @@ -323,7 +366,6 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, return count; } -show_one(no_turbo, no_turbo); show_one(max_perf_pct, max_perf_pct); show_one(min_perf_pct, min_perf_pct); @@ -394,7 +436,7 @@ static void byt_set_pstate(struct cpudata *cpudata, int pstate) cpudata->vid.ratio); vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); - vid = fp_toint(vid_fp); + vid = ceiling_fp(vid_fp); if (pstate > cpudata->pstate.max_pstate) vid = cpudata->vid.turbo; @@ -404,6 +446,22 @@ static void byt_set_pstate(struct cpudata *cpudata, int pstate) wrmsrl(MSR_IA32_PERF_CTL, val); } +#define BYT_BCLK_FREQS 5 +static int byt_freq_table[BYT_BCLK_FREQS] = { 833, 1000, 1333, 1167, 800}; + +static int byt_get_scaling(void) +{ + u64 value; + int i; + + rdmsrl(MSR_FSB_FREQ, value); + i = value & 0x3; + + BUG_ON(i > BYT_BCLK_FREQS); + + return byt_freq_table[i] * 100; +} + static void byt_get_vid(struct cpudata *cpudata) { u64 value; @@ -449,6 +507,11 @@ static int core_get_turbo_pstate(void) return ret; } +static inline int core_get_scaling(void) +{ + return 100000; +} + static void core_set_pstate(struct cpudata *cpudata, int pstate) { u64 val; @@ -473,6 +536,7 @@ static struct cpu_defaults core_params = { .get_max = core_get_max_pstate, .get_min = core_get_min_pstate, .get_turbo = core_get_turbo_pstate, + .get_scaling = core_get_scaling, .set = core_set_pstate, }, }; @@ -491,6 +555,7 @@ static struct cpu_defaults byt_params = { .get_min = byt_get_min_pstate, .get_turbo = byt_get_turbo_pstate, .set = byt_set_pstate, + .get_scaling = byt_get_scaling, .get_vid = byt_get_vid, }, }; @@ -501,7 +566,7 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) int max_perf_adj; int min_perf; - if (limits.no_turbo) + if (limits.no_turbo || limits.turbo_disabled) max_perf = cpu->pstate.max_pstate; max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf)); @@ -516,6 +581,8 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) { int max_perf, min_perf; + update_turbo_state(); + intel_pstate_get_min_max(cpu, &min_perf, &max_perf); pstate = clamp_t(int, pstate, min_perf, max_perf); @@ -523,7 +590,7 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) if (pstate == cpu->pstate.current_pstate) return; - trace_cpu_frequency(pstate * 100000, cpu->cpu); + trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); cpu->pstate.current_pstate = pstate; @@ -535,6 +602,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) cpu->pstate.min_pstate = pstate_funcs.get_min(); cpu->pstate.max_pstate = pstate_funcs.get_max(); cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); + cpu->pstate.scaling = pstate_funcs.get_scaling(); if (pstate_funcs.get_vid) pstate_funcs.get_vid(cpu); @@ -550,7 +618,9 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu) core_pct = div64_u64(core_pct, int_tofp(sample->mperf)); sample->freq = fp_toint( - mul_fp(int_tofp(cpu->pstate.max_pstate * 1000), core_pct)); + mul_fp(int_tofp( + cpu->pstate.max_pstate * cpu->pstate.scaling / 100), + core_pct)); sample->core_pct_busy = (int32_t)core_pct; } @@ -671,7 +741,9 @@ static int intel_pstate_init_cpu(unsigned int cpunum) { struct cpudata *cpu; - all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), GFP_KERNEL); + if (!all_cpu_data[cpunum]) + all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), + GFP_KERNEL); if (!all_cpu_data[cpunum]) return -ENOMEM; @@ -714,9 +786,10 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { limits.min_perf_pct = 100; limits.min_perf = int_tofp(1); + limits.max_policy_pct = 100; limits.max_perf_pct = 100; limits.max_perf = int_tofp(1); - limits.no_turbo = limits.turbo_disabled; + limits.no_turbo = 0; return 0; } limits.min_perf_pct = (policy->min * 100) / policy->cpuinfo.max_freq; @@ -751,15 +824,12 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) del_timer_sync(&all_cpu_data[cpu_num]->timer); intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); - kfree(all_cpu_data[cpu_num]); - all_cpu_data[cpu_num] = NULL; } static int intel_pstate_cpu_init(struct cpufreq_policy *policy) { struct cpudata *cpu; int rc; - u64 misc_en; rc = intel_pstate_init_cpu(policy->cpu); if (rc) @@ -767,23 +837,18 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy) cpu = all_cpu_data[policy->cpu]; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); - if (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || - cpu->pstate.max_pstate == cpu->pstate.turbo_pstate) { - limits.turbo_disabled = 1; - limits.no_turbo = 1; - } if (limits.min_perf_pct == 100 && limits.max_perf_pct == 100) policy->policy = CPUFREQ_POLICY_PERFORMANCE; else policy->policy = CPUFREQ_POLICY_POWERSAVE; - policy->min = cpu->pstate.min_pstate * 100000; - policy->max = cpu->pstate.turbo_pstate * 100000; + policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = cpu->pstate.min_pstate * 100000; - policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate * 100000; + policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->cpuinfo.max_freq = + cpu->pstate.turbo_pstate * cpu->pstate.scaling; policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; cpumask_set_cpu(policy->cpu, policy->cpus); @@ -841,6 +906,7 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs) pstate_funcs.get_max = funcs->get_max; pstate_funcs.get_min = funcs->get_min; pstate_funcs.get_turbo = funcs->get_turbo; + pstate_funcs.get_scaling = funcs->get_scaling; pstate_funcs.set = funcs->set; pstate_funcs.get_vid = funcs->get_vid; } diff --git a/drivers/cpuidle/Kconfig.mips b/drivers/cpuidle/Kconfig.mips index 0e70ee28a5ca..4102be01d06a 100644 --- a/drivers/cpuidle/Kconfig.mips +++ b/drivers/cpuidle/Kconfig.mips @@ -3,7 +3,7 @@ # config MIPS_CPS_CPUIDLE bool "CPU Idle driver for MIPS CPS platforms" - depends on CPU_IDLE + depends on CPU_IDLE && MIPS_CPS depends on SYS_SUPPORTS_MIPS_CPS select ARCH_NEEDS_CPU_IDLE_COUPLED if MIPS_MT select GENERIC_CLOCKEVENTS_BROADCAST if SMP diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index a64be578dab2..7d3a3497dd4c 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -163,7 +163,8 @@ static int powernv_add_idle_states(void) int nr_idle_states = 1; /* Snooze */ int dt_idle_states; const __be32 *idle_state_flags; - u32 len_flags, flags; + const __be32 *idle_state_latency; + u32 len_flags, flags, latency_ns; int i; /* Currently we have snooze statically defined */ @@ -180,18 +181,32 @@ static int powernv_add_idle_states(void) return nr_idle_states; } + idle_state_latency = of_get_property(power_mgt, + "ibm,cpu-idle-state-latencies-ns", NULL); + if (!idle_state_latency) { + pr_warn("DT-PowerMgmt: missing ibm,cpu-idle-state-latencies-ns\n"); + return nr_idle_states; + } + dt_idle_states = len_flags / sizeof(u32); for (i = 0; i < dt_idle_states; i++) { flags = be32_to_cpu(idle_state_flags[i]); + + /* Cpuidle accepts exit_latency in us and we estimate + * target residency to be 10x exit_latency + */ + latency_ns = be32_to_cpu(idle_state_latency[i]); if (flags & IDLE_USE_INST_NAP) { /* Add NAP state */ strcpy(powernv_states[nr_idle_states].name, "Nap"); strcpy(powernv_states[nr_idle_states].desc, "Nap"); powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIME_VALID; - powernv_states[nr_idle_states].exit_latency = 10; - powernv_states[nr_idle_states].target_residency = 100; + powernv_states[nr_idle_states].exit_latency = + ((unsigned int)latency_ns) / 1000; + powernv_states[nr_idle_states].target_residency = + ((unsigned int)latency_ns / 100); powernv_states[nr_idle_states].enter = &nap_loop; nr_idle_states++; } @@ -202,8 +217,10 @@ static int powernv_add_idle_states(void) strcpy(powernv_states[nr_idle_states].desc, "FastSleep"); powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TIMER_STOP; - powernv_states[nr_idle_states].exit_latency = 300; - powernv_states[nr_idle_states].target_residency = 1000000; + powernv_states[nr_idle_states].exit_latency = + ((unsigned int)latency_ns) / 1000; + powernv_states[nr_idle_states].target_residency = + ((unsigned int)latency_ns / 100); powernv_states[nr_idle_states].enter = &fastsleep_loop; nr_idle_states++; } diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 64ecbb501c50..8590099ac148 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -41,6 +41,28 @@ struct efi __read_mostly efi = { }; EXPORT_SYMBOL(efi); +static bool disable_runtime; +static int __init setup_noefi(char *arg) +{ + disable_runtime = true; + return 0; +} +early_param("noefi", setup_noefi); + +bool efi_runtime_disabled(void) +{ + return disable_runtime; +} + +static int __init parse_efi_cmdline(char *str) +{ + if (parse_option_str(str, "noruntime")) + disable_runtime = true; + + return 0; +} +early_param("efi", parse_efi_cmdline); + static struct kobject *efi_kobj; static struct kobject *efivars_kobj; @@ -423,3 +445,60 @@ int __init efi_get_fdt_params(struct efi_fdt_params *params, int verbose) return ret; } #endif /* CONFIG_EFI_PARAMS_FROM_FDT */ + +static __initdata char memory_type_name[][20] = { + "Reserved", + "Loader Code", + "Loader Data", + "Boot Code", + "Boot Data", + "Runtime Code", + "Runtime Data", + "Conventional Memory", + "Unusable Memory", + "ACPI Reclaim Memory", + "ACPI Memory NVS", + "Memory Mapped I/O", + "MMIO Port Space", + "PAL Code" +}; + +char * __init efi_md_typeattr_format(char *buf, size_t size, + const efi_memory_desc_t *md) +{ + char *pos; + int type_len; + u64 attr; + + pos = buf; + if (md->type >= ARRAY_SIZE(memory_type_name)) + type_len = snprintf(pos, size, "[type=%u", md->type); + else + type_len = snprintf(pos, size, "[%-*s", + (int)(sizeof(memory_type_name[0]) - 1), + memory_type_name[md->type]); + if (type_len >= size) + return buf; + + pos += type_len; + size -= type_len; + + attr = md->attribute; + if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT | + EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_WP | + EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME)) + snprintf(pos, size, "|attr=0x%016llx]", + (unsigned long long)attr); + else + snprintf(pos, size, "|%3s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", + attr & EFI_MEMORY_RUNTIME ? "RUN" : "", + attr & EFI_MEMORY_XP ? "XP" : "", + attr & EFI_MEMORY_RP ? "RP" : "", + attr & EFI_MEMORY_WP ? "WP" : "", + attr & EFI_MEMORY_UCE ? "UCE" : "", + attr & EFI_MEMORY_WB ? "WB" : "", + attr & EFI_MEMORY_WT ? "WT" : "", + attr & EFI_MEMORY_WC ? "WC" : "", + attr & EFI_MEMORY_UC ? "UC" : ""); + return buf; +} diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 480339b6b110..75ee05964cbc 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -226,6 +226,10 @@ unsigned long __init efi_entry(void *handle, efi_system_table_t *sys_table, goto fail_free_image; } + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n"); + /* * Unauthenticated device tree data is a security hazard, so * ignore 'dtb=' unless UEFI Secure Boot is disabled. diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 32d5cca30f49..a920fec8fe88 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -15,8 +15,23 @@ #include "efistub.h" +/* + * Some firmware implementations have problems reading files in one go. + * A read chunk size of 1MB seems to work for most platforms. + * + * Unfortunately, reading files in chunks triggers *other* bugs on some + * platforms, so we provide a way to disable this workaround, which can + * be done by passing "efi=nochunk" on the EFI boot stub command line. + * + * If you experience issues with initrd images being corrupt it's worth + * trying efi=nochunk, but chunking is enabled by default because there + * are far more machines that require the workaround than those that + * break with it enabled. + */ #define EFI_READ_CHUNK_SIZE (1024 * 1024) +static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE; + struct file_info { efi_file_handle_t *handle; u64 size; @@ -281,6 +296,49 @@ void efi_free(efi_system_table_t *sys_table_arg, unsigned long size, efi_call_early(free_pages, addr, nr_pages); } +/* + * Parse the ASCII string 'cmdline' for EFI options, denoted by the efi= + * option, e.g. efi=nochunk. + * + * It should be noted that efi= is parsed in two very different + * environments, first in the early boot environment of the EFI boot + * stub, and subsequently during the kernel boot. + */ +efi_status_t efi_parse_options(char *cmdline) +{ + char *str; + + /* + * If no EFI parameters were specified on the cmdline we've got + * nothing to do. + */ + str = strstr(cmdline, "efi="); + if (!str) + return EFI_SUCCESS; + + /* Skip ahead to first argument */ + str += strlen("efi="); + + /* + * Remember, because efi= is also used by the kernel we need to + * skip over arguments we don't understand. + */ + while (*str) { + if (!strncmp(str, "nochunk", 7)) { + str += strlen("nochunk"); + __chunk_size = -1UL; + } + + /* Group words together, delimited by "," */ + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return EFI_SUCCESS; +} /* * Check the cmdline for a LILO-style file= arguments. @@ -423,8 +481,8 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, size = files[j].size; while (size) { unsigned long chunksize; - if (size > EFI_READ_CHUNK_SIZE) - chunksize = EFI_READ_CHUNK_SIZE; + if (size > __chunk_size) + chunksize = __chunk_size; else chunksize = size; diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 10daa4bbb258..228bbf910461 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -14,11 +14,80 @@ * This file is released under the GPLv2. */ +#include <linux/bug.h> #include <linux/efi.h> -#include <linux/spinlock.h> /* spinlock_t */ +#include <linux/mutex.h> +#include <linux/spinlock.h> #include <asm/efi.h> /* + * According to section 7.1 of the UEFI spec, Runtime Services are not fully + * reentrant, and there are particular combinations of calls that need to be + * serialized. (source: UEFI Specification v2.4A) + * + * Table 31. Rules for Reentry Into Runtime Services + * +------------------------------------+-------------------------------+ + * | If previous call is busy in | Forbidden to call | + * +------------------------------------+-------------------------------+ + * | Any | SetVirtualAddressMap() | + * +------------------------------------+-------------------------------+ + * | ConvertPointer() | ConvertPointer() | + * +------------------------------------+-------------------------------+ + * | SetVariable() | ResetSystem() | + * | UpdateCapsule() | | + * | SetTime() | | + * | SetWakeupTime() | | + * | GetNextHighMonotonicCount() | | + * +------------------------------------+-------------------------------+ + * | GetVariable() | GetVariable() | + * | GetNextVariableName() | GetNextVariableName() | + * | SetVariable() | SetVariable() | + * | QueryVariableInfo() | QueryVariableInfo() | + * | UpdateCapsule() | UpdateCapsule() | + * | QueryCapsuleCapabilities() | QueryCapsuleCapabilities() | + * | GetNextHighMonotonicCount() | GetNextHighMonotonicCount() | + * +------------------------------------+-------------------------------+ + * | GetTime() | GetTime() | + * | SetTime() | SetTime() | + * | GetWakeupTime() | GetWakeupTime() | + * | SetWakeupTime() | SetWakeupTime() | + * +------------------------------------+-------------------------------+ + * + * Due to the fact that the EFI pstore may write to the variable store in + * interrupt context, we need to use a spinlock for at least the groups that + * contain SetVariable() and QueryVariableInfo(). That leaves little else, as + * none of the remaining functions are actually ever called at runtime. + * So let's just use a single spinlock to serialize all Runtime Services calls. + */ +static DEFINE_SPINLOCK(efi_runtime_lock); + +/* + * Some runtime services calls can be reentrant under NMI, even if the table + * above says they are not. (source: UEFI Specification v2.4A) + * + * Table 32. Functions that may be called after Machine Check, INIT and NMI + * +----------------------------+------------------------------------------+ + * | Function | Called after Machine Check, INIT and NMI | + * +----------------------------+------------------------------------------+ + * | GetTime() | Yes, even if previously busy. | + * | GetVariable() | Yes, even if previously busy | + * | GetNextVariableName() | Yes, even if previously busy | + * | QueryVariableInfo() | Yes, even if previously busy | + * | SetVariable() | Yes, even if previously busy | + * | UpdateCapsule() | Yes, even if previously busy | + * | QueryCapsuleCapabilities() | Yes, even if previously busy | + * | ResetSystem() | Yes, even if previously busy | + * +----------------------------+------------------------------------------+ + * + * In order to prevent deadlocks under NMI, the wrappers for these functions + * may only grab the efi_runtime_lock or rtc_lock spinlocks if !efi_in_nmi(). + * However, not all of the services listed are reachable through NMI code paths, + * so the the special handling as suggested by the UEFI spec is only implemented + * for QueryVariableInfo() and SetVariable(), as these can be reached in NMI + * context through efi_pstore_write(). + */ + +/* * As per commit ef68c8f87ed1 ("x86: Serialize EFI time accesses on rtc_lock"), * the EFI specification requires that callers of the time related runtime * functions serialize with other CMOS accesses in the kernel, as the EFI time @@ -32,7 +101,9 @@ static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&efi_runtime_lock); status = efi_call_virt(get_time, tm, tc); + spin_unlock(&efi_runtime_lock); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -43,7 +114,9 @@ static efi_status_t virt_efi_set_time(efi_time_t *tm) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&efi_runtime_lock); status = efi_call_virt(set_time, tm); + spin_unlock(&efi_runtime_lock); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -56,7 +129,9 @@ static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&efi_runtime_lock); status = efi_call_virt(get_wakeup_time, enabled, pending, tm); + spin_unlock(&efi_runtime_lock); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -67,7 +142,9 @@ static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&efi_runtime_lock); status = efi_call_virt(set_wakeup_time, enabled, tm); + spin_unlock(&efi_runtime_lock); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -78,14 +155,27 @@ static efi_status_t virt_efi_get_variable(efi_char16_t *name, unsigned long *data_size, void *data) { - return efi_call_virt(get_variable, name, vendor, attr, data_size, data); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(get_variable, name, vendor, attr, data_size, + data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) { - return efi_call_virt(get_next_variable, name_size, name, vendor); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(get_next_variable, name_size, name, vendor); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_set_variable(efi_char16_t *name, @@ -94,24 +184,61 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, unsigned long data_size, void *data) { - return efi_call_virt(set_variable, name, vendor, attr, data_size, data); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(set_variable, name, vendor, attr, data_size, + data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } +static efi_status_t +virt_efi_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) +{ + unsigned long flags; + efi_status_t status; + + if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) + return EFI_NOT_READY; + + status = efi_call_virt(set_variable, name, vendor, attr, data_size, + data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; +} + + static efi_status_t virt_efi_query_variable_info(u32 attr, u64 *storage_space, u64 *remaining_space, u64 *max_variable_size) { + unsigned long flags; + efi_status_t status; + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt(query_variable_info, attr, storage_space, - remaining_space, max_variable_size); + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(query_variable_info, attr, storage_space, + remaining_space, max_variable_size); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) { - return efi_call_virt(get_next_high_mono_count, count); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(get_next_high_mono_count, count); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static void virt_efi_reset_system(int reset_type, @@ -119,17 +246,27 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); __efi_call_virt(reset_system, reset_type, status, data_size, data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); } static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, unsigned long count, unsigned long sg_list) { + unsigned long flags; + efi_status_t status; + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt(update_capsule, capsules, count, sg_list); + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(update_capsule, capsules, count, sg_list); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, @@ -137,11 +274,17 @@ static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, u64 *max_size, int *reset_type) { + unsigned long flags; + efi_status_t status; + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt(query_capsule_caps, capsules, count, max_size, - reset_type); + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(query_capsule_caps, capsules, count, max_size, + reset_type); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } void efi_native_runtime_setup(void) @@ -153,6 +296,7 @@ void efi_native_runtime_setup(void) efi.get_variable = virt_efi_get_variable; efi.get_next_variable = virt_efi_get_next_variable; efi.set_variable = virt_efi_set_variable; + efi.set_variable_nonblocking = virt_efi_set_variable_nonblocking; efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; efi.query_variable_info = virt_efi_query_variable_info; diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 5abe943e3404..70a0fb10517f 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -321,11 +321,11 @@ static unsigned long var_name_strnsize(efi_char16_t *variable_name, * Print a warning when duplicate EFI variables are encountered and * disable the sysfs workqueue since the firmware is buggy. */ -static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid, +static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, unsigned long len16) { size_t i, len8 = len16 / sizeof(efi_char16_t); - char *s8; + char *str8; /* * Disable the workqueue since the algorithm it uses for @@ -334,16 +334,16 @@ static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid, */ efivar_wq_enabled = false; - s8 = kzalloc(len8, GFP_KERNEL); - if (!s8) + str8 = kzalloc(len8, GFP_KERNEL); + if (!str8) return; for (i = 0; i < len8; i++) - s8[i] = s16[i]; + str8[i] = str16[i]; printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n", - s8, vendor_guid); - kfree(s8); + str8, vendor_guid); + kfree(str8); } /** @@ -595,6 +595,39 @@ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, } EXPORT_SYMBOL_GPL(efivar_entry_set); +/* + * efivar_entry_set_nonblocking - call set_variable_nonblocking() + * + * This function is guaranteed to not block and is suitable for calling + * from crash/panic handlers. + * + * Crucially, this function will not block if it cannot acquire + * __efivars->lock. Instead, it returns -EBUSY. + */ +static int +efivar_entry_set_nonblocking(efi_char16_t *name, efi_guid_t vendor, + u32 attributes, unsigned long size, void *data) +{ + const struct efivar_operations *ops = __efivars->ops; + unsigned long flags; + efi_status_t status; + + if (!spin_trylock_irqsave(&__efivars->lock, flags)) + return -EBUSY; + + status = check_var_size(attributes, size + ucs2_strsize(name, 1024)); + if (status != EFI_SUCCESS) { + spin_unlock_irqrestore(&__efivars->lock, flags); + return -ENOSPC; + } + + status = ops->set_variable_nonblocking(name, &vendor, attributes, + size, data); + + spin_unlock_irqrestore(&__efivars->lock, flags); + return efi_status_to_err(status); +} + /** * efivar_entry_set_safe - call set_variable() if enough space in firmware * @name: buffer containing the variable name @@ -622,6 +655,20 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, if (!ops->query_variable_store) return -ENOSYS; + /* + * If the EFI variable backend provides a non-blocking + * ->set_variable() operation and we're in a context where we + * cannot block, then we need to use it to avoid live-locks, + * since the implication is that the regular ->set_variable() + * will block. + * + * If no ->set_variable_nonblocking() is provided then + * ->set_variable() is assumed to be non-blocking. + */ + if (!block && ops->set_variable_nonblocking) + return efivar_entry_set_nonblocking(name, vendor, attributes, + size, data); + if (!block) { if (!spin_trylock_irqsave(&__efivars->lock, flags)) return -EBUSY; diff --git a/drivers/gpu/drm/cirrus/cirrus_drv.c b/drivers/gpu/drm/cirrus/cirrus_drv.c index e705335101a5..c2a1cba1e984 100644 --- a/drivers/gpu/drm/cirrus/cirrus_drv.c +++ b/drivers/gpu/drm/cirrus/cirrus_drv.c @@ -32,6 +32,8 @@ static struct drm_driver driver; static const struct pci_device_id pciidlist[] = { { PCI_VENDOR_ID_CIRRUS, PCI_DEVICE_ID_CIRRUS_5446, 0x1af4, 0x1100, 0, 0, 0 }, + { PCI_VENDOR_ID_CIRRUS, PCI_DEVICE_ID_CIRRUS_5446, PCI_VENDOR_ID_XEN, + 0x0001, 0, 0, 0 }, {0,} }; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 3201986bf25e..f66392b6e287 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1711,7 +1711,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev, #define HPD_STORM_DETECT_PERIOD 1000 #define HPD_STORM_THRESHOLD 5 -static int ilk_port_to_hotplug_shift(enum port port) +static int pch_port_to_hotplug_shift(enum port port) { switch (port) { case PORT_A: @@ -1727,7 +1727,7 @@ static int ilk_port_to_hotplug_shift(enum port port) } } -static int g4x_port_to_hotplug_shift(enum port port) +static int i915_port_to_hotplug_shift(enum port port) { switch (port) { case PORT_A: @@ -1785,12 +1785,12 @@ static inline void intel_hpd_irq_handler(struct drm_device *dev, if (port && dev_priv->hpd_irq_port[port]) { bool long_hpd; - if (IS_G4X(dev)) { - dig_shift = g4x_port_to_hotplug_shift(port); - long_hpd = (hotplug_trigger >> dig_shift) & PORTB_HOTPLUG_LONG_DETECT; - } else { - dig_shift = ilk_port_to_hotplug_shift(port); + if (HAS_PCH_SPLIT(dev)) { + dig_shift = pch_port_to_hotplug_shift(port); long_hpd = (dig_hotplug_reg >> dig_shift) & PORTB_HOTPLUG_LONG_DETECT; + } else { + dig_shift = i915_port_to_hotplug_shift(port); + long_hpd = (hotplug_trigger >> dig_shift) & PORTB_HOTPLUG_LONG_DETECT; } DRM_DEBUG_DRIVER("digital hpd port %c - %s\n", @@ -3458,12 +3458,13 @@ static void gen8_irq_reset(struct drm_device *dev) void gen8_irq_power_well_post_enable(struct drm_i915_private *dev_priv) { unsigned long irqflags; + uint32_t extra_ier = GEN8_PIPE_VBLANK | GEN8_PIPE_FIFO_UNDERRUN; spin_lock_irqsave(&dev_priv->irq_lock, irqflags); GEN8_IRQ_INIT_NDX(DE_PIPE, PIPE_B, dev_priv->de_irq_mask[PIPE_B], - ~dev_priv->de_irq_mask[PIPE_B]); + ~dev_priv->de_irq_mask[PIPE_B] | extra_ier); GEN8_IRQ_INIT_NDX(DE_PIPE, PIPE_C, dev_priv->de_irq_mask[PIPE_C], - ~dev_priv->de_irq_mask[PIPE_C]); + ~dev_priv->de_irq_mask[PIPE_C] | extra_ier); spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags); } diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 507370513f3d..c9e220963a78 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -73,9 +73,6 @@ static const uint32_t intel_cursor_formats[] = { DRM_FORMAT_ARGB8888, }; -#define DIV_ROUND_CLOSEST_ULL(ll, d) \ -({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; }) - static void intel_increase_pllclock(struct drm_device *dev, enum pipe pipe); static void intel_crtc_update_cursor(struct drm_crtc *crtc, bool on); @@ -12357,27 +12354,36 @@ static void intel_setup_outputs(struct drm_device *dev) if (I915_READ(PCH_DP_D) & DP_DETECTED) intel_dp_init(dev, PCH_DP_D, PORT_D); } else if (IS_VALLEYVIEW(dev)) { - if (I915_READ(VLV_DISPLAY_BASE + GEN4_HDMIB) & SDVO_DETECTED) { + /* + * The DP_DETECTED bit is the latched state of the DDC + * SDA pin at boot. However since eDP doesn't require DDC + * (no way to plug in a DP->HDMI dongle) the DDC pins for + * eDP ports may have been muxed to an alternate function. + * Thus we can't rely on the DP_DETECTED bit alone to detect + * eDP ports. Consult the VBT as well as DP_DETECTED to + * detect eDP ports. + */ + if (I915_READ(VLV_DISPLAY_BASE + GEN4_HDMIB) & SDVO_DETECTED) intel_hdmi_init(dev, VLV_DISPLAY_BASE + GEN4_HDMIB, PORT_B); - if (I915_READ(VLV_DISPLAY_BASE + DP_B) & DP_DETECTED) - intel_dp_init(dev, VLV_DISPLAY_BASE + DP_B, PORT_B); - } + if (I915_READ(VLV_DISPLAY_BASE + DP_B) & DP_DETECTED || + intel_dp_is_edp(dev, PORT_B)) + intel_dp_init(dev, VLV_DISPLAY_BASE + DP_B, PORT_B); - if (I915_READ(VLV_DISPLAY_BASE + GEN4_HDMIC) & SDVO_DETECTED) { + if (I915_READ(VLV_DISPLAY_BASE + GEN4_HDMIC) & SDVO_DETECTED) intel_hdmi_init(dev, VLV_DISPLAY_BASE + GEN4_HDMIC, PORT_C); - if (I915_READ(VLV_DISPLAY_BASE + DP_C) & DP_DETECTED) - intel_dp_init(dev, VLV_DISPLAY_BASE + DP_C, PORT_C); - } + if (I915_READ(VLV_DISPLAY_BASE + DP_C) & DP_DETECTED || + intel_dp_is_edp(dev, PORT_C)) + intel_dp_init(dev, VLV_DISPLAY_BASE + DP_C, PORT_C); if (IS_CHERRYVIEW(dev)) { - if (I915_READ(VLV_DISPLAY_BASE + CHV_HDMID) & SDVO_DETECTED) { + if (I915_READ(VLV_DISPLAY_BASE + CHV_HDMID) & SDVO_DETECTED) intel_hdmi_init(dev, VLV_DISPLAY_BASE + CHV_HDMID, PORT_D); - if (I915_READ(VLV_DISPLAY_BASE + DP_D) & DP_DETECTED) - intel_dp_init(dev, VLV_DISPLAY_BASE + DP_D, PORT_D); - } + /* eDP not supported on port D, so don't check VBT */ + if (I915_READ(VLV_DISPLAY_BASE + DP_D) & DP_DETECTED) + intel_dp_init(dev, VLV_DISPLAY_BASE + DP_D, PORT_D); } intel_dsi_init(dev); diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 07ce04683c30..ba715229a540 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -35,6 +35,9 @@ #include <drm/drm_fb_helper.h> #include <drm/drm_dp_mst_helper.h> +#define DIV_ROUND_CLOSEST_ULL(ll, d) \ +({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; }) + /** * _wait_for - magic (register) wait macro * diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index 18784470a760..0e018cb49147 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -419,9 +419,8 @@ static uint32_t scale(uint32_t source_val, source_val = clamp(source_val, source_min, source_max); /* avoid overflows */ - target_val = (uint64_t)(source_val - source_min) * - (target_max - target_min); - do_div(target_val, source_max - source_min); + target_val = DIV_ROUND_CLOSEST_ULL((uint64_t)(source_val - source_min) * + (target_max - target_min), source_max - source_min); target_val += target_min; return target_val; diff --git a/drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c b/drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c index 552fdbd45ebe..1d0e33fb5f61 100644 --- a/drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c +++ b/drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c @@ -113,6 +113,8 @@ #define IS_NVA3F(x) (((x) > 0xa0 && (x) < 0xaa) || (x) == 0xaf) #define IS_NVAAF(x) ((x) >= 0xaa && (x) <= 0xac) +#include <subdev/fb.h> + /* * This code deals with PGRAPH contexts on NV50 family cards. Like NV40, it's * the GPU itself that does context-switching, but it needs a special @@ -569,8 +571,12 @@ nv50_graph_construct_mmio(struct nouveau_grctx *ctx) gr_def(ctx, 0x407d08, 0x00010040); else if (device->chipset < 0xa0) gr_def(ctx, 0x407d08, 0x00390040); - else - gr_def(ctx, 0x407d08, 0x003d0040); + else { + if (nouveau_fb(device)->ram->type != NV_MEM_TYPE_GDDR5) + gr_def(ctx, 0x407d08, 0x003d0040); + else + gr_def(ctx, 0x407d08, 0x003c0040); + } gr_def(ctx, 0x407d0c, 0x00000022); } diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c index 589dbb582da2..fd3dbd59d73e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.c +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c @@ -400,15 +400,20 @@ nouveau_channel_new(struct nouveau_drm *drm, struct nvif_device *device, struct nouveau_channel **pchan) { struct nouveau_cli *cli = (void *)nvif_client(&device->base); + bool super; int ret; + /* hack until fencenv50 is fixed, and agp access relaxed */ + super = cli->base.super; + cli->base.super = true; + ret = nouveau_channel_ind(drm, device, handle, arg0, pchan); if (ret) { NV_PRINTK(debug, cli, "ib channel create, %d\n", ret); ret = nouveau_channel_dma(drm, device, handle, pchan); if (ret) { NV_PRINTK(debug, cli, "dma channel create, %d\n", ret); - return ret; + goto done; } } @@ -416,8 +421,9 @@ nouveau_channel_new(struct nouveau_drm *drm, struct nvif_device *device, if (ret) { NV_PRINTK(error, cli, "channel failed to initialise, %d\n", ret); nouveau_channel_del(pchan); - return ret; } - return 0; +done: + cli->base.super = super; + return ret; } diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index af9e78546688..0d1396266857 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -572,7 +572,6 @@ static int qxl_crtc_mode_set(struct drm_crtc *crtc, struct qxl_framebuffer *qfb; struct qxl_bo *bo, *old_bo = NULL; struct qxl_crtc *qcrtc = to_qxl_crtc(crtc); - uint32_t width, height, base_offset; bool recreate_primary = false; int ret; int surf_id; @@ -602,9 +601,10 @@ static int qxl_crtc_mode_set(struct drm_crtc *crtc, if (qcrtc->index == 0) recreate_primary = true; - width = mode->hdisplay; - height = mode->vdisplay; - base_offset = 0; + if (bo->surf.stride * bo->surf.height > qdev->vram_size) { + DRM_ERROR("Mode doesn't fit in vram size (vgamem)"); + return -EINVAL; + } ret = qxl_bo_reserve(bo, false); if (ret != 0) @@ -618,10 +618,10 @@ static int qxl_crtc_mode_set(struct drm_crtc *crtc, if (recreate_primary) { qxl_io_destroy_primary(qdev); qxl_io_log(qdev, - "recreate primary: %dx%d (was %dx%d,%d,%d)\n", - width, height, bo->surf.width, - bo->surf.height, bo->surf.stride, bo->surf.format); - qxl_io_create_primary(qdev, base_offset, bo); + "recreate primary: %dx%d,%d,%d\n", + bo->surf.width, bo->surf.height, + bo->surf.stride, bo->surf.format); + qxl_io_create_primary(qdev, 0, bo); bo->is_primary = true; } diff --git a/drivers/gpu/drm/radeon/btc_dpm.c b/drivers/gpu/drm/radeon/btc_dpm.c index 300d971187c4..0b2929de9f41 100644 --- a/drivers/gpu/drm/radeon/btc_dpm.c +++ b/drivers/gpu/drm/radeon/btc_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "btcd.h" #include "r600_dpm.h" #include "cypress_dpm.h" @@ -1170,6 +1171,23 @@ static const struct radeon_blacklist_clocks btc_blacklist_clocks[] = { 25000, 30000, RADEON_SCLK_UP } }; +void btc_get_max_clock_from_voltage_dependency_table(struct radeon_clock_voltage_dependency_table *table, + u32 *max_clock) +{ + u32 i, clock = 0; + + if ((table == NULL) || (table->count == 0)) { + *max_clock = clock; + return; + } + + for (i = 0; i < table->count; i++) { + if (clock < table->entries[i].clk) + clock = table->entries[i].clk; + } + *max_clock = clock; +} + void btc_apply_voltage_dependency_rules(struct radeon_clock_voltage_dependency_table *table, u32 clock, u16 max_voltage, u16 *voltage) { diff --git a/drivers/gpu/drm/radeon/btc_dpm.h b/drivers/gpu/drm/radeon/btc_dpm.h index 1a15e0e41950..3b6f12b7760b 100644 --- a/drivers/gpu/drm/radeon/btc_dpm.h +++ b/drivers/gpu/drm/radeon/btc_dpm.h @@ -46,6 +46,8 @@ void btc_adjust_clock_combinations(struct radeon_device *rdev, struct rv7xx_pl *pl); void btc_apply_voltage_dependency_rules(struct radeon_clock_voltage_dependency_table *table, u32 clock, u16 max_voltage, u16 *voltage); +void btc_get_max_clock_from_voltage_dependency_table(struct radeon_clock_voltage_dependency_table *table, + u32 *max_clock); void btc_apply_voltage_delta_rules(struct radeon_device *rdev, u16 max_vddc, u16 max_vddci, u16 *vddc, u16 *vddci); diff --git a/drivers/gpu/drm/radeon/ci_dpm.c b/drivers/gpu/drm/radeon/ci_dpm.c index f5c8c0445a94..11a55e9dad7f 100644 --- a/drivers/gpu/drm/radeon/ci_dpm.c +++ b/drivers/gpu/drm/radeon/ci_dpm.c @@ -24,6 +24,7 @@ #include <linux/firmware.h> #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "radeon_ucode.h" #include "cikd.h" #include "r600_dpm.h" diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c index c77dad1a4576..4e8432d07f15 100644 --- a/drivers/gpu/drm/radeon/cik_sdma.c +++ b/drivers/gpu/drm/radeon/cik_sdma.c @@ -611,16 +611,19 @@ int cik_sdma_ring_test(struct radeon_device *rdev, { unsigned i; int r; - void __iomem *ptr = (void *)rdev->vram_scratch.ptr; + unsigned index; u32 tmp; + u64 gpu_addr; - if (!ptr) { - DRM_ERROR("invalid vram scratch pointer\n"); - return -EINVAL; - } + if (ring->idx == R600_RING_TYPE_DMA_INDEX) + index = R600_WB_DMA_RING_TEST_OFFSET; + else + index = CAYMAN_WB_DMA1_RING_TEST_OFFSET; + + gpu_addr = rdev->wb.gpu_addr + index; tmp = 0xCAFEDEAD; - writel(tmp, ptr); + rdev->wb.wb[index/4] = cpu_to_le32(tmp); r = radeon_ring_lock(rdev, ring, 5); if (r) { @@ -628,14 +631,14 @@ int cik_sdma_ring_test(struct radeon_device *rdev, return r; } radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_WRITE, SDMA_WRITE_SUB_OPCODE_LINEAR, 0)); - radeon_ring_write(ring, rdev->vram_scratch.gpu_addr & 0xfffffffc); - radeon_ring_write(ring, upper_32_bits(rdev->vram_scratch.gpu_addr)); + radeon_ring_write(ring, lower_32_bits(gpu_addr)); + radeon_ring_write(ring, upper_32_bits(gpu_addr)); radeon_ring_write(ring, 1); /* number of DWs to follow */ radeon_ring_write(ring, 0xDEADBEEF); radeon_ring_unlock_commit(rdev, ring, false); for (i = 0; i < rdev->usec_timeout; i++) { - tmp = readl(ptr); + tmp = le32_to_cpu(rdev->wb.wb[index/4]); if (tmp == 0xDEADBEEF) break; DRM_UDELAY(1); diff --git a/drivers/gpu/drm/radeon/cypress_dpm.c b/drivers/gpu/drm/radeon/cypress_dpm.c index 47d31e915758..9aad0327e4d1 100644 --- a/drivers/gpu/drm/radeon/cypress_dpm.c +++ b/drivers/gpu/drm/radeon/cypress_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "evergreend.h" #include "r600_dpm.h" #include "cypress_dpm.h" diff --git a/drivers/gpu/drm/radeon/dce3_1_afmt.c b/drivers/gpu/drm/radeon/dce3_1_afmt.c index 950af153f30e..2fe8cfc966d9 100644 --- a/drivers/gpu/drm/radeon/dce3_1_afmt.c +++ b/drivers/gpu/drm/radeon/dce3_1_afmt.c @@ -32,7 +32,7 @@ static void dce3_2_afmt_write_speaker_allocation(struct drm_encoder *encoder) struct drm_connector *connector; struct radeon_connector *radeon_connector = NULL; u32 tmp; - u8 *sadb; + u8 *sadb = NULL; int sad_count; list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { @@ -49,8 +49,8 @@ static void dce3_2_afmt_write_speaker_allocation(struct drm_encoder *encoder) sad_count = drm_edid_to_speaker_allocation(radeon_connector->edid, &sadb); if (sad_count < 0) { - DRM_ERROR("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); - return; + DRM_DEBUG("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); + sad_count = 0; } /* program the speaker allocation */ diff --git a/drivers/gpu/drm/radeon/dce6_afmt.c b/drivers/gpu/drm/radeon/dce6_afmt.c index c0bbf68dbc27..f312edf4d50e 100644 --- a/drivers/gpu/drm/radeon/dce6_afmt.c +++ b/drivers/gpu/drm/radeon/dce6_afmt.c @@ -155,7 +155,7 @@ void dce6_afmt_write_speaker_allocation(struct drm_encoder *encoder) struct drm_connector *connector; struct radeon_connector *radeon_connector = NULL; u32 offset, tmp; - u8 *sadb; + u8 *sadb = NULL; int sad_count; if (!dig || !dig->afmt || !dig->afmt->pin) @@ -176,9 +176,9 @@ void dce6_afmt_write_speaker_allocation(struct drm_encoder *encoder) } sad_count = drm_edid_to_speaker_allocation(radeon_connector_edid(connector), &sadb); - if (sad_count <= 0) { - DRM_ERROR("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); - return; + if (sad_count < 0) { + DRM_DEBUG("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); + sad_count = 0; } /* program the speaker allocation */ diff --git a/drivers/gpu/drm/radeon/evergreen_hdmi.c b/drivers/gpu/drm/radeon/evergreen_hdmi.c index 2514d659b1ba..53abd9b17a50 100644 --- a/drivers/gpu/drm/radeon/evergreen_hdmi.c +++ b/drivers/gpu/drm/radeon/evergreen_hdmi.c @@ -133,7 +133,7 @@ static void dce4_afmt_write_speaker_allocation(struct drm_encoder *encoder) struct drm_connector *connector; struct radeon_connector *radeon_connector = NULL; u32 tmp; - u8 *sadb; + u8 *sadb = NULL; int sad_count; list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { @@ -149,9 +149,9 @@ static void dce4_afmt_write_speaker_allocation(struct drm_encoder *encoder) } sad_count = drm_edid_to_speaker_allocation(radeon_connector_edid(connector), &sadb); - if (sad_count <= 0) { - DRM_ERROR("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); - return; + if (sad_count < 0) { + DRM_DEBUG("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); + sad_count = 0; } /* program the speaker allocation */ diff --git a/drivers/gpu/drm/radeon/ni_dpm.c b/drivers/gpu/drm/radeon/ni_dpm.c index 715b181c6243..6d2f16cf2c1c 100644 --- a/drivers/gpu/drm/radeon/ni_dpm.c +++ b/drivers/gpu/drm/radeon/ni_dpm.c @@ -23,6 +23,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "nid.h" #include "r600_dpm.h" #include "ni_dpm.h" diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c index 100189ec5fa8..aabc343b9a8f 100644 --- a/drivers/gpu/drm/radeon/r600_dma.c +++ b/drivers/gpu/drm/radeon/r600_dma.c @@ -232,16 +232,19 @@ int r600_dma_ring_test(struct radeon_device *rdev, { unsigned i; int r; - void __iomem *ptr = (void *)rdev->vram_scratch.ptr; + unsigned index; u32 tmp; + u64 gpu_addr; - if (!ptr) { - DRM_ERROR("invalid vram scratch pointer\n"); - return -EINVAL; - } + if (ring->idx == R600_RING_TYPE_DMA_INDEX) + index = R600_WB_DMA_RING_TEST_OFFSET; + else + index = CAYMAN_WB_DMA1_RING_TEST_OFFSET; + + gpu_addr = rdev->wb.gpu_addr + index; tmp = 0xCAFEDEAD; - writel(tmp, ptr); + rdev->wb.wb[index/4] = cpu_to_le32(tmp); r = radeon_ring_lock(rdev, ring, 4); if (r) { @@ -249,13 +252,13 @@ int r600_dma_ring_test(struct radeon_device *rdev, return r; } radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 1)); - radeon_ring_write(ring, rdev->vram_scratch.gpu_addr & 0xfffffffc); - radeon_ring_write(ring, upper_32_bits(rdev->vram_scratch.gpu_addr) & 0xff); + radeon_ring_write(ring, lower_32_bits(gpu_addr)); + radeon_ring_write(ring, upper_32_bits(gpu_addr) & 0xff); radeon_ring_write(ring, 0xDEADBEEF); radeon_ring_unlock_commit(rdev, ring, false); for (i = 0; i < rdev->usec_timeout; i++) { - tmp = readl(ptr); + tmp = le32_to_cpu(rdev->wb.wb[index/4]); if (tmp == 0xDEADBEEF) break; DRM_UDELAY(1); diff --git a/drivers/gpu/drm/radeon/r600_dpm.c b/drivers/gpu/drm/radeon/r600_dpm.c index 9c61b74ef441..f6309bd23e01 100644 --- a/drivers/gpu/drm/radeon/r600_dpm.c +++ b/drivers/gpu/drm/radeon/r600_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "r600d.h" #include "r600_dpm.h" #include "atom.h" diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index f7c4b226a284..a9717b3fbf1b 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1133,6 +1133,8 @@ struct radeon_wb { #define R600_WB_EVENT_OFFSET 3072 #define CIK_WB_CP1_WPTR_OFFSET 3328 #define CIK_WB_CP2_WPTR_OFFSET 3584 +#define R600_WB_DMA_RING_TEST_OFFSET 3588 +#define CAYMAN_WB_DMA1_RING_TEST_OFFSET 3592 /** * struct radeon_pm - power management datas diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index f41cc1538e48..ea2676954dde 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1130,7 +1130,7 @@ static void radeon_check_arguments(struct radeon_device *rdev) if (radeon_vm_block_size == -1) { /* Total bits covered by PD + PTs */ - unsigned bits = ilog2(radeon_vm_size) + 17; + unsigned bits = ilog2(radeon_vm_size) + 18; /* Make sure the PD is 4K in size up to 8GB address space. Above that split equal between PD and PTs */ diff --git a/drivers/gpu/drm/radeon/rs780_dpm.c b/drivers/gpu/drm/radeon/rs780_dpm.c index 02f7710de470..9031f4b69824 100644 --- a/drivers/gpu/drm/radeon/rs780_dpm.c +++ b/drivers/gpu/drm/radeon/rs780_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "rs780d.h" #include "r600_dpm.h" #include "rs780_dpm.h" diff --git a/drivers/gpu/drm/radeon/rv6xx_dpm.c b/drivers/gpu/drm/radeon/rv6xx_dpm.c index e7045b085715..6a5c233361e9 100644 --- a/drivers/gpu/drm/radeon/rv6xx_dpm.c +++ b/drivers/gpu/drm/radeon/rv6xx_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "rv6xxd.h" #include "r600_dpm.h" #include "rv6xx_dpm.h" diff --git a/drivers/gpu/drm/radeon/rv770_dpm.c b/drivers/gpu/drm/radeon/rv770_dpm.c index 3c76e1dcdf04..755a8f96fe46 100644 --- a/drivers/gpu/drm/radeon/rv770_dpm.c +++ b/drivers/gpu/drm/radeon/rv770_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "rv770d.h" #include "r600_dpm.h" #include "rv770_dpm.h" diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 9e4d5d7d348f..a53c2e79d9cb 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -23,6 +23,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "sid.h" #include "r600_dpm.h" #include "si_dpm.h" @@ -2916,6 +2917,7 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, bool disable_sclk_switching = false; u32 mclk, sclk; u16 vddc, vddci; + u32 max_sclk_vddc, max_mclk_vddci, max_mclk_vddc; int i; if ((rdev->pm.dpm.new_active_crtc_count > 1) || @@ -2949,6 +2951,29 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, } } + /* limit clocks to max supported clocks based on voltage dependency tables */ + btc_get_max_clock_from_voltage_dependency_table(&rdev->pm.dpm.dyn_state.vddc_dependency_on_sclk, + &max_sclk_vddc); + btc_get_max_clock_from_voltage_dependency_table(&rdev->pm.dpm.dyn_state.vddci_dependency_on_mclk, + &max_mclk_vddci); + btc_get_max_clock_from_voltage_dependency_table(&rdev->pm.dpm.dyn_state.vddc_dependency_on_mclk, + &max_mclk_vddc); + + for (i = 0; i < ps->performance_level_count; i++) { + if (max_sclk_vddc) { + if (ps->performance_levels[i].sclk > max_sclk_vddc) + ps->performance_levels[i].sclk = max_sclk_vddc; + } + if (max_mclk_vddci) { + if (ps->performance_levels[i].mclk > max_mclk_vddci) + ps->performance_levels[i].mclk = max_mclk_vddci; + } + if (max_mclk_vddc) { + if (ps->performance_levels[i].mclk > max_mclk_vddc) + ps->performance_levels[i].mclk = max_mclk_vddc; + } + } + /* XXX validate the min clocks required for display */ if (disable_mclk_switching) { diff --git a/drivers/gpu/drm/radeon/sumo_dpm.c b/drivers/gpu/drm/radeon/sumo_dpm.c index 3f0e8d7b8dbe..1f8a8833e1be 100644 --- a/drivers/gpu/drm/radeon/sumo_dpm.c +++ b/drivers/gpu/drm/radeon/sumo_dpm.c @@ -23,6 +23,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "sumod.h" #include "r600_dpm.h" #include "cypress_dpm.h" diff --git a/drivers/gpu/drm/radeon/trinity_dpm.c b/drivers/gpu/drm/radeon/trinity_dpm.c index 57f780053b3e..b4ec5c4e7969 100644 --- a/drivers/gpu/drm/radeon/trinity_dpm.c +++ b/drivers/gpu/drm/radeon/trinity_dpm.c @@ -23,6 +23,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "trinityd.h" #include "r600_dpm.h" #include "trinity_dpm.h" diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 8f5cec67c47d..d395b0bef73b 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -709,6 +709,7 @@ out: static int ttm_mem_evict_first(struct ttm_bo_device *bdev, uint32_t mem_type, + const struct ttm_place *place, bool interruptible, bool no_wait_gpu) { @@ -720,8 +721,21 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev, spin_lock(&glob->lru_lock); list_for_each_entry(bo, &man->lru, lru) { ret = __ttm_bo_reserve(bo, false, true, false, NULL); - if (!ret) + if (!ret) { + if (place && (place->fpfn || place->lpfn)) { + /* Don't evict this BO if it's outside of the + * requested placement range + */ + if (place->fpfn >= (bo->mem.start + bo->mem.size) || + (place->lpfn && place->lpfn <= bo->mem.start)) { + __ttm_bo_unreserve(bo); + ret = -EBUSY; + continue; + } + } + break; + } } if (ret) { @@ -782,7 +796,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo, return ret; if (mem->mm_node) break; - ret = ttm_mem_evict_first(bdev, mem_type, + ret = ttm_mem_evict_first(bdev, mem_type, place, interruptible, no_wait_gpu); if (unlikely(ret != 0)) return ret; @@ -994,9 +1008,9 @@ static bool ttm_bo_mem_compat(struct ttm_placement *placement, for (i = 0; i < placement->num_placement; i++) { const struct ttm_place *heap = &placement->placement[i]; - if (mem->mm_node && heap->lpfn != 0 && + if (mem->mm_node && (mem->start < heap->fpfn || - mem->start + mem->num_pages > heap->lpfn)) + (heap->lpfn != 0 && (mem->start + mem->num_pages) > heap->lpfn))) continue; *new_flags = heap->flags; @@ -1007,9 +1021,9 @@ static bool ttm_bo_mem_compat(struct ttm_placement *placement, for (i = 0; i < placement->num_busy_placement; i++) { const struct ttm_place *heap = &placement->busy_placement[i]; - if (mem->mm_node && heap->lpfn != 0 && + if (mem->mm_node && (mem->start < heap->fpfn || - mem->start + mem->num_pages > heap->lpfn)) + (heap->lpfn != 0 && (mem->start + mem->num_pages) > heap->lpfn))) continue; *new_flags = heap->flags; @@ -1233,7 +1247,7 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev, spin_lock(&glob->lru_lock); while (!list_empty(&man->lru)) { spin_unlock(&glob->lru_lock); - ret = ttm_mem_evict_first(bdev, mem_type, false, false); + ret = ttm_mem_evict_first(bdev, mem_type, NULL, false, false); if (ret) { if (allow_errors) { return ret; diff --git a/drivers/hwmon/menf21bmc_hwmon.c b/drivers/hwmon/menf21bmc_hwmon.c index c92229d321c9..afc6b58eaa62 100644 --- a/drivers/hwmon/menf21bmc_hwmon.c +++ b/drivers/hwmon/menf21bmc_hwmon.c @@ -21,6 +21,7 @@ #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/i2c.h> +#include <linux/err.h> #define DRV_NAME "menf21bmc_hwmon" diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 0bea5776bcbc..3effa931fce2 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2185,7 +2185,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_cmd->tx_desc.num_sge = 2; } - isert_init_send_wr(isert_conn, isert_cmd, send_wr, true); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -2871,7 +2871,7 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) &isert_cmd->tx_desc.iscsi_header); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); isert_init_send_wr(isert_conn, isert_cmd, - &isert_cmd->tx_desc.send_wr, true); + &isert_cmd->tx_desc.send_wr, false); isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr; wr->send_wr_num += 1; } @@ -3140,7 +3140,7 @@ isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) accept_wait: ret = down_interruptible(&isert_np->np_sem); - if (max_accept > 5) + if (ret || max_accept > 5) return -ENODEV; spin_lock_bh(&np->np_thread_lock); diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index aa29198fca3e..7440c58b8e6f 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -9,26 +9,21 @@ * published by the Free Software Foundation. */ -#include <linux/module.h> -#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/err.h> #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/leds.h> #include <linux/list.h> +#include <linux/module.h> +#include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/device.h> #include <linux/timer.h> -#include <linux/err.h> -#include <linux/ctype.h> -#include <linux/leds.h> #include "leds.h" static struct class *leds_class; -static void led_update_brightness(struct led_classdev *led_cdev) -{ - if (led_cdev->brightness_get) - led_cdev->brightness = led_cdev->brightness_get(led_cdev); -} - static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -59,14 +54,14 @@ static ssize_t brightness_store(struct device *dev, } static DEVICE_ATTR_RW(brightness); -static ssize_t led_max_brightness_show(struct device *dev, +static ssize_t max_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); return sprintf(buf, "%u\n", led_cdev->max_brightness); } -static DEVICE_ATTR(max_brightness, 0444, led_max_brightness_show, NULL); +static DEVICE_ATTR_RO(max_brightness); #ifdef CONFIG_LEDS_TRIGGERS static DEVICE_ATTR(trigger, 0644, led_trigger_show, led_trigger_store); diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index 71b40d3bf776..aaa8eba9099f 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -12,10 +12,11 @@ */ #include <linux/kernel.h> +#include <linux/leds.h> #include <linux/list.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/rwsem.h> -#include <linux/leds.h> #include "leds.h" DECLARE_RWSEM(leds_list_lock); @@ -126,3 +127,19 @@ void led_set_brightness(struct led_classdev *led_cdev, __led_set_brightness(led_cdev, brightness); } EXPORT_SYMBOL(led_set_brightness); + +int led_update_brightness(struct led_classdev *led_cdev) +{ + int ret = 0; + + if (led_cdev->brightness_get) { + ret = led_cdev->brightness_get(led_cdev); + if (ret >= 0) { + led_cdev->brightness = ret; + return 0; + } + } + + return ret; +} +EXPORT_SYMBOL(led_update_brightness); diff --git a/drivers/leds/leds-gpio-register.c b/drivers/leds/leds-gpio-register.c index 1c4ed5510f35..75717ba68ae0 100644 --- a/drivers/leds/leds-gpio-register.c +++ b/drivers/leds/leds-gpio-register.c @@ -7,9 +7,9 @@ * Free Software Foundation. */ #include <linux/err.h> +#include <linux/leds.h> #include <linux/platform_device.h> #include <linux/slab.h> -#include <linux/leds.h> /** * gpio_led_register_device - register a gpio-led device @@ -28,6 +28,9 @@ struct platform_device *__init gpio_led_register_device( struct platform_device *ret; struct gpio_led_platform_data _pdata = *pdata; + if (!pdata->num_leds) + return ERR_PTR(-EINVAL); + _pdata.leds = kmemdup(pdata->leds, pdata->num_leds * sizeof(*pdata->leds), GFP_KERNEL); if (!_pdata.leds) diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index 57ff20fecf57..b4518c8751c8 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -10,17 +10,17 @@ * published by the Free Software Foundation. * */ -#include <linux/kernel.h> -#include <linux/platform_device.h> +#include <linux/err.h> #include <linux/gpio.h> +#include <linux/kernel.h> #include <linux/leds.h> +#include <linux/module.h> #include <linux/of.h> -#include <linux/of_platform.h> #include <linux/of_gpio.h> +#include <linux/of_platform.h> +#include <linux/platform_device.h> #include <linux/slab.h> #include <linux/workqueue.h> -#include <linux/module.h> -#include <linux/err.h> struct gpio_led_data { struct led_classdev cdev; @@ -36,7 +36,7 @@ struct gpio_led_data { static void gpio_led_work(struct work_struct *work) { - struct gpio_led_data *led_dat = + struct gpio_led_data *led_dat = container_of(work, struct gpio_led_data, work); if (led_dat->blinking) { @@ -235,14 +235,12 @@ static struct gpio_leds_priv *gpio_leds_create_of(struct platform_device *pdev) } #endif /* CONFIG_OF_GPIO */ - static int gpio_led_probe(struct platform_device *pdev) { struct gpio_led_platform_data *pdata = dev_get_platdata(&pdev->dev); struct gpio_leds_priv *priv; int i, ret = 0; - if (pdata && pdata->num_leds) { priv = devm_kzalloc(&pdev->dev, sizeof_gpio_leds_priv(pdata->num_leds), diff --git a/drivers/leds/leds-lp3944.c b/drivers/leds/leds-lp3944.c index 8e1abdcd4c9d..53144fb96167 100644 --- a/drivers/leds/leds-lp3944.c +++ b/drivers/leds/leds-lp3944.c @@ -335,7 +335,8 @@ static int lp3944_configure(struct i2c_client *client, } /* to expose the default value to userspace */ - led->ldev.brightness = led->status; + led->ldev.brightness = + (enum led_brightness) led->status; /* Set the default led status */ err = lp3944_led_set(led, led->status); diff --git a/drivers/leds/trigger/ledtrig-gpio.c b/drivers/leds/trigger/ledtrig-gpio.c index 35812e3a37f2..c86c41826476 100644 --- a/drivers/leds/trigger/ledtrig-gpio.c +++ b/drivers/leds/trigger/ledtrig-gpio.c @@ -48,7 +48,7 @@ static void gpio_trig_work(struct work_struct *work) if (!gpio_data->gpio) return; - tmp = gpio_get_value(gpio_data->gpio); + tmp = gpio_get_value_cansleep(gpio_data->gpio); if (gpio_data->inverted) tmp = !tmp; diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile index 6d184dbcaca8..94ed7cefb14d 100644 --- a/drivers/mailbox/Makefile +++ b/drivers/mailbox/Makefile @@ -1,3 +1,7 @@ +# Generic MAILBOX API + +obj-$(CONFIG_MAILBOX) += mailbox.o + obj-$(CONFIG_PL320_MBOX) += pl320-ipc.o obj-$(CONFIG_OMAP2PLUS_MBOX) += omap-mailbox.o diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c new file mode 100644 index 000000000000..afcb430508ec --- /dev/null +++ b/drivers/mailbox/mailbox.c @@ -0,0 +1,465 @@ +/* + * Mailbox: Common code for Mailbox controllers and users + * + * Copyright (C) 2013-2014 Linaro Ltd. + * Author: Jassi Brar <jassisinghbrar@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/bitops.h> +#include <linux/mailbox_client.h> +#include <linux/mailbox_controller.h> + +#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define TXDONE_BY_ACK BIT(2) /* S/W ACK recevied by Client ticks the TX */ + +static LIST_HEAD(mbox_cons); +static DEFINE_MUTEX(con_mutex); + +static int add_to_rbuf(struct mbox_chan *chan, void *mssg) +{ + int idx; + unsigned long flags; + + spin_lock_irqsave(&chan->lock, flags); + + /* See if there is any space left */ + if (chan->msg_count == MBOX_TX_QUEUE_LEN) { + spin_unlock_irqrestore(&chan->lock, flags); + return -ENOBUFS; + } + + idx = chan->msg_free; + chan->msg_data[idx] = mssg; + chan->msg_count++; + + if (idx == MBOX_TX_QUEUE_LEN - 1) + chan->msg_free = 0; + else + chan->msg_free++; + + spin_unlock_irqrestore(&chan->lock, flags); + + return idx; +} + +static void msg_submit(struct mbox_chan *chan) +{ + unsigned count, idx; + unsigned long flags; + void *data; + int err; + + spin_lock_irqsave(&chan->lock, flags); + + if (!chan->msg_count || chan->active_req) + goto exit; + + count = chan->msg_count; + idx = chan->msg_free; + if (idx >= count) + idx -= count; + else + idx += MBOX_TX_QUEUE_LEN - count; + + data = chan->msg_data[idx]; + + /* Try to submit a message to the MBOX controller */ + err = chan->mbox->ops->send_data(chan, data); + if (!err) { + chan->active_req = data; + chan->msg_count--; + } +exit: + spin_unlock_irqrestore(&chan->lock, flags); +} + +static void tx_tick(struct mbox_chan *chan, int r) +{ + unsigned long flags; + void *mssg; + + spin_lock_irqsave(&chan->lock, flags); + mssg = chan->active_req; + chan->active_req = NULL; + spin_unlock_irqrestore(&chan->lock, flags); + + /* Submit next message */ + msg_submit(chan); + + /* Notify the client */ + if (mssg && chan->cl->tx_done) + chan->cl->tx_done(chan->cl, mssg, r); + + if (chan->cl->tx_block) + complete(&chan->tx_complete); +} + +static void poll_txdone(unsigned long data) +{ + struct mbox_controller *mbox = (struct mbox_controller *)data; + bool txdone, resched = false; + int i; + + for (i = 0; i < mbox->num_chans; i++) { + struct mbox_chan *chan = &mbox->chans[i]; + + if (chan->active_req && chan->cl) { + resched = true; + txdone = chan->mbox->ops->last_tx_done(chan); + if (txdone) + tx_tick(chan, 0); + } + } + + if (resched) + mod_timer(&mbox->poll, jiffies + + msecs_to_jiffies(mbox->txpoll_period)); +} + +/** + * mbox_chan_received_data - A way for controller driver to push data + * received from remote to the upper layer. + * @chan: Pointer to the mailbox channel on which RX happened. + * @mssg: Client specific message typecasted as void * + * + * After startup and before shutdown any data received on the chan + * is passed on to the API via atomic mbox_chan_received_data(). + * The controller should ACK the RX only after this call returns. + */ +void mbox_chan_received_data(struct mbox_chan *chan, void *mssg) +{ + /* No buffering the received data */ + if (chan->cl->rx_callback) + chan->cl->rx_callback(chan->cl, mssg); +} +EXPORT_SYMBOL_GPL(mbox_chan_received_data); + +/** + * mbox_chan_txdone - A way for controller driver to notify the + * framework that the last TX has completed. + * @chan: Pointer to the mailbox chan on which TX happened. + * @r: Status of last TX - OK or ERROR + * + * The controller that has IRQ for TX ACK calls this atomic API + * to tick the TX state machine. It works only if txdone_irq + * is set by the controller. + */ +void mbox_chan_txdone(struct mbox_chan *chan, int r) +{ + if (unlikely(!(chan->txdone_method & TXDONE_BY_IRQ))) { + dev_err(chan->mbox->dev, + "Controller can't run the TX ticker\n"); + return; + } + + tx_tick(chan, r); +} +EXPORT_SYMBOL_GPL(mbox_chan_txdone); + +/** + * mbox_client_txdone - The way for a client to run the TX state machine. + * @chan: Mailbox channel assigned to this client. + * @r: Success status of last transmission. + * + * The client/protocol had received some 'ACK' packet and it notifies + * the API that the last packet was sent successfully. This only works + * if the controller can't sense TX-Done. + */ +void mbox_client_txdone(struct mbox_chan *chan, int r) +{ + if (unlikely(!(chan->txdone_method & TXDONE_BY_ACK))) { + dev_err(chan->mbox->dev, "Client can't run the TX ticker\n"); + return; + } + + tx_tick(chan, r); +} +EXPORT_SYMBOL_GPL(mbox_client_txdone); + +/** + * mbox_client_peek_data - A way for client driver to pull data + * received from remote by the controller. + * @chan: Mailbox channel assigned to this client. + * + * A poke to controller driver for any received data. + * The data is actually passed onto client via the + * mbox_chan_received_data() + * The call can be made from atomic context, so the controller's + * implementation of peek_data() must not sleep. + * + * Return: True, if controller has, and is going to push after this, + * some data. + * False, if controller doesn't have any data to be read. + */ +bool mbox_client_peek_data(struct mbox_chan *chan) +{ + if (chan->mbox->ops->peek_data) + return chan->mbox->ops->peek_data(chan); + + return false; +} +EXPORT_SYMBOL_GPL(mbox_client_peek_data); + +/** + * mbox_send_message - For client to submit a message to be + * sent to the remote. + * @chan: Mailbox channel assigned to this client. + * @mssg: Client specific message typecasted. + * + * For client to submit data to the controller destined for a remote + * processor. If the client had set 'tx_block', the call will return + * either when the remote receives the data or when 'tx_tout' millisecs + * run out. + * In non-blocking mode, the requests are buffered by the API and a + * non-negative token is returned for each queued request. If the request + * is not queued, a negative token is returned. Upon failure or successful + * TX, the API calls 'tx_done' from atomic context, from which the client + * could submit yet another request. + * The pointer to message should be preserved until it is sent + * over the chan, i.e, tx_done() is made. + * This function could be called from atomic context as it simply + * queues the data and returns a token against the request. + * + * Return: Non-negative integer for successful submission (non-blocking mode) + * or transmission over chan (blocking mode). + * Negative value denotes failure. + */ +int mbox_send_message(struct mbox_chan *chan, void *mssg) +{ + int t; + + if (!chan || !chan->cl) + return -EINVAL; + + t = add_to_rbuf(chan, mssg); + if (t < 0) { + dev_err(chan->mbox->dev, "Try increasing MBOX_TX_QUEUE_LEN\n"); + return t; + } + + msg_submit(chan); + + if (chan->txdone_method == TXDONE_BY_POLL) + poll_txdone((unsigned long)chan->mbox); + + if (chan->cl->tx_block && chan->active_req) { + unsigned long wait; + int ret; + + if (!chan->cl->tx_tout) /* wait forever */ + wait = msecs_to_jiffies(3600000); + else + wait = msecs_to_jiffies(chan->cl->tx_tout); + + ret = wait_for_completion_timeout(&chan->tx_complete, wait); + if (ret == 0) { + t = -EIO; + tx_tick(chan, -EIO); + } + } + + return t; +} +EXPORT_SYMBOL_GPL(mbox_send_message); + +/** + * mbox_request_channel - Request a mailbox channel. + * @cl: Identity of the client requesting the channel. + * @index: Index of mailbox specifier in 'mboxes' property. + * + * The Client specifies its requirements and capabilities while asking for + * a mailbox channel. It can't be called from atomic context. + * The channel is exclusively allocated and can't be used by another + * client before the owner calls mbox_free_channel. + * After assignment, any packet received on this channel will be + * handed over to the client via the 'rx_callback'. + * The framework holds reference to the client, so the mbox_client + * structure shouldn't be modified until the mbox_free_channel returns. + * + * Return: Pointer to the channel assigned to the client if successful. + * ERR_PTR for request failure. + */ +struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index) +{ + struct device *dev = cl->dev; + struct mbox_controller *mbox; + struct of_phandle_args spec; + struct mbox_chan *chan; + unsigned long flags; + int ret; + + if (!dev || !dev->of_node) { + pr_debug("%s: No owner device node\n", __func__); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&con_mutex); + + if (of_parse_phandle_with_args(dev->of_node, "mboxes", + "#mbox-cells", index, &spec)) { + dev_dbg(dev, "%s: can't parse \"mboxes\" property\n", __func__); + mutex_unlock(&con_mutex); + return ERR_PTR(-ENODEV); + } + + chan = NULL; + list_for_each_entry(mbox, &mbox_cons, node) + if (mbox->dev->of_node == spec.np) { + chan = mbox->of_xlate(mbox, &spec); + break; + } + + of_node_put(spec.np); + + if (!chan || chan->cl || !try_module_get(mbox->dev->driver->owner)) { + dev_dbg(dev, "%s: mailbox not free\n", __func__); + mutex_unlock(&con_mutex); + return ERR_PTR(-EBUSY); + } + + spin_lock_irqsave(&chan->lock, flags); + chan->msg_free = 0; + chan->msg_count = 0; + chan->active_req = NULL; + chan->cl = cl; + init_completion(&chan->tx_complete); + + if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone) + chan->txdone_method |= TXDONE_BY_ACK; + + spin_unlock_irqrestore(&chan->lock, flags); + + ret = chan->mbox->ops->startup(chan); + if (ret) { + dev_err(dev, "Unable to startup the chan (%d)\n", ret); + mbox_free_channel(chan); + chan = ERR_PTR(ret); + } + + mutex_unlock(&con_mutex); + return chan; +} +EXPORT_SYMBOL_GPL(mbox_request_channel); + +/** + * mbox_free_channel - The client relinquishes control of a mailbox + * channel by this call. + * @chan: The mailbox channel to be freed. + */ +void mbox_free_channel(struct mbox_chan *chan) +{ + unsigned long flags; + + if (!chan || !chan->cl) + return; + + chan->mbox->ops->shutdown(chan); + + /* The queued TX requests are simply aborted, no callbacks are made */ + spin_lock_irqsave(&chan->lock, flags); + chan->cl = NULL; + chan->active_req = NULL; + if (chan->txdone_method == (TXDONE_BY_POLL | TXDONE_BY_ACK)) + chan->txdone_method = TXDONE_BY_POLL; + + module_put(chan->mbox->dev->driver->owner); + spin_unlock_irqrestore(&chan->lock, flags); +} +EXPORT_SYMBOL_GPL(mbox_free_channel); + +static struct mbox_chan * +of_mbox_index_xlate(struct mbox_controller *mbox, + const struct of_phandle_args *sp) +{ + int ind = sp->args[0]; + + if (ind >= mbox->num_chans) + return NULL; + + return &mbox->chans[ind]; +} + +/** + * mbox_controller_register - Register the mailbox controller + * @mbox: Pointer to the mailbox controller. + * + * The controller driver registers its communication channels + */ +int mbox_controller_register(struct mbox_controller *mbox) +{ + int i, txdone; + + /* Sanity check */ + if (!mbox || !mbox->dev || !mbox->ops || !mbox->num_chans) + return -EINVAL; + + if (mbox->txdone_irq) + txdone = TXDONE_BY_IRQ; + else if (mbox->txdone_poll) + txdone = TXDONE_BY_POLL; + else /* It has to be ACK then */ + txdone = TXDONE_BY_ACK; + + if (txdone == TXDONE_BY_POLL) { + mbox->poll.function = &poll_txdone; + mbox->poll.data = (unsigned long)mbox; + init_timer(&mbox->poll); + } + + for (i = 0; i < mbox->num_chans; i++) { + struct mbox_chan *chan = &mbox->chans[i]; + + chan->cl = NULL; + chan->mbox = mbox; + chan->txdone_method = txdone; + spin_lock_init(&chan->lock); + } + + if (!mbox->of_xlate) + mbox->of_xlate = of_mbox_index_xlate; + + mutex_lock(&con_mutex); + list_add_tail(&mbox->node, &mbox_cons); + mutex_unlock(&con_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(mbox_controller_register); + +/** + * mbox_controller_unregister - Unregister the mailbox controller + * @mbox: Pointer to the mailbox controller. + */ +void mbox_controller_unregister(struct mbox_controller *mbox) +{ + int i; + + if (!mbox) + return; + + mutex_lock(&con_mutex); + + list_del(&mbox->node); + + for (i = 0; i < mbox->num_chans; i++) + mbox_free_channel(&mbox->chans[i]); + + if (mbox->txdone_poll) + del_timer_sync(&mbox->poll); + + mutex_unlock(&con_mutex); +} +EXPORT_SYMBOL_GPL(mbox_controller_unregister); diff --git a/drivers/mailbox/pl320-ipc.c b/drivers/mailbox/pl320-ipc.c index d873cbae2fbb..f3755e0aa935 100644 --- a/drivers/mailbox/pl320-ipc.c +++ b/drivers/mailbox/pl320-ipc.c @@ -26,7 +26,7 @@ #include <linux/device.h> #include <linux/amba/bus.h> -#include <linux/mailbox.h> +#include <linux/pl320-ipc.h> #define IPCMxSOURCE(m) ((m) * 0x40) #define IPCMxDSET(m) (((m) * 0x40) + 0x004) diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index a9f9c46e5022..63fc63911295 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -397,6 +397,7 @@ static int pcie_pme_suspend(struct pcie_device *srv) struct pcie_pme_service_data *data = get_service_data(srv); struct pci_dev *port = srv->port; bool wakeup; + int ret; if (device_may_wakeup(&port->dev)) { wakeup = true; @@ -407,9 +408,10 @@ static int pcie_pme_suspend(struct pcie_device *srv) } spin_lock_irq(&data->lock); if (wakeup) { - enable_irq_wake(srv->irq); + ret = enable_irq_wake(srv->irq); data->suspend_level = PME_SUSPEND_WAKEUP; - } else { + } + if (!wakeup || ret) { struct pci_dev *port = srv->port; pcie_pme_interrupt_enable(port, false); diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index b800783800a3..ef2dd2e4754b 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -83,6 +83,7 @@ config PWM_BFIN config PWM_CLPS711X tristate "CLPS711X PWM support" depends on ARCH_CLPS711X || COMPILE_TEST + depends on HAS_IOMEM help Generic PWM framework driver for Cirrus Logic CLPS711X. @@ -101,6 +102,7 @@ config PWM_EP93XX config PWM_FSL_FTM tristate "Freescale FlexTimer Module (FTM) PWM support" depends on OF + select REGMAP_MMIO help Generic FTM PWM framework driver for Freescale VF610 and Layerscape LS-1 SoCs. @@ -149,7 +151,7 @@ config PWM_LPC32XX config PWM_LPSS tristate "Intel LPSS PWM support" - depends on ACPI + depends on X86 help Generic PWM framework driver for Intel Low Power Subsystem PWM controller. @@ -157,6 +159,24 @@ config PWM_LPSS To compile this driver as a module, choose M here: the module will be called pwm-lpss. +config PWM_LPSS_PCI + tristate "Intel LPSS PWM PCI driver" + depends on PWM_LPSS && PCI + help + The PCI driver for Intel Low Power Subsystem PWM controller. + + To compile this driver as a module, choose M here: the module + will be called pwm-lpss-pci. + +config PWM_LPSS_PLATFORM + tristate "Intel LPSS PWM platform driver" + depends on PWM_LPSS && ACPI + help + The platform driver for Intel Low Power Subsystem PWM controller. + + To compile this driver as a module, choose M here: the module + will be called pwm-lpss-platform. + config PWM_MXS tristate "Freescale MXS PWM support" depends on ARCH_MXS && OF diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index f8c577d41091..c458606c3755 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -13,6 +13,8 @@ obj-$(CONFIG_PWM_JZ4740) += pwm-jz4740.o obj-$(CONFIG_PWM_LP3943) += pwm-lp3943.o obj-$(CONFIG_PWM_LPC32XX) += pwm-lpc32xx.o obj-$(CONFIG_PWM_LPSS) += pwm-lpss.o +obj-$(CONFIG_PWM_LPSS_PCI) += pwm-lpss-pci.o +obj-$(CONFIG_PWM_LPSS_PLATFORM) += pwm-lpss-platform.o obj-$(CONFIG_PWM_MXS) += pwm-mxs.o obj-$(CONFIG_PWM_PCA9685) += pwm-pca9685.o obj-$(CONFIG_PWM_PUV3) += pwm-puv3.o diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index d2c35920ff08..966497d10c6e 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -236,7 +236,7 @@ int pwmchip_add(struct pwm_chip *chip) int ret; if (!chip || !chip->dev || !chip->ops || !chip->ops->config || - !chip->ops->enable || !chip->ops->disable) + !chip->ops->enable || !chip->ops->disable || !chip->npwm) return -EINVAL; mutex_lock(&pwm_lock); @@ -602,12 +602,9 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id) struct pwm_device *pwm = ERR_PTR(-EPROBE_DEFER); const char *dev_id = dev ? dev_name(dev) : NULL; struct pwm_chip *chip = NULL; - unsigned int index = 0; unsigned int best = 0; - struct pwm_lookup *p; + struct pwm_lookup *p, *chosen = NULL; unsigned int match; - unsigned int period; - enum pwm_polarity polarity; /* look up via DT first */ if (IS_ENABLED(CONFIG_OF) && dev && dev->of_node) @@ -653,10 +650,7 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id) } if (match > best) { - chip = pwmchip_find_by_name(p->provider); - index = p->index; - period = p->period; - polarity = p->polarity; + chosen = p; if (match != 3) best = match; @@ -665,17 +659,22 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id) } } - mutex_unlock(&pwm_lookup_lock); + if (!chosen) + goto out; - if (chip) - pwm = pwm_request_from_chip(chip, index, con_id ?: dev_id); - if (IS_ERR(pwm)) - return pwm; + chip = pwmchip_find_by_name(chosen->provider); + if (!chip) + goto out; - pwm_set_period(pwm, period); - pwm_set_polarity(pwm, polarity); + pwm = pwm_request_from_chip(chip, chosen->index, con_id ?: dev_id); + if (IS_ERR(pwm)) + goto out; + pwm_set_period(pwm, chosen->period); + pwm_set_polarity(pwm, chosen->polarity); +out: + mutex_unlock(&pwm_lookup_lock); return pwm; } EXPORT_SYMBOL_GPL(pwm_get); diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index 6e700a541ca3..d3c22de9ee47 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -102,7 +102,7 @@ static int atmel_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct atmel_pwm_chip *atmel_pwm = to_atmel_pwm_chip(chip); - unsigned long clk_rate, prd, dty; + unsigned long prd, dty; unsigned long long div; unsigned int pres = 0; u32 val; @@ -113,20 +113,18 @@ static int atmel_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return -EBUSY; } - clk_rate = clk_get_rate(atmel_pwm->clk); - div = clk_rate; + /* Calculate the period cycles and prescale value */ + div = (unsigned long long)clk_get_rate(atmel_pwm->clk) * period_ns; + do_div(div, NSEC_PER_SEC); - /* Calculate the period cycles */ while (div > PWM_MAX_PRD) { - div = clk_rate / (1 << pres); - div = div * period_ns; - /* 1/Hz = 100000000 ns */ - do_div(div, 1000000000); - - if (pres++ > PRD_MAX_PRES) { - dev_err(chip->dev, "pres exceeds the maximum value\n"); - return -EINVAL; - } + div >>= 1; + pres++; + } + + if (pres > PRD_MAX_PRES) { + dev_err(chip->dev, "pres exceeds the maximum value\n"); + return -EINVAL; } /* Calculate the duty cycles */ diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index a18bc8fea385..0f2cc7ef7784 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -18,14 +18,14 @@ #include <linux/of_address.h> #include <linux/platform_device.h> #include <linux/pwm.h> +#include <linux/regmap.h> #include <linux/slab.h> #define FTM_SC 0x00 -#define FTM_SC_CLK_MASK 0x3 -#define FTM_SC_CLK_SHIFT 3 -#define FTM_SC_CLK(c) (((c) + 1) << FTM_SC_CLK_SHIFT) +#define FTM_SC_CLK_MASK_SHIFT 3 +#define FTM_SC_CLK_MASK (3 << FTM_SC_CLK_MASK_SHIFT) +#define FTM_SC_CLK(c) (((c) + 1) << FTM_SC_CLK_MASK_SHIFT) #define FTM_SC_PS_MASK 0x7 -#define FTM_SC_PS_SHIFT 0 #define FTM_CNT 0x04 #define FTM_MOD 0x08 @@ -83,7 +83,7 @@ struct fsl_pwm_chip { unsigned int cnt_select; unsigned int clk_ps; - void __iomem *base; + struct regmap *regmap; int period_ns; @@ -219,10 +219,11 @@ static unsigned long fsl_pwm_calculate_duty(struct fsl_pwm_chip *fpc, unsigned long period_ns, unsigned long duty_ns) { - unsigned long long val, duty; + unsigned long long duty; + u32 val; - val = readl(fpc->base + FTM_MOD); - duty = duty_ns * (val + 1); + regmap_read(fpc->regmap, FTM_MOD, &val); + duty = (unsigned long long)duty_ns * (val + 1); do_div(duty, period_ns); return (unsigned long)duty; @@ -232,7 +233,7 @@ static int fsl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct fsl_pwm_chip *fpc = to_fsl_chip(chip); - u32 val, period, duty; + u32 period, duty; mutex_lock(&fpc->lock); @@ -257,11 +258,9 @@ static int fsl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return -EINVAL; } - val = readl(fpc->base + FTM_SC); - val &= ~(FTM_SC_PS_MASK << FTM_SC_PS_SHIFT); - val |= fpc->clk_ps; - writel(val, fpc->base + FTM_SC); - writel(period - 1, fpc->base + FTM_MOD); + regmap_update_bits(fpc->regmap, FTM_SC, FTM_SC_PS_MASK, + fpc->clk_ps); + regmap_write(fpc->regmap, FTM_MOD, period - 1); fpc->period_ns = period_ns; } @@ -270,8 +269,9 @@ static int fsl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, duty = fsl_pwm_calculate_duty(fpc, period_ns, duty_ns); - writel(FTM_CSC_MSB | FTM_CSC_ELSB, fpc->base + FTM_CSC(pwm->hwpwm)); - writel(duty, fpc->base + FTM_CV(pwm->hwpwm)); + regmap_write(fpc->regmap, FTM_CSC(pwm->hwpwm), + FTM_CSC_MSB | FTM_CSC_ELSB); + regmap_write(fpc->regmap, FTM_CV(pwm->hwpwm), duty); return 0; } @@ -283,31 +283,28 @@ static int fsl_pwm_set_polarity(struct pwm_chip *chip, struct fsl_pwm_chip *fpc = to_fsl_chip(chip); u32 val; - val = readl(fpc->base + FTM_POL); + regmap_read(fpc->regmap, FTM_POL, &val); if (polarity == PWM_POLARITY_INVERSED) val |= BIT(pwm->hwpwm); else val &= ~BIT(pwm->hwpwm); - writel(val, fpc->base + FTM_POL); + regmap_write(fpc->regmap, FTM_POL, val); return 0; } static int fsl_counter_clock_enable(struct fsl_pwm_chip *fpc) { - u32 val; int ret; if (fpc->use_count != 0) return 0; /* select counter clock source */ - val = readl(fpc->base + FTM_SC); - val &= ~(FTM_SC_CLK_MASK << FTM_SC_CLK_SHIFT); - val |= FTM_SC_CLK(fpc->cnt_select); - writel(val, fpc->base + FTM_SC); + regmap_update_bits(fpc->regmap, FTM_SC, FTM_SC_CLK_MASK, + FTM_SC_CLK(fpc->cnt_select)); ret = clk_prepare_enable(fpc->clk[fpc->cnt_select]); if (ret) @@ -327,13 +324,10 @@ static int fsl_counter_clock_enable(struct fsl_pwm_chip *fpc) static int fsl_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct fsl_pwm_chip *fpc = to_fsl_chip(chip); - u32 val; int ret; mutex_lock(&fpc->lock); - val = readl(fpc->base + FTM_OUTMASK); - val &= ~BIT(pwm->hwpwm); - writel(val, fpc->base + FTM_OUTMASK); + regmap_update_bits(fpc->regmap, FTM_OUTMASK, BIT(pwm->hwpwm), 0); ret = fsl_counter_clock_enable(fpc); mutex_unlock(&fpc->lock); @@ -343,8 +337,6 @@ static int fsl_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) static void fsl_counter_clock_disable(struct fsl_pwm_chip *fpc) { - u32 val; - /* * already disabled, do nothing */ @@ -356,9 +348,7 @@ static void fsl_counter_clock_disable(struct fsl_pwm_chip *fpc) return; /* no users left, disable PWM counter clock */ - val = readl(fpc->base + FTM_SC); - val &= ~(FTM_SC_CLK_MASK << FTM_SC_CLK_SHIFT); - writel(val, fpc->base + FTM_SC); + regmap_update_bits(fpc->regmap, FTM_SC, FTM_SC_CLK_MASK, 0); clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_CNTEN]); clk_disable_unprepare(fpc->clk[fpc->cnt_select]); @@ -370,14 +360,12 @@ static void fsl_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) u32 val; mutex_lock(&fpc->lock); - val = readl(fpc->base + FTM_OUTMASK); - val |= BIT(pwm->hwpwm); - writel(val, fpc->base + FTM_OUTMASK); + regmap_update_bits(fpc->regmap, FTM_OUTMASK, BIT(pwm->hwpwm), + BIT(pwm->hwpwm)); fsl_counter_clock_disable(fpc); - val = readl(fpc->base + FTM_OUTMASK); - + regmap_read(fpc->regmap, FTM_OUTMASK, &val); if ((val & 0xFF) == 0xFF) fpc->period_ns = 0; @@ -402,19 +390,28 @@ static int fsl_pwm_init(struct fsl_pwm_chip *fpc) if (ret) return ret; - writel(0x00, fpc->base + FTM_CNTIN); - writel(0x00, fpc->base + FTM_OUTINIT); - writel(0xFF, fpc->base + FTM_OUTMASK); + regmap_write(fpc->regmap, FTM_CNTIN, 0x00); + regmap_write(fpc->regmap, FTM_OUTINIT, 0x00); + regmap_write(fpc->regmap, FTM_OUTMASK, 0xFF); clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_SYS]); return 0; } +static const struct regmap_config fsl_pwm_regmap_config = { + .reg_bits = 32, + .reg_stride = 4, + .val_bits = 32, + + .max_register = FTM_PWMLOAD, +}; + static int fsl_pwm_probe(struct platform_device *pdev) { struct fsl_pwm_chip *fpc; struct resource *res; + void __iomem *base; int ret; fpc = devm_kzalloc(&pdev->dev, sizeof(*fpc), GFP_KERNEL); @@ -426,9 +423,16 @@ static int fsl_pwm_probe(struct platform_device *pdev) fpc->chip.dev = &pdev->dev; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - fpc->base = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(fpc->base)) - return PTR_ERR(fpc->base); + base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(base)) + return PTR_ERR(base); + + fpc->regmap = devm_regmap_init_mmio_clk(&pdev->dev, NULL, base, + &fsl_pwm_regmap_config); + if (IS_ERR(fpc->regmap)) { + dev_err(&pdev->dev, "regmap init failed\n"); + return PTR_ERR(fpc->regmap); + } fpc->clk[FSL_PWM_CLK_SYS] = devm_clk_get(&pdev->dev, "ftm_sys"); if (IS_ERR(fpc->clk[FSL_PWM_CLK_SYS])) { diff --git a/drivers/pwm/pwm-imx.c b/drivers/pwm/pwm-imx.c index 5449d9150d40..f8b5f109c1ab 100644 --- a/drivers/pwm/pwm-imx.c +++ b/drivers/pwm/pwm-imx.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/clk.h> +#include <linux/delay.h> #include <linux/io.h> #include <linux/pwm.h> #include <linux/of.h> @@ -21,24 +22,30 @@ /* i.MX1 and i.MX21 share the same PWM function block: */ -#define MX1_PWMC 0x00 /* PWM Control Register */ -#define MX1_PWMS 0x04 /* PWM Sample Register */ -#define MX1_PWMP 0x08 /* PWM Period Register */ +#define MX1_PWMC 0x00 /* PWM Control Register */ +#define MX1_PWMS 0x04 /* PWM Sample Register */ +#define MX1_PWMP 0x08 /* PWM Period Register */ -#define MX1_PWMC_EN (1 << 4) +#define MX1_PWMC_EN (1 << 4) /* i.MX27, i.MX31, i.MX35 share the same PWM function block: */ -#define MX3_PWMCR 0x00 /* PWM Control Register */ -#define MX3_PWMSAR 0x0C /* PWM Sample Register */ -#define MX3_PWMPR 0x10 /* PWM Period Register */ -#define MX3_PWMCR_PRESCALER(x) (((x - 1) & 0xFFF) << 4) -#define MX3_PWMCR_DOZEEN (1 << 24) -#define MX3_PWMCR_WAITEN (1 << 23) +#define MX3_PWMCR 0x00 /* PWM Control Register */ +#define MX3_PWMSR 0x04 /* PWM Status Register */ +#define MX3_PWMSAR 0x0C /* PWM Sample Register */ +#define MX3_PWMPR 0x10 /* PWM Period Register */ +#define MX3_PWMCR_PRESCALER(x) ((((x) - 1) & 0xFFF) << 4) +#define MX3_PWMCR_DOZEEN (1 << 24) +#define MX3_PWMCR_WAITEN (1 << 23) #define MX3_PWMCR_DBGEN (1 << 22) -#define MX3_PWMCR_CLKSRC_IPG_HIGH (2 << 16) -#define MX3_PWMCR_CLKSRC_IPG (1 << 16) -#define MX3_PWMCR_EN (1 << 0) +#define MX3_PWMCR_CLKSRC_IPG_HIGH (2 << 16) +#define MX3_PWMCR_CLKSRC_IPG (1 << 16) +#define MX3_PWMCR_SWR (1 << 3) +#define MX3_PWMCR_EN (1 << 0) +#define MX3_PWMSR_FIFOAV_4WORDS 0x4 +#define MX3_PWMSR_FIFOAV_MASK 0x7 + +#define MX3_PWM_SWR_LOOP 5 struct imx_chip { struct clk *clk_per; @@ -103,9 +110,43 @@ static int imx_pwm_config_v2(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct imx_chip *imx = to_imx_chip(chip); + struct device *dev = chip->dev; unsigned long long c; unsigned long period_cycles, duty_cycles, prescale; - u32 cr; + unsigned int period_ms; + bool enable = test_bit(PWMF_ENABLED, &pwm->flags); + int wait_count = 0, fifoav; + u32 cr, sr; + + /* + * i.MX PWMv2 has a 4-word sample FIFO. + * In order to avoid FIFO overflow issue, we do software reset + * to clear all sample FIFO if the controller is disabled or + * wait for a full PWM cycle to get a relinquished FIFO slot + * when the controller is enabled and the FIFO is fully loaded. + */ + if (enable) { + sr = readl(imx->mmio_base + MX3_PWMSR); + fifoav = sr & MX3_PWMSR_FIFOAV_MASK; + if (fifoav == MX3_PWMSR_FIFOAV_4WORDS) { + period_ms = DIV_ROUND_UP(pwm->period, NSEC_PER_MSEC); + msleep(period_ms); + + sr = readl(imx->mmio_base + MX3_PWMSR); + if (fifoav == (sr & MX3_PWMSR_FIFOAV_MASK)) + dev_warn(dev, "there is no free FIFO slot\n"); + } + } else { + writel(MX3_PWMCR_SWR, imx->mmio_base + MX3_PWMCR); + do { + usleep_range(200, 1000); + cr = readl(imx->mmio_base + MX3_PWMCR); + } while ((cr & MX3_PWMCR_SWR) && + (wait_count++ < MX3_PWM_SWR_LOOP)); + + if (cr & MX3_PWMCR_SWR) + dev_warn(dev, "software reset timeout\n"); + } c = clk_get_rate(imx->clk_per); c = c * period_ns; @@ -135,7 +176,7 @@ static int imx_pwm_config_v2(struct pwm_chip *chip, MX3_PWMCR_DOZEEN | MX3_PWMCR_WAITEN | MX3_PWMCR_DBGEN | MX3_PWMCR_CLKSRC_IPG_HIGH; - if (test_bit(PWMF_ENABLED, &pwm->flags)) + if (enable) cr |= MX3_PWMCR_EN; writel(cr, imx->mmio_base + MX3_PWMCR); diff --git a/drivers/pwm/pwm-lpss-pci.c b/drivers/pwm/pwm-lpss-pci.c new file mode 100644 index 000000000000..cf20d2beacdd --- /dev/null +++ b/drivers/pwm/pwm-lpss-pci.c @@ -0,0 +1,64 @@ +/* + * Intel Low Power Subsystem PWM controller PCI driver + * + * Copyright (C) 2014, Intel Corporation + * + * Derived from the original pwm-lpss.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include "pwm-lpss.h" + +static int pwm_lpss_probe_pci(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + const struct pwm_lpss_boardinfo *info; + struct pwm_lpss_chip *lpwm; + int err; + + err = pcim_enable_device(pdev); + if (err < 0) + return err; + + info = (struct pwm_lpss_boardinfo *)id->driver_data; + lpwm = pwm_lpss_probe(&pdev->dev, &pdev->resource[0], info); + if (IS_ERR(lpwm)) + return PTR_ERR(lpwm); + + pci_set_drvdata(pdev, lpwm); + return 0; +} + +static void pwm_lpss_remove_pci(struct pci_dev *pdev) +{ + struct pwm_lpss_chip *lpwm = pci_get_drvdata(pdev); + + pwm_lpss_remove(lpwm); +} + +static const struct pci_device_id pwm_lpss_pci_ids[] = { + { PCI_VDEVICE(INTEL, 0x0f08), (unsigned long)&pwm_lpss_byt_info}, + { PCI_VDEVICE(INTEL, 0x0f09), (unsigned long)&pwm_lpss_byt_info}, + { PCI_VDEVICE(INTEL, 0x2288), (unsigned long)&pwm_lpss_bsw_info}, + { PCI_VDEVICE(INTEL, 0x2289), (unsigned long)&pwm_lpss_bsw_info}, + { }, +}; +MODULE_DEVICE_TABLE(pci, pwm_lpss_pci_ids); + +static struct pci_driver pwm_lpss_driver_pci = { + .name = "pwm-lpss", + .id_table = pwm_lpss_pci_ids, + .probe = pwm_lpss_probe_pci, + .remove = pwm_lpss_remove_pci, +}; +module_pci_driver(pwm_lpss_driver_pci); + +MODULE_DESCRIPTION("PWM PCI driver for Intel LPSS"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pwm/pwm-lpss-platform.c b/drivers/pwm/pwm-lpss-platform.c new file mode 100644 index 000000000000..18a9c880a76d --- /dev/null +++ b/drivers/pwm/pwm-lpss-platform.c @@ -0,0 +1,68 @@ +/* + * Intel Low Power Subsystem PWM controller driver + * + * Copyright (C) 2014, Intel Corporation + * + * Derived from the original pwm-lpss.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/acpi.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/platform_device.h> + +#include "pwm-lpss.h" + +static int pwm_lpss_probe_platform(struct platform_device *pdev) +{ + const struct pwm_lpss_boardinfo *info; + const struct acpi_device_id *id; + struct pwm_lpss_chip *lpwm; + struct resource *r; + + id = acpi_match_device(pdev->dev.driver->acpi_match_table, &pdev->dev); + if (!id) + return -ENODEV; + + info = (const struct pwm_lpss_boardinfo *)id->driver_data; + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + + lpwm = pwm_lpss_probe(&pdev->dev, r, info); + if (IS_ERR(lpwm)) + return PTR_ERR(lpwm); + + platform_set_drvdata(pdev, lpwm); + return 0; +} + +static int pwm_lpss_remove_platform(struct platform_device *pdev) +{ + struct pwm_lpss_chip *lpwm = platform_get_drvdata(pdev); + + return pwm_lpss_remove(lpwm); +} + +static const struct acpi_device_id pwm_lpss_acpi_match[] = { + { "80860F09", (unsigned long)&pwm_lpss_byt_info }, + { "80862288", (unsigned long)&pwm_lpss_bsw_info }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, pwm_lpss_acpi_match); + +static struct platform_driver pwm_lpss_driver_platform = { + .driver = { + .name = "pwm-lpss", + .acpi_match_table = pwm_lpss_acpi_match, + }, + .probe = pwm_lpss_probe_platform, + .remove = pwm_lpss_remove_platform, +}; +module_platform_driver(pwm_lpss_driver_platform); + +MODULE_DESCRIPTION("PWM platform driver for Intel LPSS"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:pwm-lpss"); diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index 4df994f72d96..e9798253a16f 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -13,15 +13,11 @@ * published by the Free Software Foundation. */ -#include <linux/acpi.h> -#include <linux/device.h> +#include <linux/io.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/pwm.h> -#include <linux/platform_device.h> -#include <linux/pci.h> -static int pci_drv, plat_drv; /* So we know which drivers registered */ +#include "pwm-lpss.h" #define PWM 0x00000000 #define PWM_ENABLE BIT(31) @@ -39,14 +35,17 @@ struct pwm_lpss_chip { unsigned long clk_rate; }; -struct pwm_lpss_boardinfo { - unsigned long clk_rate; +/* BayTrail */ +const struct pwm_lpss_boardinfo pwm_lpss_byt_info = { + .clk_rate = 25000000 }; +EXPORT_SYMBOL_GPL(pwm_lpss_byt_info); -/* BayTrail */ -static const struct pwm_lpss_boardinfo byt_info = { - 25000000 +/* Braswell */ +const struct pwm_lpss_boardinfo pwm_lpss_bsw_info = { + .clk_rate = 19200000 }; +EXPORT_SYMBOL_GPL(pwm_lpss_bsw_info); static inline struct pwm_lpss_chip *to_lpwm(struct pwm_chip *chip) { @@ -118,9 +117,8 @@ static const struct pwm_ops pwm_lpss_ops = { .owner = THIS_MODULE, }; -static struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, - struct resource *r, - const struct pwm_lpss_boardinfo *info) +struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r, + const struct pwm_lpss_boardinfo *info) { struct pwm_lpss_chip *lpwm; int ret; @@ -147,8 +145,9 @@ static struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, return lpwm; } +EXPORT_SYMBOL_GPL(pwm_lpss_probe); -static int pwm_lpss_remove(struct pwm_lpss_chip *lpwm) +int pwm_lpss_remove(struct pwm_lpss_chip *lpwm) { u32 ctrl; @@ -157,114 +156,8 @@ static int pwm_lpss_remove(struct pwm_lpss_chip *lpwm) return pwmchip_remove(&lpwm->chip); } - -static int pwm_lpss_probe_pci(struct pci_dev *pdev, - const struct pci_device_id *id) -{ - const struct pwm_lpss_boardinfo *info; - struct pwm_lpss_chip *lpwm; - int err; - - err = pci_enable_device(pdev); - if (err < 0) - return err; - - info = (struct pwm_lpss_boardinfo *)id->driver_data; - lpwm = pwm_lpss_probe(&pdev->dev, &pdev->resource[0], info); - if (IS_ERR(lpwm)) - return PTR_ERR(lpwm); - - pci_set_drvdata(pdev, lpwm); - return 0; -} - -static void pwm_lpss_remove_pci(struct pci_dev *pdev) -{ - struct pwm_lpss_chip *lpwm = pci_get_drvdata(pdev); - - pwm_lpss_remove(lpwm); - pci_disable_device(pdev); -} - -static struct pci_device_id pwm_lpss_pci_ids[] = { - { PCI_VDEVICE(INTEL, 0x0f08), (unsigned long)&byt_info}, - { PCI_VDEVICE(INTEL, 0x0f09), (unsigned long)&byt_info}, - { }, -}; -MODULE_DEVICE_TABLE(pci, pwm_lpss_pci_ids); - -static struct pci_driver pwm_lpss_driver_pci = { - .name = "pwm-lpss", - .id_table = pwm_lpss_pci_ids, - .probe = pwm_lpss_probe_pci, - .remove = pwm_lpss_remove_pci, -}; - -static int pwm_lpss_probe_platform(struct platform_device *pdev) -{ - const struct pwm_lpss_boardinfo *info; - const struct acpi_device_id *id; - struct pwm_lpss_chip *lpwm; - struct resource *r; - - id = acpi_match_device(pdev->dev.driver->acpi_match_table, &pdev->dev); - if (!id) - return -ENODEV; - - r = platform_get_resource(pdev, IORESOURCE_MEM, 0); - - info = (struct pwm_lpss_boardinfo *)id->driver_data; - lpwm = pwm_lpss_probe(&pdev->dev, r, info); - if (IS_ERR(lpwm)) - return PTR_ERR(lpwm); - - platform_set_drvdata(pdev, lpwm); - return 0; -} - -static int pwm_lpss_remove_platform(struct platform_device *pdev) -{ - struct pwm_lpss_chip *lpwm = platform_get_drvdata(pdev); - - return pwm_lpss_remove(lpwm); -} - -static const struct acpi_device_id pwm_lpss_acpi_match[] = { - { "80860F09", (unsigned long)&byt_info }, - { }, -}; -MODULE_DEVICE_TABLE(acpi, pwm_lpss_acpi_match); - -static struct platform_driver pwm_lpss_driver_platform = { - .driver = { - .name = "pwm-lpss", - .acpi_match_table = pwm_lpss_acpi_match, - }, - .probe = pwm_lpss_probe_platform, - .remove = pwm_lpss_remove_platform, -}; - -static int __init pwm_init(void) -{ - pci_drv = pci_register_driver(&pwm_lpss_driver_pci); - plat_drv = platform_driver_register(&pwm_lpss_driver_platform); - if (pci_drv && plat_drv) - return pci_drv; - - return 0; -} -module_init(pwm_init); - -static void __exit pwm_exit(void) -{ - if (!pci_drv) - pci_unregister_driver(&pwm_lpss_driver_pci); - if (!plat_drv) - platform_driver_unregister(&pwm_lpss_driver_platform); -} -module_exit(pwm_exit); +EXPORT_SYMBOL_GPL(pwm_lpss_remove); MODULE_DESCRIPTION("PWM driver for Intel LPSS"); MODULE_AUTHOR("Mika Westerberg <mika.westerberg@linux.intel.com>"); MODULE_LICENSE("GPL v2"); -MODULE_ALIAS("platform:pwm-lpss"); diff --git a/drivers/pwm/pwm-lpss.h b/drivers/pwm/pwm-lpss.h new file mode 100644 index 000000000000..aa041bb1b67d --- /dev/null +++ b/drivers/pwm/pwm-lpss.h @@ -0,0 +1,32 @@ +/* + * Intel Low Power Subsystem PWM controller driver + * + * Copyright (C) 2014, Intel Corporation + * + * Derived from the original pwm-lpss.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __PWM_LPSS_H +#define __PWM_LPSS_H + +#include <linux/device.h> +#include <linux/pwm.h> + +struct pwm_lpss_chip; + +struct pwm_lpss_boardinfo { + unsigned long clk_rate; +}; + +extern const struct pwm_lpss_boardinfo pwm_lpss_byt_info; +extern const struct pwm_lpss_boardinfo pwm_lpss_bsw_info; + +struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r, + const struct pwm_lpss_boardinfo *info); +int pwm_lpss_remove(struct pwm_lpss_chip *lpwm); + +#endif /* __PWM_LPSS_H */ diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index bdd8644c01cf..9442df244101 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -24,7 +24,9 @@ #define PWM_ENABLE (1 << 0) #define PWM_CONTINUOUS (1 << 1) #define PWM_DUTY_POSITIVE (1 << 3) +#define PWM_DUTY_NEGATIVE (0 << 3) #define PWM_INACTIVE_NEGATIVE (0 << 4) +#define PWM_INACTIVE_POSITIVE (1 << 4) #define PWM_OUTPUT_LEFT (0 << 5) #define PWM_LP_DISABLE (0 << 8) @@ -45,8 +47,10 @@ struct rockchip_pwm_regs { struct rockchip_pwm_data { struct rockchip_pwm_regs regs; unsigned int prescaler; + const struct pwm_ops *ops; - void (*set_enable)(struct pwm_chip *chip, bool enable); + void (*set_enable)(struct pwm_chip *chip, + struct pwm_device *pwm, bool enable); }; static inline struct rockchip_pwm_chip *to_rockchip_pwm_chip(struct pwm_chip *c) @@ -54,7 +58,8 @@ static inline struct rockchip_pwm_chip *to_rockchip_pwm_chip(struct pwm_chip *c) return container_of(c, struct rockchip_pwm_chip, chip); } -static void rockchip_pwm_set_enable_v1(struct pwm_chip *chip, bool enable) +static void rockchip_pwm_set_enable_v1(struct pwm_chip *chip, + struct pwm_device *pwm, bool enable) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); u32 enable_conf = PWM_CTRL_OUTPUT_EN | PWM_CTRL_TIMER_EN; @@ -70,14 +75,19 @@ static void rockchip_pwm_set_enable_v1(struct pwm_chip *chip, bool enable) writel_relaxed(val, pc->base + pc->data->regs.ctrl); } -static void rockchip_pwm_set_enable_v2(struct pwm_chip *chip, bool enable) +static void rockchip_pwm_set_enable_v2(struct pwm_chip *chip, + struct pwm_device *pwm, bool enable) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); u32 enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE | - PWM_CONTINUOUS | PWM_DUTY_POSITIVE | - PWM_INACTIVE_NEGATIVE; + PWM_CONTINUOUS; u32 val; + if (pwm->polarity == PWM_POLARITY_INVERSED) + enable_conf |= PWM_DUTY_NEGATIVE | PWM_INACTIVE_POSITIVE; + else + enable_conf |= PWM_DUTY_POSITIVE | PWM_INACTIVE_NEGATIVE; + val = readl_relaxed(pc->base + pc->data->regs.ctrl); if (enable) @@ -124,6 +134,19 @@ static int rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } +static int rockchip_pwm_set_polarity(struct pwm_chip *chip, + struct pwm_device *pwm, + enum pwm_polarity polarity) +{ + /* + * No action needed here because pwm->polarity will be set by the core + * and the core will only change polarity when the PWM is not enabled. + * We'll handle things in set_enable(). + */ + + return 0; +} + static int rockchip_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); @@ -133,7 +156,7 @@ static int rockchip_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) if (ret) return ret; - pc->data->set_enable(chip, true); + pc->data->set_enable(chip, pwm, true); return 0; } @@ -142,18 +165,26 @@ static void rockchip_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); - pc->data->set_enable(chip, false); + pc->data->set_enable(chip, pwm, false); clk_disable(pc->clk); } -static const struct pwm_ops rockchip_pwm_ops = { +static const struct pwm_ops rockchip_pwm_ops_v1 = { .config = rockchip_pwm_config, .enable = rockchip_pwm_enable, .disable = rockchip_pwm_disable, .owner = THIS_MODULE, }; +static const struct pwm_ops rockchip_pwm_ops_v2 = { + .config = rockchip_pwm_config, + .set_polarity = rockchip_pwm_set_polarity, + .enable = rockchip_pwm_enable, + .disable = rockchip_pwm_disable, + .owner = THIS_MODULE, +}; + static const struct rockchip_pwm_data pwm_data_v1 = { .regs = { .duty = 0x04, @@ -162,6 +193,7 @@ static const struct rockchip_pwm_data pwm_data_v1 = { .ctrl = 0x0c, }, .prescaler = 2, + .ops = &rockchip_pwm_ops_v1, .set_enable = rockchip_pwm_set_enable_v1, }; @@ -173,6 +205,7 @@ static const struct rockchip_pwm_data pwm_data_v2 = { .ctrl = 0x0c, }, .prescaler = 1, + .ops = &rockchip_pwm_ops_v2, .set_enable = rockchip_pwm_set_enable_v2, }; @@ -184,6 +217,7 @@ static const struct rockchip_pwm_data pwm_data_vop = { .ctrl = 0x00, }, .prescaler = 1, + .ops = &rockchip_pwm_ops_v2, .set_enable = rockchip_pwm_set_enable_v2, }; @@ -227,10 +261,15 @@ static int rockchip_pwm_probe(struct platform_device *pdev) pc->data = id->data; pc->chip.dev = &pdev->dev; - pc->chip.ops = &rockchip_pwm_ops; + pc->chip.ops = pc->data->ops; pc->chip.base = -1; pc->chip.npwm = 1; + if (pc->data->ops->set_polarity) { + pc->chip.of_xlate = of_pwm_xlate_with_flags; + pc->chip.of_pwm_n_cells = 3; + } + ret = pwmchip_add(&pc->chip); if (ret < 0) { clk_unprepare(pc->clk); diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 8cd0beebdc3f..94ae1798d48a 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -830,7 +830,7 @@ config RTC_DRV_DA9063 config RTC_DRV_EFI tristate "EFI RTC" - depends on EFI + depends on EFI && !X86 help If you say yes here you will get support for the EFI Real Time Clock. diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c index c384fec6d173..53b589dc34eb 100644 --- a/drivers/rtc/rtc-efi.c +++ b/drivers/rtc/rtc-efi.c @@ -236,3 +236,4 @@ MODULE_ALIAS("platform:rtc-efi"); MODULE_AUTHOR("dann frazier <dannf@hp.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("EFI RTC driver"); +MODULE_ALIAS("platform:rtc-efi"); diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig index dc24ecfac2d1..db2cb1f8a1b5 100644 --- a/drivers/s390/char/Kconfig +++ b/drivers/s390/char/Kconfig @@ -105,7 +105,7 @@ config SCLP_ASYNC config HMC_DRV def_tristate m prompt "Support for file transfers from HMC drive CD/DVD-ROM" - depends on 64BIT + depends on S390 && 64BIT select CRC16 help This option enables support for file transfers from a Hardware diff --git a/drivers/scsi/osd/Kbuild b/drivers/scsi/osd/Kbuild index 5fd73d77c3af..58cecd45b0f5 100644 --- a/drivers/scsi/osd/Kbuild +++ b/drivers/scsi/osd/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh <bharrosh@panasas.com> +# Boaz Harrosh <ooo@electrozaur.com> # Benny Halevy <bhalevy@panasas.com> # # This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/Kconfig b/drivers/scsi/osd/Kconfig index a0703514eb0f..347cc5e33749 100644 --- a/drivers/scsi/osd/Kconfig +++ b/drivers/scsi/osd/Kconfig @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh <bharrosh@panasas.com> +# Boaz Harrosh <ooo@electrozaur.com> # Benny Halevy <bhalevy@panasas.com> # # This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/osd_debug.h b/drivers/scsi/osd/osd_debug.h index 579e491f11df..26341261bb5c 100644 --- a/drivers/scsi/osd/osd_debug.h +++ b/drivers/scsi/osd/osd_debug.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * Benny Halevy <bhalevy@panasas.com> * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index fd19fd8468ac..488c3929f19a 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -7,7 +7,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * Benny Halevy <bhalevy@panasas.com> * * This program is free software; you can redistribute it and/or modify @@ -57,7 +57,7 @@ enum { OSD_REQ_RETRIES = 1 }; -MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>"); MODULE_DESCRIPTION("open-osd initiator library libosd.ko"); MODULE_LICENSE("GPL"); diff --git a/drivers/scsi/osd/osd_uld.c b/drivers/scsi/osd/osd_uld.c index e1d9a4c4c4b3..92cdd4b06526 100644 --- a/drivers/scsi/osd/osd_uld.c +++ b/drivers/scsi/osd/osd_uld.c @@ -10,7 +10,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * Benny Halevy <bhalevy@panasas.com> * * This program is free software; you can redistribute it and/or modify @@ -74,7 +74,7 @@ static const char osd_name[] = "osd"; static const char *osd_version_string = "open-osd 0.2.1"; -MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>"); MODULE_DESCRIPTION("open-osd Upper-Layer-Driver osd.ko"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CHARDEV_MAJOR(SCSI_OSD_MAJOR); diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 829752cfd73f..a902fa1db7af 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -112,6 +112,7 @@ static void qlt_abort_cmd_on_host_reset(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd); static void qlt_alloc_qfull_cmd(struct scsi_qla_host *vha, struct atio_from_isp *atio, uint16_t status, int qfull); +static void qlt_disable_vha(struct scsi_qla_host *vha); /* * Global Variables */ @@ -210,7 +211,7 @@ static inline void qlt_decr_num_pend_cmds(struct scsi_qla_host *vha) spin_unlock_irqrestore(&vha->hw->tgt.q_full_lock, flags); } -void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *vha, +static void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *vha, struct atio_from_isp *atio) { ql_dbg(ql_dbg_tgt, vha, 0xe072, @@ -433,7 +434,7 @@ static int qlt_reset(struct scsi_qla_host *vha, void *iocb, int mcmd) #if 0 /* FIXME: Re-enable Global event handling.. */ /* Global event */ atomic_inc(&ha->tgt.qla_tgt->tgt_global_resets_count); - qlt_clear_tgt_db(ha->tgt.qla_tgt, 1); + qlt_clear_tgt_db(ha->tgt.qla_tgt); if (!list_empty(&ha->tgt.qla_tgt->sess_list)) { sess = list_entry(ha->tgt.qla_tgt->sess_list.next, typeof(*sess), sess_list_entry); @@ -515,7 +516,7 @@ static void qlt_schedule_sess_for_deletion(struct qla_tgt_sess *sess, } /* ha->hardware_lock supposed to be held on entry */ -static void qlt_clear_tgt_db(struct qla_tgt *tgt, bool local_only) +static void qlt_clear_tgt_db(struct qla_tgt *tgt) { struct qla_tgt_sess *sess; @@ -867,7 +868,7 @@ int qlt_stop_phase1(struct qla_tgt *tgt) mutex_lock(&vha->vha_tgt.tgt_mutex); spin_lock_irqsave(&ha->hardware_lock, flags); tgt->tgt_stop = 1; - qlt_clear_tgt_db(tgt, true); + qlt_clear_tgt_db(tgt); spin_unlock_irqrestore(&ha->hardware_lock, flags); mutex_unlock(&vha->vha_tgt.tgt_mutex); mutex_unlock(&qla_tgt_mutex); @@ -1462,12 +1463,13 @@ out_err: return -1; } -static inline void qlt_unmap_sg(struct scsi_qla_host *vha, - struct qla_tgt_cmd *cmd) +static void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd) { struct qla_hw_data *ha = vha->hw; - BUG_ON(!cmd->sg_mapped); + if (!cmd->sg_mapped) + return; + pci_unmap_sg(ha->pdev, cmd->sg, cmd->sg_cnt, cmd->dma_data_direction); cmd->sg_mapped = 0; @@ -2428,8 +2430,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type, return 0; out_unmap_unlock: - if (cmd->sg_mapped) - qlt_unmap_sg(vha, cmd); + qlt_unmap_sg(vha, cmd); spin_unlock_irqrestore(&ha->hardware_lock, flags); return res; @@ -2506,8 +2507,7 @@ int qlt_rdy_to_xfer(struct qla_tgt_cmd *cmd) return res; out_unlock_free_unmap: - if (cmd->sg_mapped) - qlt_unmap_sg(vha, cmd); + qlt_unmap_sg(vha, cmd); spin_unlock_irqrestore(&ha->hardware_lock, flags); return res; @@ -2741,8 +2741,7 @@ done: if (!ha_locked && !in_interrupt()) msleep(250); /* just in case */ - if (cmd->sg_mapped) - qlt_unmap_sg(vha, cmd); + qlt_unmap_sg(vha, cmd); vha->hw->tgt.tgt_ops->free_cmd(cmd); } return; @@ -3087,8 +3086,7 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha, uint32_t handle, tfo = se_cmd->se_tfo; cmd->cmd_sent_to_fw = 0; - if (cmd->sg_mapped) - qlt_unmap_sg(vha, cmd); + qlt_unmap_sg(vha, cmd); if (unlikely(status != CTIO_SUCCESS)) { switch (status & 0xFFFF) { @@ -5343,7 +5341,7 @@ void qlt_lport_deregister(struct scsi_qla_host *vha) EXPORT_SYMBOL(qlt_lport_deregister); /* Must be called under HW lock */ -void qlt_set_mode(struct scsi_qla_host *vha) +static void qlt_set_mode(struct scsi_qla_host *vha) { struct qla_hw_data *ha = vha->hw; @@ -5364,7 +5362,7 @@ void qlt_set_mode(struct scsi_qla_host *vha) } /* Must be called under HW lock */ -void qlt_clear_mode(struct scsi_qla_host *vha) +static void qlt_clear_mode(struct scsi_qla_host *vha) { struct qla_hw_data *ha = vha->hw; @@ -5428,8 +5426,7 @@ EXPORT_SYMBOL(qlt_enable_vha); * * Disable Target Mode and reset the adapter */ -void -qlt_disable_vha(struct scsi_qla_host *vha) +static void qlt_disable_vha(struct scsi_qla_host *vha) { struct qla_hw_data *ha = vha->hw; struct qla_tgt *tgt = vha->vha_tgt.qla_tgt; diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h index 8ff330f7d6f5..332086776dfe 100644 --- a/drivers/scsi/qla2xxx/qla_target.h +++ b/drivers/scsi/qla2xxx/qla_target.h @@ -1001,11 +1001,11 @@ struct qla_tgt_prm { struct qla_tgt *tgt; void *pkt; struct scatterlist *sg; /* cmd data buffer SG vector */ + unsigned char *sense_buffer; int seg_cnt; int req_cnt; uint16_t rq_result; uint16_t scsi_status; - unsigned char *sense_buffer; int sense_buffer_len; int residual; int add_status_pkt; @@ -1033,10 +1033,6 @@ struct qla_tgt_srr_ctio { extern struct qla_tgt_data qla_target; -/* - * Internal function prototypes - */ -void qlt_disable_vha(struct scsi_qla_host *); /* * Function prototypes for qla_target.c logic used by qla2xxx LLD code. @@ -1049,8 +1045,6 @@ extern void qlt_lport_deregister(struct scsi_qla_host *); extern void qlt_unreg_sess(struct qla_tgt_sess *); extern void qlt_fc_port_added(struct scsi_qla_host *, fc_port_t *); extern void qlt_fc_port_deleted(struct scsi_qla_host *, fc_port_t *); -extern void qlt_set_mode(struct scsi_qla_host *ha); -extern void qlt_clear_mode(struct scsi_qla_host *ha); extern int __init qlt_init(void); extern void qlt_exit(void); extern void qlt_update_vp_map(struct scsi_qla_host *, int); @@ -1083,13 +1077,9 @@ static inline void qla_reverse_ini_mode(struct scsi_qla_host *ha) /* * Exported symbols from qla_target.c LLD logic used by qla2xxx code.. */ -extern void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *, - struct atio_from_isp *); extern void qlt_response_pkt_all_vps(struct scsi_qla_host *, response_t *); extern int qlt_rdy_to_xfer(struct qla_tgt_cmd *); extern int qlt_xmit_response(struct qla_tgt_cmd *, int, uint8_t); -extern int qlt_rdy_to_xfer_dif(struct qla_tgt_cmd *); -extern int qlt_xmit_response_dif(struct qla_tgt_cmd *, int, uint8_t); extern void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *); extern void qlt_free_mcmd(struct qla_tgt_mgmt_cmd *); extern void qlt_free_cmd(struct qla_tgt_cmd *cmd); diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index 031b2961c6b7..73f9feecda72 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -786,7 +786,16 @@ static void tcm_qla2xxx_clear_nacl_from_fcport_map(struct qla_tgt_sess *sess) pr_debug("fc_rport domain: port_id 0x%06x\n", nacl->nport_id); node = btree_remove32(&lport->lport_fcport_map, nacl->nport_id); - WARN_ON(node && (node != se_nacl)); + if (WARN_ON(node && (node != se_nacl))) { + /* + * The nacl no longer matches what we think it should be. + * Most likely a new dynamic acl has been added while + * someone dropped the hardware lock. It clearly is a + * bug elsewhere, but this bit can't make things worse. + */ + btree_insert32(&lport->lport_fcport_map, nacl->nport_id, + node, GFP_ATOMIC); + } pr_debug("Removed from fcport_map: %p for WWNN: 0x%016LX, port_id: 0x%06x\n", se_nacl, nacl->nport_wwnn, nacl->nport_id); diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index dc2d84ac5a0e..81d44c477a5b 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig @@ -31,6 +31,13 @@ config TCM_PSCSI Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered passthrough access to Linux/SCSI device +config TCM_USER + tristate "TCM/USER Subsystem Plugin for Linux" + depends on UIO && NET + help + Say Y here to enable the TCM/USER subsystem plugin for a userspace + process to handle requests + source "drivers/target/loopback/Kconfig" source "drivers/target/tcm_fc/Kconfig" source "drivers/target/iscsi/Kconfig" diff --git a/drivers/target/Makefile b/drivers/target/Makefile index 85b012d2f89b..bbb4a7d638ef 100644 --- a/drivers/target/Makefile +++ b/drivers/target/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_TARGET_CORE) += target_core_mod.o obj-$(CONFIG_TCM_IBLOCK) += target_core_iblock.o obj-$(CONFIG_TCM_FILEIO) += target_core_file.o obj-$(CONFIG_TCM_PSCSI) += target_core_pscsi.o +obj-$(CONFIG_TCM_USER) += target_core_user.o # Fabric modules obj-$(CONFIG_LOOPBACK_TARGET) += loopback/ diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 260c3e1e312c..b19e4329ba00 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -3709,7 +3709,6 @@ static inline void iscsit_thread_check_cpumask( struct task_struct *p, int mode) { - char buf[128]; /* * mode == 1 signals iscsi_target_tx_thread() usage. * mode == 0 signals iscsi_target_rx_thread() usage. @@ -3728,8 +3727,6 @@ static inline void iscsit_thread_check_cpumask( * both TX and RX kthreads are scheduled to run on the * same CPU. */ - memset(buf, 0, 128); - cpumask_scnprintf(buf, 128, conn->conn_cpumask); set_cpus_allowed_ptr(p, conn->conn_cpumask); } @@ -4326,8 +4323,7 @@ int iscsit_close_connection( if (conn->conn_tx_hash.tfm) crypto_free_hash(conn->conn_tx_hash.tfm); - if (conn->conn_cpumask) - free_cpumask_var(conn->conn_cpumask); + free_cpumask_var(conn->conn_cpumask); kfree(conn->conn_ops); conn->conn_ops = NULL; diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c index ae03f3e5de1e..9059c1e0b26e 100644 --- a/drivers/target/iscsi/iscsi_target_configfs.c +++ b/drivers/target/iscsi/iscsi_target_configfs.c @@ -669,12 +669,10 @@ static ssize_t lio_target_nacl_show_info( } else { sess = se_sess->fabric_sess_ptr; - if (sess->sess_ops->InitiatorName) - rb += sprintf(page+rb, "InitiatorName: %s\n", - sess->sess_ops->InitiatorName); - if (sess->sess_ops->InitiatorAlias) - rb += sprintf(page+rb, "InitiatorAlias: %s\n", - sess->sess_ops->InitiatorAlias); + rb += sprintf(page+rb, "InitiatorName: %s\n", + sess->sess_ops->InitiatorName); + rb += sprintf(page+rb, "InitiatorAlias: %s\n", + sess->sess_ops->InitiatorAlias); rb += sprintf(page+rb, "LIO Session ID: %u " "ISID: 0x%02x %02x %02x %02x %02x %02x " diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c index 0d1e6ee3e992..a0ae5fc0ad75 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.c +++ b/drivers/target/iscsi/iscsi_target_erl0.c @@ -345,7 +345,6 @@ static int iscsit_dataout_check_datasn( struct iscsi_cmd *cmd, unsigned char *buf) { - int dump = 0, recovery = 0; u32 data_sn = 0; struct iscsi_conn *conn = cmd->conn; struct iscsi_data *hdr = (struct iscsi_data *) buf; @@ -370,13 +369,11 @@ static int iscsit_dataout_check_datasn( pr_err("Command ITT: 0x%08x, received DataSN: 0x%08x" " higher than expected 0x%08x.\n", cmd->init_task_tag, be32_to_cpu(hdr->datasn), data_sn); - recovery = 1; goto recover; } else if (be32_to_cpu(hdr->datasn) < data_sn) { pr_err("Command ITT: 0x%08x, received DataSN: 0x%08x" " lower than expected 0x%08x, discarding payload.\n", cmd->init_task_tag, be32_to_cpu(hdr->datasn), data_sn); - dump = 1; goto dump; } @@ -392,8 +389,7 @@ dump: if (iscsit_dump_data_payload(conn, payload_length, 1) < 0) return DATAOUT_CANNOT_RECOVER; - return (recovery || dump) ? DATAOUT_WITHIN_COMMAND_RECOVERY : - DATAOUT_NORMAL; + return DATAOUT_WITHIN_COMMAND_RECOVERY; } static int iscsit_dataout_pre_datapduinorder_yes( diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 5e71ac609418..480f2e0ecc11 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -978,8 +978,7 @@ int iscsit_setup_np( return 0; fail: np->np_socket = NULL; - if (sock) - sock_release(sock); + sock_release(sock); return ret; } @@ -1190,8 +1189,7 @@ old_sess_out: if (!IS_ERR(conn->conn_tx_hash.tfm)) crypto_free_hash(conn->conn_tx_hash.tfm); - if (conn->conn_cpumask) - free_cpumask_var(conn->conn_cpumask); + free_cpumask_var(conn->conn_cpumask); kfree(conn->conn_ops); @@ -1268,8 +1266,6 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) iscsit_put_transport(conn->conn_transport); kfree(conn); conn = NULL; - if (ret == -ENODEV) - goto out; /* Get another socket */ return 1; } diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index 73355f4fca74..ce87ce9bdb9c 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -1481,8 +1481,9 @@ void iscsit_collect_login_stats( if (conn->param_list) intrname = iscsi_find_param_from_key(INITIATORNAME, conn->param_list); - strcpy(ls->last_intr_fail_name, - (intrname ? intrname->value : "Unknown")); + strlcpy(ls->last_intr_fail_name, + (intrname ? intrname->value : "Unknown"), + sizeof(ls->last_intr_fail_name)); ls->last_intr_fail_ip_family = conn->login_family; diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index 340de9d92b15..ab3ab27d49b7 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -153,18 +153,11 @@ static int tcm_loop_change_queue_type(struct scsi_device *sdev, int tag) /* * Locate the SAM Task Attr from struct scsi_cmnd * */ -static int tcm_loop_sam_attr(struct scsi_cmnd *sc) -{ - if (sc->device->tagged_supported) { - switch (sc->tag) { - case HEAD_OF_QUEUE_TAG: - return MSG_HEAD_TAG; - case ORDERED_QUEUE_TAG: - return MSG_ORDERED_TAG; - default: - break; - } - } +static int tcm_loop_sam_attr(struct scsi_cmnd *sc, int tag) +{ + if (sc->device->tagged_supported && + sc->device->ordered_tags && tag >= 0) + return MSG_ORDERED_TAG; return MSG_SIMPLE_TAG; } @@ -227,7 +220,7 @@ static void tcm_loop_submission_work(struct work_struct *work) rc = target_submit_cmd_map_sgls(se_cmd, tl_nexus->se_sess, sc->cmnd, &tl_cmd->tl_sense_buf[0], tl_cmd->sc->device->lun, - transfer_length, tcm_loop_sam_attr(sc), + transfer_length, tcm_loop_sam_attr(sc, tl_cmd->sc_cmd_tag), sc->sc_data_direction, 0, scsi_sglist(sc), scsi_sg_count(sc), sgl_bidi, sgl_bidi_count, @@ -266,7 +259,7 @@ static int tcm_loop_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) } tl_cmd->sc = sc; - tl_cmd->sc_cmd_tag = sc->tag; + tl_cmd->sc_cmd_tag = sc->request->tag; INIT_WORK(&tl_cmd->work, tcm_loop_submission_work); queue_work(tcm_loop_workqueue, &tl_cmd->work); return 0; @@ -370,7 +363,7 @@ static int tcm_loop_abort_task(struct scsi_cmnd *sc) */ tl_tpg = &tl_hba->tl_hba_tpgs[sc->device->id]; ret = tcm_loop_issue_tmr(tl_tpg, tl_nexus, sc->device->lun, - sc->tag, TMR_ABORT_TASK); + sc->request->tag, TMR_ABORT_TASK); return (ret == TMR_FUNCTION_COMPLETE) ? SUCCESS : FAILED; } @@ -960,8 +953,7 @@ static int tcm_loop_port_link( struct tcm_loop_tpg, tl_se_tpg); struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba; - atomic_inc(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic(); + atomic_inc_mb(&tl_tpg->tl_tpg_port_count); /* * Add Linux/SCSI struct scsi_device by HCTL */ @@ -995,8 +987,7 @@ static void tcm_loop_port_unlink( scsi_remove_device(sd); scsi_device_put(sd); - atomic_dec(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tl_tpg->tl_tpg_port_count); pr_debug("TCM_Loop_ConfigFS: Port Unlink Successful\n"); } diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index fbc5ebb5f761..fb87780929d2 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -392,8 +392,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) if (tg_pt_id != tg_pt_gp->tg_pt_gp_id) continue; - atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -403,8 +402,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) found = true; spin_lock(&dev->t10_alua.tg_pt_gps_lock); - atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); break; } spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -998,8 +996,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) * every I_T nexus other than the I_T nexus on which the SET * TARGET PORT GROUPS command */ - atomic_inc(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&mem->tg_pt_gp_mem_ref_cnt); spin_unlock(&tg_pt_gp->tg_pt_gp_lock); spin_lock_bh(&port->sep_alua_lock); @@ -1028,8 +1025,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) spin_unlock_bh(&port->sep_alua_lock); spin_lock(&tg_pt_gp->tg_pt_gp_lock); - atomic_dec(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&mem->tg_pt_gp_mem_ref_cnt); } spin_unlock(&tg_pt_gp->tg_pt_gp_lock); /* @@ -1063,7 +1059,6 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) core_alua_dump_state(tg_pt_gp->tg_pt_gp_alua_pending_state)); spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (tg_pt_gp->tg_pt_gp_transition_complete) @@ -1125,7 +1120,6 @@ static int core_alua_do_transition_tg_pt( */ spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs) { @@ -1168,7 +1162,6 @@ int core_alua_do_port_transition( spin_lock(&local_lu_gp_mem->lu_gp_mem_lock); lu_gp = local_lu_gp_mem->lu_gp; atomic_inc(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&local_lu_gp_mem->lu_gp_mem_lock); /* * For storage objects that are members of the 'default_lu_gp', @@ -1184,8 +1177,7 @@ int core_alua_do_port_transition( l_tg_pt_gp->tg_pt_gp_alua_nacl = l_nacl; rc = core_alua_do_transition_tg_pt(l_tg_pt_gp, new_state, explicit); - atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp->lu_gp_ref_cnt); return rc; } /* @@ -1198,8 +1190,7 @@ int core_alua_do_port_transition( lu_gp_mem_list) { dev = lu_gp_mem->lu_gp_mem_dev; - atomic_inc(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&lu_gp_mem->lu_gp_mem_ref_cnt); spin_unlock(&lu_gp->lu_gp_lock); spin_lock(&dev->t10_alua.tg_pt_gps_lock); @@ -1227,8 +1218,7 @@ int core_alua_do_port_transition( tg_pt_gp->tg_pt_gp_alua_port = NULL; tg_pt_gp->tg_pt_gp_alua_nacl = NULL; } - atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); /* * core_alua_do_transition_tg_pt() will always return @@ -1238,16 +1228,14 @@ int core_alua_do_port_transition( new_state, explicit); spin_lock(&dev->t10_alua.tg_pt_gps_lock); - atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); if (rc) break; } spin_unlock(&dev->t10_alua.tg_pt_gps_lock); spin_lock(&lu_gp->lu_gp_lock); - atomic_dec(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp_mem->lu_gp_mem_ref_cnt); } spin_unlock(&lu_gp->lu_gp_lock); @@ -1260,8 +1248,7 @@ int core_alua_do_port_transition( core_alua_dump_state(new_state)); } - atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp->lu_gp_ref_cnt); return rc; } diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index 756def38c77a..79f9296a08ae 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -665,6 +665,9 @@ SE_DEV_ATTR(is_nonrot, S_IRUGO | S_IWUSR); DEF_DEV_ATTRIB(emulate_rest_reord); SE_DEV_ATTR(emulate_rest_reord, S_IRUGO | S_IWUSR); +DEF_DEV_ATTRIB(force_pr_aptpl); +SE_DEV_ATTR(force_pr_aptpl, S_IRUGO | S_IWUSR); + DEF_DEV_ATTRIB_RO(hw_block_size); SE_DEV_ATTR_RO(hw_block_size); @@ -719,6 +722,7 @@ static struct configfs_attribute *target_core_dev_attrib_attrs[] = { &target_core_dev_attrib_hw_pi_prot_type.attr, &target_core_dev_attrib_pi_prot_format.attr, &target_core_dev_attrib_enforce_pr_isids.attr, + &target_core_dev_attrib_force_pr_aptpl.attr, &target_core_dev_attrib_is_nonrot.attr, &target_core_dev_attrib_emulate_rest_reord.attr, &target_core_dev_attrib_hw_block_size.attr, @@ -1263,7 +1267,7 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( { unsigned char *i_fabric = NULL, *i_port = NULL, *isid = NULL; unsigned char *t_fabric = NULL, *t_port = NULL; - char *orig, *ptr, *arg_p, *opts; + char *orig, *ptr, *opts; substring_t args[MAX_OPT_ARGS]; unsigned long long tmp_ll; u64 sa_res_key = 0; @@ -1295,14 +1299,14 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( token = match_token(ptr, tokens, args); switch (token) { case Opt_initiator_fabric: - i_fabric = match_strdup(&args[0]); + i_fabric = match_strdup(args); if (!i_fabric) { ret = -ENOMEM; goto out; } break; case Opt_initiator_node: - i_port = match_strdup(&args[0]); + i_port = match_strdup(args); if (!i_port) { ret = -ENOMEM; goto out; @@ -1316,7 +1320,7 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( } break; case Opt_initiator_sid: - isid = match_strdup(&args[0]); + isid = match_strdup(args); if (!isid) { ret = -ENOMEM; goto out; @@ -1330,15 +1334,9 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( } break; case Opt_sa_res_key: - arg_p = match_strdup(&args[0]); - if (!arg_p) { - ret = -ENOMEM; - goto out; - } - ret = kstrtoull(arg_p, 0, &tmp_ll); + ret = kstrtoull(args->from, 0, &tmp_ll); if (ret < 0) { - pr_err("kstrtoull() failed for" - " sa_res_key=\n"); + pr_err("kstrtoull() failed for sa_res_key=\n"); goto out; } sa_res_key = (u64)tmp_ll; @@ -1370,14 +1368,14 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( * PR APTPL Metadata for Target Port */ case Opt_target_fabric: - t_fabric = match_strdup(&args[0]); + t_fabric = match_strdup(args); if (!t_fabric) { ret = -ENOMEM; goto out; } break; case Opt_target_node: - t_port = match_strdup(&args[0]); + t_port = match_strdup(args); if (!t_port) { ret = -ENOMEM; goto out; diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 98da90167159..c45f9e907e44 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -224,8 +224,7 @@ struct se_dev_entry *core_get_se_deve_from_rtpi( if (port->sep_rtpi != rtpi) continue; - atomic_inc(&deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->pr_ref_count); spin_unlock_irq(&nacl->device_list_lock); return deve; @@ -1019,6 +1018,23 @@ int se_dev_set_enforce_pr_isids(struct se_device *dev, int flag) return 0; } +int se_dev_set_force_pr_aptpl(struct se_device *dev, int flag) +{ + if ((flag != 0) && (flag != 1)) { + printk(KERN_ERR "Illegal value %d\n", flag); + return -EINVAL; + } + if (dev->export_count) { + pr_err("dev[%p]: Unable to set force_pr_aptpl while" + " export_count is %d\n", dev, dev->export_count); + return -EINVAL; + } + + dev->dev_attrib.force_pr_aptpl = flag; + pr_debug("dev[%p]: SE Device force_pr_aptpl: %d\n", dev, flag); + return 0; +} + int se_dev_set_is_nonrot(struct se_device *dev, int flag) { if ((flag != 0) && (flag != 1)) { @@ -1250,24 +1266,16 @@ struct se_lun *core_dev_add_lun( * * */ -int core_dev_del_lun( +void core_dev_del_lun( struct se_portal_group *tpg, - u32 unpacked_lun) + struct se_lun *lun) { - struct se_lun *lun; - - lun = core_tpg_pre_dellun(tpg, unpacked_lun); - if (IS_ERR(lun)) - return PTR_ERR(lun); - - core_tpg_post_dellun(tpg, lun); - - pr_debug("%s_TPG[%u]_LUN[%u] - Deactivated %s Logical Unit from" + pr_debug("%s_TPG[%u]_LUN[%u] - Deactivating %s Logical Unit from" " device object\n", tpg->se_tpg_tfo->get_fabric_name(), - tpg->se_tpg_tfo->tpg_get_tag(tpg), unpacked_lun, + tpg->se_tpg_tfo->tpg_get_tag(tpg), lun->unpacked_lun, tpg->se_tpg_tfo->get_fabric_name()); - return 0; + core_tpg_remove_lun(tpg, lun); } struct se_lun *core_get_lun_from_tpg(struct se_portal_group *tpg, u32 unpacked_lun) @@ -1396,8 +1404,7 @@ int core_dev_add_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_add_tail(&lacl->lacl_list, &lun->lun_acl_list); - atomic_inc(&lun->lun_acl_count); - smp_mb__after_atomic(); + atomic_inc_mb(&lun->lun_acl_count); spin_unlock(&lun->lun_acl_lock); pr_debug("%s_TPG[%hu]_LUN[%u->%u] - Added %s ACL for " @@ -1409,7 +1416,8 @@ int core_dev_add_initiator_node_lun_acl( * Check to see if there are any existing persistent reservation APTPL * pre-registrations that need to be enabled for this LUN ACL.. */ - core_scsi3_check_aptpl_registration(lun->lun_se_dev, tpg, lun, lacl); + core_scsi3_check_aptpl_registration(lun->lun_se_dev, tpg, lun, nacl, + lacl->mapped_lun); return 0; } @@ -1430,8 +1438,7 @@ int core_dev_del_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_del(&lacl->lacl_list); - atomic_dec(&lun->lun_acl_count); - smp_mb__after_atomic(); + atomic_dec_mb(&lun->lun_acl_count); spin_unlock(&lun->lun_acl_lock); core_disable_device_list_for_node(lun, NULL, lacl->mapped_lun, @@ -1554,6 +1561,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) dev->dev_attrib.emulate_3pc = DA_EMULATE_3PC; dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE0_PROT; dev->dev_attrib.enforce_pr_isids = DA_ENFORCE_PR_ISIDS; + dev->dev_attrib.force_pr_aptpl = DA_FORCE_PR_APTPL; dev->dev_attrib.is_nonrot = DA_IS_NONROT; dev->dev_attrib.emulate_rest_reord = DA_EMULATE_REST_REORD; dev->dev_attrib.max_unmap_lba_count = DA_MAX_UNMAP_LBA_COUNT; diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index 7de9f0475d05..0c3f90130b7d 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -320,7 +320,7 @@ static struct config_group *target_fabric_make_mappedlun( struct se_node_acl, acl_group); struct se_portal_group *se_tpg = se_nacl->se_tpg; struct target_fabric_configfs *tf = se_tpg->se_tpg_wwn->wwn_tf; - struct se_lun_acl *lacl; + struct se_lun_acl *lacl = NULL; struct config_item *acl_ci; struct config_group *lacl_cg = NULL, *ml_stat_grp = NULL; char *buf; @@ -406,6 +406,7 @@ static struct config_group *target_fabric_make_mappedlun( out: if (lacl_cg) kfree(lacl_cg->default_groups); + kfree(lacl); kfree(buf); return ERR_PTR(ret); } @@ -821,7 +822,7 @@ static int target_fabric_port_unlink( tf->tf_ops.fabric_pre_unlink(se_tpg, lun); } - core_dev_del_lun(se_tpg, lun->unpacked_lun); + core_dev_del_lun(se_tpg, lun); return 0; } @@ -910,16 +911,12 @@ static struct config_group *target_fabric_make_lun( GFP_KERNEL); if (!port_stat_grp->default_groups) { pr_err("Unable to allocate port_stat_grp->default_groups\n"); - errno = -ENOMEM; - goto out; + kfree(lun_cg->default_groups); + return ERR_PTR(-ENOMEM); } target_stat_setup_port_default_groups(lun); return &lun->lun_group; -out: - if (lun_cg) - kfree(lun_cg->default_groups); - return ERR_PTR(errno); } static void target_fabric_drop_lun( diff --git a/drivers/target/target_core_fabric_lib.c b/drivers/target/target_core_fabric_lib.c index 0d1cf8b4f49f..35bfe77160d8 100644 --- a/drivers/target/target_core_fabric_lib.c +++ b/drivers/target/target_core_fabric_lib.c @@ -394,9 +394,9 @@ char *iscsi_parse_pr_out_transport_id( * If the caller wants the TransportID Length, we set that value for the * entire iSCSI Tarnsport ID now. */ - if (out_tid_len != NULL) { - add_len = ((buf[2] >> 8) & 0xff); - add_len |= (buf[3] & 0xff); + if (out_tid_len) { + /* The shift works thanks to integer promotion rules */ + add_len = (buf[2] << 8) | buf[3]; tid_len = strlen(&buf[4]); tid_len += 4; /* Add four bytes for iSCSI Transport ID header */ diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 7d6cddaec525..72c83d98662b 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -415,7 +415,7 @@ fd_execute_sync_cache(struct se_cmd *cmd) } else { start = cmd->t_task_lba * dev->dev_attrib.block_size; if (cmd->data_length) - end = start + cmd->data_length; + end = start + cmd->data_length - 1; else end = LLONG_MAX; } @@ -680,7 +680,12 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, struct fd_dev *fd_dev = FD_DEV(dev); loff_t start = cmd->t_task_lba * dev->dev_attrib.block_size; - loff_t end = start + cmd->data_length; + loff_t end; + + if (cmd->data_length) + end = start + cmd->data_length - 1; + else + end = LLONG_MAX; vfs_fsync_range(fd_dev->fd_file, start, end, 1); } @@ -762,7 +767,9 @@ static ssize_t fd_set_configfs_dev_params(struct se_device *dev, fd_dev->fbd_flags |= FBDF_HAS_SIZE; break; case Opt_fd_buffered_io: - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; if (arg != 1) { pr_err("bogus fd_buffered_io=%d value\n", arg); ret = -EINVAL; diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index de9cab708f45..e31f42f369ff 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -38,6 +38,7 @@ int se_dev_set_emulate_3pc(struct se_device *, int); int se_dev_set_pi_prot_type(struct se_device *, int); int se_dev_set_pi_prot_format(struct se_device *, int); int se_dev_set_enforce_pr_isids(struct se_device *, int); +int se_dev_set_force_pr_aptpl(struct se_device *, int); int se_dev_set_is_nonrot(struct se_device *, int); int se_dev_set_emulate_rest_reord(struct se_device *dev, int); int se_dev_set_queue_depth(struct se_device *, u32); @@ -46,7 +47,7 @@ int se_dev_set_fabric_max_sectors(struct se_device *, u32); int se_dev_set_optimal_sectors(struct se_device *, u32); int se_dev_set_block_size(struct se_device *, u32); struct se_lun *core_dev_add_lun(struct se_portal_group *, struct se_device *, u32); -int core_dev_del_lun(struct se_portal_group *, u32); +void core_dev_del_lun(struct se_portal_group *, struct se_lun *); struct se_lun *core_get_lun_from_tpg(struct se_portal_group *, u32); struct se_lun_acl *core_dev_init_initiator_node_lun_acl(struct se_portal_group *, struct se_node_acl *, u32, int *); @@ -82,8 +83,7 @@ void core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *); struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u32); int core_tpg_add_lun(struct se_portal_group *, struct se_lun *, u32, struct se_device *); -struct se_lun *core_tpg_pre_dellun(struct se_portal_group *, u32 unpacked_lun); -int core_tpg_post_dellun(struct se_portal_group *, struct se_lun *); +void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *); /* target_core_transport.c */ extern struct kmem_cache *se_tmr_req_cache; diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index df357862286e..8c60a1a1ae8d 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -674,8 +674,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( */ spin_lock(&dev->se_port_lock); list_for_each_entry_safe(port, port_tmp, &dev->dev_sep_list, sep_list) { - atomic_inc(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&port->sep_tg_pt_ref_cnt); spin_unlock(&dev->se_port_lock); spin_lock_bh(&port->sep_alua_lock); @@ -709,8 +708,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( if (strcmp(nacl->initiatorname, nacl_tmp->initiatorname)) continue; - atomic_inc(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve_tmp->pr_ref_count); spin_unlock_bh(&port->sep_alua_lock); /* * Grab a configfs group dependency that is released @@ -722,10 +720,8 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( if (ret < 0) { pr_err("core_scsi3_lunacl_depend" "_item() failed\n"); - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); - atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); + atomic_dec_mb(&deve_tmp->pr_ref_count); goto out; } /* @@ -739,10 +735,8 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( nacl_tmp, deve_tmp, NULL, sa_res_key, all_tg_pt, aptpl); if (!pr_reg_atp) { - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); - atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); + atomic_dec_mb(&deve_tmp->pr_ref_count); core_scsi3_lunacl_undepend_item(deve_tmp); goto out; } @@ -754,8 +748,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( spin_unlock_bh(&port->sep_alua_lock); spin_lock(&dev->se_port_lock); - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); } spin_unlock(&dev->se_port_lock); @@ -902,6 +895,7 @@ static int __core_scsi3_check_aptpl_registration( spin_lock(&pr_tmpl->aptpl_reg_lock); list_for_each_entry_safe(pr_reg, pr_reg_tmp, &pr_tmpl->aptpl_reg_list, pr_reg_aptpl_list) { + if (!strcmp(pr_reg->pr_iport, i_port) && (pr_reg->pr_res_mapped_lun == deve->mapped_lun) && !(strcmp(pr_reg->pr_tport, t_port)) && @@ -944,10 +938,10 @@ int core_scsi3_check_aptpl_registration( struct se_device *dev, struct se_portal_group *tpg, struct se_lun *lun, - struct se_lun_acl *lun_acl) + struct se_node_acl *nacl, + u32 mapped_lun) { - struct se_node_acl *nacl = lun_acl->se_lun_nacl; - struct se_dev_entry *deve = nacl->device_list[lun_acl->mapped_lun]; + struct se_dev_entry *deve = nacl->device_list[mapped_lun]; if (dev->dev_reservation_flags & DRF_SPC2_RESERVATIONS) return 0; @@ -1109,8 +1103,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( if (dev->dev_attrib.enforce_pr_isids) continue; } - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1124,8 +1117,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( if (strcmp(isid, pr_reg->pr_reg_isid)) continue; - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1154,8 +1146,7 @@ static struct t10_pr_registration *core_scsi3_locate_pr_reg( static void core_scsi3_put_pr_reg(struct t10_pr_registration *pr_reg) { - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); } static int core_scsi3_check_implicit_release( @@ -1348,8 +1339,7 @@ static void core_scsi3_tpg_undepend_item(struct se_portal_group *tpg) configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &tpg->tpg_group.cg_item); - atomic_dec(&tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tpg->tpg_pr_ref_count); } static int core_scsi3_nodeacl_depend_item(struct se_node_acl *nacl) @@ -1368,16 +1358,14 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl) struct se_portal_group *tpg = nacl->se_tpg; if (nacl->dynamic_node_acl) { - atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&nacl->acl_pr_ref_count); return; } configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &nacl->acl_group.cg_item); - atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&nacl->acl_pr_ref_count); } static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve) @@ -1407,8 +1395,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) * For nacl->dynamic_node_acl=1 */ if (!lun_acl) { - atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&se_deve->pr_ref_count); return; } nacl = lun_acl->se_lun_nacl; @@ -1417,8 +1404,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &lun_acl->se_lun_group.cg_item); - atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&se_deve->pr_ref_count); } static sense_reason_t @@ -1551,15 +1537,13 @@ core_scsi3_decode_spec_i_port( if (!i_str) continue; - atomic_inc(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&tmp_tpg->tpg_pr_ref_count); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(tmp_tpg)) { pr_err(" core_scsi3_tpg_depend_item()" " for tmp_tpg\n"); - atomic_dec(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tmp_tpg->tpg_pr_ref_count); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; } @@ -1571,10 +1555,8 @@ core_scsi3_decode_spec_i_port( spin_lock_irq(&tmp_tpg->acl_node_lock); dest_node_acl = __core_tpg_get_initiator_node_acl( tmp_tpg, i_str); - if (dest_node_acl) { - atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); - } + if (dest_node_acl) + atomic_inc_mb(&dest_node_acl->acl_pr_ref_count); spin_unlock_irq(&tmp_tpg->acl_node_lock); if (!dest_node_acl) { @@ -1586,8 +1568,7 @@ core_scsi3_decode_spec_i_port( if (core_scsi3_nodeacl_depend_item(dest_node_acl)) { pr_err("configfs_depend_item() failed" " for dest_node_acl->acl_group\n"); - atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_node_acl->acl_pr_ref_count); core_scsi3_tpg_undepend_item(tmp_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; @@ -1646,8 +1627,7 @@ core_scsi3_decode_spec_i_port( if (core_scsi3_lunacl_depend_item(dest_se_deve)) { pr_err("core_scsi3_lunacl_depend_item()" " failed\n"); - atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_deve->pr_ref_count); core_scsi3_nodeacl_undepend_item(dest_node_acl); core_scsi3_tpg_undepend_item(dest_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -3167,15 +3147,13 @@ core_scsi3_emulate_pro_register_and_move(struct se_cmd *cmd, u64 res_key, if (!dest_tf_ops) continue; - atomic_inc(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&dest_se_tpg->tpg_pr_ref_count); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(dest_se_tpg)) { pr_err("core_scsi3_tpg_depend_item() failed" " for dest_se_tpg\n"); - atomic_dec(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_tpg->tpg_pr_ref_count); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_put_pr_reg; } @@ -3271,10 +3249,8 @@ after_iport_check: spin_lock_irq(&dest_se_tpg->acl_node_lock); dest_node_acl = __core_tpg_get_initiator_node_acl(dest_se_tpg, initiator_str); - if (dest_node_acl) { - atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); - } + if (dest_node_acl) + atomic_inc_mb(&dest_node_acl->acl_pr_ref_count); spin_unlock_irq(&dest_se_tpg->acl_node_lock); if (!dest_node_acl) { @@ -3288,8 +3264,7 @@ after_iport_check: if (core_scsi3_nodeacl_depend_item(dest_node_acl)) { pr_err("core_scsi3_nodeacl_depend_item() for" " dest_node_acl\n"); - atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_node_acl->acl_pr_ref_count); dest_node_acl = NULL; ret = TCM_INVALID_PARAMETER_LIST; goto out; @@ -3313,8 +3288,7 @@ after_iport_check: if (core_scsi3_lunacl_depend_item(dest_se_deve)) { pr_err("core_scsi3_lunacl_depend_item() failed\n"); - atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_deve->pr_ref_count); dest_se_deve = NULL; ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out; @@ -3497,6 +3471,7 @@ static unsigned long long core_scsi3_extract_reservation_key(unsigned char *cdb) sense_reason_t target_scsi3_emulate_pr_out(struct se_cmd *cmd) { + struct se_device *dev = cmd->se_dev; unsigned char *cdb = &cmd->t_task_cdb[0]; unsigned char *buf; u64 res_key, sa_res_key; @@ -3561,6 +3536,13 @@ target_scsi3_emulate_pr_out(struct se_cmd *cmd) aptpl = (buf[17] & 0x01); unreg = (buf[17] & 0x02); } + /* + * If the backend device has been configured to force APTPL metadata + * write-out, go ahead and propigate aptpl=1 down now. + */ + if (dev->dev_attrib.force_pr_aptpl) + aptpl = 1; + transport_kunmap_data_sg(cmd); buf = NULL; @@ -3803,7 +3785,7 @@ core_scsi3_pri_report_capabilities(struct se_cmd *cmd) if (!buf) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - buf[0] = ((add_len << 8) & 0xff); + buf[0] = ((add_len >> 8) & 0xff); buf[1] = (add_len & 0xff); buf[2] |= 0x10; /* CRH: Compatible Reservation Hanlding bit. */ buf[2] |= 0x08; /* SIP_C: Specify Initiator Ports Capable bit */ @@ -3879,8 +3861,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) se_tpg = pr_reg->pr_reg_nacl->se_tpg; add_desc_len = 0; - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); /* * Determine expected length of $FABRIC_MOD specific @@ -3893,8 +3874,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) pr_warn("SPC-3 PRIN READ_FULL_STATUS ran" " out of buffer: %d\n", cmd->data_length); spin_lock(&pr_tmpl->registration_lock); - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); break; } /* @@ -3955,8 +3935,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) se_nacl, pr_reg, &format_code, &buf[off+4]); spin_lock(&pr_tmpl->registration_lock); - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); /* * Set the ADDITIONAL DESCRIPTOR LENGTH */ diff --git a/drivers/target/target_core_pr.h b/drivers/target/target_core_pr.h index 2ee2936fa0bd..749fd7bb7510 100644 --- a/drivers/target/target_core_pr.h +++ b/drivers/target/target_core_pr.h @@ -60,7 +60,7 @@ extern int core_scsi3_alloc_aptpl_registration( unsigned char *, u16, u32, int, int, u8); extern int core_scsi3_check_aptpl_registration(struct se_device *, struct se_portal_group *, struct se_lun *, - struct se_lun_acl *); + struct se_node_acl *, u32); extern void core_scsi3_free_pr_reg_from_nacl(struct se_device *, struct se_node_acl *); extern void core_scsi3_free_all_registrations(struct se_device *); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 70d9f6dabba0..7c8291f0bbbc 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -749,14 +749,18 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev, ret = -EINVAL; goto out; } - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; pdv->pdv_host_id = arg; pr_debug("PSCSI[%d]: Referencing SCSI Host ID:" " %d\n", phv->phv_host_id, pdv->pdv_host_id); pdv->pdv_flags |= PDF_HAS_VIRT_HOST_ID; break; case Opt_scsi_channel_id: - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; pdv->pdv_channel_id = arg; pr_debug("PSCSI[%d]: Referencing SCSI Channel" " ID: %d\n", phv->phv_host_id, @@ -764,7 +768,9 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev, pdv->pdv_flags |= PDF_HAS_CHANNEL_ID; break; case Opt_scsi_target_id: - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; pdv->pdv_target_id = arg; pr_debug("PSCSI[%d]: Referencing SCSI Target" " ID: %d\n", phv->phv_host_id, @@ -772,7 +778,9 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev, pdv->pdv_flags |= PDF_HAS_TARGET_ID; break; case Opt_scsi_lun_id: - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; pdv->pdv_lun_id = arg; pr_debug("PSCSI[%d]: Referencing SCSI LUN ID:" " %d\n", phv->phv_host_id, pdv->pdv_lun_id); diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index bd78d9235ac6..ebe62afb957d 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -948,7 +948,7 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops) } /* reject any command that we don't have a handler for */ - if (!(cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) && !cmd->execute_cmd) + if (!cmd->execute_cmd) return TCM_UNSUPPORTED_SCSI_OPCODE; if (cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) { diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c index f7cd95e8111a..fa5e157db47b 100644 --- a/drivers/target/target_core_tmr.c +++ b/drivers/target/target_core_tmr.c @@ -64,21 +64,17 @@ int core_tmr_alloc_req( } EXPORT_SYMBOL(core_tmr_alloc_req); -void core_tmr_release_req( - struct se_tmr_req *tmr) +void core_tmr_release_req(struct se_tmr_req *tmr) { struct se_device *dev = tmr->tmr_dev; unsigned long flags; - if (!dev) { - kfree(tmr); - return; + if (dev) { + spin_lock_irqsave(&dev->se_tmr_lock, flags); + list_del(&tmr->tmr_list); + spin_unlock_irqrestore(&dev->se_tmr_lock, flags); } - spin_lock_irqsave(&dev->se_tmr_lock, flags); - list_del(&tmr->tmr_list); - spin_unlock_irqrestore(&dev->se_tmr_lock, flags); - kfree(tmr); } @@ -90,9 +86,8 @@ static void core_tmr_handle_tas_abort( bool remove = true; /* * TASK ABORTED status (TAS) bit support - */ - if ((tmr_nacl && - (tmr_nacl != cmd->se_sess->se_node_acl)) && tas) { + */ + if ((tmr_nacl && (tmr_nacl != cmd->se_sess->se_node_acl)) && tas) { remove = false; transport_send_task_abort(cmd); } @@ -120,13 +115,12 @@ void core_tmr_abort_task( struct se_tmr_req *tmr, struct se_session *se_sess) { - struct se_cmd *se_cmd, *tmp_cmd; + struct se_cmd *se_cmd; unsigned long flags; int ref_tag; spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); - list_for_each_entry_safe(se_cmd, tmp_cmd, - &se_sess->sess_cmd_list, se_cmd_list) { + list_for_each_entry(se_cmd, &se_sess->sess_cmd_list, se_cmd_list) { if (dev != se_cmd->se_dev) continue; diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index be783f717f19..0696de9553d3 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -40,6 +40,7 @@ #include <target/target_core_fabric.h> #include "target_core_internal.h" +#include "target_core_pr.h" extern struct se_device *g_lun0_dev; @@ -166,6 +167,13 @@ void core_tpg_add_node_to_devs( core_enable_device_list_for_node(lun, NULL, lun->unpacked_lun, lun_access, acl, tpg); + /* + * Check to see if there are any existing persistent reservation + * APTPL pre-registrations that need to be enabled for this dynamic + * LUN ACL now.. + */ + core_scsi3_check_aptpl_registration(dev, tpg, lun, acl, + lun->unpacked_lun); spin_lock(&tpg->tpg_lun_lock); } spin_unlock(&tpg->tpg_lun_lock); @@ -335,7 +343,7 @@ void core_tpg_clear_object_luns(struct se_portal_group *tpg) continue; spin_unlock(&tpg->tpg_lun_lock); - core_dev_del_lun(tpg, lun->unpacked_lun); + core_dev_del_lun(tpg, lun); spin_lock(&tpg->tpg_lun_lock); } spin_unlock(&tpg->tpg_lun_lock); @@ -663,13 +671,6 @@ static int core_tpg_setup_virtual_lun0(struct se_portal_group *se_tpg) return 0; } -static void core_tpg_release_virtual_lun0(struct se_portal_group *se_tpg) -{ - struct se_lun *lun = &se_tpg->tpg_virt_lun0; - - core_tpg_post_dellun(se_tpg, lun); -} - int core_tpg_register( struct target_core_fabric_ops *tfo, struct se_wwn *se_wwn, @@ -773,7 +774,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg) spin_unlock_irq(&se_tpg->acl_node_lock); if (se_tpg->se_tpg_type == TRANSPORT_TPG_TYPE_NORMAL) - core_tpg_release_virtual_lun0(se_tpg); + core_tpg_remove_lun(se_tpg, &se_tpg->tpg_virt_lun0); se_tpg->se_tpg_fabric_ptr = NULL; array_free(se_tpg->tpg_lun_list, TRANSPORT_MAX_LUNS_PER_TPG); @@ -838,37 +839,7 @@ int core_tpg_add_lun( return 0; } -struct se_lun *core_tpg_pre_dellun( - struct se_portal_group *tpg, - u32 unpacked_lun) -{ - struct se_lun *lun; - - if (unpacked_lun > (TRANSPORT_MAX_LUNS_PER_TPG-1)) { - pr_err("%s LUN: %u exceeds TRANSPORT_MAX_LUNS_PER_TPG" - "-1: %u for Target Portal Group: %u\n", - tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun, - TRANSPORT_MAX_LUNS_PER_TPG-1, - tpg->se_tpg_tfo->tpg_get_tag(tpg)); - return ERR_PTR(-EOVERFLOW); - } - - spin_lock(&tpg->tpg_lun_lock); - lun = tpg->tpg_lun_list[unpacked_lun]; - if (lun->lun_status != TRANSPORT_LUN_STATUS_ACTIVE) { - pr_err("%s Logical Unit Number: %u is not active on" - " Target Portal Group: %u, ignoring request.\n", - tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun, - tpg->se_tpg_tfo->tpg_get_tag(tpg)); - spin_unlock(&tpg->tpg_lun_lock); - return ERR_PTR(-ENODEV); - } - spin_unlock(&tpg->tpg_lun_lock); - - return lun; -} - -int core_tpg_post_dellun( +void core_tpg_remove_lun( struct se_portal_group *tpg, struct se_lun *lun) { @@ -882,6 +853,4 @@ int core_tpg_post_dellun( spin_unlock(&tpg->tpg_lun_lock); percpu_ref_exit(&lun->lun_ref); - - return 0; } diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 7fa62fc93e0b..9ea0d5f03f7a 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -232,6 +232,10 @@ void transport_subsystem_check_init(void) if (ret != 0) pr_err("Unable to load target_core_pscsi\n"); + ret = request_module("target_core_user"); + if (ret != 0) + pr_err("Unable to load target_core_user\n"); + sub_api_initialized = 1; } @@ -752,8 +756,7 @@ void target_qf_do_work(struct work_struct *work) list_for_each_entry_safe(cmd, cmd_tmp, &qf_cmd_list, se_qf_node) { list_del(&cmd->se_qf_node); - atomic_dec(&dev->dev_qf_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->dev_qf_count); pr_debug("Processing %s cmd: %p QUEUE_FULL in work queue" " context: %s\n", cmd->se_tfo->get_fabric_name(), cmd, @@ -1166,7 +1169,6 @@ transport_check_alloc_task_attr(struct se_cmd *cmd) * Dormant to Active status. */ cmd->se_ordered_id = atomic_inc_return(&dev->dev_ordered_id); - smp_mb__after_atomic(); pr_debug("Allocated se_ordered_id: %u for Task Attr: 0x%02x on %s\n", cmd->se_ordered_id, cmd->sam_task_attr, dev->transport->name); @@ -1722,8 +1724,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) cmd->t_task_cdb[0], cmd->se_ordered_id); return false; case MSG_ORDERED_TAG: - atomic_inc(&dev->dev_ordered_sync); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->dev_ordered_sync); pr_debug("Added ORDERED for CDB: 0x%02x to ordered list, " " se_ordered_id: %u\n", @@ -1740,8 +1741,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) /* * For SIMPLE and UNTAGGED Task Attribute commands */ - atomic_inc(&dev->simple_cmds); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->simple_cmds); break; } @@ -1845,8 +1845,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) return; if (cmd->sam_task_attr == MSG_SIMPLE_TAG) { - atomic_dec(&dev->simple_cmds); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->simple_cmds); dev->dev_cur_ordered_id++; pr_debug("Incremented dev->dev_cur_ordered_id: %u for" " SIMPLE: %u\n", dev->dev_cur_ordered_id, @@ -1857,8 +1856,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) " HEAD_OF_QUEUE: %u\n", dev->dev_cur_ordered_id, cmd->se_ordered_id); } else if (cmd->sam_task_attr == MSG_ORDERED_TAG) { - atomic_dec(&dev->dev_ordered_sync); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->dev_ordered_sync); dev->dev_cur_ordered_id++; pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED:" @@ -1877,8 +1875,7 @@ static void transport_complete_qf(struct se_cmd *cmd) if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) { trace_target_cmd_complete(cmd); ret = cmd->se_tfo->queue_status(cmd); - if (ret) - goto out; + goto out; } switch (cmd->data_direction) { @@ -1916,8 +1913,7 @@ static void transport_handle_queue_full( { spin_lock_irq(&dev->qf_cmd_lock); list_add_tail(&cmd->se_qf_node, &cmd->se_dev->qf_cmd_list); - atomic_inc(&dev->dev_qf_count); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->dev_qf_count); spin_unlock_irq(&cmd->se_dev->qf_cmd_lock); schedule_work(&cmd->se_dev->qf_work_queue); @@ -2896,7 +2892,6 @@ void transport_send_task_abort(struct se_cmd *cmd) if (cmd->se_tfo->write_pending_status(cmd) != 0) { cmd->transport_state |= CMD_T_ABORTED; cmd->se_cmd_flags |= SCF_SEND_DELAYED_TAS; - smp_mb__after_atomic(); return; } } diff --git a/drivers/target/target_core_ua.c b/drivers/target/target_core_ua.c index 101858e245b3..1738b1646988 100644 --- a/drivers/target/target_core_ua.c +++ b/drivers/target/target_core_ua.c @@ -161,8 +161,7 @@ int core_scsi3_ua_allocate( spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); - atomic_inc(&deve->ua_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->ua_count); return 0; } list_add_tail(&ua->ua_nacl_list, &deve->ua_list); @@ -174,8 +173,7 @@ int core_scsi3_ua_allocate( nacl->se_tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun, asc, ascq); - atomic_inc(&deve->ua_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->ua_count); return 0; } @@ -189,8 +187,7 @@ void core_scsi3_ua_release_all( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); } @@ -250,8 +247,7 @@ void core_scsi3_ua_for_check_condition( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); @@ -309,8 +305,7 @@ int core_scsi3_ua_clear_for_request_sense( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); diff --git a/drivers/target/target_core_ua.h b/drivers/target/target_core_ua.h index be912b36daae..a6b56b364e7a 100644 --- a/drivers/target/target_core_ua.h +++ b/drivers/target/target_core_ua.h @@ -1,4 +1,5 @@ #ifndef TARGET_CORE_UA_H +#define TARGET_CORE_UA_H /* * From spc4r17, Table D.1: ASC and ASCQ Assignement diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c new file mode 100644 index 000000000000..9a1b314f6482 --- /dev/null +++ b/drivers/target/target_core_user.c @@ -0,0 +1,1167 @@ +/* + * Copyright (C) 2013 Shaohua Li <shli@kernel.org> + * Copyright (C) 2014 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/idr.h> +#include <linux/timer.h> +#include <linux/parser.h> +#include <scsi/scsi.h> +#include <scsi/scsi_host.h> +#include <linux/uio_driver.h> +#include <net/genetlink.h> +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> +#include <target/target_core_backend.h> +#include <linux/target_core_user.h> + +/* + * Define a shared-memory interface for LIO to pass SCSI commands and + * data to userspace for processing. This is to allow backends that + * are too complex for in-kernel support to be possible. + * + * It uses the UIO framework to do a lot of the device-creation and + * introspection work for us. + * + * See the .h file for how the ring is laid out. Note that while the + * command ring is defined, the particulars of the data area are + * not. Offset values in the command entry point to other locations + * internal to the mmap()ed area. There is separate space outside the + * command ring for data buffers. This leaves maximum flexibility for + * moving buffer allocations, or even page flipping or other + * allocation techniques, without altering the command ring layout. + * + * SECURITY: + * The user process must be assumed to be malicious. There's no way to + * prevent it breaking the command ring protocol if it wants, but in + * order to prevent other issues we must only ever read *data* from + * the shared memory area, not offsets or sizes. This applies to + * command ring entries as well as the mailbox. Extra code needed for + * this may have a 'UAM' comment. + */ + + +#define TCMU_TIME_OUT (30 * MSEC_PER_SEC) + +#define CMDR_SIZE (16 * 4096) +#define DATA_SIZE (257 * 4096) + +#define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE) + +static struct device *tcmu_root_device; + +struct tcmu_hba { + u32 host_id; +}; + +/* User wants all cmds or just some */ +enum passthru_level { + TCMU_PASS_ALL = 0, + TCMU_PASS_IO, + TCMU_PASS_INVALID, +}; + +#define TCMU_CONFIG_LEN 256 + +struct tcmu_dev { + struct se_device se_dev; + + char *name; + struct se_hba *hba; + +#define TCMU_DEV_BIT_OPEN 0 +#define TCMU_DEV_BIT_BROKEN 1 + unsigned long flags; + enum passthru_level pass_level; + + struct uio_info uio_info; + + struct tcmu_mailbox *mb_addr; + size_t dev_size; + u32 cmdr_size; + u32 cmdr_last_cleaned; + /* Offset of data ring from start of mb */ + size_t data_off; + size_t data_size; + /* Ring head + tail values. */ + /* Must add data_off and mb_addr to get the address */ + size_t data_head; + size_t data_tail; + + wait_queue_head_t wait_cmdr; + /* TODO should this be a mutex? */ + spinlock_t cmdr_lock; + + struct idr commands; + spinlock_t commands_lock; + + struct timer_list timeout; + + char dev_config[TCMU_CONFIG_LEN]; +}; + +#define TCMU_DEV(_se_dev) container_of(_se_dev, struct tcmu_dev, se_dev) + +#define CMDR_OFF sizeof(struct tcmu_mailbox) + +struct tcmu_cmd { + struct se_cmd *se_cmd; + struct tcmu_dev *tcmu_dev; + + uint16_t cmd_id; + + /* Can't use se_cmd->data_length when cleaning up expired cmds, because if + cmd has been completed then accessing se_cmd is off limits */ + size_t data_length; + + unsigned long deadline; + +#define TCMU_CMD_BIT_EXPIRED 0 + unsigned long flags; +}; + +static struct kmem_cache *tcmu_cmd_cache; + +/* multicast group */ +enum tcmu_multicast_groups { + TCMU_MCGRP_CONFIG, +}; + +static const struct genl_multicast_group tcmu_mcgrps[] = { + [TCMU_MCGRP_CONFIG] = { .name = "config", }, +}; + +/* Our generic netlink family */ +static struct genl_family tcmu_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "TCM-USER", + .version = 1, + .maxattr = TCMU_ATTR_MAX, + .mcgrps = tcmu_mcgrps, + .n_mcgrps = ARRAY_SIZE(tcmu_mcgrps), +}; + +static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) +{ + struct se_device *se_dev = se_cmd->se_dev; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + struct tcmu_cmd *tcmu_cmd; + int cmd_id; + + tcmu_cmd = kmem_cache_zalloc(tcmu_cmd_cache, GFP_KERNEL); + if (!tcmu_cmd) + return NULL; + + tcmu_cmd->se_cmd = se_cmd; + tcmu_cmd->tcmu_dev = udev; + tcmu_cmd->data_length = se_cmd->data_length; + + tcmu_cmd->deadline = jiffies + msecs_to_jiffies(TCMU_TIME_OUT); + + idr_preload(GFP_KERNEL); + spin_lock_irq(&udev->commands_lock); + cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 0, + USHRT_MAX, GFP_NOWAIT); + spin_unlock_irq(&udev->commands_lock); + idr_preload_end(); + + if (cmd_id < 0) { + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + return NULL; + } + tcmu_cmd->cmd_id = cmd_id; + + return tcmu_cmd; +} + +static inline void tcmu_flush_dcache_range(void *vaddr, size_t size) +{ + unsigned long offset = (unsigned long) vaddr & ~PAGE_MASK; + + size = round_up(size+offset, PAGE_SIZE); + vaddr -= offset; + + while (size) { + flush_dcache_page(virt_to_page(vaddr)); + size -= PAGE_SIZE; + } +} + +/* + * Some ring helper functions. We don't assume size is a power of 2 so + * we can't use circ_buf.h. + */ +static inline size_t spc_used(size_t head, size_t tail, size_t size) +{ + int diff = head - tail; + + if (diff >= 0) + return diff; + else + return size + diff; +} + +static inline size_t spc_free(size_t head, size_t tail, size_t size) +{ + /* Keep 1 byte unused or we can't tell full from empty */ + return (size - spc_used(head, tail, size) - 1); +} + +static inline size_t head_to_end(size_t head, size_t size) +{ + return size - head; +} + +#define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size) + +/* + * We can't queue a command until we have space available on the cmd ring *and* space + * space avail on the data ring. + * + * Called with ring lock held. + */ +static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t data_needed) +{ + struct tcmu_mailbox *mb = udev->mb_addr; + size_t space; + u32 cmd_head; + size_t cmd_needed; + + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + + /* + * If cmd end-of-ring space is too small then we need space for a NOP plus + * original cmd - cmds are internally contiguous. + */ + if (head_to_end(cmd_head, udev->cmdr_size) >= cmd_size) + cmd_needed = cmd_size; + else + cmd_needed = cmd_size + head_to_end(cmd_head, udev->cmdr_size); + + space = spc_free(cmd_head, udev->cmdr_last_cleaned, udev->cmdr_size); + if (space < cmd_needed) { + pr_debug("no cmd space: %u %u %u\n", cmd_head, + udev->cmdr_last_cleaned, udev->cmdr_size); + return false; + } + + space = spc_free(udev->data_head, udev->data_tail, udev->data_size); + if (space < data_needed) { + pr_debug("no data space: %zu %zu %zu\n", udev->data_head, + udev->data_tail, udev->data_size); + return false; + } + + return true; +} + +static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + size_t base_command_size, command_size; + struct tcmu_mailbox *mb; + struct tcmu_cmd_entry *entry; + int i; + struct scatterlist *sg; + struct iovec *iov; + int iov_cnt = 0; + uint32_t cmd_head; + uint64_t cdb_off; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) + return -EINVAL; + + /* + * Must be a certain minimum size for response sense info, but + * also may be larger if the iov array is large. + * + * iovs = sgl_nents+1, for end-of-ring case, plus another 1 + * b/c size == offsetof one-past-element. + */ + base_command_size = max(offsetof(struct tcmu_cmd_entry, + req.iov[se_cmd->t_data_nents + 2]), + sizeof(struct tcmu_cmd_entry)); + command_size = base_command_size + + round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE); + + WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1)); + + spin_lock_irq(&udev->cmdr_lock); + + mb = udev->mb_addr; + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + if ((command_size > (udev->cmdr_size / 2)) + || tcmu_cmd->data_length > (udev->data_size - 1)) + pr_warn("TCMU: Request of size %zu/%zu may be too big for %u/%zu " + "cmd/data ring buffers\n", command_size, tcmu_cmd->data_length, + udev->cmdr_size, udev->data_size); + + while (!is_ring_space_avail(udev, command_size, tcmu_cmd->data_length)) { + int ret; + DEFINE_WAIT(__wait); + + prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE); + + pr_debug("sleeping for ring space\n"); + spin_unlock_irq(&udev->cmdr_lock); + ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT)); + finish_wait(&udev->wait_cmdr, &__wait); + if (!ret) { + pr_warn("tcmu: command timed out\n"); + return -ETIMEDOUT; + } + + spin_lock_irq(&udev->cmdr_lock); + + /* We dropped cmdr_lock, cmd_head is stale */ + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + } + + /* Insert a PAD if end-of-ring space is too small */ + if (head_to_end(cmd_head, udev->cmdr_size) < command_size) { + size_t pad_size = head_to_end(cmd_head, udev->cmdr_size); + + entry = (void *) mb + CMDR_OFF + cmd_head; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + tcmu_hdr_set_op(&entry->hdr, TCMU_OP_PAD); + tcmu_hdr_set_len(&entry->hdr, pad_size); + + UPDATE_HEAD(mb->cmd_head, pad_size, udev->cmdr_size); + + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + WARN_ON(cmd_head != 0); + } + + entry = (void *) mb + CMDR_OFF + cmd_head; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + tcmu_hdr_set_op(&entry->hdr, TCMU_OP_CMD); + tcmu_hdr_set_len(&entry->hdr, command_size); + entry->cmd_id = tcmu_cmd->cmd_id; + + /* + * Fix up iovecs, and handle if allocation in data ring wrapped. + */ + iov = &entry->req.iov[0]; + for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) { + size_t copy_bytes = min((size_t)sg->length, + head_to_end(udev->data_head, udev->data_size)); + void *from = kmap_atomic(sg_page(sg)) + sg->offset; + void *to = (void *) mb + udev->data_off + udev->data_head; + + if (tcmu_cmd->se_cmd->data_direction == DMA_TO_DEVICE) { + memcpy(to, from, copy_bytes); + tcmu_flush_dcache_range(to, copy_bytes); + } + + /* Even iov_base is relative to mb_addr */ + iov->iov_len = copy_bytes; + iov->iov_base = (void *) udev->data_off + udev->data_head; + iov_cnt++; + iov++; + + UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size); + + /* Uh oh, we wrapped the buffer. Must split sg across 2 iovs. */ + if (sg->length != copy_bytes) { + from += copy_bytes; + copy_bytes = sg->length - copy_bytes; + + iov->iov_len = copy_bytes; + iov->iov_base = (void *) udev->data_off + udev->data_head; + + if (se_cmd->data_direction == DMA_TO_DEVICE) { + to = (void *) mb + udev->data_off + udev->data_head; + memcpy(to, from, copy_bytes); + tcmu_flush_dcache_range(to, copy_bytes); + } + + iov_cnt++; + iov++; + + UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size); + } + + kunmap_atomic(from); + } + entry->req.iov_cnt = iov_cnt; + + /* All offsets relative to mb_addr, not start of entry! */ + cdb_off = CMDR_OFF + cmd_head + base_command_size; + memcpy((void *) mb + cdb_off, se_cmd->t_task_cdb, scsi_command_size(se_cmd->t_task_cdb)); + entry->req.cdb_off = cdb_off; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + + UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size); + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + spin_unlock_irq(&udev->cmdr_lock); + + /* TODO: only if FLUSH and FUA? */ + uio_event_notify(&udev->uio_info); + + mod_timer(&udev->timeout, + round_jiffies_up(jiffies + msecs_to_jiffies(TCMU_TIME_OUT))); + + return 0; +} + +static int tcmu_queue_cmd(struct se_cmd *se_cmd) +{ + struct se_device *se_dev = se_cmd->se_dev; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + struct tcmu_cmd *tcmu_cmd; + int ret; + + tcmu_cmd = tcmu_alloc_cmd(se_cmd); + if (!tcmu_cmd) + return -ENOMEM; + + ret = tcmu_queue_cmd_ring(tcmu_cmd); + if (ret < 0) { + pr_err("TCMU: Could not queue command\n"); + spin_lock_irq(&udev->commands_lock); + idr_remove(&udev->commands, tcmu_cmd->cmd_id); + spin_unlock_irq(&udev->commands_lock); + + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + } + + return ret; +} + +static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry) +{ + struct se_cmd *se_cmd = cmd->se_cmd; + struct tcmu_dev *udev = cmd->tcmu_dev; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) { + /* cmd has been completed already from timeout, just reclaim data + ring space */ + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + return; + } + + if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) { + memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer, + se_cmd->scsi_sense_length); + + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + } + else if (se_cmd->data_direction == DMA_FROM_DEVICE) { + struct scatterlist *sg; + int i; + + /* It'd be easier to look at entry's iovec again, but UAM */ + for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) { + size_t copy_bytes; + void *to; + void *from; + + copy_bytes = min((size_t)sg->length, + head_to_end(udev->data_tail, udev->data_size)); + + to = kmap_atomic(sg_page(sg)) + sg->offset; + WARN_ON(sg->length + sg->offset > PAGE_SIZE); + from = (void *) udev->mb_addr + udev->data_off + udev->data_tail; + tcmu_flush_dcache_range(from, copy_bytes); + memcpy(to, from, copy_bytes); + + UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size); + + /* Uh oh, wrapped the data buffer for this sg's data */ + if (sg->length != copy_bytes) { + from = (void *) udev->mb_addr + udev->data_off + udev->data_tail; + WARN_ON(udev->data_tail); + to += copy_bytes; + copy_bytes = sg->length - copy_bytes; + tcmu_flush_dcache_range(from, copy_bytes); + memcpy(to, from, copy_bytes); + + UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size); + } + + kunmap_atomic(to); + } + + } else if (se_cmd->data_direction == DMA_TO_DEVICE) { + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + } else { + pr_warn("TCMU: data direction was %d!\n", se_cmd->data_direction); + } + + target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status); + cmd->se_cmd = NULL; + + kmem_cache_free(tcmu_cmd_cache, cmd); +} + +static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) +{ + struct tcmu_mailbox *mb; + LIST_HEAD(cpl_cmds); + unsigned long flags; + int handled = 0; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { + pr_err("ring broken, not handling completions\n"); + return 0; + } + + spin_lock_irqsave(&udev->cmdr_lock, flags); + + mb = udev->mb_addr; + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + while (udev->cmdr_last_cleaned != ACCESS_ONCE(mb->cmd_tail)) { + + struct tcmu_cmd_entry *entry = (void *) mb + CMDR_OFF + udev->cmdr_last_cleaned; + struct tcmu_cmd *cmd; + + tcmu_flush_dcache_range(entry, sizeof(*entry)); + + if (tcmu_hdr_get_op(&entry->hdr) == TCMU_OP_PAD) { + UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + continue; + } + WARN_ON(tcmu_hdr_get_op(&entry->hdr) != TCMU_OP_CMD); + + spin_lock(&udev->commands_lock); + cmd = idr_find(&udev->commands, entry->cmd_id); + if (cmd) + idr_remove(&udev->commands, cmd->cmd_id); + spin_unlock(&udev->commands_lock); + + if (!cmd) { + pr_err("cmd_id not found, ring is broken\n"); + set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); + break; + } + + tcmu_handle_completion(cmd, entry); + + UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + + handled++; + } + + if (mb->cmd_tail == mb->cmd_head) + del_timer(&udev->timeout); /* no more pending cmds */ + + spin_unlock_irqrestore(&udev->cmdr_lock, flags); + + wake_up(&udev->wait_cmdr); + + return handled; +} + +static int tcmu_check_expired_cmd(int id, void *p, void *data) +{ + struct tcmu_cmd *cmd = p; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + return 0; + + if (!time_after(cmd->deadline, jiffies)) + return 0; + + set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags); + target_complete_cmd(cmd->se_cmd, SAM_STAT_CHECK_CONDITION); + cmd->se_cmd = NULL; + + kmem_cache_free(tcmu_cmd_cache, cmd); + + return 0; +} + +static void tcmu_device_timedout(unsigned long data) +{ + struct tcmu_dev *udev = (struct tcmu_dev *)data; + unsigned long flags; + int handled; + + handled = tcmu_handle_completions(udev); + + pr_warn("%d completions handled from timeout\n", handled); + + spin_lock_irqsave(&udev->commands_lock, flags); + idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL); + spin_unlock_irqrestore(&udev->commands_lock, flags); + + /* + * We don't need to wakeup threads on wait_cmdr since they have their + * own timeout. + */ +} + +static int tcmu_attach_hba(struct se_hba *hba, u32 host_id) +{ + struct tcmu_hba *tcmu_hba; + + tcmu_hba = kzalloc(sizeof(struct tcmu_hba), GFP_KERNEL); + if (!tcmu_hba) + return -ENOMEM; + + tcmu_hba->host_id = host_id; + hba->hba_ptr = tcmu_hba; + + return 0; +} + +static void tcmu_detach_hba(struct se_hba *hba) +{ + kfree(hba->hba_ptr); + hba->hba_ptr = NULL; +} + +static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) +{ + struct tcmu_dev *udev; + + udev = kzalloc(sizeof(struct tcmu_dev), GFP_KERNEL); + if (!udev) + return NULL; + + udev->name = kstrdup(name, GFP_KERNEL); + if (!udev->name) { + kfree(udev); + return NULL; + } + + udev->hba = hba; + + init_waitqueue_head(&udev->wait_cmdr); + spin_lock_init(&udev->cmdr_lock); + + idr_init(&udev->commands); + spin_lock_init(&udev->commands_lock); + + setup_timer(&udev->timeout, tcmu_device_timedout, + (unsigned long)udev); + + udev->pass_level = TCMU_PASS_ALL; + + return &udev->se_dev; +} + +static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on) +{ + struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info); + + tcmu_handle_completions(tcmu_dev); + + return 0; +} + +/* + * mmap code from uio.c. Copied here because we want to hook mmap() + * and this stuff must come along. + */ +static int tcmu_find_mem_index(struct vm_area_struct *vma) +{ + struct tcmu_dev *udev = vma->vm_private_data; + struct uio_info *info = &udev->uio_info; + + if (vma->vm_pgoff < MAX_UIO_MAPS) { + if (info->mem[vma->vm_pgoff].size == 0) + return -1; + return (int)vma->vm_pgoff; + } + return -1; +} + +static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct tcmu_dev *udev = vma->vm_private_data; + struct uio_info *info = &udev->uio_info; + struct page *page; + unsigned long offset; + void *addr; + + int mi = tcmu_find_mem_index(vma); + if (mi < 0) + return VM_FAULT_SIGBUS; + + /* + * We need to subtract mi because userspace uses offset = N*PAGE_SIZE + * to use mem[N]. + */ + offset = (vmf->pgoff - mi) << PAGE_SHIFT; + + addr = (void *)(unsigned long)info->mem[mi].addr + offset; + if (info->mem[mi].memtype == UIO_MEM_LOGICAL) + page = virt_to_page(addr); + else + page = vmalloc_to_page(addr); + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct tcmu_vm_ops = { + .fault = tcmu_vma_fault, +}; + +static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_ops = &tcmu_vm_ops; + + vma->vm_private_data = udev; + + /* Ensure the mmap is exactly the right size */ + if (vma_pages(vma) != (TCMU_RING_SIZE >> PAGE_SHIFT)) + return -EINVAL; + + return 0; +} + +static int tcmu_open(struct uio_info *info, struct inode *inode) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + /* O_EXCL not supported for char devs, so fake it? */ + if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags)) + return -EBUSY; + + pr_debug("open\n"); + + return 0; +} + +static int tcmu_release(struct uio_info *info, struct inode *inode) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags); + + pr_debug("close\n"); + + return 0; +} + +static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int minor) +{ + struct sk_buff *skb; + void *msg_header; + int ret = -ENOMEM; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return ret; + + msg_header = genlmsg_put(skb, 0, 0, &tcmu_genl_family, 0, cmd); + if (!msg_header) + goto free_skb; + + ret = nla_put_string(skb, TCMU_ATTR_DEVICE, name); + if (ret < 0) + goto free_skb; + + ret = nla_put_u32(skb, TCMU_ATTR_MINOR, minor); + if (ret < 0) + goto free_skb; + + ret = genlmsg_end(skb, msg_header); + if (ret < 0) + goto free_skb; + + ret = genlmsg_multicast(&tcmu_genl_family, skb, 0, + TCMU_MCGRP_CONFIG, GFP_KERNEL); + + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + + return ret; +free_skb: + nlmsg_free(skb); + return ret; +} + +static int tcmu_configure_device(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + struct tcmu_hba *hba = udev->hba->hba_ptr; + struct uio_info *info; + struct tcmu_mailbox *mb; + size_t size; + size_t used; + int ret = 0; + char *str; + + info = &udev->uio_info; + + size = snprintf(NULL, 0, "tcm-user/%u/%s/%s", hba->host_id, udev->name, + udev->dev_config); + size += 1; /* for \0 */ + str = kmalloc(size, GFP_KERNEL); + if (!str) + return -ENOMEM; + + used = snprintf(str, size, "tcm-user/%u/%s", hba->host_id, udev->name); + + if (udev->dev_config[0]) + snprintf(str + used, size - used, "/%s", udev->dev_config); + + info->name = str; + + udev->mb_addr = vzalloc(TCMU_RING_SIZE); + if (!udev->mb_addr) { + ret = -ENOMEM; + goto err_vzalloc; + } + + /* mailbox fits in first part of CMDR space */ + udev->cmdr_size = CMDR_SIZE - CMDR_OFF; + udev->data_off = CMDR_SIZE; + udev->data_size = TCMU_RING_SIZE - CMDR_SIZE; + + mb = udev->mb_addr; + mb->version = 1; + mb->cmdr_off = CMDR_OFF; + mb->cmdr_size = udev->cmdr_size; + + WARN_ON(!PAGE_ALIGNED(udev->data_off)); + WARN_ON(udev->data_size % PAGE_SIZE); + + info->version = "1"; + + info->mem[0].name = "tcm-user command & data buffer"; + info->mem[0].addr = (phys_addr_t) udev->mb_addr; + info->mem[0].size = TCMU_RING_SIZE; + info->mem[0].memtype = UIO_MEM_VIRTUAL; + + info->irqcontrol = tcmu_irqcontrol; + info->irq = UIO_IRQ_CUSTOM; + + info->mmap = tcmu_mmap; + info->open = tcmu_open; + info->release = tcmu_release; + + ret = uio_register_device(tcmu_root_device, info); + if (ret) + goto err_register; + + /* Other attributes can be configured in userspace */ + dev->dev_attrib.hw_block_size = 512; + dev->dev_attrib.hw_max_sectors = 128; + dev->dev_attrib.hw_queue_depth = 128; + + ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name, + udev->uio_info.uio_dev->minor); + if (ret) + goto err_netlink; + + return 0; + +err_netlink: + uio_unregister_device(&udev->uio_info); +err_register: + vfree(udev->mb_addr); +err_vzalloc: + kfree(info->name); + + return ret; +} + +static int tcmu_check_pending_cmd(int id, void *p, void *data) +{ + struct tcmu_cmd *cmd = p; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + return 0; + return -EINVAL; +} + +static void tcmu_free_device(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + int i; + + del_timer_sync(&udev->timeout); + + vfree(udev->mb_addr); + + /* Upper layer should drain all requests before calling this */ + spin_lock_irq(&udev->commands_lock); + i = idr_for_each(&udev->commands, tcmu_check_pending_cmd, NULL); + idr_destroy(&udev->commands); + spin_unlock_irq(&udev->commands_lock); + WARN_ON(i); + + /* Device was configured */ + if (udev->uio_info.uio_dev) { + tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, + udev->uio_info.uio_dev->minor); + + uio_unregister_device(&udev->uio_info); + kfree(udev->uio_info.name); + kfree(udev->name); + } + + kfree(udev); +} + +enum { + Opt_dev_config, Opt_dev_size, Opt_err, Opt_pass_level, +}; + +static match_table_t tokens = { + {Opt_dev_config, "dev_config=%s"}, + {Opt_dev_size, "dev_size=%u"}, + {Opt_pass_level, "pass_level=%u"}, + {Opt_err, NULL} +}; + +static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, + const char *page, ssize_t count) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + char *orig, *ptr, *opts, *arg_p; + substring_t args[MAX_OPT_ARGS]; + int ret = 0, token; + int arg; + + opts = kstrdup(page, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + orig = opts; + + while ((ptr = strsep(&opts, ",\n")) != NULL) { + if (!*ptr) + continue; + + token = match_token(ptr, tokens, args); + switch (token) { + case Opt_dev_config: + if (match_strlcpy(udev->dev_config, &args[0], + TCMU_CONFIG_LEN) == 0) { + ret = -EINVAL; + break; + } + pr_debug("TCMU: Referencing Path: %s\n", udev->dev_config); + break; + case Opt_dev_size: + arg_p = match_strdup(&args[0]); + if (!arg_p) { + ret = -ENOMEM; + break; + } + ret = kstrtoul(arg_p, 0, (unsigned long *) &udev->dev_size); + kfree(arg_p); + if (ret < 0) + pr_err("kstrtoul() failed for dev_size=\n"); + break; + case Opt_pass_level: + match_int(args, &arg); + if (arg >= TCMU_PASS_INVALID) { + pr_warn("TCMU: Invalid pass_level: %d\n", arg); + break; + } + + pr_debug("TCMU: Setting pass_level to %d\n", arg); + udev->pass_level = arg; + break; + default: + break; + } + } + + kfree(orig); + return (!ret) ? count : ret; +} + +static ssize_t tcmu_show_configfs_dev_params(struct se_device *dev, char *b) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + ssize_t bl = 0; + + bl = sprintf(b + bl, "Config: %s ", + udev->dev_config[0] ? udev->dev_config : "NULL"); + bl += sprintf(b + bl, "Size: %zu PassLevel: %u\n", + udev->dev_size, udev->pass_level); + + return bl; +} + +static sector_t tcmu_get_blocks(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + + return div_u64(udev->dev_size - dev->dev_attrib.block_size, + dev->dev_attrib.block_size); +} + +static sense_reason_t +tcmu_execute_rw(struct se_cmd *se_cmd, struct scatterlist *sgl, u32 sgl_nents, + enum dma_data_direction data_direction) +{ + int ret; + + ret = tcmu_queue_cmd(se_cmd); + + if (ret != 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + else + return TCM_NO_SENSE; +} + +static sense_reason_t +tcmu_pass_op(struct se_cmd *se_cmd) +{ + int ret = tcmu_queue_cmd(se_cmd); + + if (ret != 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + else + return TCM_NO_SENSE; +} + +static struct sbc_ops tcmu_sbc_ops = { + .execute_rw = tcmu_execute_rw, + .execute_sync_cache = tcmu_pass_op, + .execute_write_same = tcmu_pass_op, + .execute_write_same_unmap = tcmu_pass_op, + .execute_unmap = tcmu_pass_op, +}; + +static sense_reason_t +tcmu_parse_cdb(struct se_cmd *cmd) +{ + unsigned char *cdb = cmd->t_task_cdb; + struct tcmu_dev *udev = TCMU_DEV(cmd->se_dev); + sense_reason_t ret; + + switch (udev->pass_level) { + case TCMU_PASS_ALL: + /* We're just like pscsi, then */ + /* + * For REPORT LUNS we always need to emulate the response, for everything + * else, pass it up. + */ + switch (cdb[0]) { + case REPORT_LUNS: + cmd->execute_cmd = spc_emulate_report_luns; + break; + case READ_6: + case READ_10: + case READ_12: + case READ_16: + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + case WRITE_VERIFY: + cmd->se_cmd_flags |= SCF_SCSI_DATA_CDB; + /* FALLTHROUGH */ + default: + cmd->execute_cmd = tcmu_pass_op; + } + ret = TCM_NO_SENSE; + break; + case TCMU_PASS_IO: + ret = sbc_parse_cdb(cmd, &tcmu_sbc_ops); + break; + default: + pr_err("Unknown tcm-user pass level %d\n", udev->pass_level); + ret = TCM_CHECK_CONDITION_ABORT_CMD; + } + + return ret; +} + +static struct se_subsystem_api tcmu_template = { + .name = "user", + .inquiry_prod = "USER", + .inquiry_rev = TCMU_VERSION, + .owner = THIS_MODULE, + .transport_type = TRANSPORT_PLUGIN_VHBA_PDEV, + .attach_hba = tcmu_attach_hba, + .detach_hba = tcmu_detach_hba, + .alloc_device = tcmu_alloc_device, + .configure_device = tcmu_configure_device, + .free_device = tcmu_free_device, + .parse_cdb = tcmu_parse_cdb, + .set_configfs_dev_params = tcmu_set_configfs_dev_params, + .show_configfs_dev_params = tcmu_show_configfs_dev_params, + .get_device_type = sbc_get_device_type, + .get_blocks = tcmu_get_blocks, +}; + +static int __init tcmu_module_init(void) +{ + int ret; + + BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0); + + tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache", + sizeof(struct tcmu_cmd), + __alignof__(struct tcmu_cmd), + 0, NULL); + if (!tcmu_cmd_cache) + return -ENOMEM; + + tcmu_root_device = root_device_register("tcm_user"); + if (IS_ERR(tcmu_root_device)) { + ret = PTR_ERR(tcmu_root_device); + goto out_free_cache; + } + + ret = genl_register_family(&tcmu_genl_family); + if (ret < 0) { + goto out_unreg_device; + } + + ret = transport_subsystem_register(&tcmu_template); + if (ret) + goto out_unreg_genl; + + return 0; + +out_unreg_genl: + genl_unregister_family(&tcmu_genl_family); +out_unreg_device: + root_device_unregister(tcmu_root_device); +out_free_cache: + kmem_cache_destroy(tcmu_cmd_cache); + + return ret; +} + +static void __exit tcmu_module_exit(void) +{ + transport_subsystem_release(&tcmu_template); + genl_unregister_family(&tcmu_genl_family); + root_device_unregister(tcmu_root_device); + kmem_cache_destroy(tcmu_cmd_cache); +} + +MODULE_DESCRIPTION("TCM USER subsystem plugin"); +MODULE_AUTHOR("Shaohua Li <shli@kernel.org>"); +MODULE_AUTHOR("Andy Grover <agrover@redhat.com>"); +MODULE_LICENSE("GPL"); + +module_init(tcmu_module_init); +module_exit(tcmu_module_exit); diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c index 21ce50880c79..ccee7e332a4d 100644 --- a/drivers/target/tcm_fc/tfc_sess.c +++ b/drivers/target/tcm_fc/tfc_sess.c @@ -98,7 +98,7 @@ static void ft_tport_delete(struct ft_tport *tport) ft_sess_delete_all(tport); lport = tport->lport; BUG_ON(tport != lport->prov[FC_TYPE_FCP]); - rcu_assign_pointer(lport->prov[FC_TYPE_FCP], NULL); + RCU_INIT_POINTER(lport->prov[FC_TYPE_FCP], NULL); tpg = tport->tpg; if (tpg) { diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index ef5587fe2c69..f554d25b4399 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -84,6 +84,16 @@ config THERMAL_GOV_STEP_WISE Enable this to manage platform thermals using a simple linear governor. +config THERMAL_GOV_BANG_BANG + bool "Bang Bang thermal governor" + default n + help + Enable this to manage platform thermals using bang bang governor. + + Say 'Y' here if you want to use two point temperature regulation + used for fans without throttling. Some fan drivers depend on this + governor to be enabled (e.g. acerhdf). + config THERMAL_GOV_USER_SPACE bool "User_space thermal governor" help @@ -207,21 +217,6 @@ config X86_PKG_TEMP_THERMAL two trip points which can be set by user to get notifications via thermal notification methods. -config ACPI_INT3403_THERMAL - tristate "ACPI INT3403 thermal driver" - depends on X86 && ACPI - help - Newer laptops and tablets that use ACPI may have thermal sensors - outside the core CPU/SOC for thermal safety reasons. These - temperature sensors are also exposed for the OS to use via the so - called INT3403 ACPI object. This driver will, on devices that have - such sensors, expose the temperature information from these sensors - to userspace via the normal thermal framework. This means that a wide - range of applications and GUI widgets can show this information to - the user or use this information for making decisions. For example, - the Intel Thermal Daemon can use this information to allow the user - to select his laptop to run without turning on the fans. - config INTEL_SOC_DTS_THERMAL tristate "Intel SoCs DTS thermal driver" depends on X86 && IOSF_MBI @@ -234,6 +229,30 @@ config INTEL_SOC_DTS_THERMAL notification methods.The other trip is a critical trip point, which was set by the driver based on the TJ MAX temperature. +config INT340X_THERMAL + tristate "ACPI INT340X thermal drivers" + depends on X86 && ACPI + select THERMAL_GOV_USER_SPACE + select ACPI_THERMAL_REL + select ACPI_FAN + help + Newer laptops and tablets that use ACPI may have thermal sensors and + other devices with thermal control capabilities outside the core + CPU/SOC, for thermal safety reasons. + They are exposed for the OS to use via the INT3400 ACPI device object + as the master, and INT3401~INT340B ACPI device objects as the slaves. + Enable this to expose the temperature information and cooling ability + from these objects to userspace via the normal thermal framework. + This means that a wide range of applications and GUI widgets can show + the information to the user or use this information for making + decisions. For example, the Intel Thermal Daemon can use this + information to allow the user to select his laptop to run without + turning on the fans. + +config ACPI_THERMAL_REL + tristate + depends on ACPI + menu "Texas Instruments thermal drivers" source "drivers/thermal/ti-soc-thermal/Kconfig" endmenu diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 31e232f84b6b..39c4fe87da2f 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -11,6 +11,7 @@ thermal_sys-$(CONFIG_THERMAL_OF) += of-thermal.o # governors thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE) += fair_share.o +thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG) += gov_bang_bang.o thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE) += step_wise.o thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE) += user_space.o @@ -31,5 +32,5 @@ obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o obj-$(CONFIG_X86_PKG_TEMP_THERMAL) += x86_pkg_temp_thermal.o obj-$(CONFIG_INTEL_SOC_DTS_THERMAL) += intel_soc_dts_thermal.o obj-$(CONFIG_TI_SOC_THERMAL) += ti-soc-thermal/ -obj-$(CONFIG_ACPI_INT3403_THERMAL) += int3403_thermal.o +obj-$(CONFIG_INT340X_THERMAL) += int340x_thermal/ obj-$(CONFIG_ST_THERMAL) += st/ diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c index 944ba2f340c8..6e0a3fbfae86 100644 --- a/drivers/thermal/fair_share.c +++ b/drivers/thermal/fair_share.c @@ -23,6 +23,7 @@ */ #include <linux/thermal.h> +#include <trace/events/thermal.h> #include "thermal_core.h" @@ -34,6 +35,7 @@ static int get_trip_level(struct thermal_zone_device *tz) { int count = 0; unsigned long trip_temp; + enum thermal_trip_type trip_type; if (tz->trips == 0 || !tz->ops->get_trip_temp) return 0; @@ -43,6 +45,16 @@ static int get_trip_level(struct thermal_zone_device *tz) if (tz->temperature < trip_temp) break; } + + /* + * count > 0 only if temperature is greater than first trip + * point, in which case, trip_point = count - 1 + */ + if (count > 0) { + tz->ops->get_trip_type(tz, count - 1, &trip_type); + trace_thermal_zone_trip(tz, count - 1, trip_type); + } + return count; } diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c new file mode 100644 index 000000000000..c5dd76b2ee74 --- /dev/null +++ b/drivers/thermal/gov_bang_bang.c @@ -0,0 +1,131 @@ +/* + * gov_bang_bang.c - A simple thermal throttling governor using hysteresis + * + * Copyright (C) 2014 Peter Feuerer <peter@piie.net> + * + * Based on step_wise.c with following Copyrights: + * Copyright (C) 2012 Intel Corp + * Copyright (C) 2012 Durgadoss R <durgadoss.r@intel.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + */ + +#include <linux/thermal.h> + +#include "thermal_core.h" + +static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip) +{ + long trip_temp; + unsigned long trip_hyst; + struct thermal_instance *instance; + + tz->ops->get_trip_temp(tz, trip, &trip_temp); + tz->ops->get_trip_hyst(tz, trip, &trip_hyst); + + dev_dbg(&tz->device, "Trip%d[temp=%ld]:temp=%d:hyst=%ld\n", + trip, trip_temp, tz->temperature, + trip_hyst); + + mutex_lock(&tz->lock); + + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if (instance->trip != trip) + continue; + + /* in case fan is in initial state, switch the fan off */ + if (instance->target == THERMAL_NO_TARGET) + instance->target = 0; + + /* in case fan is neither on nor off set the fan to active */ + if (instance->target != 0 && instance->target != 1) { + pr_warn("Thermal instance %s controlled by bang-bang has unexpected state: %ld\n", + instance->name, instance->target); + instance->target = 1; + } + + /* + * enable fan when temperature exceeds trip_temp and disable + * the fan in case it falls below trip_temp minus hysteresis + */ + if (instance->target == 0 && tz->temperature >= trip_temp) + instance->target = 1; + else if (instance->target == 1 && + tz->temperature < trip_temp - trip_hyst) + instance->target = 0; + + dev_dbg(&instance->cdev->device, "target=%d\n", + (int)instance->target); + + instance->cdev->updated = false; /* cdev needs update */ + } + + mutex_unlock(&tz->lock); +} + +/** + * bang_bang_control - controls devices associated with the given zone + * @tz - thermal_zone_device + * @trip - the trip point + * + * Regulation Logic: a two point regulation, deliver cooling state depending + * on the previous state shown in this diagram: + * + * Fan: OFF ON + * + * | + * | + * trip_temp: +---->+ + * | | ^ + * | | | + * | | Temperature + * (trip_temp - hyst): +<----+ + * | + * | + * | + * + * * If the fan is not running and temperature exceeds trip_temp, the fan + * gets turned on. + * * In case the fan is running, temperature must fall below + * (trip_temp - hyst) so that the fan gets turned off again. + * + */ +static int bang_bang_control(struct thermal_zone_device *tz, int trip) +{ + struct thermal_instance *instance; + + thermal_zone_trip_update(tz, trip); + + mutex_lock(&tz->lock); + + list_for_each_entry(instance, &tz->thermal_instances, tz_node) + thermal_cdev_update(instance->cdev); + + mutex_unlock(&tz->lock); + + return 0; +} + +static struct thermal_governor thermal_gov_bang_bang = { + .name = "bang_bang", + .throttle = bang_bang_control, +}; + +int thermal_gov_bang_bang_register(void) +{ + return thermal_register_governor(&thermal_gov_bang_bang); +} + +void thermal_gov_bang_bang_unregister(void) +{ + thermal_unregister_governor(&thermal_gov_bang_bang); +} diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 2c516f2eebed..461bf3d033a0 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -19,6 +19,7 @@ #include <linux/mfd/syscon.h> #include <linux/module.h> #include <linux/of.h> +#include <linux/of_device.h> #include <linux/platform_device.h> #include <linux/regmap.h> #include <linux/slab.h> @@ -31,6 +32,11 @@ #define MISC0 0x0150 #define MISC0_REFTOP_SELBIASOFF (1 << 3) +#define MISC1 0x0160 +#define MISC1_IRQ_TEMPHIGH (1 << 29) +/* Below LOW and PANIC bits are only for TEMPMON_IMX6SX */ +#define MISC1_IRQ_TEMPLOW (1 << 28) +#define MISC1_IRQ_TEMPPANIC (1 << 27) #define TEMPSENSE0 0x0180 #define TEMPSENSE0_ALARM_VALUE_SHIFT 20 @@ -43,6 +49,12 @@ #define TEMPSENSE1 0x0190 #define TEMPSENSE1_MEASURE_FREQ 0xffff +/* Below TEMPSENSE2 is only for TEMPMON_IMX6SX */ +#define TEMPSENSE2 0x0290 +#define TEMPSENSE2_LOW_VALUE_SHIFT 0 +#define TEMPSENSE2_LOW_VALUE_MASK 0xfff +#define TEMPSENSE2_PANIC_VALUE_SHIFT 16 +#define TEMPSENSE2_PANIC_VALUE_MASK 0xfff0000 #define OCOTP_ANA1 0x04e0 @@ -66,6 +78,21 @@ enum imx_thermal_trip { #define FACTOR1 15976 #define FACTOR2 4297157 +#define TEMPMON_IMX6Q 1 +#define TEMPMON_IMX6SX 2 + +struct thermal_soc_data { + u32 version; +}; + +static struct thermal_soc_data thermal_imx6q_data = { + .version = TEMPMON_IMX6Q, +}; + +static struct thermal_soc_data thermal_imx6sx_data = { + .version = TEMPMON_IMX6SX, +}; + struct imx_thermal_data { struct thermal_zone_device *tz; struct thermal_cooling_device *cdev; @@ -79,8 +106,21 @@ struct imx_thermal_data { bool irq_enabled; int irq; struct clk *thermal_clk; + const struct thermal_soc_data *socdata; }; +static void imx_set_panic_temp(struct imx_thermal_data *data, + signed long panic_temp) +{ + struct regmap *map = data->tempmon; + int critical_value; + + critical_value = (data->c2 - panic_temp) / data->c1; + regmap_write(map, TEMPSENSE2 + REG_CLR, TEMPSENSE2_PANIC_VALUE_MASK); + regmap_write(map, TEMPSENSE2 + REG_SET, critical_value << + TEMPSENSE2_PANIC_VALUE_SHIFT); +} + static void imx_set_alarm_temp(struct imx_thermal_data *data, signed long alarm_temp) { @@ -142,13 +182,17 @@ static int imx_get_temp(struct thermal_zone_device *tz, unsigned long *temp) /* See imx_get_sensor_data() for formula derivation */ *temp = data->c2 - n_meas * data->c1; - /* Update alarm value to next higher trip point */ - if (data->alarm_temp == data->temp_passive && *temp >= data->temp_passive) - imx_set_alarm_temp(data, data->temp_critical); - if (data->alarm_temp == data->temp_critical && *temp < data->temp_passive) { - imx_set_alarm_temp(data, data->temp_passive); - dev_dbg(&tz->device, "thermal alarm off: T < %lu\n", - data->alarm_temp / 1000); + /* Update alarm value to next higher trip point for TEMPMON_IMX6Q */ + if (data->socdata->version == TEMPMON_IMX6Q) { + if (data->alarm_temp == data->temp_passive && + *temp >= data->temp_passive) + imx_set_alarm_temp(data, data->temp_critical); + if (data->alarm_temp == data->temp_critical && + *temp < data->temp_passive) { + imx_set_alarm_temp(data, data->temp_passive); + dev_dbg(&tz->device, "thermal alarm off: T < %lu\n", + data->alarm_temp / 1000); + } } if (*temp != data->last_temp) { @@ -398,8 +442,17 @@ static irqreturn_t imx_thermal_alarm_irq_thread(int irq, void *dev) return IRQ_HANDLED; } +static const struct of_device_id of_imx_thermal_match[] = { + { .compatible = "fsl,imx6q-tempmon", .data = &thermal_imx6q_data, }, + { .compatible = "fsl,imx6sx-tempmon", .data = &thermal_imx6sx_data, }, + { /* end */ } +}; +MODULE_DEVICE_TABLE(of, of_imx_thermal_match); + static int imx_thermal_probe(struct platform_device *pdev) { + const struct of_device_id *of_id = + of_match_device(of_imx_thermal_match, &pdev->dev); struct imx_thermal_data *data; struct cpumask clip_cpus; struct regmap *map; @@ -418,6 +471,20 @@ static int imx_thermal_probe(struct platform_device *pdev) } data->tempmon = map; + data->socdata = of_id->data; + + /* make sure the IRQ flag is clear before enabling irq on i.MX6SX */ + if (data->socdata->version == TEMPMON_IMX6SX) { + regmap_write(map, MISC1 + REG_CLR, MISC1_IRQ_TEMPHIGH | + MISC1_IRQ_TEMPLOW | MISC1_IRQ_TEMPPANIC); + /* + * reset value of LOW ALARM is incorrect, set it to lowest + * value to avoid false trigger of low alarm. + */ + regmap_write(map, TEMPSENSE2 + REG_SET, + TEMPSENSE2_LOW_VALUE_MASK); + } + data->irq = platform_get_irq(pdev, 0); if (data->irq < 0) return data->irq; @@ -489,6 +556,10 @@ static int imx_thermal_probe(struct platform_device *pdev) measure_freq = DIV_ROUND_UP(32768, 10); /* 10 Hz */ regmap_write(map, TEMPSENSE1 + REG_SET, measure_freq); imx_set_alarm_temp(data, data->temp_passive); + + if (data->socdata->version == TEMPMON_IMX6SX) + imx_set_panic_temp(data, data->temp_critical); + regmap_write(map, TEMPSENSE0 + REG_CLR, TEMPSENSE0_POWER_DOWN); regmap_write(map, TEMPSENSE0 + REG_SET, TEMPSENSE0_MEASURE_TEMP); @@ -550,12 +621,6 @@ static int imx_thermal_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(imx_thermal_pm_ops, imx_thermal_suspend, imx_thermal_resume); -static const struct of_device_id of_imx_thermal_match[] = { - { .compatible = "fsl,imx6q-tempmon", }, - { /* end */ } -}; -MODULE_DEVICE_TABLE(of, of_imx_thermal_match); - static struct platform_driver imx_thermal = { .driver = { .name = "imx_thermal", diff --git a/drivers/thermal/int3403_thermal.c b/drivers/thermal/int3403_thermal.c deleted file mode 100644 index 17554eeb3953..000000000000 --- a/drivers/thermal/int3403_thermal.c +++ /dev/null @@ -1,296 +0,0 @@ -/* - * ACPI INT3403 thermal driver - * Copyright (c) 2013, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/acpi.h> -#include <linux/thermal.h> - -#define INT3403_TYPE_SENSOR 0x03 -#define INT3403_PERF_CHANGED_EVENT 0x80 -#define INT3403_THERMAL_EVENT 0x90 - -#define DECI_KELVIN_TO_MILLI_CELSIUS(t, off) (((t) - (off)) * 100) -#define KELVIN_OFFSET 2732 -#define MILLI_CELSIUS_TO_DECI_KELVIN(t, off) (((t) / 100) + (off)) - -#define ACPI_INT3403_CLASS "int3403" -#define ACPI_INT3403_FILE_STATE "state" - -struct int3403_sensor { - struct thermal_zone_device *tzone; - unsigned long *thresholds; - unsigned long crit_temp; - int crit_trip_id; - unsigned long psv_temp; - int psv_trip_id; -}; - -static int sys_get_curr_temp(struct thermal_zone_device *tzone, - unsigned long *temp) -{ - struct acpi_device *device = tzone->devdata; - unsigned long long tmp; - acpi_status status; - - status = acpi_evaluate_integer(device->handle, "_TMP", NULL, &tmp); - if (ACPI_FAILURE(status)) - return -EIO; - - *temp = DECI_KELVIN_TO_MILLI_CELSIUS(tmp, KELVIN_OFFSET); - - return 0; -} - -static int sys_get_trip_hyst(struct thermal_zone_device *tzone, - int trip, unsigned long *temp) -{ - struct acpi_device *device = tzone->devdata; - unsigned long long hyst; - acpi_status status; - - status = acpi_evaluate_integer(device->handle, "GTSH", NULL, &hyst); - if (ACPI_FAILURE(status)) - return -EIO; - - /* - * Thermal hysteresis represents a temperature difference. - * Kelvin and Celsius have same degree size. So the - * conversion here between tenths of degree Kelvin unit - * and Milli-Celsius unit is just to multiply 100. - */ - *temp = hyst * 100; - - return 0; -} - -static int sys_get_trip_temp(struct thermal_zone_device *tzone, - int trip, unsigned long *temp) -{ - struct acpi_device *device = tzone->devdata; - struct int3403_sensor *obj = acpi_driver_data(device); - - if (trip == obj->crit_trip_id) - *temp = obj->crit_temp; - else if (trip == obj->psv_trip_id) - *temp = obj->psv_temp; - else { - /* - * get_trip_temp is a mandatory callback but - * PATx method doesn't return any value, so return - * cached value, which was last set from user space. - */ - *temp = obj->thresholds[trip]; - } - - return 0; -} - -static int sys_get_trip_type(struct thermal_zone_device *thermal, - int trip, enum thermal_trip_type *type) -{ - struct acpi_device *device = thermal->devdata; - struct int3403_sensor *obj = acpi_driver_data(device); - - /* Mandatory callback, may not mean much here */ - if (trip == obj->crit_trip_id) - *type = THERMAL_TRIP_CRITICAL; - else - *type = THERMAL_TRIP_PASSIVE; - - return 0; -} - -int sys_set_trip_temp(struct thermal_zone_device *tzone, int trip, - unsigned long temp) -{ - struct acpi_device *device = tzone->devdata; - acpi_status status; - char name[10]; - int ret = 0; - struct int3403_sensor *obj = acpi_driver_data(device); - - snprintf(name, sizeof(name), "PAT%d", trip); - if (acpi_has_method(device->handle, name)) { - status = acpi_execute_simple_method(device->handle, name, - MILLI_CELSIUS_TO_DECI_KELVIN(temp, - KELVIN_OFFSET)); - if (ACPI_FAILURE(status)) - ret = -EIO; - else - obj->thresholds[trip] = temp; - } else { - ret = -EIO; - dev_err(&device->dev, "sys_set_trip_temp: method not found\n"); - } - - return ret; -} - -static struct thermal_zone_device_ops tzone_ops = { - .get_temp = sys_get_curr_temp, - .get_trip_temp = sys_get_trip_temp, - .get_trip_type = sys_get_trip_type, - .set_trip_temp = sys_set_trip_temp, - .get_trip_hyst = sys_get_trip_hyst, -}; - -static void acpi_thermal_notify(struct acpi_device *device, u32 event) -{ - struct int3403_sensor *obj; - - if (!device) - return; - - obj = acpi_driver_data(device); - if (!obj) - return; - - switch (event) { - case INT3403_PERF_CHANGED_EVENT: - break; - case INT3403_THERMAL_EVENT: - thermal_zone_device_update(obj->tzone); - break; - default: - dev_err(&device->dev, "Unsupported event [0x%x]\n", event); - break; - } -} - -static int sys_get_trip_crt(struct acpi_device *device, unsigned long *temp) -{ - unsigned long long crt; - acpi_status status; - - status = acpi_evaluate_integer(device->handle, "_CRT", NULL, &crt); - if (ACPI_FAILURE(status)) - return -EIO; - - *temp = DECI_KELVIN_TO_MILLI_CELSIUS(crt, KELVIN_OFFSET); - - return 0; -} - -static int sys_get_trip_psv(struct acpi_device *device, unsigned long *temp) -{ - unsigned long long psv; - acpi_status status; - - status = acpi_evaluate_integer(device->handle, "_PSV", NULL, &psv); - if (ACPI_FAILURE(status)) - return -EIO; - - *temp = DECI_KELVIN_TO_MILLI_CELSIUS(psv, KELVIN_OFFSET); - - return 0; -} - -static int acpi_int3403_add(struct acpi_device *device) -{ - int result = 0; - unsigned long long ptyp; - acpi_status status; - struct int3403_sensor *obj; - unsigned long long trip_cnt; - int trip_mask = 0; - - if (!device) - return -EINVAL; - - status = acpi_evaluate_integer(device->handle, "PTYP", NULL, &ptyp); - if (ACPI_FAILURE(status)) - return -EINVAL; - - if (ptyp != INT3403_TYPE_SENSOR) - return -EINVAL; - - obj = devm_kzalloc(&device->dev, sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; - - device->driver_data = obj; - - status = acpi_evaluate_integer(device->handle, "PATC", NULL, - &trip_cnt); - if (ACPI_FAILURE(status)) - trip_cnt = 0; - - if (trip_cnt) { - /* We have to cache, thresholds can't be readback */ - obj->thresholds = devm_kzalloc(&device->dev, - sizeof(*obj->thresholds) * trip_cnt, - GFP_KERNEL); - if (!obj->thresholds) - return -ENOMEM; - trip_mask = BIT(trip_cnt) - 1; - } - - obj->psv_trip_id = -1; - if (!sys_get_trip_psv(device, &obj->psv_temp)) - obj->psv_trip_id = trip_cnt++; - - obj->crit_trip_id = -1; - if (!sys_get_trip_crt(device, &obj->crit_temp)) - obj->crit_trip_id = trip_cnt++; - - obj->tzone = thermal_zone_device_register(acpi_device_bid(device), - trip_cnt, trip_mask, device, &tzone_ops, - NULL, 0, 0); - if (IS_ERR(obj->tzone)) { - result = PTR_ERR(obj->tzone); - return result; - } - - strcpy(acpi_device_name(device), "INT3403"); - strcpy(acpi_device_class(device), ACPI_INT3403_CLASS); - - return 0; -} - -static int acpi_int3403_remove(struct acpi_device *device) -{ - struct int3403_sensor *obj; - - obj = acpi_driver_data(device); - thermal_zone_device_unregister(obj->tzone); - - return 0; -} - -ACPI_MODULE_NAME("int3403"); -static const struct acpi_device_id int3403_device_ids[] = { - {"INT3403", 0}, - {"", 0}, -}; -MODULE_DEVICE_TABLE(acpi, int3403_device_ids); - -static struct acpi_driver acpi_int3403_driver = { - .name = "INT3403", - .class = ACPI_INT3403_CLASS, - .ids = int3403_device_ids, - .ops = { - .add = acpi_int3403_add, - .remove = acpi_int3403_remove, - .notify = acpi_thermal_notify, - }, -}; - -module_acpi_driver(acpi_int3403_driver); - -MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>"); -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("ACPI INT3403 thermal driver"); diff --git a/drivers/thermal/int340x_thermal/Makefile b/drivers/thermal/int340x_thermal/Makefile new file mode 100644 index 000000000000..ffe40bffaf1a --- /dev/null +++ b/drivers/thermal/int340x_thermal/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_INT340X_THERMAL) += int3400_thermal.o +obj-$(CONFIG_INT340X_THERMAL) += int3402_thermal.o +obj-$(CONFIG_INT340X_THERMAL) += int3403_thermal.o +obj-$(CONFIG_ACPI_THERMAL_REL) += acpi_thermal_rel.o diff --git a/drivers/thermal/int340x_thermal/acpi_thermal_rel.c b/drivers/thermal/int340x_thermal/acpi_thermal_rel.c new file mode 100644 index 000000000000..0d8db808f0ae --- /dev/null +++ b/drivers/thermal/int340x_thermal/acpi_thermal_rel.c @@ -0,0 +1,400 @@ +/* acpi_thermal_rel.c driver for exporting ACPI thermal relationship + * + * Copyright (c) 2014 Intel Corp + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + */ + +/* + * Two functionalities included: + * 1. Export _TRT, _ART, via misc device interface to the userspace. + * 2. Provide parsing result to kernel drivers + * + */ +#include <linux/init.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/platform_device.h> +#include <linux/io.h> +#include <linux/acpi.h> +#include <linux/uaccess.h> +#include <linux/miscdevice.h> +#include "acpi_thermal_rel.h" + +static acpi_handle acpi_thermal_rel_handle; +static DEFINE_SPINLOCK(acpi_thermal_rel_chrdev_lock); +static int acpi_thermal_rel_chrdev_count; /* #times opened */ +static int acpi_thermal_rel_chrdev_exclu; /* already open exclusive? */ + +static int acpi_thermal_rel_open(struct inode *inode, struct file *file) +{ + spin_lock(&acpi_thermal_rel_chrdev_lock); + if (acpi_thermal_rel_chrdev_exclu || + (acpi_thermal_rel_chrdev_count && (file->f_flags & O_EXCL))) { + spin_unlock(&acpi_thermal_rel_chrdev_lock); + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + acpi_thermal_rel_chrdev_exclu = 1; + acpi_thermal_rel_chrdev_count++; + + spin_unlock(&acpi_thermal_rel_chrdev_lock); + + return nonseekable_open(inode, file); +} + +static int acpi_thermal_rel_release(struct inode *inode, struct file *file) +{ + spin_lock(&acpi_thermal_rel_chrdev_lock); + acpi_thermal_rel_chrdev_count--; + acpi_thermal_rel_chrdev_exclu = 0; + spin_unlock(&acpi_thermal_rel_chrdev_lock); + + return 0; +} + +/** + * acpi_parse_trt - Thermal Relationship Table _TRT for passive cooling + * + * @handle: ACPI handle of the device contains _TRT + * @art_count: the number of valid entries resulted from parsing _TRT + * @artp: pointer to pointer of array of art entries in parsing result + * @create_dev: whether to create platform devices for target and source + * + */ +int acpi_parse_trt(acpi_handle handle, int *trt_count, struct trt **trtp, + bool create_dev) +{ + acpi_status status; + int result = 0; + int i; + int nr_bad_entries = 0; + struct trt *trts; + struct acpi_device *adev; + union acpi_object *p; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer element = { 0, NULL }; + struct acpi_buffer trt_format = { sizeof("RRNNNNNN"), "RRNNNNNN" }; + + if (!acpi_has_method(handle, "_TRT")) + return 0; + + status = acpi_evaluate_object(handle, "_TRT", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + p = buffer.pointer; + if (!p || (p->type != ACPI_TYPE_PACKAGE)) { + pr_err("Invalid _TRT data\n"); + result = -EFAULT; + goto end; + } + + *trt_count = p->package.count; + trts = kzalloc(*trt_count * sizeof(struct trt), GFP_KERNEL); + if (!trts) { + result = -ENOMEM; + goto end; + } + + for (i = 0; i < *trt_count; i++) { + struct trt *trt = &trts[i - nr_bad_entries]; + + element.length = sizeof(struct trt); + element.pointer = trt; + + status = acpi_extract_package(&(p->package.elements[i]), + &trt_format, &element); + if (ACPI_FAILURE(status)) { + nr_bad_entries++; + pr_warn("_TRT package %d is invalid, ignored\n", i); + continue; + } + if (!create_dev) + continue; + + result = acpi_bus_get_device(trt->source, &adev); + if (!result) + acpi_create_platform_device(adev); + else + pr_warn("Failed to get source ACPI device\n"); + + result = acpi_bus_get_device(trt->target, &adev); + if (!result) + acpi_create_platform_device(adev); + else + pr_warn("Failed to get target ACPI device\n"); + } + + *trtp = trts; + /* don't count bad entries */ + *trt_count -= nr_bad_entries; +end: + kfree(buffer.pointer); + return result; +} +EXPORT_SYMBOL(acpi_parse_trt); + +/** + * acpi_parse_art - Parse Active Relationship Table _ART + * + * @handle: ACPI handle of the device contains _ART + * @art_count: the number of valid entries resulted from parsing _ART + * @artp: pointer to pointer of array of art entries in parsing result + * @create_dev: whether to create platform devices for target and source + * + */ +int acpi_parse_art(acpi_handle handle, int *art_count, struct art **artp, + bool create_dev) +{ + acpi_status status; + int result = 0; + int i; + int nr_bad_entries = 0; + struct art *arts; + struct acpi_device *adev; + union acpi_object *p; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer element = { 0, NULL }; + struct acpi_buffer art_format = { + sizeof("RRNNNNNNNNNNN"), "RRNNNNNNNNNNN" }; + + if (!acpi_has_method(handle, "_ART")) + return 0; + + status = acpi_evaluate_object(handle, "_ART", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + p = buffer.pointer; + if (!p || (p->type != ACPI_TYPE_PACKAGE)) { + pr_err("Invalid _ART data\n"); + result = -EFAULT; + goto end; + } + + /* ignore p->package.elements[0], as this is _ART Revision field */ + *art_count = p->package.count - 1; + arts = kzalloc(*art_count * sizeof(struct art), GFP_KERNEL); + if (!arts) { + result = -ENOMEM; + goto end; + } + + for (i = 0; i < *art_count; i++) { + struct art *art = &arts[i - nr_bad_entries]; + + element.length = sizeof(struct art); + element.pointer = art; + + status = acpi_extract_package(&(p->package.elements[i + 1]), + &art_format, &element); + if (ACPI_FAILURE(status)) { + pr_warn("_ART package %d is invalid, ignored", i); + nr_bad_entries++; + continue; + } + if (!create_dev) + continue; + + if (art->source) { + result = acpi_bus_get_device(art->source, &adev); + if (!result) + acpi_create_platform_device(adev); + else + pr_warn("Failed to get source ACPI device\n"); + } + if (art->target) { + result = acpi_bus_get_device(art->target, &adev); + if (!result) + acpi_create_platform_device(adev); + else + pr_warn("Failed to get source ACPI device\n"); + } + } + + *artp = arts; + /* don't count bad entries */ + *art_count -= nr_bad_entries; +end: + kfree(buffer.pointer); + return result; +} +EXPORT_SYMBOL(acpi_parse_art); + + +/* get device name from acpi handle */ +static void get_single_name(acpi_handle handle, char *name) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER}; + + if (ACPI_FAILURE(acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer))) + pr_warn("Failed get name from handle\n"); + else { + memcpy(name, buffer.pointer, ACPI_NAME_SIZE); + kfree(buffer.pointer); + } +} + +static int fill_art(char __user *ubuf) +{ + int i; + int ret; + int count; + int art_len; + struct art *arts = NULL; + union art_object *art_user; + + ret = acpi_parse_art(acpi_thermal_rel_handle, &count, &arts, false); + if (ret) + goto free_art; + art_len = count * sizeof(union art_object); + art_user = kzalloc(art_len, GFP_KERNEL); + if (!art_user) { + ret = -ENOMEM; + goto free_art; + } + /* now fill in user art data */ + for (i = 0; i < count; i++) { + /* userspace art needs device name instead of acpi reference */ + get_single_name(arts[i].source, art_user[i].source_device); + get_single_name(arts[i].target, art_user[i].target_device); + /* copy the rest int data in addition to source and target */ + memcpy(&art_user[i].weight, &arts[i].weight, + sizeof(u64) * (ACPI_NR_ART_ELEMENTS - 2)); + } + + if (copy_to_user(ubuf, art_user, art_len)) + ret = -EFAULT; + kfree(art_user); +free_art: + kfree(arts); + return ret; +} + +static int fill_trt(char __user *ubuf) +{ + int i; + int ret; + int count; + int trt_len; + struct trt *trts = NULL; + union trt_object *trt_user; + + ret = acpi_parse_trt(acpi_thermal_rel_handle, &count, &trts, false); + if (ret) + goto free_trt; + trt_len = count * sizeof(union trt_object); + trt_user = kzalloc(trt_len, GFP_KERNEL); + if (!trt_user) { + ret = -ENOMEM; + goto free_trt; + } + /* now fill in user trt data */ + for (i = 0; i < count; i++) { + /* userspace trt needs device name instead of acpi reference */ + get_single_name(trts[i].source, trt_user[i].source_device); + get_single_name(trts[i].target, trt_user[i].target_device); + trt_user[i].sample_period = trts[i].sample_period; + trt_user[i].influence = trts[i].influence; + } + + if (copy_to_user(ubuf, trt_user, trt_len)) + ret = -EFAULT; + kfree(trt_user); +free_trt: + kfree(trts); + return ret; +} + +static long acpi_thermal_rel_ioctl(struct file *f, unsigned int cmd, + unsigned long __arg) +{ + int ret = 0; + unsigned long length = 0; + unsigned long count = 0; + char __user *arg = (void __user *)__arg; + struct trt *trts; + struct art *arts; + + switch (cmd) { + case ACPI_THERMAL_GET_TRT_COUNT: + ret = acpi_parse_trt(acpi_thermal_rel_handle, (int *)&count, + &trts, false); + kfree(trts); + if (!ret) + return put_user(count, (unsigned long __user *)__arg); + return ret; + case ACPI_THERMAL_GET_TRT_LEN: + ret = acpi_parse_trt(acpi_thermal_rel_handle, (int *)&count, + &trts, false); + kfree(trts); + length = count * sizeof(union trt_object); + if (!ret) + return put_user(length, (unsigned long __user *)__arg); + return ret; + case ACPI_THERMAL_GET_TRT: + return fill_trt(arg); + case ACPI_THERMAL_GET_ART_COUNT: + ret = acpi_parse_art(acpi_thermal_rel_handle, (int *)&count, + &arts, false); + kfree(arts); + if (!ret) + return put_user(count, (unsigned long __user *)__arg); + return ret; + case ACPI_THERMAL_GET_ART_LEN: + ret = acpi_parse_art(acpi_thermal_rel_handle, (int *)&count, + &arts, false); + kfree(arts); + length = count * sizeof(union art_object); + if (!ret) + return put_user(length, (unsigned long __user *)__arg); + return ret; + + case ACPI_THERMAL_GET_ART: + return fill_art(arg); + + default: + return -ENOTTY; + } +} + +static const struct file_operations acpi_thermal_rel_fops = { + .owner = THIS_MODULE, + .open = acpi_thermal_rel_open, + .release = acpi_thermal_rel_release, + .unlocked_ioctl = acpi_thermal_rel_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice acpi_thermal_rel_misc_device = { + .minor = MISC_DYNAMIC_MINOR, + "acpi_thermal_rel", + &acpi_thermal_rel_fops +}; + +int acpi_thermal_rel_misc_device_add(acpi_handle handle) +{ + acpi_thermal_rel_handle = handle; + + return misc_register(&acpi_thermal_rel_misc_device); +} +EXPORT_SYMBOL(acpi_thermal_rel_misc_device_add); + +int acpi_thermal_rel_misc_device_remove(acpi_handle handle) +{ + misc_deregister(&acpi_thermal_rel_misc_device); + + return 0; +} +EXPORT_SYMBOL(acpi_thermal_rel_misc_device_remove); + +MODULE_AUTHOR("Zhang Rui <rui.zhang@intel.com>"); +MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com"); +MODULE_DESCRIPTION("Intel acpi thermal rel misc dev driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/thermal/int340x_thermal/acpi_thermal_rel.h b/drivers/thermal/int340x_thermal/acpi_thermal_rel.h new file mode 100644 index 000000000000..f00700bc9d79 --- /dev/null +++ b/drivers/thermal/int340x_thermal/acpi_thermal_rel.h @@ -0,0 +1,84 @@ +#ifndef __ACPI_ACPI_THERMAL_H +#define __ACPI_ACPI_THERMAL_H + +#include <asm/ioctl.h> + +#define ACPI_THERMAL_MAGIC 's' + +#define ACPI_THERMAL_GET_TRT_LEN _IOR(ACPI_THERMAL_MAGIC, 1, unsigned long) +#define ACPI_THERMAL_GET_ART_LEN _IOR(ACPI_THERMAL_MAGIC, 2, unsigned long) +#define ACPI_THERMAL_GET_TRT_COUNT _IOR(ACPI_THERMAL_MAGIC, 3, unsigned long) +#define ACPI_THERMAL_GET_ART_COUNT _IOR(ACPI_THERMAL_MAGIC, 4, unsigned long) + +#define ACPI_THERMAL_GET_TRT _IOR(ACPI_THERMAL_MAGIC, 5, unsigned long) +#define ACPI_THERMAL_GET_ART _IOR(ACPI_THERMAL_MAGIC, 6, unsigned long) + +struct art { + acpi_handle source; + acpi_handle target; + u64 weight; + u64 ac0_max; + u64 ac1_max; + u64 ac2_max; + u64 ac3_max; + u64 ac4_max; + u64 ac5_max; + u64 ac6_max; + u64 ac7_max; + u64 ac8_max; + u64 ac9_max; +} __packed; + +struct trt { + acpi_handle source; + acpi_handle target; + u64 influence; + u64 sample_period; + u64 reverved1; + u64 reverved2; + u64 reverved3; + u64 reverved4; +} __packed; + +#define ACPI_NR_ART_ELEMENTS 13 +/* for usrspace */ +union art_object { + struct { + char source_device[8]; /* ACPI single name */ + char target_device[8]; /* ACPI single name */ + u64 weight; + u64 ac0_max_level; + u64 ac1_max_level; + u64 ac2_max_level; + u64 ac3_max_level; + u64 ac4_max_level; + u64 ac5_max_level; + u64 ac6_max_level; + u64 ac7_max_level; + u64 ac8_max_level; + u64 ac9_max_level; + }; + u64 __data[ACPI_NR_ART_ELEMENTS]; +}; + +union trt_object { + struct { + char source_device[8]; /* ACPI single name */ + char target_device[8]; /* ACPI single name */ + u64 influence; + u64 sample_period; + u64 reserved[4]; + }; + u64 __data[8]; +}; + +#ifdef __KERNEL__ +int acpi_thermal_rel_misc_device_add(acpi_handle handle); +int acpi_thermal_rel_misc_device_remove(acpi_handle handle); +int acpi_parse_art(acpi_handle handle, int *art_count, struct art **arts, + bool create_dev); +int acpi_parse_trt(acpi_handle handle, int *trt_count, struct trt **trts, + bool create_dev); +#endif + +#endif /* __ACPI_ACPI_THERMAL_H */ diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c new file mode 100644 index 000000000000..edc1cce117ba --- /dev/null +++ b/drivers/thermal/int340x_thermal/int3400_thermal.c @@ -0,0 +1,271 @@ +/* + * INT3400 thermal driver + * + * Copyright (C) 2014, Intel Corporation + * Authors: Zhang Rui <rui.zhang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/acpi.h> +#include <linux/thermal.h> +#include "acpi_thermal_rel.h" + +enum int3400_thermal_uuid { + INT3400_THERMAL_PASSIVE_1, + INT3400_THERMAL_PASSIVE_2, + INT3400_THERMAL_ACTIVE, + INT3400_THERMAL_CRITICAL, + INT3400_THERMAL_COOLING_MODE, + INT3400_THERMAL_MAXIMUM_UUID, +}; + +static u8 *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = { + "42A441D6-AE6A-462b-A84B-4A8CE79027D3", + "9E04115A-AE87-4D1C-9500-0F3E340BFE75", + "3A95C389-E4B8-4629-A526-C52C88626BAE", + "97C68AE7-15FA-499c-B8C9-5DA81D606E0A", + "16CAF1B7-DD38-40ed-B1C1-1B8A1913D531", +}; + +struct int3400_thermal_priv { + struct acpi_device *adev; + struct thermal_zone_device *thermal; + int mode; + int art_count; + struct art *arts; + int trt_count; + struct trt *trts; + u8 uuid_bitmap; + int rel_misc_dev_res; +}; + +static int int3400_thermal_get_uuids(struct int3400_thermal_priv *priv) +{ + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *obja, *objb; + int i, j; + int result = 0; + acpi_status status; + + status = acpi_evaluate_object(priv->adev->handle, "IDSP", NULL, &buf); + if (ACPI_FAILURE(status)) + return -ENODEV; + + obja = (union acpi_object *)buf.pointer; + if (obja->type != ACPI_TYPE_PACKAGE) { + result = -EINVAL; + goto end; + } + + for (i = 0; i < obja->package.count; i++) { + objb = &obja->package.elements[i]; + if (objb->type != ACPI_TYPE_BUFFER) { + result = -EINVAL; + goto end; + } + + /* UUID must be 16 bytes */ + if (objb->buffer.length != 16) { + result = -EINVAL; + goto end; + } + + for (j = 0; j < INT3400_THERMAL_MAXIMUM_UUID; j++) { + u8 uuid[16]; + + acpi_str_to_uuid(int3400_thermal_uuids[j], uuid); + if (!strncmp(uuid, objb->buffer.pointer, 16)) { + priv->uuid_bitmap |= (1 << j); + break; + } + } + } + +end: + kfree(buf.pointer); + return result; +} + +static int int3400_thermal_run_osc(acpi_handle handle, + enum int3400_thermal_uuid uuid, bool enable) +{ + u32 ret, buf[2]; + acpi_status status; + int result = 0; + struct acpi_osc_context context = { + .uuid_str = int3400_thermal_uuids[uuid], + .rev = 1, + .cap.length = 8, + }; + + buf[OSC_QUERY_DWORD] = 0; + buf[OSC_SUPPORT_DWORD] = enable; + + context.cap.pointer = buf; + + status = acpi_run_osc(handle, &context); + if (ACPI_SUCCESS(status)) { + ret = *((u32 *)(context.ret.pointer + 4)); + if (ret != enable) + result = -EPERM; + } else + result = -EPERM; + + kfree(context.ret.pointer); + return result; +} + +static int int3400_thermal_get_temp(struct thermal_zone_device *thermal, + unsigned long *temp) +{ + *temp = 20 * 1000; /* faked temp sensor with 20C */ + return 0; +} + +static int int3400_thermal_get_mode(struct thermal_zone_device *thermal, + enum thermal_device_mode *mode) +{ + struct int3400_thermal_priv *priv = thermal->devdata; + + if (!priv) + return -EINVAL; + + *mode = priv->mode; + + return 0; +} + +static int int3400_thermal_set_mode(struct thermal_zone_device *thermal, + enum thermal_device_mode mode) +{ + struct int3400_thermal_priv *priv = thermal->devdata; + bool enable; + int result = 0; + + if (!priv) + return -EINVAL; + + if (mode == THERMAL_DEVICE_ENABLED) + enable = true; + else if (mode == THERMAL_DEVICE_DISABLED) + enable = false; + else + return -EINVAL; + + if (enable != priv->mode) { + priv->mode = enable; + /* currently, only PASSIVE COOLING is supported */ + result = int3400_thermal_run_osc(priv->adev->handle, + INT3400_THERMAL_PASSIVE_1, enable); + } + return result; +} + +static struct thermal_zone_device_ops int3400_thermal_ops = { + .get_temp = int3400_thermal_get_temp, +}; + +static struct thermal_zone_params int3400_thermal_params = { + .governor_name = "user_space", + .no_hwmon = true, +}; + +static int int3400_thermal_probe(struct platform_device *pdev) +{ + struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + struct int3400_thermal_priv *priv; + int result; + + if (!adev) + return -ENODEV; + + priv = kzalloc(sizeof(struct int3400_thermal_priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->adev = adev; + + result = int3400_thermal_get_uuids(priv); + if (result) + goto free_priv; + + result = acpi_parse_art(priv->adev->handle, &priv->art_count, + &priv->arts, true); + if (result) + goto free_priv; + + + result = acpi_parse_trt(priv->adev->handle, &priv->trt_count, + &priv->trts, true); + if (result) + goto free_art; + + platform_set_drvdata(pdev, priv); + + if (priv->uuid_bitmap & 1 << INT3400_THERMAL_PASSIVE_1) { + int3400_thermal_ops.get_mode = int3400_thermal_get_mode; + int3400_thermal_ops.set_mode = int3400_thermal_set_mode; + } + priv->thermal = thermal_zone_device_register("INT3400 Thermal", 0, 0, + priv, &int3400_thermal_ops, + &int3400_thermal_params, 0, 0); + if (IS_ERR(priv->thermal)) { + result = PTR_ERR(priv->thermal); + goto free_trt; + } + + priv->rel_misc_dev_res = acpi_thermal_rel_misc_device_add( + priv->adev->handle); + + return 0; +free_trt: + kfree(priv->trts); +free_art: + kfree(priv->arts); +free_priv: + kfree(priv); + return result; +} + +static int int3400_thermal_remove(struct platform_device *pdev) +{ + struct int3400_thermal_priv *priv = platform_get_drvdata(pdev); + + if (!priv->rel_misc_dev_res) + acpi_thermal_rel_misc_device_remove(priv->adev->handle); + + thermal_zone_device_unregister(priv->thermal); + kfree(priv->trts); + kfree(priv->arts); + kfree(priv); + return 0; +} + +static const struct acpi_device_id int3400_thermal_match[] = { + {"INT3400", 0}, + {} +}; + +MODULE_DEVICE_TABLE(acpi, int3400_thermal_match); + +static struct platform_driver int3400_thermal_driver = { + .probe = int3400_thermal_probe, + .remove = int3400_thermal_remove, + .driver = { + .name = "int3400 thermal", + .owner = THIS_MODULE, + .acpi_match_table = ACPI_PTR(int3400_thermal_match), + }, +}; + +module_platform_driver(int3400_thermal_driver); + +MODULE_DESCRIPTION("INT3400 Thermal driver"); +MODULE_AUTHOR("Zhang Rui <rui.zhang@intel.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/thermal/int340x_thermal/int3402_thermal.c b/drivers/thermal/int340x_thermal/int3402_thermal.c new file mode 100644 index 000000000000..a5d08c14ba24 --- /dev/null +++ b/drivers/thermal/int340x_thermal/int3402_thermal.c @@ -0,0 +1,242 @@ +/* + * INT3402 thermal driver for memory temperature reporting + * + * Copyright (C) 2014, Intel Corporation + * Authors: Aaron Lu <aaron.lu@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/acpi.h> +#include <linux/thermal.h> + +#define ACPI_ACTIVE_COOLING_MAX_NR 10 + +struct active_trip { + unsigned long temp; + int id; + bool valid; +}; + +struct int3402_thermal_data { + unsigned long *aux_trips; + int aux_trip_nr; + unsigned long psv_temp; + int psv_trip_id; + unsigned long crt_temp; + int crt_trip_id; + unsigned long hot_temp; + int hot_trip_id; + struct active_trip act_trips[ACPI_ACTIVE_COOLING_MAX_NR]; + acpi_handle *handle; +}; + +static int int3402_thermal_get_zone_temp(struct thermal_zone_device *zone, + unsigned long *temp) +{ + struct int3402_thermal_data *d = zone->devdata; + unsigned long long tmp; + acpi_status status; + + status = acpi_evaluate_integer(d->handle, "_TMP", NULL, &tmp); + if (ACPI_FAILURE(status)) + return -ENODEV; + + /* _TMP returns the temperature in tenths of degrees Kelvin */ + *temp = DECI_KELVIN_TO_MILLICELSIUS(tmp); + + return 0; +} + +static int int3402_thermal_get_trip_temp(struct thermal_zone_device *zone, + int trip, unsigned long *temp) +{ + struct int3402_thermal_data *d = zone->devdata; + int i; + + if (trip < d->aux_trip_nr) + *temp = d->aux_trips[trip]; + else if (trip == d->crt_trip_id) + *temp = d->crt_temp; + else if (trip == d->psv_trip_id) + *temp = d->psv_temp; + else if (trip == d->hot_trip_id) + *temp = d->hot_temp; + else { + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { + *temp = d->act_trips[i].temp; + break; + } + } + if (i == ACPI_ACTIVE_COOLING_MAX_NR) + return -EINVAL; + } + return 0; +} + +static int int3402_thermal_get_trip_type(struct thermal_zone_device *zone, + int trip, enum thermal_trip_type *type) +{ + struct int3402_thermal_data *d = zone->devdata; + int i; + + if (trip < d->aux_trip_nr) + *type = THERMAL_TRIP_PASSIVE; + else if (trip == d->crt_trip_id) + *type = THERMAL_TRIP_CRITICAL; + else if (trip == d->hot_trip_id) + *type = THERMAL_TRIP_HOT; + else if (trip == d->psv_trip_id) + *type = THERMAL_TRIP_PASSIVE; + else { + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { + *type = THERMAL_TRIP_ACTIVE; + break; + } + } + if (i == ACPI_ACTIVE_COOLING_MAX_NR) + return -EINVAL; + } + return 0; +} + +static int int3402_thermal_set_trip_temp(struct thermal_zone_device *zone, int trip, + unsigned long temp) +{ + struct int3402_thermal_data *d = zone->devdata; + acpi_status status; + char name[10]; + + snprintf(name, sizeof(name), "PAT%d", trip); + status = acpi_execute_simple_method(d->handle, name, + MILLICELSIUS_TO_DECI_KELVIN(temp)); + if (ACPI_FAILURE(status)) + return -EIO; + + d->aux_trips[trip] = temp; + return 0; +} + +static struct thermal_zone_device_ops int3402_thermal_zone_ops = { + .get_temp = int3402_thermal_get_zone_temp, + .get_trip_temp = int3402_thermal_get_trip_temp, + .get_trip_type = int3402_thermal_get_trip_type, + .set_trip_temp = int3402_thermal_set_trip_temp, +}; + +static struct thermal_zone_params int3402_thermal_params = { + .governor_name = "user_space", + .no_hwmon = true, +}; + +static int int3402_thermal_get_temp(acpi_handle handle, char *name, + unsigned long *temp) +{ + unsigned long long r; + acpi_status status; + + status = acpi_evaluate_integer(handle, name, NULL, &r); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLICELSIUS(r); + return 0; +} + +static int int3402_thermal_probe(struct platform_device *pdev) +{ + struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + struct int3402_thermal_data *d; + struct thermal_zone_device *zone; + acpi_status status; + unsigned long long trip_cnt; + int trip_mask = 0, i; + + if (!acpi_has_method(adev->handle, "_TMP")) + return -ENODEV; + + d = devm_kzalloc(&pdev->dev, sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + status = acpi_evaluate_integer(adev->handle, "PATC", NULL, &trip_cnt); + if (ACPI_FAILURE(status)) + trip_cnt = 0; + else { + d->aux_trips = devm_kzalloc(&pdev->dev, + sizeof(*d->aux_trips) * trip_cnt, GFP_KERNEL); + if (!d->aux_trips) + return -ENOMEM; + trip_mask = trip_cnt - 1; + d->handle = adev->handle; + d->aux_trip_nr = trip_cnt; + } + + d->crt_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_CRT", &d->crt_temp)) + d->crt_trip_id = trip_cnt++; + d->hot_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_HOT", &d->hot_temp)) + d->hot_trip_id = trip_cnt++; + d->psv_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_PSV", &d->psv_temp)) + d->psv_trip_id = trip_cnt++; + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + char name[5] = { '_', 'A', 'C', '0' + i, '\0' }; + if (int3402_thermal_get_temp(adev->handle, name, + &d->act_trips[i].temp)) + break; + d->act_trips[i].id = trip_cnt++; + d->act_trips[i].valid = true; + } + + zone = thermal_zone_device_register(acpi_device_bid(adev), trip_cnt, + trip_mask, d, + &int3402_thermal_zone_ops, + &int3402_thermal_params, + 0, 0); + if (IS_ERR(zone)) + return PTR_ERR(zone); + platform_set_drvdata(pdev, zone); + + return 0; +} + +static int int3402_thermal_remove(struct platform_device *pdev) +{ + struct thermal_zone_device *zone = platform_get_drvdata(pdev); + + thermal_zone_device_unregister(zone); + return 0; +} + +static const struct acpi_device_id int3402_thermal_match[] = { + {"INT3402", 0}, + {} +}; + +MODULE_DEVICE_TABLE(acpi, int3402_thermal_match); + +static struct platform_driver int3402_thermal_driver = { + .probe = int3402_thermal_probe, + .remove = int3402_thermal_remove, + .driver = { + .name = "int3402 thermal", + .owner = THIS_MODULE, + .acpi_match_table = int3402_thermal_match, + }, +}; + +module_platform_driver(int3402_thermal_driver); + +MODULE_DESCRIPTION("INT3402 Thermal driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/thermal/int340x_thermal/int3403_thermal.c b/drivers/thermal/int340x_thermal/int3403_thermal.c new file mode 100644 index 000000000000..d20dba986f0f --- /dev/null +++ b/drivers/thermal/int340x_thermal/int3403_thermal.c @@ -0,0 +1,477 @@ +/* + * ACPI INT3403 thermal driver + * Copyright (c) 2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <linux/thermal.h> +#include <linux/platform_device.h> + +#define INT3403_TYPE_SENSOR 0x03 +#define INT3403_TYPE_CHARGER 0x0B +#define INT3403_TYPE_BATTERY 0x0C +#define INT3403_PERF_CHANGED_EVENT 0x80 +#define INT3403_THERMAL_EVENT 0x90 + +#define DECI_KELVIN_TO_MILLI_CELSIUS(t, off) (((t) - (off)) * 100) +#define KELVIN_OFFSET 2732 +#define MILLI_CELSIUS_TO_DECI_KELVIN(t, off) (((t) / 100) + (off)) + +struct int3403_sensor { + struct thermal_zone_device *tzone; + unsigned long *thresholds; + unsigned long crit_temp; + int crit_trip_id; + unsigned long psv_temp; + int psv_trip_id; + +}; + +struct int3403_performance_state { + u64 performance; + u64 power; + u64 latency; + u64 linear; + u64 control; + u64 raw_performace; + char *raw_unit; + int reserved; +}; + +struct int3403_cdev { + struct thermal_cooling_device *cdev; + unsigned long max_state; +}; + +struct int3403_priv { + struct platform_device *pdev; + struct acpi_device *adev; + unsigned long long type; + void *priv; +}; + +static int sys_get_curr_temp(struct thermal_zone_device *tzone, + unsigned long *temp) +{ + struct int3403_priv *priv = tzone->devdata; + struct acpi_device *device = priv->adev; + unsigned long long tmp; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "_TMP", NULL, &tmp); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLI_CELSIUS(tmp, KELVIN_OFFSET); + + return 0; +} + +static int sys_get_trip_hyst(struct thermal_zone_device *tzone, + int trip, unsigned long *temp) +{ + struct int3403_priv *priv = tzone->devdata; + struct acpi_device *device = priv->adev; + unsigned long long hyst; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "GTSH", NULL, &hyst); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLI_CELSIUS(hyst, KELVIN_OFFSET); + + return 0; +} + +static int sys_get_trip_temp(struct thermal_zone_device *tzone, + int trip, unsigned long *temp) +{ + struct int3403_priv *priv = tzone->devdata; + struct int3403_sensor *obj = priv->priv; + + if (priv->type != INT3403_TYPE_SENSOR || !obj) + return -EINVAL; + + if (trip == obj->crit_trip_id) + *temp = obj->crit_temp; + else if (trip == obj->psv_trip_id) + *temp = obj->psv_temp; + else { + /* + * get_trip_temp is a mandatory callback but + * PATx method doesn't return any value, so return + * cached value, which was last set from user space + */ + *temp = obj->thresholds[trip]; + } + + return 0; +} + +static int sys_get_trip_type(struct thermal_zone_device *thermal, + int trip, enum thermal_trip_type *type) +{ + struct int3403_priv *priv = thermal->devdata; + struct int3403_sensor *obj = priv->priv; + + /* Mandatory callback, may not mean much here */ + if (trip == obj->crit_trip_id) + *type = THERMAL_TRIP_CRITICAL; + else + *type = THERMAL_TRIP_PASSIVE; + + return 0; +} + +int sys_set_trip_temp(struct thermal_zone_device *tzone, int trip, + unsigned long temp) +{ + struct int3403_priv *priv = tzone->devdata; + struct acpi_device *device = priv->adev; + struct int3403_sensor *obj = priv->priv; + acpi_status status; + char name[10]; + int ret = 0; + + snprintf(name, sizeof(name), "PAT%d", trip); + if (acpi_has_method(device->handle, name)) { + status = acpi_execute_simple_method(device->handle, name, + MILLI_CELSIUS_TO_DECI_KELVIN(temp, + KELVIN_OFFSET)); + if (ACPI_FAILURE(status)) + ret = -EIO; + else + obj->thresholds[trip] = temp; + } else { + ret = -EIO; + dev_err(&device->dev, "sys_set_trip_temp: method not found\n"); + } + + return ret; +} + +static struct thermal_zone_device_ops tzone_ops = { + .get_temp = sys_get_curr_temp, + .get_trip_temp = sys_get_trip_temp, + .get_trip_type = sys_get_trip_type, + .set_trip_temp = sys_set_trip_temp, + .get_trip_hyst = sys_get_trip_hyst, +}; + +static struct thermal_zone_params int3403_thermal_params = { + .governor_name = "user_space", + .no_hwmon = true, +}; + +static void int3403_notify(acpi_handle handle, + u32 event, void *data) +{ + struct int3403_priv *priv = data; + struct int3403_sensor *obj; + + if (!priv) + return; + + obj = priv->priv; + if (priv->type != INT3403_TYPE_SENSOR || !obj) + return; + + switch (event) { + case INT3403_PERF_CHANGED_EVENT: + break; + case INT3403_THERMAL_EVENT: + thermal_zone_device_update(obj->tzone); + break; + default: + dev_err(&priv->pdev->dev, "Unsupported event [0x%x]\n", event); + break; + } +} + +static int sys_get_trip_crt(struct acpi_device *device, unsigned long *temp) +{ + unsigned long long crt; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "_CRT", NULL, &crt); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLI_CELSIUS(crt, KELVIN_OFFSET); + + return 0; +} + +static int sys_get_trip_psv(struct acpi_device *device, unsigned long *temp) +{ + unsigned long long psv; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "_PSV", NULL, &psv); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLI_CELSIUS(psv, KELVIN_OFFSET); + + return 0; +} + +static int int3403_sensor_add(struct int3403_priv *priv) +{ + int result = 0; + acpi_status status; + struct int3403_sensor *obj; + unsigned long long trip_cnt; + int trip_mask = 0; + + obj = devm_kzalloc(&priv->pdev->dev, sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + priv->priv = obj; + + status = acpi_evaluate_integer(priv->adev->handle, "PATC", NULL, + &trip_cnt); + if (ACPI_FAILURE(status)) + trip_cnt = 0; + + if (trip_cnt) { + /* We have to cache, thresholds can't be readback */ + obj->thresholds = devm_kzalloc(&priv->pdev->dev, + sizeof(*obj->thresholds) * trip_cnt, + GFP_KERNEL); + if (!obj->thresholds) { + result = -ENOMEM; + goto err_free_obj; + } + trip_mask = BIT(trip_cnt) - 1; + } + + obj->psv_trip_id = -1; + if (!sys_get_trip_psv(priv->adev, &obj->psv_temp)) + obj->psv_trip_id = trip_cnt++; + + obj->crit_trip_id = -1; + if (!sys_get_trip_crt(priv->adev, &obj->crit_temp)) + obj->crit_trip_id = trip_cnt++; + + obj->tzone = thermal_zone_device_register(acpi_device_bid(priv->adev), + trip_cnt, trip_mask, priv, &tzone_ops, + &int3403_thermal_params, 0, 0); + if (IS_ERR(obj->tzone)) { + result = PTR_ERR(obj->tzone); + obj->tzone = NULL; + goto err_free_obj; + } + + result = acpi_install_notify_handler(priv->adev->handle, + ACPI_DEVICE_NOTIFY, int3403_notify, + (void *)priv); + if (result) + goto err_free_obj; + + return 0; + + err_free_obj: + if (obj->tzone) + thermal_zone_device_unregister(obj->tzone); + return result; +} + +static int int3403_sensor_remove(struct int3403_priv *priv) +{ + struct int3403_sensor *obj = priv->priv; + + thermal_zone_device_unregister(obj->tzone); + return 0; +} + +/* INT3403 Cooling devices */ +static int int3403_get_max_state(struct thermal_cooling_device *cdev, + unsigned long *state) +{ + struct int3403_priv *priv = cdev->devdata; + struct int3403_cdev *obj = priv->priv; + + *state = obj->max_state; + return 0; +} + +static int int3403_get_cur_state(struct thermal_cooling_device *cdev, + unsigned long *state) +{ + struct int3403_priv *priv = cdev->devdata; + unsigned long long level; + acpi_status status; + + status = acpi_evaluate_integer(priv->adev->handle, "PPPC", NULL, &level); + if (ACPI_SUCCESS(status)) { + *state = level; + return 0; + } else + return -EINVAL; +} + +static int +int3403_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) +{ + struct int3403_priv *priv = cdev->devdata; + acpi_status status; + + status = acpi_execute_simple_method(priv->adev->handle, "SPPC", state); + if (ACPI_SUCCESS(status)) + return 0; + else + return -EINVAL; +} + +static const struct thermal_cooling_device_ops int3403_cooling_ops = { + .get_max_state = int3403_get_max_state, + .get_cur_state = int3403_get_cur_state, + .set_cur_state = int3403_set_cur_state, +}; + +static int int3403_cdev_add(struct int3403_priv *priv) +{ + int result = 0; + acpi_status status; + struct int3403_cdev *obj; + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *p; + + obj = devm_kzalloc(&priv->pdev->dev, sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + status = acpi_evaluate_object(priv->adev->handle, "PPSS", NULL, &buf); + if (ACPI_FAILURE(status)) + return -ENODEV; + + p = buf.pointer; + if (!p || (p->type != ACPI_TYPE_PACKAGE)) { + printk(KERN_WARNING "Invalid PPSS data\n"); + return -EFAULT; + } + + obj->max_state = p->package.count - 1; + obj->cdev = + thermal_cooling_device_register(acpi_device_bid(priv->adev), + priv, &int3403_cooling_ops); + if (IS_ERR(obj->cdev)) + result = PTR_ERR(obj->cdev); + + priv->priv = obj; + + /* TODO: add ACPI notification support */ + + return result; +} + +static int int3403_cdev_remove(struct int3403_priv *priv) +{ + struct int3403_cdev *obj = priv->priv; + + thermal_cooling_device_unregister(obj->cdev); + return 0; +} + +static int int3403_add(struct platform_device *pdev) +{ + struct int3403_priv *priv; + int result = 0; + acpi_status status; + + priv = devm_kzalloc(&pdev->dev, sizeof(struct int3403_priv), + GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->pdev = pdev; + priv->adev = ACPI_COMPANION(&(pdev->dev)); + if (!priv->adev) { + result = -EINVAL; + goto err; + } + + status = acpi_evaluate_integer(priv->adev->handle, "PTYP", + NULL, &priv->type); + if (ACPI_FAILURE(status)) { + result = -EINVAL; + goto err; + } + + platform_set_drvdata(pdev, priv); + switch (priv->type) { + case INT3403_TYPE_SENSOR: + result = int3403_sensor_add(priv); + break; + case INT3403_TYPE_CHARGER: + case INT3403_TYPE_BATTERY: + result = int3403_cdev_add(priv); + break; + default: + result = -EINVAL; + } + + if (result) + goto err; + return result; + +err: + return result; +} + +static int int3403_remove(struct platform_device *pdev) +{ + struct int3403_priv *priv = platform_get_drvdata(pdev); + + switch (priv->type) { + case INT3403_TYPE_SENSOR: + int3403_sensor_remove(priv); + break; + case INT3403_TYPE_CHARGER: + case INT3403_TYPE_BATTERY: + int3403_cdev_remove(priv); + break; + default: + break; + } + + return 0; +} + +static const struct acpi_device_id int3403_device_ids[] = { + {"INT3403", 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, int3403_device_ids); + +static struct platform_driver int3403_driver = { + .probe = int3403_add, + .remove = int3403_remove, + .driver = { + .name = "int3403 thermal", + .owner = THIS_MODULE, + .acpi_match_table = int3403_device_ids, + }, +}; + +module_platform_driver(int3403_driver); + +MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>"); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("ACPI INT3403 thermal driver"); diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c index 4b2b999b7611..f8eb625b8400 100644 --- a/drivers/thermal/of-thermal.c +++ b/drivers/thermal/of-thermal.c @@ -401,6 +401,10 @@ thermal_zone_of_sensor_register(struct device *dev, int sensor_id, struct of_phandle_args sensor_specs; int ret, id; + /* Check whether child is enabled or not */ + if (!of_device_is_available(child)) + continue; + /* For now, thermal framework supports only 1 sensor per zone */ ret = of_parse_phandle_with_args(child, "thermal-sensors", "#thermal-sensor-cells", @@ -771,6 +775,10 @@ int __init of_parse_thermal_zones(void) struct thermal_zone_device *zone; struct thermal_zone_params *tzp; + /* Check whether child is enabled or not */ + if (!of_device_is_available(child)) + continue; + tz = thermal_of_build_thermal_zone(child); if (IS_ERR(tz)) { pr_err("failed to build thermal zone %s: %ld\n", @@ -838,6 +846,10 @@ void of_thermal_destroy_zones(void) for_each_child_of_node(np, child) { struct thermal_zone_device *zone; + /* Check whether child is enabled or not */ + if (!of_device_is_available(child)) + continue; + zone = thermal_zone_get_zone_by_name(child->name); if (IS_ERR(zone)) continue; diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c index f251521baaa2..fdd1f523a1ed 100644 --- a/drivers/thermal/step_wise.c +++ b/drivers/thermal/step_wise.c @@ -23,6 +23,7 @@ */ #include <linux/thermal.h> +#include <trace/events/thermal.h> #include "thermal_core.h" @@ -76,7 +77,7 @@ static unsigned long get_target_state(struct thermal_instance *instance, next_target = instance->upper; break; case THERMAL_TREND_DROPPING: - if (cur_state == instance->lower) { + if (cur_state <= instance->lower) { if (!throttle) next_target = THERMAL_NO_TARGET; } else { @@ -129,8 +130,10 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip) trend = get_tz_trend(tz, trip); - if (tz->temperature >= trip_temp) + if (tz->temperature >= trip_temp) { throttle = true; + trace_thermal_zone_trip(tz, trip, trip_type); + } dev_dbg(&tz->device, "Trip%d[type=%d,temp=%ld]:trend=%d,throttle=%d\n", trip, trip_type, trip_temp, trend, throttle); diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 1e23f4f8d2c2..9bf10aa6069b 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -38,6 +38,9 @@ #include <net/netlink.h> #include <net/genetlink.h> +#define CREATE_TRACE_POINTS +#include <trace/events/thermal.h> + #include "thermal_core.h" #include "thermal_hwmon.h" @@ -368,6 +371,8 @@ static void handle_critical_trips(struct thermal_zone_device *tz, if (tz->temperature < trip_temp) return; + trace_thermal_zone_trip(tz, trip, trip_type); + if (tz->ops->notify) tz->ops->notify(tz, trip, trip_type); @@ -463,6 +468,7 @@ static void update_temperature(struct thermal_zone_device *tz) tz->temperature = temp; mutex_unlock(&tz->lock); + trace_thermal_temperature(tz); dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n", tz->last_temperature, tz->temperature); } @@ -1287,6 +1293,7 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev) mutex_unlock(&cdev->lock); cdev->ops->set_cur_state(cdev, target); cdev->updated = true; + trace_cdev_update(cdev, target); dev_dbg(&cdev->device, "set to state %lu\n", target); } EXPORT_SYMBOL(thermal_cdev_update); @@ -1790,6 +1797,10 @@ static int __init thermal_register_governors(void) if (result) return result; + result = thermal_gov_bang_bang_register(); + if (result) + return result; + return thermal_gov_user_space_register(); } @@ -1797,6 +1808,7 @@ static void thermal_unregister_governors(void) { thermal_gov_step_wise_unregister(); thermal_gov_fair_share_unregister(); + thermal_gov_bang_bang_unregister(); thermal_gov_user_space_unregister(); } diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 3db339fb636f..d15d243de27a 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -69,6 +69,14 @@ static inline int thermal_gov_fair_share_register(void) { return 0; } static inline void thermal_gov_fair_share_unregister(void) {} #endif /* CONFIG_THERMAL_GOV_FAIR_SHARE */ +#ifdef CONFIG_THERMAL_GOV_BANG_BANG +int thermal_gov_bang_bang_register(void); +void thermal_gov_bang_bang_unregister(void); +#else +static inline int thermal_gov_bang_bang_register(void) { return 0; } +static inline void thermal_gov_bang_bang_unregister(void) {} +#endif /* CONFIG_THERMAL_GOV_BANG_BANG */ + #ifdef CONFIG_THERMAL_GOV_USER_SPACE int thermal_gov_user_space_register(void); void thermal_gov_user_space_unregister(void); diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a673e5b6a2e0..60fa6278fbce 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -28,18 +28,6 @@ #define UIO_MAX_DEVICES (1U << MINORBITS) -struct uio_device { - struct module *owner; - struct device *dev; - int minor; - atomic_t event; - struct fasync_struct *async_queue; - wait_queue_head_t wait; - struct uio_info *info; - struct kobject *map_dir; - struct kobject *portio_dir; -}; - static int uio_major; static struct cdev *uio_cdev; static DEFINE_IDR(uio_idr); diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index e3d5bf0a5021..d0107d424ee4 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -87,6 +87,15 @@ config DA9055_WATCHDOG This driver can also be built as a module. If so, the module will be called da9055_wdt. +config DA9063_WATCHDOG + tristate "Dialog DA9063 Watchdog" + depends on MFD_DA9063 + select WATCHDOG_CORE + help + Support for the watchdog in the DA9063 PMIC. + + This driver can be built as a module. The module name is da9063_wdt. + config GPIO_WATCHDOG tristate "Watchdog device controlled through GPIO-line" depends on OF_GPIO @@ -123,6 +132,7 @@ config WM8350_WATCHDOG config XILINX_WATCHDOG tristate "Xilinx Watchdog timer" + depends on HAS_IOMEM select WATCHDOG_CORE help Watchdog driver for the xps_timebase_wdt ip core. @@ -157,6 +167,14 @@ config AT91SAM9X_WATCHDOG Watchdog timer embedded into AT91SAM9X and AT91CAP9 chips. This will reboot your system when the timeout is reached. +config CADENCE_WATCHDOG + tristate "Cadence Watchdog Timer" + depends on ARM + select WATCHDOG_CORE + help + Say Y here if you want to include support for the watchdog + timer in the Xilinx Zynq. + config 21285_WATCHDOG tristate "DC21285 watchdog" depends on FOOTBRIDGE @@ -319,6 +337,17 @@ config ORION_WATCHDOG To compile this driver as a module, choose M here: the module will be called orion_wdt. +config RN5T618_WATCHDOG + tristate "Ricoh RN5T618 watchdog" + depends on MFD_RN5T618 + select WATCHDOG_CORE + help + If you say yes here you get support for watchdog on the Ricoh + RN5T618 PMIC. + + This driver can also be built as a module. If so, the module + will be called rn5t618_wdt. + config SUNXI_WATCHDOG tristate "Allwinner SoCs watchdog support" depends on ARCH_SUNXI @@ -444,7 +473,7 @@ config SIRFSOC_WATCHDOG config TEGRA_WATCHDOG tristate "Tegra watchdog" - depends on ARCH_TEGRA || COMPILE_TEST + depends on (ARCH_TEGRA || COMPILE_TEST) && HAS_IOMEM select WATCHDOG_CORE help Say Y here to include support for the watchdog timer @@ -453,6 +482,29 @@ config TEGRA_WATCHDOG To compile this driver as a module, choose M here: the module will be called tegra_wdt. +config QCOM_WDT + tristate "QCOM watchdog" + depends on HAS_IOMEM + depends on ARCH_QCOM + select WATCHDOG_CORE + help + Say Y here to include Watchdog timer support for the watchdog found + on QCOM chipsets. Currently supported targets are the MSM8960, + APQ8064, and IPQ8064. + + To compile this driver as a module, choose M here: the + module will be called qcom_wdt. + +config MESON_WATCHDOG + tristate "Amlogic Meson SoCs watchdog support" + depends on ARCH_MESON + select WATCHDOG_CORE + help + Say Y here to include support for the watchdog timer + in Amlogic Meson SoCs. + To compile this driver as a module, choose M here: the + module will be called meson_wdt. + # AVR32 Architecture config AT32AP700X_WDT diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index de1701470c14..c569ec8f8a76 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_USBPCWATCHDOG) += pcwd_usb.o obj-$(CONFIG_ARM_SP805_WATCHDOG) += sp805_wdt.o obj-$(CONFIG_AT91RM9200_WATCHDOG) += at91rm9200_wdt.o obj-$(CONFIG_AT91SAM9X_WATCHDOG) += at91sam9_wdt.o +obj-$(CONFIG_CADENCE_WATCHDOG) += cadence_wdt.o obj-$(CONFIG_OMAP_WATCHDOG) += omap_wdt.o obj-$(CONFIG_TWL4030_WATCHDOG) += twl4030_wdt.o obj-$(CONFIG_21285_WATCHDOG) += wdt285.o @@ -47,6 +48,7 @@ obj-$(CONFIG_IOP_WATCHDOG) += iop_wdt.o obj-$(CONFIG_DAVINCI_WATCHDOG) += davinci_wdt.o obj-$(CONFIG_ORION_WATCHDOG) += orion_wdt.o obj-$(CONFIG_SUNXI_WATCHDOG) += sunxi_wdt.o +obj-$(CONFIG_RN5T618_WATCHDOG) += rn5t618_wdt.o obj-$(CONFIG_COH901327_WATCHDOG) += coh901327_wdt.o obj-$(CONFIG_STMP3XXX_RTC_WATCHDOG) += stmp3xxx_rtc_wdt.o obj-$(CONFIG_NUC900_WATCHDOG) += nuc900_wdt.o @@ -57,8 +59,10 @@ obj-$(CONFIG_RETU_WATCHDOG) += retu_wdt.o obj-$(CONFIG_BCM2835_WDT) += bcm2835_wdt.o obj-$(CONFIG_MOXART_WDT) += moxart_wdt.o obj-$(CONFIG_SIRFSOC_WATCHDOG) += sirfsoc_wdt.o +obj-$(CONFIG_QCOM_WDT) += qcom-wdt.o obj-$(CONFIG_BCM_KONA_WDT) += bcm_kona_wdt.o obj-$(CONFIG_TEGRA_WATCHDOG) += tegra_wdt.o +obj-$(CONFIG_MESON_WATCHDOG) += meson_wdt.o # AVR32 Architecture obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o @@ -173,6 +177,7 @@ obj-$(CONFIG_XEN_WDT) += xen_wdt.o # Architecture Independent obj-$(CONFIG_DA9052_WATCHDOG) += da9052_wdt.o obj-$(CONFIG_DA9055_WATCHDOG) += da9055_wdt.o +obj-$(CONFIG_DA9063_WATCHDOG) += da9063_wdt.o obj-$(CONFIG_GPIO_WATCHDOG) += gpio_wdt.o obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o diff --git a/drivers/watchdog/booke_wdt.c b/drivers/watchdog/booke_wdt.c index 08a785398eac..e96b09b135c8 100644 --- a/drivers/watchdog/booke_wdt.c +++ b/drivers/watchdog/booke_wdt.c @@ -30,8 +30,6 @@ * occur, and the final time the board will reset. */ -u32 booke_wdt_enabled; -u32 booke_wdt_period = CONFIG_BOOKE_WDT_DEFAULT_TIMEOUT; #ifdef CONFIG_PPC_FSL_BOOK3E #define WDTP(x) ((((x)&0x3)<<30)|(((x)&0x3c)<<15)) @@ -41,27 +39,10 @@ u32 booke_wdt_period = CONFIG_BOOKE_WDT_DEFAULT_TIMEOUT; #define WDTP_MASK (TCR_WP_MASK) #endif -/* Checks wdt=x and wdt_period=xx command-line option */ -notrace int __init early_parse_wdt(char *p) -{ - if (p && strncmp(p, "0", 1) != 0) - booke_wdt_enabled = 1; - - return 0; -} -early_param("wdt", early_parse_wdt); - -int __init early_parse_wdt_period(char *p) -{ - unsigned long ret; - if (p) { - if (!kstrtol(p, 0, &ret)) - booke_wdt_period = ret; - } - - return 0; -} -early_param("wdt_period", early_parse_wdt_period); +static bool booke_wdt_enabled; +module_param(booke_wdt_enabled, bool, 0); +static int booke_wdt_period = CONFIG_BOOKE_WDT_DEFAULT_TIMEOUT; +module_param(booke_wdt_period, int, 0); #ifdef CONFIG_PPC_FSL_BOOK3E @@ -259,5 +240,6 @@ static int __init booke_wdt_init(void) module_init(booke_wdt_init); module_exit(booke_wdt_exit); +MODULE_ALIAS("booke_wdt"); MODULE_DESCRIPTION("PowerPC Book-E watchdog driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/watchdog/cadence_wdt.c b/drivers/watchdog/cadence_wdt.c new file mode 100644 index 000000000000..5927c0a98a74 --- /dev/null +++ b/drivers/watchdog/cadence_wdt.c @@ -0,0 +1,516 @@ +/* + * Cadence WDT driver - Used by Xilinx Zynq + * + * Copyright (C) 2010 - 2014 Xilinx, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/clk.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/reboot.h> +#include <linux/watchdog.h> + +#define CDNS_WDT_DEFAULT_TIMEOUT 10 +/* Supports 1 - 516 sec */ +#define CDNS_WDT_MIN_TIMEOUT 1 +#define CDNS_WDT_MAX_TIMEOUT 516 + +/* Restart key */ +#define CDNS_WDT_RESTART_KEY 0x00001999 + +/* Counter register access key */ +#define CDNS_WDT_REGISTER_ACCESS_KEY 0x00920000 + +/* Counter value divisor */ +#define CDNS_WDT_COUNTER_VALUE_DIVISOR 0x1000 + +/* Clock prescaler value and selection */ +#define CDNS_WDT_PRESCALE_64 64 +#define CDNS_WDT_PRESCALE_512 512 +#define CDNS_WDT_PRESCALE_4096 4096 +#define CDNS_WDT_PRESCALE_SELECT_64 1 +#define CDNS_WDT_PRESCALE_SELECT_512 2 +#define CDNS_WDT_PRESCALE_SELECT_4096 3 + +/* Input clock frequency */ +#define CDNS_WDT_CLK_10MHZ 10000000 +#define CDNS_WDT_CLK_75MHZ 75000000 + +/* Counter maximum value */ +#define CDNS_WDT_COUNTER_MAX 0xFFF + +static int wdt_timeout = CDNS_WDT_DEFAULT_TIMEOUT; +static int nowayout = WATCHDOG_NOWAYOUT; + +module_param(wdt_timeout, int, 0); +MODULE_PARM_DESC(wdt_timeout, + "Watchdog time in seconds. (default=" + __MODULE_STRING(CDNS_WDT_DEFAULT_TIMEOUT) ")"); + +module_param(nowayout, int, 0); +MODULE_PARM_DESC(nowayout, + "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +/** + * struct cdns_wdt - Watchdog device structure + * @regs: baseaddress of device + * @rst: reset flag + * @clk: struct clk * of a clock source + * @prescaler: for saving prescaler value + * @ctrl_clksel: counter clock prescaler selection + * @io_lock: spinlock for IO register access + * @cdns_wdt_device: watchdog device structure + * @cdns_wdt_notifier: notifier structure + * + * Structure containing parameters specific to cadence watchdog. + */ +struct cdns_wdt { + void __iomem *regs; + bool rst; + struct clk *clk; + u32 prescaler; + u32 ctrl_clksel; + spinlock_t io_lock; + struct watchdog_device cdns_wdt_device; + struct notifier_block cdns_wdt_notifier; +}; + +/* Write access to Registers */ +static inline void cdns_wdt_writereg(struct cdns_wdt *wdt, u32 offset, u32 val) +{ + writel_relaxed(val, wdt->regs + offset); +} + +/*************************Register Map**************************************/ + +/* Register Offsets for the WDT */ +#define CDNS_WDT_ZMR_OFFSET 0x0 /* Zero Mode Register */ +#define CDNS_WDT_CCR_OFFSET 0x4 /* Counter Control Register */ +#define CDNS_WDT_RESTART_OFFSET 0x8 /* Restart Register */ +#define CDNS_WDT_SR_OFFSET 0xC /* Status Register */ + +/* + * Zero Mode Register - This register controls how the time out is indicated + * and also contains the access code to allow writes to the register (0xABC). + */ +#define CDNS_WDT_ZMR_WDEN_MASK 0x00000001 /* Enable the WDT */ +#define CDNS_WDT_ZMR_RSTEN_MASK 0x00000002 /* Enable the reset output */ +#define CDNS_WDT_ZMR_IRQEN_MASK 0x00000004 /* Enable IRQ output */ +#define CDNS_WDT_ZMR_RSTLEN_16 0x00000030 /* Reset pulse of 16 pclk cycles */ +#define CDNS_WDT_ZMR_ZKEY_VAL 0x00ABC000 /* Access key, 0xABC << 12 */ +/* + * Counter Control register - This register controls how fast the timer runs + * and the reset value and also contains the access code to allow writes to + * the register. + */ +#define CDNS_WDT_CCR_CRV_MASK 0x00003FFC /* Counter reset value */ + +/** + * cdns_wdt_stop - Stop the watchdog. + * + * @wdd: watchdog device + * + * Read the contents of the ZMR register, clear the WDEN bit + * in the register and set the access key for successful write. + * + * Return: always 0 + */ +static int cdns_wdt_stop(struct watchdog_device *wdd) +{ + struct cdns_wdt *wdt = watchdog_get_drvdata(wdd); + + spin_lock(&wdt->io_lock); + cdns_wdt_writereg(wdt, CDNS_WDT_ZMR_OFFSET, + CDNS_WDT_ZMR_ZKEY_VAL & (~CDNS_WDT_ZMR_WDEN_MASK)); + spin_unlock(&wdt->io_lock); + + return 0; +} + +/** + * cdns_wdt_reload - Reload the watchdog timer (i.e. pat the watchdog). + * + * @wdd: watchdog device + * + * Write the restart key value (0x00001999) to the restart register. + * + * Return: always 0 + */ +static int cdns_wdt_reload(struct watchdog_device *wdd) +{ + struct cdns_wdt *wdt = watchdog_get_drvdata(wdd); + + spin_lock(&wdt->io_lock); + cdns_wdt_writereg(wdt, CDNS_WDT_RESTART_OFFSET, + CDNS_WDT_RESTART_KEY); + spin_unlock(&wdt->io_lock); + + return 0; +} + +/** + * cdns_wdt_start - Enable and start the watchdog. + * + * @wdd: watchdog device + * + * The counter value is calculated according to the formula: + * calculated count = (timeout * clock) / prescaler + 1. + * The calculated count is divided by 0x1000 to obtain the field value + * to write to counter control register. + * Clears the contents of prescaler and counter reset value. Sets the + * prescaler to 4096 and the calculated count and access key + * to write to CCR Register. + * Sets the WDT (WDEN bit) and either the Reset signal(RSTEN bit) + * or Interrupt signal(IRQEN) with a specified cycles and the access + * key to write to ZMR Register. + * + * Return: always 0 + */ +static int cdns_wdt_start(struct watchdog_device *wdd) +{ + struct cdns_wdt *wdt = watchdog_get_drvdata(wdd); + unsigned int data = 0; + unsigned short count; + unsigned long clock_f = clk_get_rate(wdt->clk); + + /* + * Counter value divisor to obtain the value of + * counter reset to be written to control register. + */ + count = (wdd->timeout * (clock_f / wdt->prescaler)) / + CDNS_WDT_COUNTER_VALUE_DIVISOR + 1; + + if (count > CDNS_WDT_COUNTER_MAX) + count = CDNS_WDT_COUNTER_MAX; + + spin_lock(&wdt->io_lock); + cdns_wdt_writereg(wdt, CDNS_WDT_ZMR_OFFSET, + CDNS_WDT_ZMR_ZKEY_VAL); + + count = (count << 2) & CDNS_WDT_CCR_CRV_MASK; + + /* Write counter access key first to be able write to register */ + data = count | CDNS_WDT_REGISTER_ACCESS_KEY | wdt->ctrl_clksel; + cdns_wdt_writereg(wdt, CDNS_WDT_CCR_OFFSET, data); + data = CDNS_WDT_ZMR_WDEN_MASK | CDNS_WDT_ZMR_RSTLEN_16 | + CDNS_WDT_ZMR_ZKEY_VAL; + + /* Reset on timeout if specified in device tree. */ + if (wdt->rst) { + data |= CDNS_WDT_ZMR_RSTEN_MASK; + data &= ~CDNS_WDT_ZMR_IRQEN_MASK; + } else { + data &= ~CDNS_WDT_ZMR_RSTEN_MASK; + data |= CDNS_WDT_ZMR_IRQEN_MASK; + } + cdns_wdt_writereg(wdt, CDNS_WDT_ZMR_OFFSET, data); + cdns_wdt_writereg(wdt, CDNS_WDT_RESTART_OFFSET, + CDNS_WDT_RESTART_KEY); + spin_unlock(&wdt->io_lock); + + return 0; +} + +/** + * cdns_wdt_settimeout - Set a new timeout value for the watchdog device. + * + * @wdd: watchdog device + * @new_time: new timeout value that needs to be set + * Return: 0 on success + * + * Update the watchdog_device timeout with new value which is used when + * cdns_wdt_start is called. + */ +static int cdns_wdt_settimeout(struct watchdog_device *wdd, + unsigned int new_time) +{ + wdd->timeout = new_time; + + return cdns_wdt_start(wdd); +} + +/** + * cdns_wdt_irq_handler - Notifies of watchdog timeout. + * + * @irq: interrupt number + * @dev_id: pointer to a platform device structure + * Return: IRQ_HANDLED + * + * The handler is invoked when the watchdog times out and a + * reset on timeout has not been enabled. + */ +static irqreturn_t cdns_wdt_irq_handler(int irq, void *dev_id) +{ + struct platform_device *pdev = dev_id; + + dev_info(&pdev->dev, + "Watchdog timed out. Internal reset not enabled\n"); + + return IRQ_HANDLED; +} + +/* + * Info structure used to indicate the features supported by the device + * to the upper layers. This is defined in watchdog.h header file. + */ +static struct watchdog_info cdns_wdt_info = { + .identity = "cdns_wdt watchdog", + .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | + WDIOF_MAGICCLOSE, +}; + +/* Watchdog Core Ops */ +static struct watchdog_ops cdns_wdt_ops = { + .owner = THIS_MODULE, + .start = cdns_wdt_start, + .stop = cdns_wdt_stop, + .ping = cdns_wdt_reload, + .set_timeout = cdns_wdt_settimeout, +}; + +/** + * cdns_wdt_notify_sys - Notifier for reboot or shutdown. + * + * @this: handle to notifier block + * @code: turn off indicator + * @unused: unused + * Return: NOTIFY_DONE + * + * This notifier is invoked whenever the system reboot or shutdown occur + * because we need to disable the WDT before system goes down as WDT might + * reset on the next boot. + */ +static int cdns_wdt_notify_sys(struct notifier_block *this, unsigned long code, + void *unused) +{ + struct cdns_wdt *wdt = container_of(this, struct cdns_wdt, + cdns_wdt_notifier); + if (code == SYS_DOWN || code == SYS_HALT) + cdns_wdt_stop(&wdt->cdns_wdt_device); + + return NOTIFY_DONE; +} + +/************************Platform Operations*****************************/ +/** + * cdns_wdt_probe - Probe call for the device. + * + * @pdev: handle to the platform device structure. + * Return: 0 on success, negative error otherwise. + * + * It does all the memory allocation and registration for the device. + */ +static int cdns_wdt_probe(struct platform_device *pdev) +{ + struct resource *res; + int ret, irq; + unsigned long clock_f; + struct cdns_wdt *wdt; + struct watchdog_device *cdns_wdt_device; + + wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + cdns_wdt_device = &wdt->cdns_wdt_device; + cdns_wdt_device->info = &cdns_wdt_info; + cdns_wdt_device->ops = &cdns_wdt_ops; + cdns_wdt_device->timeout = CDNS_WDT_DEFAULT_TIMEOUT; + cdns_wdt_device->min_timeout = CDNS_WDT_MIN_TIMEOUT; + cdns_wdt_device->max_timeout = CDNS_WDT_MAX_TIMEOUT; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + wdt->regs = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(wdt->regs)) + return PTR_ERR(wdt->regs); + + /* Register the interrupt */ + wdt->rst = of_property_read_bool(pdev->dev.of_node, "reset-on-timeout"); + irq = platform_get_irq(pdev, 0); + if (!wdt->rst && irq >= 0) { + ret = devm_request_irq(&pdev->dev, irq, cdns_wdt_irq_handler, 0, + pdev->name, pdev); + if (ret) { + dev_err(&pdev->dev, + "cannot register interrupt handler err=%d\n", + ret); + return ret; + } + } + + /* Initialize the members of cdns_wdt structure */ + cdns_wdt_device->parent = &pdev->dev; + + ret = watchdog_init_timeout(cdns_wdt_device, wdt_timeout, &pdev->dev); + if (ret) { + dev_err(&pdev->dev, "unable to set timeout value\n"); + return ret; + } + + watchdog_set_nowayout(cdns_wdt_device, nowayout); + watchdog_set_drvdata(cdns_wdt_device, wdt); + + wdt->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(wdt->clk)) { + dev_err(&pdev->dev, "input clock not found\n"); + ret = PTR_ERR(wdt->clk); + return ret; + } + + ret = clk_prepare_enable(wdt->clk); + if (ret) { + dev_err(&pdev->dev, "unable to enable clock\n"); + return ret; + } + + clock_f = clk_get_rate(wdt->clk); + if (clock_f <= CDNS_WDT_CLK_75MHZ) { + wdt->prescaler = CDNS_WDT_PRESCALE_512; + wdt->ctrl_clksel = CDNS_WDT_PRESCALE_SELECT_512; + } else { + wdt->prescaler = CDNS_WDT_PRESCALE_4096; + wdt->ctrl_clksel = CDNS_WDT_PRESCALE_SELECT_4096; + } + + spin_lock_init(&wdt->io_lock); + + wdt->cdns_wdt_notifier.notifier_call = &cdns_wdt_notify_sys; + ret = register_reboot_notifier(&wdt->cdns_wdt_notifier); + if (ret != 0) { + dev_err(&pdev->dev, "cannot register reboot notifier err=%d)\n", + ret); + goto err_clk_disable; + } + + ret = watchdog_register_device(cdns_wdt_device); + if (ret) { + dev_err(&pdev->dev, "Failed to register wdt device\n"); + goto err_clk_disable; + } + platform_set_drvdata(pdev, wdt); + + dev_dbg(&pdev->dev, "Xilinx Watchdog Timer at %p with timeout %ds%s\n", + wdt->regs, cdns_wdt_device->timeout, + nowayout ? ", nowayout" : ""); + + return 0; + +err_clk_disable: + clk_disable_unprepare(wdt->clk); + + return ret; +} + +/** + * cdns_wdt_remove - Probe call for the device. + * + * @pdev: handle to the platform device structure. + * Return: 0 on success, otherwise negative error. + * + * Unregister the device after releasing the resources. + */ +static int cdns_wdt_remove(struct platform_device *pdev) +{ + struct cdns_wdt *wdt = platform_get_drvdata(pdev); + + cdns_wdt_stop(&wdt->cdns_wdt_device); + watchdog_unregister_device(&wdt->cdns_wdt_device); + unregister_reboot_notifier(&wdt->cdns_wdt_notifier); + clk_disable_unprepare(wdt->clk); + + return 0; +} + +/** + * cdns_wdt_shutdown - Stop the device. + * + * @pdev: handle to the platform structure. + * + */ +static void cdns_wdt_shutdown(struct platform_device *pdev) +{ + struct cdns_wdt *wdt = platform_get_drvdata(pdev); + + cdns_wdt_stop(&wdt->cdns_wdt_device); + clk_disable_unprepare(wdt->clk); +} + +/** + * cdns_wdt_suspend - Stop the device. + * + * @dev: handle to the device structure. + * Return: 0 always. + */ +static int __maybe_unused cdns_wdt_suspend(struct device *dev) +{ + struct platform_device *pdev = container_of(dev, + struct platform_device, dev); + struct cdns_wdt *wdt = platform_get_drvdata(pdev); + + cdns_wdt_stop(&wdt->cdns_wdt_device); + clk_disable_unprepare(wdt->clk); + + return 0; +} + +/** + * cdns_wdt_resume - Resume the device. + * + * @dev: handle to the device structure. + * Return: 0 on success, errno otherwise. + */ +static int __maybe_unused cdns_wdt_resume(struct device *dev) +{ + int ret; + struct platform_device *pdev = container_of(dev, + struct platform_device, dev); + struct cdns_wdt *wdt = platform_get_drvdata(pdev); + + ret = clk_prepare_enable(wdt->clk); + if (ret) { + dev_err(dev, "unable to enable clock\n"); + return ret; + } + cdns_wdt_start(&wdt->cdns_wdt_device); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(cdns_wdt_pm_ops, cdns_wdt_suspend, cdns_wdt_resume); + +static struct of_device_id cdns_wdt_of_match[] = { + { .compatible = "cdns,wdt-r1p2", }, + { /* end of table */ } +}; +MODULE_DEVICE_TABLE(of, cdns_wdt_of_match); + +/* Driver Structure */ +static struct platform_driver cdns_wdt_driver = { + .probe = cdns_wdt_probe, + .remove = cdns_wdt_remove, + .shutdown = cdns_wdt_shutdown, + .driver = { + .name = "cdns-wdt", + .owner = THIS_MODULE, + .of_match_table = cdns_wdt_of_match, + .pm = &cdns_wdt_pm_ops, + }, +}; + +module_platform_driver(cdns_wdt_driver); + +MODULE_AUTHOR("Xilinx, Inc."); +MODULE_DESCRIPTION("Watchdog driver for Cadence WDT"); +MODULE_LICENSE("GPL"); diff --git a/drivers/watchdog/da9063_wdt.c b/drivers/watchdog/da9063_wdt.c new file mode 100644 index 000000000000..2cd6b2c2dd2a --- /dev/null +++ b/drivers/watchdog/da9063_wdt.c @@ -0,0 +1,191 @@ +/* + * Watchdog driver for DA9063 PMICs. + * + * Copyright(c) 2012 Dialog Semiconductor Ltd. + * + * Author: Mariusz Wojtasik <mariusz.wojtasik@diasemi.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/watchdog.h> +#include <linux/platform_device.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/mfd/da9063/registers.h> +#include <linux/mfd/da9063/core.h> +#include <linux/regmap.h> + +/* + * Watchdog selector to timeout in seconds. + * 0: WDT disabled; + * others: timeout = 2048 ms * 2^(TWDSCALE-1). + */ +static const unsigned int wdt_timeout[] = { 0, 2, 4, 8, 16, 32, 65, 131 }; +#define DA9063_TWDSCALE_DISABLE 0 +#define DA9063_TWDSCALE_MIN 1 +#define DA9063_TWDSCALE_MAX (ARRAY_SIZE(wdt_timeout) - 1) +#define DA9063_WDT_MIN_TIMEOUT wdt_timeout[DA9063_TWDSCALE_MIN] +#define DA9063_WDT_MAX_TIMEOUT wdt_timeout[DA9063_TWDSCALE_MAX] +#define DA9063_WDG_TIMEOUT wdt_timeout[3] + +struct da9063_watchdog { + struct da9063 *da9063; + struct watchdog_device wdtdev; +}; + +static unsigned int da9063_wdt_timeout_to_sel(unsigned int secs) +{ + unsigned int i; + + for (i = DA9063_TWDSCALE_MIN; i <= DA9063_TWDSCALE_MAX; i++) { + if (wdt_timeout[i] >= secs) + return i; + } + + return DA9063_TWDSCALE_MAX; +} + +static int _da9063_wdt_set_timeout(struct da9063 *da9063, unsigned int regval) +{ + return regmap_update_bits(da9063->regmap, DA9063_REG_CONTROL_D, + DA9063_TWDSCALE_MASK, regval); +} + +static int da9063_wdt_start(struct watchdog_device *wdd) +{ + struct da9063_watchdog *wdt = watchdog_get_drvdata(wdd); + unsigned int selector; + int ret; + + selector = da9063_wdt_timeout_to_sel(wdt->wdtdev.timeout); + ret = _da9063_wdt_set_timeout(wdt->da9063, selector); + if (ret) + dev_err(wdt->da9063->dev, "Watchdog failed to start (err = %d)\n", + ret); + + return ret; +} + +static int da9063_wdt_stop(struct watchdog_device *wdd) +{ + struct da9063_watchdog *wdt = watchdog_get_drvdata(wdd); + int ret; + + ret = regmap_update_bits(wdt->da9063->regmap, DA9063_REG_CONTROL_D, + DA9063_TWDSCALE_MASK, DA9063_TWDSCALE_DISABLE); + if (ret) + dev_alert(wdt->da9063->dev, "Watchdog failed to stop (err = %d)\n", + ret); + + return ret; +} + +static int da9063_wdt_ping(struct watchdog_device *wdd) +{ + struct da9063_watchdog *wdt = watchdog_get_drvdata(wdd); + int ret; + + ret = regmap_write(wdt->da9063->regmap, DA9063_REG_CONTROL_F, + DA9063_WATCHDOG); + if (ret) + dev_alert(wdt->da9063->dev, "Failed to ping the watchdog (err = %d)\n", + ret); + + return ret; +} + +static int da9063_wdt_set_timeout(struct watchdog_device *wdd, + unsigned int timeout) +{ + struct da9063_watchdog *wdt = watchdog_get_drvdata(wdd); + unsigned int selector; + int ret; + + selector = da9063_wdt_timeout_to_sel(timeout); + ret = _da9063_wdt_set_timeout(wdt->da9063, selector); + if (ret) + dev_err(wdt->da9063->dev, "Failed to set watchdog timeout (err = %d)\n", + ret); + else + wdd->timeout = wdt_timeout[selector]; + + return ret; +} + +static const struct watchdog_info da9063_watchdog_info = { + .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING, + .identity = "DA9063 Watchdog", +}; + +static const struct watchdog_ops da9063_watchdog_ops = { + .owner = THIS_MODULE, + .start = da9063_wdt_start, + .stop = da9063_wdt_stop, + .ping = da9063_wdt_ping, + .set_timeout = da9063_wdt_set_timeout, +}; + +static int da9063_wdt_probe(struct platform_device *pdev) +{ + int ret; + struct da9063 *da9063; + struct da9063_watchdog *wdt; + + if (!pdev->dev.parent) + return -EINVAL; + + da9063 = dev_get_drvdata(pdev->dev.parent); + if (!da9063) + return -EINVAL; + + wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + wdt->da9063 = da9063; + + wdt->wdtdev.info = &da9063_watchdog_info; + wdt->wdtdev.ops = &da9063_watchdog_ops; + wdt->wdtdev.min_timeout = DA9063_WDT_MIN_TIMEOUT; + wdt->wdtdev.max_timeout = DA9063_WDT_MAX_TIMEOUT; + wdt->wdtdev.timeout = DA9063_WDG_TIMEOUT; + + wdt->wdtdev.status = WATCHDOG_NOWAYOUT_INIT_STATUS; + + watchdog_set_drvdata(&wdt->wdtdev, wdt); + dev_set_drvdata(&pdev->dev, wdt); + + ret = watchdog_register_device(&wdt->wdtdev); + + return ret; +} + +static int da9063_wdt_remove(struct platform_device *pdev) +{ + struct da9063_watchdog *wdt = dev_get_drvdata(&pdev->dev); + + watchdog_unregister_device(&wdt->wdtdev); + + return 0; +} + +static struct platform_driver da9063_wdt_driver = { + .probe = da9063_wdt_probe, + .remove = da9063_wdt_remove, + .driver = { + .name = DA9063_DRVNAME_WATCHDOG, + }, +}; +module_platform_driver(da9063_wdt_driver); + +MODULE_AUTHOR("Mariusz Wojtasik <mariusz.wojtasik@diasemi.com>"); +MODULE_DESCRIPTION("Watchdog driver for Dialog DA9063"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:" DA9063_DRVNAME_WATCHDOG); diff --git a/drivers/watchdog/dw_wdt.c b/drivers/watchdog/dw_wdt.c index 9f210299de24..9e577a64ec9e 100644 --- a/drivers/watchdog/dw_wdt.c +++ b/drivers/watchdog/dw_wdt.c @@ -21,6 +21,7 @@ #include <linux/bitops.h> #include <linux/clk.h> +#include <linux/delay.h> #include <linux/device.h> #include <linux/err.h> #include <linux/fs.h> @@ -29,9 +30,11 @@ #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/notifier.h> #include <linux/of.h> #include <linux/pm.h> #include <linux/platform_device.h> +#include <linux/reboot.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/uaccess.h> @@ -40,6 +43,7 @@ #define WDOG_CONTROL_REG_OFFSET 0x00 #define WDOG_CONTROL_REG_WDT_EN_MASK 0x01 #define WDOG_TIMEOUT_RANGE_REG_OFFSET 0x04 +#define WDOG_TIMEOUT_RANGE_TOPINIT_SHIFT 4 #define WDOG_CURRENT_COUNT_REG_OFFSET 0x08 #define WDOG_COUNTER_RESTART_REG_OFFSET 0x0c #define WDOG_COUNTER_RESTART_KICK_VALUE 0x76 @@ -62,6 +66,7 @@ static struct { unsigned long next_heartbeat; struct timer_list timer; int expect_close; + struct notifier_block restart_handler; } dw_wdt; static inline int dw_wdt_is_enabled(void) @@ -106,7 +111,8 @@ static int dw_wdt_set_top(unsigned top_s) } /* Set the new value in the watchdog. */ - writel(top_val, dw_wdt.regs + WDOG_TIMEOUT_RANGE_REG_OFFSET); + writel(top_val | top_val << WDOG_TIMEOUT_RANGE_TOPINIT_SHIFT, + dw_wdt.regs + WDOG_TIMEOUT_RANGE_REG_OFFSET); dw_wdt_set_next_heartbeat(); @@ -119,6 +125,26 @@ static void dw_wdt_keepalive(void) WDOG_COUNTER_RESTART_REG_OFFSET); } +static int dw_wdt_restart_handle(struct notifier_block *this, + unsigned long mode, void *cmd) +{ + u32 val; + + writel(0, dw_wdt.regs + WDOG_TIMEOUT_RANGE_REG_OFFSET); + val = readl(dw_wdt.regs + WDOG_CONTROL_REG_OFFSET); + if (val & WDOG_CONTROL_REG_WDT_EN_MASK) + writel(WDOG_COUNTER_RESTART_KICK_VALUE, dw_wdt.regs + + WDOG_COUNTER_RESTART_REG_OFFSET); + else + writel(WDOG_CONTROL_REG_WDT_EN_MASK, + dw_wdt.regs + WDOG_CONTROL_REG_OFFSET); + + /* wait for reset to assert... */ + mdelay(500); + + return NOTIFY_DONE; +} + static void dw_wdt_ping(unsigned long data) { if (time_before(jiffies, dw_wdt.next_heartbeat) || @@ -314,6 +340,12 @@ static int dw_wdt_drv_probe(struct platform_device *pdev) if (ret) goto out_disable_clk; + dw_wdt.restart_handler.notifier_call = dw_wdt_restart_handle; + dw_wdt.restart_handler.priority = 128; + ret = register_restart_handler(&dw_wdt.restart_handler); + if (ret) + pr_warn("cannot register restart handler\n"); + dw_wdt_set_next_heartbeat(); setup_timer(&dw_wdt.timer, dw_wdt_ping, 0); mod_timer(&dw_wdt.timer, jiffies + WDT_TIMEOUT); @@ -328,6 +360,8 @@ out_disable_clk: static int dw_wdt_drv_remove(struct platform_device *pdev) { + unregister_restart_handler(&dw_wdt.restart_handler); + misc_deregister(&dw_wdt_miscdev); clk_disable_unprepare(dw_wdt.clk); diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c index 68c3d379ffa8..7e12f88bb4a6 100644 --- a/drivers/watchdog/imx2_wdt.c +++ b/drivers/watchdog/imx2_wdt.c @@ -22,14 +22,17 @@ */ #include <linux/clk.h> +#include <linux/delay.h> #include <linux/init.h> #include <linux/io.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/notifier.h> #include <linux/of_address.h> #include <linux/platform_device.h> +#include <linux/reboot.h> #include <linux/regmap.h> #include <linux/timer.h> #include <linux/watchdog.h> @@ -59,6 +62,7 @@ struct imx2_wdt_device { struct regmap *regmap; struct timer_list timer; /* Pings the watchdog when closed */ struct watchdog_device wdog; + struct notifier_block restart_handler; }; static bool nowayout = WATCHDOG_NOWAYOUT; @@ -77,6 +81,31 @@ static const struct watchdog_info imx2_wdt_info = { .options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE, }; +static int imx2_restart_handler(struct notifier_block *this, unsigned long mode, + void *cmd) +{ + unsigned int wcr_enable = IMX2_WDT_WCR_WDE; + struct imx2_wdt_device *wdev = container_of(this, + struct imx2_wdt_device, + restart_handler); + /* Assert SRS signal */ + regmap_write(wdev->regmap, 0, wcr_enable); + /* + * Due to imx6q errata ERR004346 (WDOG: WDOG SRS bit requires to be + * written twice), we add another two writes to ensure there must be at + * least two writes happen in the same one 32kHz clock period. We save + * the target check here, since the writes shouldn't be a huge burden + * for other platforms. + */ + regmap_write(wdev->regmap, 0, wcr_enable); + regmap_write(wdev->regmap, 0, wcr_enable); + + /* wait for reset to assert... */ + mdelay(500); + + return NOTIFY_DONE; +} + static inline void imx2_wdt_setup(struct watchdog_device *wdog) { struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog); @@ -191,12 +220,10 @@ static struct regmap_config imx2_wdt_regmap_config = { static int __init imx2_wdt_probe(struct platform_device *pdev) { - struct device_node *np = pdev->dev.of_node; struct imx2_wdt_device *wdev; struct watchdog_device *wdog; struct resource *res; void __iomem *base; - bool big_endian; int ret; u32 val; @@ -204,10 +231,6 @@ static int __init imx2_wdt_probe(struct platform_device *pdev) if (!wdev) return -ENOMEM; - big_endian = of_property_read_bool(np, "big-endian"); - if (big_endian) - imx2_wdt_regmap_config.val_format_endian = REGMAP_ENDIAN_BIG; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(base)) @@ -257,6 +280,12 @@ static int __init imx2_wdt_probe(struct platform_device *pdev) return ret; } + wdev->restart_handler.notifier_call = imx2_restart_handler; + wdev->restart_handler.priority = 128; + ret = register_restart_handler(&wdev->restart_handler); + if (ret) + dev_err(&pdev->dev, "cannot register restart handler\n"); + dev_info(&pdev->dev, "timeout %d sec (nowayout=%d)\n", wdog->timeout, nowayout); @@ -268,6 +297,8 @@ static int __exit imx2_wdt_remove(struct platform_device *pdev) struct watchdog_device *wdog = platform_get_drvdata(pdev); struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog); + unregister_restart_handler(&wdev->restart_handler); + watchdog_unregister_device(wdog); if (imx2_wdt_is_running(wdev)) { diff --git a/drivers/watchdog/meson_wdt.c b/drivers/watchdog/meson_wdt.c new file mode 100644 index 000000000000..ef6a298e8c45 --- /dev/null +++ b/drivers/watchdog/meson_wdt.c @@ -0,0 +1,236 @@ +/* + * Meson Watchdog Driver + * + * Copyright (c) 2014 Carlo Caione + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/clk.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/reboot.h> +#include <linux/types.h> +#include <linux/watchdog.h> + +#define DRV_NAME "meson_wdt" + +#define MESON_WDT_TC 0x00 +#define MESON_WDT_TC_EN BIT(22) +#define MESON_WDT_TC_TM_MASK 0x3fffff +#define MESON_WDT_DC_RESET (3 << 24) + +#define MESON_WDT_RESET 0x04 + +#define MESON_WDT_TIMEOUT 30 +#define MESON_WDT_MIN_TIMEOUT 1 +#define MESON_WDT_MAX_TIMEOUT (MESON_WDT_TC_TM_MASK / 100000) + +#define MESON_SEC_TO_TC(s) ((s) * 100000) + +static bool nowayout = WATCHDOG_NOWAYOUT; +static unsigned int timeout = MESON_WDT_TIMEOUT; + +struct meson_wdt_dev { + struct watchdog_device wdt_dev; + void __iomem *wdt_base; + struct notifier_block restart_handler; +}; + +static int meson_restart_handle(struct notifier_block *this, unsigned long mode, + void *cmd) +{ + u32 tc_reboot = MESON_WDT_DC_RESET | MESON_WDT_TC_EN; + struct meson_wdt_dev *meson_wdt = container_of(this, + struct meson_wdt_dev, + restart_handler); + + while (1) { + writel(tc_reboot, meson_wdt->wdt_base + MESON_WDT_TC); + mdelay(5); + } + + return NOTIFY_DONE; +} + +static int meson_wdt_ping(struct watchdog_device *wdt_dev) +{ + struct meson_wdt_dev *meson_wdt = watchdog_get_drvdata(wdt_dev); + + writel(0, meson_wdt->wdt_base + MESON_WDT_RESET); + + return 0; +} + +static void meson_wdt_change_timeout(struct watchdog_device *wdt_dev, + unsigned int timeout) +{ + struct meson_wdt_dev *meson_wdt = watchdog_get_drvdata(wdt_dev); + u32 reg; + + reg = readl(meson_wdt->wdt_base + MESON_WDT_TC); + reg &= ~MESON_WDT_TC_TM_MASK; + reg |= MESON_SEC_TO_TC(timeout); + writel(reg, meson_wdt->wdt_base + MESON_WDT_TC); +} + +static int meson_wdt_set_timeout(struct watchdog_device *wdt_dev, + unsigned int timeout) +{ + wdt_dev->timeout = timeout; + + meson_wdt_change_timeout(wdt_dev, timeout); + meson_wdt_ping(wdt_dev); + + return 0; +} + +static int meson_wdt_stop(struct watchdog_device *wdt_dev) +{ + struct meson_wdt_dev *meson_wdt = watchdog_get_drvdata(wdt_dev); + u32 reg; + + reg = readl(meson_wdt->wdt_base + MESON_WDT_TC); + reg &= ~MESON_WDT_TC_EN; + writel(reg, meson_wdt->wdt_base + MESON_WDT_TC); + + return 0; +} + +static int meson_wdt_start(struct watchdog_device *wdt_dev) +{ + struct meson_wdt_dev *meson_wdt = watchdog_get_drvdata(wdt_dev); + u32 reg; + + meson_wdt_change_timeout(wdt_dev, meson_wdt->wdt_dev.timeout); + meson_wdt_ping(wdt_dev); + + reg = readl(meson_wdt->wdt_base + MESON_WDT_TC); + reg |= MESON_WDT_TC_EN; + writel(reg, meson_wdt->wdt_base + MESON_WDT_TC); + + return 0; +} + +static const struct watchdog_info meson_wdt_info = { + .identity = DRV_NAME, + .options = WDIOF_SETTIMEOUT | + WDIOF_KEEPALIVEPING | + WDIOF_MAGICCLOSE, +}; + +static const struct watchdog_ops meson_wdt_ops = { + .owner = THIS_MODULE, + .start = meson_wdt_start, + .stop = meson_wdt_stop, + .ping = meson_wdt_ping, + .set_timeout = meson_wdt_set_timeout, +}; + +static int meson_wdt_probe(struct platform_device *pdev) +{ + struct resource *res; + struct meson_wdt_dev *meson_wdt; + int err; + + meson_wdt = devm_kzalloc(&pdev->dev, sizeof(*meson_wdt), GFP_KERNEL); + if (!meson_wdt) + return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + meson_wdt->wdt_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(meson_wdt->wdt_base)) + return PTR_ERR(meson_wdt->wdt_base); + + meson_wdt->wdt_dev.parent = &pdev->dev; + meson_wdt->wdt_dev.info = &meson_wdt_info; + meson_wdt->wdt_dev.ops = &meson_wdt_ops; + meson_wdt->wdt_dev.timeout = MESON_WDT_TIMEOUT; + meson_wdt->wdt_dev.max_timeout = MESON_WDT_MAX_TIMEOUT; + meson_wdt->wdt_dev.min_timeout = MESON_WDT_MIN_TIMEOUT; + + watchdog_set_drvdata(&meson_wdt->wdt_dev, meson_wdt); + + watchdog_init_timeout(&meson_wdt->wdt_dev, timeout, &pdev->dev); + watchdog_set_nowayout(&meson_wdt->wdt_dev, nowayout); + + meson_wdt_stop(&meson_wdt->wdt_dev); + + err = watchdog_register_device(&meson_wdt->wdt_dev); + if (err) + return err; + + platform_set_drvdata(pdev, meson_wdt); + + meson_wdt->restart_handler.notifier_call = meson_restart_handle; + meson_wdt->restart_handler.priority = 128; + err = register_restart_handler(&meson_wdt->restart_handler); + if (err) + dev_err(&pdev->dev, + "cannot register restart handler (err=%d)\n", err); + + dev_info(&pdev->dev, "Watchdog enabled (timeout=%d sec, nowayout=%d)", + meson_wdt->wdt_dev.timeout, nowayout); + + return 0; +} + +static int meson_wdt_remove(struct platform_device *pdev) +{ + struct meson_wdt_dev *meson_wdt = platform_get_drvdata(pdev); + + unregister_restart_handler(&meson_wdt->restart_handler); + + watchdog_unregister_device(&meson_wdt->wdt_dev); + + return 0; +} + +static void meson_wdt_shutdown(struct platform_device *pdev) +{ + struct meson_wdt_dev *meson_wdt = platform_get_drvdata(pdev); + + meson_wdt_stop(&meson_wdt->wdt_dev); +} + +static const struct of_device_id meson_wdt_dt_ids[] = { + { .compatible = "amlogic,meson6-wdt" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, meson_wdt_dt_ids); + +static struct platform_driver meson_wdt_driver = { + .probe = meson_wdt_probe, + .remove = meson_wdt_remove, + .shutdown = meson_wdt_shutdown, + .driver = { + .owner = THIS_MODULE, + .name = DRV_NAME, + .of_match_table = meson_wdt_dt_ids, + }, +}; + +module_platform_driver(meson_wdt_driver); + +module_param(timeout, uint, 0); +MODULE_PARM_DESC(timeout, "Watchdog heartbeat in seconds"); + +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, + "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Carlo Caione <carlo@caione.org>"); +MODULE_DESCRIPTION("Meson Watchdog Timer Driver"); diff --git a/drivers/watchdog/of_xilinx_wdt.c b/drivers/watchdog/of_xilinx_wdt.c index 1e6e28df5d7b..b2e1b4cbbdc1 100644 --- a/drivers/watchdog/of_xilinx_wdt.c +++ b/drivers/watchdog/of_xilinx_wdt.c @@ -236,7 +236,6 @@ static struct platform_driver xwdt_driver = { .probe = xwdt_probe, .remove = xwdt_remove, .driver = { - .owner = THIS_MODULE, .name = WATCHDOG_NAME, .of_match_table = xwdt_of_match, }, diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c new file mode 100644 index 000000000000..aa85618c4d03 --- /dev/null +++ b/drivers/watchdog/qcom-wdt.c @@ -0,0 +1,224 @@ +/* Copyright (c) 2014, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include <linux/clk.h> +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/reboot.h> +#include <linux/watchdog.h> + +#define WDT_RST 0x0 +#define WDT_EN 0x8 +#define WDT_BITE_TIME 0x24 + +struct qcom_wdt { + struct watchdog_device wdd; + struct clk *clk; + unsigned long rate; + struct notifier_block restart_nb; + void __iomem *base; +}; + +static inline +struct qcom_wdt *to_qcom_wdt(struct watchdog_device *wdd) +{ + return container_of(wdd, struct qcom_wdt, wdd); +} + +static int qcom_wdt_start(struct watchdog_device *wdd) +{ + struct qcom_wdt *wdt = to_qcom_wdt(wdd); + + writel(0, wdt->base + WDT_EN); + writel(1, wdt->base + WDT_RST); + writel(wdd->timeout * wdt->rate, wdt->base + WDT_BITE_TIME); + writel(1, wdt->base + WDT_EN); + return 0; +} + +static int qcom_wdt_stop(struct watchdog_device *wdd) +{ + struct qcom_wdt *wdt = to_qcom_wdt(wdd); + + writel(0, wdt->base + WDT_EN); + return 0; +} + +static int qcom_wdt_ping(struct watchdog_device *wdd) +{ + struct qcom_wdt *wdt = to_qcom_wdt(wdd); + + writel(1, wdt->base + WDT_RST); + return 0; +} + +static int qcom_wdt_set_timeout(struct watchdog_device *wdd, + unsigned int timeout) +{ + wdd->timeout = timeout; + return qcom_wdt_start(wdd); +} + +static const struct watchdog_ops qcom_wdt_ops = { + .start = qcom_wdt_start, + .stop = qcom_wdt_stop, + .ping = qcom_wdt_ping, + .set_timeout = qcom_wdt_set_timeout, + .owner = THIS_MODULE, +}; + +static const struct watchdog_info qcom_wdt_info = { + .options = WDIOF_KEEPALIVEPING + | WDIOF_MAGICCLOSE + | WDIOF_SETTIMEOUT, + .identity = KBUILD_MODNAME, +}; + +static int qcom_wdt_restart(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct qcom_wdt *wdt = container_of(nb, struct qcom_wdt, restart_nb); + u32 timeout; + + /* + * Trigger watchdog bite: + * Setup BITE_TIME to be 128ms, and enable WDT. + */ + timeout = 128 * wdt->rate / 1000; + + writel(0, wdt->base + WDT_EN); + writel(1, wdt->base + WDT_RST); + writel(timeout, wdt->base + WDT_BITE_TIME); + writel(1, wdt->base + WDT_EN); + + /* + * Actually make sure the above sequence hits hardware before sleeping. + */ + wmb(); + + msleep(150); + return NOTIFY_DONE; +} + +static int qcom_wdt_probe(struct platform_device *pdev) +{ + struct qcom_wdt *wdt; + struct resource *res; + int ret; + + wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + wdt->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(wdt->base)) + return PTR_ERR(wdt->base); + + wdt->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(wdt->clk)) { + dev_err(&pdev->dev, "failed to get input clock\n"); + return PTR_ERR(wdt->clk); + } + + ret = clk_prepare_enable(wdt->clk); + if (ret) { + dev_err(&pdev->dev, "failed to setup clock\n"); + return ret; + } + + /* + * We use the clock rate to calculate the max timeout, so ensure it's + * not zero to avoid a divide-by-zero exception. + * + * WATCHDOG_CORE assumes units of seconds, if the WDT is clocked such + * that it would bite before a second elapses it's usefulness is + * limited. Bail if this is the case. + */ + wdt->rate = clk_get_rate(wdt->clk); + if (wdt->rate == 0 || + wdt->rate > 0x10000000U) { + dev_err(&pdev->dev, "invalid clock rate\n"); + ret = -EINVAL; + goto err_clk_unprepare; + } + + wdt->wdd.dev = &pdev->dev; + wdt->wdd.info = &qcom_wdt_info; + wdt->wdd.ops = &qcom_wdt_ops; + wdt->wdd.min_timeout = 1; + wdt->wdd.max_timeout = 0x10000000U / wdt->rate; + + /* + * If 'timeout-sec' unspecified in devicetree, assume a 30 second + * default, unless the max timeout is less than 30 seconds, then use + * the max instead. + */ + wdt->wdd.timeout = min(wdt->wdd.max_timeout, 30U); + watchdog_init_timeout(&wdt->wdd, 0, &pdev->dev); + + ret = watchdog_register_device(&wdt->wdd); + if (ret) { + dev_err(&pdev->dev, "failed to register watchdog\n"); + goto err_clk_unprepare; + } + + /* + * WDT restart notifier has priority 0 (use as a last resort) + */ + wdt->restart_nb.notifier_call = qcom_wdt_restart; + ret = register_restart_handler(&wdt->restart_nb); + if (ret) + dev_err(&pdev->dev, "failed to setup restart handler\n"); + + platform_set_drvdata(pdev, wdt); + return 0; + +err_clk_unprepare: + clk_disable_unprepare(wdt->clk); + return ret; +} + +static int qcom_wdt_remove(struct platform_device *pdev) +{ + struct qcom_wdt *wdt = platform_get_drvdata(pdev); + + unregister_restart_handler(&wdt->restart_nb); + watchdog_unregister_device(&wdt->wdd); + clk_disable_unprepare(wdt->clk); + return 0; +} + +static const struct of_device_id qcom_wdt_of_table[] = { + { .compatible = "qcom,kpss-wdt-msm8960", }, + { .compatible = "qcom,kpss-wdt-apq8064", }, + { .compatible = "qcom,kpss-wdt-ipq8064", }, + { }, +}; +MODULE_DEVICE_TABLE(of, qcom_wdt_of_table); + +static struct platform_driver qcom_watchdog_driver = { + .probe = qcom_wdt_probe, + .remove = qcom_wdt_remove, + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = qcom_wdt_of_table, + }, +}; +module_platform_driver(qcom_watchdog_driver); + +MODULE_DESCRIPTION("QCOM KPSS Watchdog Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/watchdog/rn5t618_wdt.c b/drivers/watchdog/rn5t618_wdt.c new file mode 100644 index 000000000000..d1c12278cb6a --- /dev/null +++ b/drivers/watchdog/rn5t618_wdt.c @@ -0,0 +1,198 @@ +/* + * Watchdog driver for Ricoh RN5T618 PMIC + * + * Copyright (C) 2014 Beniamino Galvani <b.galvani@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/device.h> +#include <linux/mfd/rn5t618.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/watchdog.h> + +#define DRIVER_NAME "rn5t618-wdt" + +static bool nowayout = WATCHDOG_NOWAYOUT; +static unsigned int timeout; + +module_param(timeout, uint, 0); +MODULE_PARM_DESC(timeout, "Initial watchdog timeout in seconds"); + +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +struct rn5t618_wdt { + struct watchdog_device wdt_dev; + struct rn5t618 *rn5t618; +}; + +/* + * This array encodes the values of WDOGTIM field for the supported + * watchdog expiration times. If the watchdog is not accessed before + * the timer expiration, the PMU generates an interrupt and if the CPU + * doesn't clear it within one second the system is restarted. + */ +static const struct { + u8 reg_val; + unsigned int time; +} rn5t618_wdt_map[] = { + { 0, 1 }, + { 1, 8 }, + { 2, 32 }, + { 3, 128 }, +}; + +static int rn5t618_wdt_set_timeout(struct watchdog_device *wdt_dev, + unsigned int t) +{ + struct rn5t618_wdt *wdt = watchdog_get_drvdata(wdt_dev); + int ret, i; + + for (i = 0; i < ARRAY_SIZE(rn5t618_wdt_map); i++) { + if (rn5t618_wdt_map[i].time + 1 >= t) + break; + } + + if (i == ARRAY_SIZE(rn5t618_wdt_map)) + return -EINVAL; + + ret = regmap_update_bits(wdt->rn5t618->regmap, RN5T618_WATCHDOG, + RN5T618_WATCHDOG_WDOGTIM_M, + rn5t618_wdt_map[i].reg_val); + if (!ret) + wdt_dev->timeout = rn5t618_wdt_map[i].time; + + return ret; +} + +static int rn5t618_wdt_start(struct watchdog_device *wdt_dev) +{ + struct rn5t618_wdt *wdt = watchdog_get_drvdata(wdt_dev); + int ret; + + ret = rn5t618_wdt_set_timeout(wdt_dev, wdt_dev->timeout); + if (ret) + return ret; + + /* enable repower-on */ + ret = regmap_update_bits(wdt->rn5t618->regmap, RN5T618_REPCNT, + RN5T618_REPCNT_REPWRON, + RN5T618_REPCNT_REPWRON); + if (ret) + return ret; + + /* enable watchdog */ + ret = regmap_update_bits(wdt->rn5t618->regmap, RN5T618_WATCHDOG, + RN5T618_WATCHDOG_WDOGEN, + RN5T618_WATCHDOG_WDOGEN); + if (ret) + return ret; + + /* enable watchdog interrupt */ + return regmap_update_bits(wdt->rn5t618->regmap, RN5T618_PWRIREN, + RN5T618_PWRIRQ_IR_WDOG, + RN5T618_PWRIRQ_IR_WDOG); +} + +static int rn5t618_wdt_stop(struct watchdog_device *wdt_dev) +{ + struct rn5t618_wdt *wdt = watchdog_get_drvdata(wdt_dev); + + return regmap_update_bits(wdt->rn5t618->regmap, RN5T618_WATCHDOG, + RN5T618_WATCHDOG_WDOGEN, 0); +} + +static int rn5t618_wdt_ping(struct watchdog_device *wdt_dev) +{ + struct rn5t618_wdt *wdt = watchdog_get_drvdata(wdt_dev); + unsigned int val; + int ret; + + /* The counter is restarted after a R/W access to watchdog register */ + ret = regmap_read(wdt->rn5t618->regmap, RN5T618_WATCHDOG, &val); + if (ret) + return ret; + + ret = regmap_write(wdt->rn5t618->regmap, RN5T618_WATCHDOG, val); + if (ret) + return ret; + + /* Clear pending watchdog interrupt */ + return regmap_update_bits(wdt->rn5t618->regmap, RN5T618_PWRIRQ, + RN5T618_PWRIRQ_IR_WDOG, 0); +} + +static struct watchdog_info rn5t618_wdt_info = { + .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | + WDIOF_KEEPALIVEPING, + .identity = DRIVER_NAME, +}; + +static struct watchdog_ops rn5t618_wdt_ops = { + .owner = THIS_MODULE, + .start = rn5t618_wdt_start, + .stop = rn5t618_wdt_stop, + .ping = rn5t618_wdt_ping, + .set_timeout = rn5t618_wdt_set_timeout, +}; + +static int rn5t618_wdt_probe(struct platform_device *pdev) +{ + struct rn5t618 *rn5t618 = dev_get_drvdata(pdev->dev.parent); + struct rn5t618_wdt *wdt; + int min_timeout, max_timeout; + + wdt = devm_kzalloc(&pdev->dev, sizeof(struct rn5t618_wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + min_timeout = rn5t618_wdt_map[0].time; + max_timeout = rn5t618_wdt_map[ARRAY_SIZE(rn5t618_wdt_map) - 1].time; + + wdt->rn5t618 = rn5t618; + wdt->wdt_dev.info = &rn5t618_wdt_info; + wdt->wdt_dev.ops = &rn5t618_wdt_ops; + wdt->wdt_dev.min_timeout = min_timeout; + wdt->wdt_dev.max_timeout = max_timeout; + wdt->wdt_dev.timeout = max_timeout; + wdt->wdt_dev.parent = &pdev->dev; + + watchdog_set_drvdata(&wdt->wdt_dev, wdt); + watchdog_init_timeout(&wdt->wdt_dev, timeout, &pdev->dev); + watchdog_set_nowayout(&wdt->wdt_dev, nowayout); + + platform_set_drvdata(pdev, wdt); + + return watchdog_register_device(&wdt->wdt_dev); +} + +static int rn5t618_wdt_remove(struct platform_device *pdev) +{ + struct rn5t618_wdt *wdt = platform_get_drvdata(pdev); + + watchdog_unregister_device(&wdt->wdt_dev); + + return 0; +} + +static struct platform_driver rn5t618_wdt_driver = { + .probe = rn5t618_wdt_probe, + .remove = rn5t618_wdt_remove, + .driver = { + .name = DRIVER_NAME, + }, +}; + +module_platform_driver(rn5t618_wdt_driver); + +MODULE_AUTHOR("Beniamino Galvani <b.galvani@gmail.com>"); +MODULE_DESCRIPTION("RN5T618 watchdog driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/watchdog/s3c2410_wdt.c b/drivers/watchdog/s3c2410_wdt.c index 7c6ccd071baf..8532c3e2aea7 100644 --- a/drivers/watchdog/s3c2410_wdt.c +++ b/drivers/watchdog/s3c2410_wdt.c @@ -41,6 +41,8 @@ #include <linux/of.h> #include <linux/mfd/syscon.h> #include <linux/regmap.h> +#include <linux/reboot.h> +#include <linux/delay.h> #define S3C2410_WTCON 0x00 #define S3C2410_WTDAT 0x04 @@ -128,6 +130,7 @@ struct s3c2410_wdt { unsigned long wtdat_save; struct watchdog_device wdt_device; struct notifier_block freq_transition; + struct notifier_block restart_handler; struct s3c2410_wdt_variant *drv_data; struct regmap *pmureg; }; @@ -155,6 +158,15 @@ static const struct s3c2410_wdt_variant drv_data_exynos5420 = { .quirks = QUIRK_HAS_PMU_CONFIG | QUIRK_HAS_RST_STAT, }; +static const struct s3c2410_wdt_variant drv_data_exynos7 = { + .disable_reg = EXYNOS5_WDT_DISABLE_REG_OFFSET, + .mask_reset_reg = EXYNOS5_WDT_MASK_RESET_REG_OFFSET, + .mask_bit = 0, + .rst_stat_reg = EXYNOS5_RST_STAT_REG_OFFSET, + .rst_stat_bit = 23, /* A57 WDTRESET */ + .quirks = QUIRK_HAS_PMU_CONFIG | QUIRK_HAS_RST_STAT, +}; + static const struct of_device_id s3c2410_wdt_match[] = { { .compatible = "samsung,s3c2410-wdt", .data = &drv_data_s3c2410 }, @@ -162,6 +174,8 @@ static const struct of_device_id s3c2410_wdt_match[] = { .data = &drv_data_exynos5250 }, { .compatible = "samsung,exynos5420-wdt", .data = &drv_data_exynos5420 }, + { .compatible = "samsung,exynos7-wdt", + .data = &drv_data_exynos7 }, {}, }; MODULE_DEVICE_TABLE(of, s3c2410_wdt_match); @@ -438,6 +452,31 @@ static inline void s3c2410wdt_cpufreq_deregister(struct s3c2410_wdt *wdt) } #endif +static int s3c2410wdt_restart(struct notifier_block *this, + unsigned long mode, void *cmd) +{ + struct s3c2410_wdt *wdt = container_of(this, struct s3c2410_wdt, + restart_handler); + void __iomem *wdt_base = wdt->reg_base; + + /* disable watchdog, to be safe */ + writel(0, wdt_base + S3C2410_WTCON); + + /* put initial values into count and data */ + writel(0x80, wdt_base + S3C2410_WTCNT); + writel(0x80, wdt_base + S3C2410_WTDAT); + + /* set the watchdog to go and reset... */ + writel(S3C2410_WTCON_ENABLE | S3C2410_WTCON_DIV16 | + S3C2410_WTCON_RSTEN | S3C2410_WTCON_PRESCALE(0x20), + wdt_base + S3C2410_WTCON); + + /* wait for reset to assert... */ + mdelay(500); + + return NOTIFY_DONE; +} + static inline unsigned int s3c2410wdt_get_bootstatus(struct s3c2410_wdt *wdt) { unsigned int rst_stat; @@ -592,6 +631,12 @@ static int s3c2410wdt_probe(struct platform_device *pdev) platform_set_drvdata(pdev, wdt); + wdt->restart_handler.notifier_call = s3c2410wdt_restart; + wdt->restart_handler.priority = 128; + ret = register_restart_handler(&wdt->restart_handler); + if (ret) + pr_err("cannot register restart handler, %d\n", ret); + /* print out a statement of readiness */ wtcon = readl(wdt->reg_base + S3C2410_WTCON); @@ -621,6 +666,8 @@ static int s3c2410wdt_remove(struct platform_device *dev) int ret; struct s3c2410_wdt *wdt = platform_get_drvdata(dev); + unregister_restart_handler(&wdt->restart_handler); + ret = s3c2410wdt_mask_and_disable_reset(wdt, true); if (ret < 0) return ret; diff --git a/drivers/watchdog/stmp3xxx_rtc_wdt.c b/drivers/watchdog/stmp3xxx_rtc_wdt.c index 3804d5e9baea..a62b1b6decf4 100644 --- a/drivers/watchdog/stmp3xxx_rtc_wdt.c +++ b/drivers/watchdog/stmp3xxx_rtc_wdt.c @@ -94,9 +94,33 @@ static int stmp3xxx_wdt_remove(struct platform_device *pdev) return 0; } +static int __maybe_unused stmp3xxx_wdt_suspend(struct device *dev) +{ + struct watchdog_device *wdd = &stmp3xxx_wdd; + + if (watchdog_active(wdd)) + return wdt_stop(wdd); + + return 0; +} + +static int __maybe_unused stmp3xxx_wdt_resume(struct device *dev) +{ + struct watchdog_device *wdd = &stmp3xxx_wdd; + + if (watchdog_active(wdd)) + return wdt_start(wdd); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(stmp3xxx_wdt_pm_ops, + stmp3xxx_wdt_suspend, stmp3xxx_wdt_resume); + static struct platform_driver stmp3xxx_wdt_driver = { .driver = { .name = "stmp3xxx_rtc_wdt", + .pm = &stmp3xxx_wdt_pm_ops, }, .probe = stmp3xxx_wdt_probe, .remove = stmp3xxx_wdt_remove, diff --git a/drivers/watchdog/sunxi_wdt.c b/drivers/watchdog/sunxi_wdt.c index 480bb557f353..b62301e74e5f 100644 --- a/drivers/watchdog/sunxi_wdt.c +++ b/drivers/watchdog/sunxi_wdt.c @@ -23,6 +23,7 @@ #include <linux/moduleparam.h> #include <linux/notifier.h> #include <linux/of.h> +#include <linux/of_device.h> #include <linux/platform_device.h> #include <linux/reboot.h> #include <linux/types.h> @@ -30,15 +31,11 @@ #define WDT_MAX_TIMEOUT 16 #define WDT_MIN_TIMEOUT 1 -#define WDT_MODE_TIMEOUT(n) ((n) << 3) -#define WDT_TIMEOUT_MASK WDT_MODE_TIMEOUT(0x0F) +#define WDT_TIMEOUT_MASK 0x0F -#define WDT_CTRL 0x00 #define WDT_CTRL_RELOAD ((1 << 0) | (0x0a57 << 1)) -#define WDT_MODE 0x04 #define WDT_MODE_EN (1 << 0) -#define WDT_MODE_RST_EN (1 << 1) #define DRV_NAME "sunxi-wdt" #define DRV_VERSION "1.0" @@ -46,15 +43,29 @@ static bool nowayout = WATCHDOG_NOWAYOUT; static unsigned int timeout = WDT_MAX_TIMEOUT; +/* + * This structure stores the register offsets for different variants + * of Allwinner's watchdog hardware. + */ +struct sunxi_wdt_reg { + u8 wdt_ctrl; + u8 wdt_cfg; + u8 wdt_mode; + u8 wdt_timeout_shift; + u8 wdt_reset_mask; + u8 wdt_reset_val; +}; + struct sunxi_wdt_dev { struct watchdog_device wdt_dev; void __iomem *wdt_base; + const struct sunxi_wdt_reg *wdt_regs; struct notifier_block restart_handler; }; /* * wdt_timeout_map maps the watchdog timer interval value in seconds to - * the value of the register WDT_MODE bit 3:6 + * the value of the register WDT_MODE at bits .wdt_timeout_shift ~ +3 * * [timeout seconds] = register value * @@ -82,19 +93,32 @@ static int sunxi_restart_handle(struct notifier_block *this, unsigned long mode, struct sunxi_wdt_dev, restart_handler); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; + u32 val; + + /* Set system reset function */ + val = readl(wdt_base + regs->wdt_cfg); + val &= ~(regs->wdt_reset_mask); + val |= regs->wdt_reset_val; + writel(val, wdt_base + regs->wdt_cfg); - /* Enable timer and set reset bit in the watchdog */ - writel(WDT_MODE_EN | WDT_MODE_RST_EN, wdt_base + WDT_MODE); + /* Set lowest timeout and enable watchdog */ + val = readl(wdt_base + regs->wdt_mode); + val &= ~(WDT_TIMEOUT_MASK << regs->wdt_timeout_shift); + val |= WDT_MODE_EN; + writel(val, wdt_base + regs->wdt_mode); /* * Restart the watchdog. The default (and lowest) interval * value for the watchdog is 0.5s. */ - writel(WDT_CTRL_RELOAD, wdt_base + WDT_CTRL); + writel(WDT_CTRL_RELOAD, wdt_base + regs->wdt_ctrl); while (1) { mdelay(5); - writel(WDT_MODE_EN | WDT_MODE_RST_EN, wdt_base + WDT_MODE); + val = readl(wdt_base + regs->wdt_mode); + val |= WDT_MODE_EN; + writel(val, wdt_base + regs->wdt_mode); } return NOTIFY_DONE; } @@ -103,8 +127,9 @@ static int sunxi_wdt_ping(struct watchdog_device *wdt_dev) { struct sunxi_wdt_dev *sunxi_wdt = watchdog_get_drvdata(wdt_dev); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; - iowrite32(WDT_CTRL_RELOAD, wdt_base + WDT_CTRL); + writel(WDT_CTRL_RELOAD, wdt_base + regs->wdt_ctrl); return 0; } @@ -114,6 +139,7 @@ static int sunxi_wdt_set_timeout(struct watchdog_device *wdt_dev, { struct sunxi_wdt_dev *sunxi_wdt = watchdog_get_drvdata(wdt_dev); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; u32 reg; if (wdt_timeout_map[timeout] == 0) @@ -121,10 +147,10 @@ static int sunxi_wdt_set_timeout(struct watchdog_device *wdt_dev, sunxi_wdt->wdt_dev.timeout = timeout; - reg = ioread32(wdt_base + WDT_MODE); - reg &= ~WDT_TIMEOUT_MASK; - reg |= WDT_MODE_TIMEOUT(wdt_timeout_map[timeout]); - iowrite32(reg, wdt_base + WDT_MODE); + reg = readl(wdt_base + regs->wdt_mode); + reg &= ~(WDT_TIMEOUT_MASK << regs->wdt_timeout_shift); + reg |= wdt_timeout_map[timeout] << regs->wdt_timeout_shift; + writel(reg, wdt_base + regs->wdt_mode); sunxi_wdt_ping(wdt_dev); @@ -135,8 +161,9 @@ static int sunxi_wdt_stop(struct watchdog_device *wdt_dev) { struct sunxi_wdt_dev *sunxi_wdt = watchdog_get_drvdata(wdt_dev); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; - iowrite32(0, wdt_base + WDT_MODE); + writel(0, wdt_base + regs->wdt_mode); return 0; } @@ -146,6 +173,7 @@ static int sunxi_wdt_start(struct watchdog_device *wdt_dev) u32 reg; struct sunxi_wdt_dev *sunxi_wdt = watchdog_get_drvdata(wdt_dev); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; int ret; ret = sunxi_wdt_set_timeout(&sunxi_wdt->wdt_dev, @@ -153,9 +181,16 @@ static int sunxi_wdt_start(struct watchdog_device *wdt_dev) if (ret < 0) return ret; - reg = ioread32(wdt_base + WDT_MODE); - reg |= (WDT_MODE_RST_EN | WDT_MODE_EN); - iowrite32(reg, wdt_base + WDT_MODE); + /* Set system reset function */ + reg = readl(wdt_base + regs->wdt_cfg); + reg &= ~(regs->wdt_reset_mask); + reg |= ~(regs->wdt_reset_val); + writel(reg, wdt_base + regs->wdt_cfg); + + /* Enable watchdog */ + reg = readl(wdt_base + regs->wdt_mode); + reg |= WDT_MODE_EN; + writel(reg, wdt_base + regs->wdt_mode); return 0; } @@ -175,9 +210,35 @@ static const struct watchdog_ops sunxi_wdt_ops = { .set_timeout = sunxi_wdt_set_timeout, }; +static const struct sunxi_wdt_reg sun4i_wdt_reg = { + .wdt_ctrl = 0x00, + .wdt_cfg = 0x04, + .wdt_mode = 0x04, + .wdt_timeout_shift = 3, + .wdt_reset_mask = 0x02, + .wdt_reset_val = 0x02, +}; + +static const struct sunxi_wdt_reg sun6i_wdt_reg = { + .wdt_ctrl = 0x10, + .wdt_cfg = 0x14, + .wdt_mode = 0x18, + .wdt_timeout_shift = 4, + .wdt_reset_mask = 0x03, + .wdt_reset_val = 0x01, +}; + +static const struct of_device_id sunxi_wdt_dt_ids[] = { + { .compatible = "allwinner,sun4i-a10-wdt", .data = &sun4i_wdt_reg }, + { .compatible = "allwinner,sun6i-a31-wdt", .data = &sun6i_wdt_reg }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, sunxi_wdt_dt_ids); + static int sunxi_wdt_probe(struct platform_device *pdev) { struct sunxi_wdt_dev *sunxi_wdt; + const struct of_device_id *device; struct resource *res; int err; @@ -187,6 +248,12 @@ static int sunxi_wdt_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sunxi_wdt); + device = of_match_device(sunxi_wdt_dt_ids, &pdev->dev); + if (!device) + return -ENODEV; + + sunxi_wdt->wdt_regs = device->data; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); sunxi_wdt->wdt_base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(sunxi_wdt->wdt_base)) @@ -242,12 +309,6 @@ static void sunxi_wdt_shutdown(struct platform_device *pdev) sunxi_wdt_stop(&sunxi_wdt->wdt_dev); } -static const struct of_device_id sunxi_wdt_dt_ids[] = { - { .compatible = "allwinner,sun4i-a10-wdt" }, - { /* sentinel */ } -}; -MODULE_DEVICE_TABLE(of, sunxi_wdt_dt_ids); - static struct platform_driver sunxi_wdt_driver = { .probe = sunxi_wdt_probe, .remove = sunxi_wdt_remove, diff --git a/drivers/watchdog/ts72xx_wdt.c b/drivers/watchdog/ts72xx_wdt.c index afa9d6ef353a..dee9c6cbe6df 100644 --- a/drivers/watchdog/ts72xx_wdt.c +++ b/drivers/watchdog/ts72xx_wdt.c @@ -428,11 +428,7 @@ static int ts72xx_wdt_probe(struct platform_device *pdev) static int ts72xx_wdt_remove(struct platform_device *pdev) { - int error; - - error = misc_deregister(&ts72xx_wdt_miscdev); - - return error; + return misc_deregister(&ts72xx_wdt_miscdev); } static struct platform_driver ts72xx_wdt_driver = { diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 1e0a317d3dcd..3860d02729dc 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -167,6 +167,9 @@ static struct page *balloon_next_page(struct page *page) static enum bp_state update_schedule(enum bp_state state) { + if (state == BP_ECANCELED) + return BP_ECANCELED; + if (state == BP_DONE) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index dd9c249ea311..95ee4302ffb8 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -41,24 +41,29 @@ static int xen_add_device(struct device *dev) #endif if (pci_seg_supported) { - struct physdev_pci_device_add add = { - .seg = pci_domain_nr(pci_dev->bus), - .bus = pci_dev->bus->number, - .devfn = pci_dev->devfn + struct { + struct physdev_pci_device_add add; + uint32_t pxm; + } add_ext = { + .add.seg = pci_domain_nr(pci_dev->bus), + .add.bus = pci_dev->bus->number, + .add.devfn = pci_dev->devfn }; + struct physdev_pci_device_add *add = &add_ext.add; + #ifdef CONFIG_ACPI acpi_handle handle; #endif #ifdef CONFIG_PCI_IOV if (pci_dev->is_virtfn) { - add.flags = XEN_PCI_DEV_VIRTFN; - add.physfn.bus = physfn->bus->number; - add.physfn.devfn = physfn->devfn; + add->flags = XEN_PCI_DEV_VIRTFN; + add->physfn.bus = physfn->bus->number; + add->physfn.devfn = physfn->devfn; } else #endif if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) - add.flags = XEN_PCI_DEV_EXTFN; + add->flags = XEN_PCI_DEV_EXTFN; #ifdef CONFIG_ACPI handle = ACPI_HANDLE(&pci_dev->dev); @@ -77,8 +82,8 @@ static int xen_add_device(struct device *dev) status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); if (ACPI_SUCCESS(status)) { - add.optarr[0] = pxm; - add.flags |= XEN_PCI_DEV_PXM; + add->optarr[0] = pxm; + add->flags |= XEN_PCI_DEV_PXM; break; } status = acpi_get_parent(handle, &handle); @@ -86,7 +91,7 @@ static int xen_add_device(struct device *dev) } #endif /* CONFIG_ACPI */ - r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add); + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, add); if (r != -ENOSYS) return r; pci_seg_supported = false; diff --git a/fs/Kconfig b/fs/Kconfig index db5dc1598716..664991afe0c0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -67,6 +67,7 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" +source "fs/overlayfs/Kconfig" menu "Caches" diff --git a/fs/Makefile b/fs/Makefile index 90c88529892b..34a1b9dea6dd 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8d2b76e29d3b..4399f0c3a4ce 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -765,23 +765,6 @@ out: return ret; } -/* copy of check_sticky in fs/namei.c() -* It's inline, so penalty for filesystems that don't use sticky bit is -* minimal. -*/ -static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) -{ - kuid_t fsuid = current_fsuid(); - - if (!(dir->i_mode & S_ISVTX)) - return 0; - if (uid_eq(inode->i_uid, fsuid)) - return 0; - if (uid_eq(dir->i_uid, fsuid)) - return 0; - return !capable(CAP_FOWNER); -} - /* copy of may_delete in fs/namei.c() * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. @@ -817,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (btrfs_check_sticky(dir, victim->d_inode)|| - IS_APPEND(victim->d_inode)|| + if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) return -EPERM; if (isdir) { diff --git a/fs/buffer.c b/fs/buffer.c index 9614adc7e754..6c48f20eddd4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, */ static int grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size, int sizebits) + pgoff_t index, int size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct page *page; @@ -1002,8 +1002,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; - gfp_mask |= __GFP_MOVABLE; + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + /* * XXX: __getblk_slow() can not really deal with failure and * will endlessly loop on improvised global reclaim. Prefer @@ -1060,7 +1060,7 @@ failed: * that page was dirty, the buffers are set dirty also. */ static int -grow_buffers(struct block_device *bdev, sector_t block, int size) +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) { pgoff_t index; int sizebits; @@ -1087,11 +1087,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) } /* Create a page with the proper size buffers.. */ - return grow_dev_page(bdev, block, index, size, sizebits); + return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -static struct buffer_head * -__getblk_slow(struct block_device *bdev, sector_t block, int size) +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_logical_block_size(bdev)-1) || @@ -1113,13 +1114,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) if (bh) return bh; - ret = grow_buffers(bdev, block, size); + ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; if (ret == 0) free_more_memory(); } } +EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: @@ -1373,24 +1375,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__find_get_block); /* - * __getblk will locate (and, if necessary, create) the buffer_head + * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() - * attempt is failing. FIXME, perhaps? + * __getblk_gfp() will lock up the machine if grow_dev_page's + * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * -__getblk(struct block_device *bdev, sector_t block, unsigned size) +__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { struct buffer_head *bh = __find_get_block(bdev, block, size); might_sleep(); if (bh == NULL) - bh = __getblk_slow(bdev, block, size); + bh = __getblk_slow(bdev, block, size, gfp); return bh; } -EXPORT_SYMBOL(__getblk); +EXPORT_SYMBOL(__getblk_gfp); /* * Do async read-ahead on a buffer.. @@ -1406,24 +1409,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__breadahead); /** - * __bread() - reads a specified block and returns the bh + * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read - * + * @gfp: page allocation flag + * * Reads a specified block, and returns buffer head that contains it. + * The page cache can be allocated from non-movable area + * not to prevent page migration if you set gfp to zero. * It returns NULL if the block was unreadable. */ struct buffer_head * -__bread(struct block_device *bdev, sector_t block, unsigned size) +__bread_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); return bh; } -EXPORT_SYMBOL(__bread); +EXPORT_SYMBOL(__bread_gfp); /* * invalidate_bh_lrus() is called rarely - but not only at unmount. @@ -2082,6 +2089,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2101,6 +2109,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock diff --git a/fs/dcache.c b/fs/dcache.c index d5a23fd0da90..3ffef7f4e5cd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2673,11 +2673,13 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (!IS_ROOT(new)) { spin_unlock(&inode->i_lock); dput(new); + iput(inode); return ERR_PTR(-EIO); } if (d_ancestor(new, dentry)) { spin_unlock(&inode->i_lock); dput(new); + iput(inode); return ERR_PTR(-EIO); } write_seqlock(&rename_lock); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1b119d3bf924..c4cd1fd86cc2 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; + s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; + + rc = -EINVAL; + if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); + goto out_free; + } inode = ecryptfs_get_inode(path.dentry->d_inode, s); rc = PTR_ERR(inode); diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 389ba8312d5d..b47c7b8dc275 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh <bharrosh@panasas.com> +# Boaz Harrosh <ooo@electrozaur.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 diff --git a/fs/exofs/common.h b/fs/exofs/common.h index 3bbd46956d77..7d88ef566213 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -4,7 +4,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 49f51ab4caac..d7defd557601 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index fffe86fd7a42..ad9cac670a47 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 71bf8e4fb5d4..1a376b42d305 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 3f9cafd73931..f1d3d4eb8c4f 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 4731fd991efe..28907460e8fa 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index cfc0205d62c4..7bd8ac8dfb28 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This file is part of exofs. * @@ -29,7 +29,7 @@ #include "ore_raid.h" -MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>"); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 84529b8a331b..27cbdb697649 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index cf6375d82129..a6e746775570 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -1,6 +1,6 @@ /* * Copyright (C) from 2011 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/super.c b/fs/exofs/super.c index ed73ed8ebbee..95965503afcb 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c index 4dd687c3e747..832e2624b80b 100644 --- a/fs/exofs/symlink.c +++ b/fs/exofs/symlink.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index 1b4f2f95fc37..5e6a2c0a1f0b 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2012 * Sachin Bhamare <sbhamare@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This file is part of exofs. * diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 581ef40fbe90..83a6f497c4e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -176,7 +176,7 @@ static unsigned int num_clusters_in_group(struct super_block *sb, } /* Initializes an uninitialized block bitmap */ -static void ext4_init_block_bitmap(struct super_block *sb, +static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) @@ -192,7 +192,6 @@ static void ext4_init_block_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -205,7 +204,7 @@ static void ext4_init_block_bitmap(struct super_block *sb, count); } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return; + return -EIO; } memset(bh->b_data, 0, sb->s_blocksize); @@ -243,6 +242,7 @@ static void ext4_init_block_bitmap(struct super_block *sb, sb->s_blocksize * 8, bh->b_data); ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); ext4_group_desc_csum_set(sb, block_group, gdp); + return 0; } /* Return the number of free blocks in a block group. It is used when @@ -438,11 +438,15 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh, block_group, desc); + int err; + + err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); + if (err) + ext4_error(sb, "Checksum bad for grp %u", block_group); return bh; } ext4_unlock_group(sb, block_group); @@ -636,8 +640,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ - if (!(*errp) && - ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { + if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_alloc_block_nofail(inode, diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 3285aa5a706a..b610779a958c 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); @@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); @@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); @@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 0bb3f9ea0832..c24143ea9c08 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -151,13 +151,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; - bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); + bh = ext4_bread(NULL, inode, map.m_lblk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); } - /* - * We ignore I/O errors on directories so users have a chance - * of recovering data when there's a bad sector - */ if (!bh) { if (!dir_has_error) { EXT4_ERROR_FILE(file, 0, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0c225cdb52c..c55a1faaed58 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -572,15 +572,15 @@ enum { /* * The bit position of these flags must not overlap with any of the - * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), * read_extent_tree_block(), ext4_split_extent_at(), * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be * caching the extents when reading from the extent tree while a * truncate or punch hole operation is in progress. */ -#define EXT4_EX_NOCACHE 0x0400 -#define EXT4_EX_FORCE_CACHE 0x0800 +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 /* * Flags used by ext4_free_blocks @@ -890,6 +890,7 @@ struct ext4_inode_info { struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; struct list_head i_es_lru; + unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_lru_nr; /* protected by i_es_lock */ unsigned long i_touch_when; /* jiffies of last accessing */ @@ -1174,6 +1175,9 @@ struct ext4_super_block { #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 2 + /* * fourth extended-fs super-block data in memory */ @@ -1237,7 +1241,7 @@ struct ext4_sb_info { u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ @@ -1330,8 +1334,7 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_lru; - unsigned long s_es_last_sorted; - struct percpu_counter s_extent_cache_cnt; + struct ext4_es_stats s_es_stats; struct mb_cache *s_mb_cache; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; @@ -1399,7 +1402,6 @@ enum { EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ - EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ @@ -2086,10 +2088,8 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ -struct buffer_head *ext4_getblk(handle_t *, struct inode *, - ext4_lblk_t, int, int *); -struct buffer_head *ext4_bread(handle_t *, struct inode *, - ext4_lblk_t, int, int *); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, @@ -2109,6 +2109,7 @@ int do_journal_get_write_access(handle_t *handle, #define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -2332,10 +2333,18 @@ extern int ext4_register_li_request(struct super_block *sb, static inline int ext4_has_group_desc_csum(struct super_block *sb) { return EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM | - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); + EXT4_FEATURE_RO_COMPAT_GDT_CSUM) || + (EXT4_SB(sb)->s_chksum_driver != NULL); } +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + !EXT4_SB(sb)->s_chksum_driver); + + return (EXT4_SB(sb)->s_chksum_driver != NULL); +} static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | @@ -2731,21 +2740,26 @@ extern int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2); extern int ext4_ext_insert_extent(handle_t *, struct inode *, - struct ext4_ext_path *, + struct ext4_ext_path **, struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *, - int flags); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, ext4_lblk_t lblk_end); extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -2755,8 +2769,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode, extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); -extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent **extent); /* page-io.c */ extern int __init ext4_init_pageio(void); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index a867f5ca9991..3c9381547094 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -123,6 +123,7 @@ find_ext4_extent_tail(struct ext4_extent_header *eh) struct ext4_ext_path { ext4_fsblk_t p_block; __u16 p_depth; + __u16 p_maxdepth; struct ext4_extent *p_ext; struct ext4_extent_idx *p_idx; struct ext4_extent_header *p_hdr; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 0074e0d23d6e..3445035c7e01 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -256,8 +256,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - /* Errors can only happen if there is a bug */ - if (WARN_ON_ONCE(err)) { + /* Errors can only happen due to aborted journal or a nasty bug */ + if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); if (inode == NULL) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 17c00ff202f2..9c5b49fb281e 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -102,9 +102,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif -#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) -#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) -#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) static inline int ext4_jbd2_credits_xattr(struct inode *inode) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74292a71b384..37043d0b2be8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -73,8 +73,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode, { struct ext4_extent_tail *et; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -88,8 +87,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, { struct ext4_extent_tail *et; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); @@ -98,14 +96,14 @@ static void ext4_extent_block_csum_set(struct inode *inode, static int ext4_split_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags); static int ext4_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags); @@ -291,6 +289,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) return size; } +static inline int +ext4_force_split_extent_at(handle_t *handle, struct inode *inode, + struct ext4_ext_path **ppath, ext4_lblk_t lblk, + int nofail) +{ + struct ext4_ext_path *path = *ppath; + int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + + return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? + EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | + (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); +} + /* * Calculate the number of metadata blocks needed * to allocate @blocks @@ -695,9 +707,11 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, void ext4_ext_drop_refs(struct ext4_ext_path *path) { - int depth = path->p_depth; - int i; + int depth, i; + if (!path) + return; + depth = path->p_depth; for (i = 0; i <= depth; i++, path++) if (path->p_bh) { brelse(path->p_bh); @@ -841,24 +855,32 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) } struct ext4_ext_path * -ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path, int flags) +ext4_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path **orig_path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; - short int depth, i, ppos = 0, alloc = 0; + struct ext4_ext_path *path = orig_path ? *orig_path : NULL; + short int depth, i, ppos = 0; int ret; eh = ext_inode_hdr(inode); depth = ext_depth(inode); - /* account possible depth increase */ + if (path) { + ext4_ext_drop_refs(path); + if (depth > path[0].p_maxdepth) { + kfree(path); + *orig_path = path = NULL; + } + } if (!path) { + /* account possible depth increase */ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), GFP_NOFS); - if (!path) + if (unlikely(!path)) return ERR_PTR(-ENOMEM); - alloc = 1; + path[0].p_maxdepth = depth + 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; @@ -876,7 +898,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, bh = read_extent_tree_block(inode, path[ppos].p_block, --i, flags); - if (IS_ERR(bh)) { + if (unlikely(IS_ERR(bh))) { ret = PTR_ERR(bh); goto err; } @@ -910,8 +932,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, err: ext4_ext_drop_refs(path); - if (alloc) - kfree(path); + kfree(path); + if (orig_path) + *orig_path = NULL; return ERR_PTR(ret); } @@ -1238,16 +1261,24 @@ cleanup: * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_extent *newext) + unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; - ext4_fsblk_t newblock; + ext4_fsblk_t newblock, goal = 0; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, NULL, - newext, &err, flags); + /* Try to prepend new index to old one */ + if (ext_depth(inode)) + goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); + if (goal > le32_to_cpu(es->s_first_data_block)) { + flags |= EXT4_MB_HINT_TRY_GOAL; + goal--; + } else + goal = ext4_inode_to_goal_block(inode); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, &err); if (newblock == 0) return err; @@ -1314,9 +1345,10 @@ out: static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, unsigned int mb_flags, unsigned int gb_flags, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_extent *newext) { + struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1340,23 +1372,21 @@ repeat: goto out; /* refill path */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, + path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path, gb_flags); + ppath, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); + err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto out; /* refill path */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, + path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path, gb_flags); + ppath, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1559,7 +1589,7 @@ found_extent: * allocated block. Thus, index entries have to be consistent * with leaves. */ -static ext4_lblk_t +ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; @@ -1802,6 +1832,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); + path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + @@ -1896,9 +1927,10 @@ out: * creating new leaf in the no-space case. */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_extent *newext, int gb_flags) { + struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ @@ -1907,6 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ext4_lblk_t next; int mb_flags = 0, unwritten; + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); return -EIO; @@ -1925,7 +1959,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because - * ext4_ext_find_extent() can return either extent on the + * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ @@ -2008,7 +2042,7 @@ prepend: if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL, 0); + npath = ext4_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -2028,9 +2062,9 @@ prepend: * We're gonna add a new leaf in the tree. */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) - mb_flags = EXT4_MB_USE_RESERVED; + mb_flags |= EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, - path, newext); + ppath, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2108,10 +2142,8 @@ merge: err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: - if (npath) { - ext4_ext_drop_refs(npath); - kfree(npath); - } + ext4_ext_drop_refs(npath); + kfree(npath); return err; } @@ -2133,13 +2165,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); - if (path && ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; - } - - path = ext4_ext_find_extent(inode, block, path, 0); + path = ext4_find_extent(inode, block, &path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); @@ -2156,7 +2182,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); - ext4_ext_drop_refs(path); flags = 0; exists = 0; @@ -2266,11 +2291,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode, block = es.es_lblk + es.es_len; } - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - + ext4_ext_drop_refs(path); + kfree(path); return err; } @@ -2826,7 +2848,7 @@ again: ext4_lblk_t ee_block; /* find extent for this block */ - path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2854,24 +2876,14 @@ again: */ if (end >= ee_block && end < ee_block + ext4_ext_get_actual_len(ex) - 1) { - int split_flag = 0; - - if (ext4_ext_is_unwritten(ex)) - split_flag = EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2; - /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ - err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_EX_NOCACHE | - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_METADATA_NOFAIL); - + err = ext4_force_split_extent_at(handle, inode, &path, + end + 1, 1); if (err < 0) goto out; } @@ -2893,7 +2905,7 @@ again: ext4_journal_stop(handle); return -ENOMEM; } - path[0].p_depth = depth; + path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; @@ -3013,10 +3025,9 @@ again: out: ext4_ext_drop_refs(path); kfree(path); - if (err == -EAGAIN) { - path = NULL; + path = NULL; + if (err == -EAGAIN) goto again; - } ext4_journal_stop(handle); return err; @@ -3130,11 +3141,12 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) */ static int ext4_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags) { + struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; @@ -3205,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle, if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { @@ -3271,11 +3283,12 @@ fix_extent_len: */ static int ext4_split_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags) { + struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; @@ -3298,7 +3311,7 @@ static int ext4_split_extent(handle_t *handle, EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; - err = ext4_split_extent_at(handle, inode, path, + err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; @@ -3309,8 +3322,7 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3330,7 +3342,7 @@ static int ext4_split_extent(handle_t *handle, split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } - err = ext4_split_extent_at(handle, inode, path, + err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk, split_flag1, flags); if (err) goto out; @@ -3364,9 +3376,10 @@ out: static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, int flags) { + struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; @@ -3590,7 +3603,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } - allocated = ext4_split_extent(handle, inode, path, + allocated = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (allocated < 0) err = allocated; @@ -3629,9 +3642,10 @@ out: static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, int flags) { + struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; @@ -3665,74 +3679,15 @@ static int ext4_split_convert_extents(handle_t *handle, split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, path, map, split_flag, flags); -} - -static int ext4_convert_initialized_extents(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path) -{ - struct ext4_extent *ex; - ext4_lblk_t ee_block; - unsigned int ee_len; - int depth; - int err = 0; - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - - ext_debug("%s: inode %lu, logical" - "block %llu, max_blocks %u\n", __func__, inode->i_ino, - (unsigned long long)ee_block, ee_len); - - if (ee_block != map->m_lblk || ee_len > map->m_len) { - err = ext4_split_convert_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); - if (err < 0) - goto out; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) { - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) map->m_lblk); - err = -EIO; - goto out; - } - } - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - /* first mark the extent as unwritten */ - ext4_ext_mark_unwritten(ex); - - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed - */ - ext4_ext_try_to_merge(handle, inode, path, ex); - - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + path->p_depth); -out: - ext4_ext_show_leaf(inode, path); - return err; + return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); } - static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path **ppath) { + struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3761,16 +3716,13 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_convert_extents(handle, inode, map, path, + err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT); if (err < 0) - goto out; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } + return err; + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; } @@ -3963,12 +3915,16 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } static int -ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, - unsigned int allocated, ext4_fsblk_t newblock) +convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path **ppath, int flags, + unsigned int allocated, ext4_fsblk_t newblock) { - int ret = 0; + struct ext4_ext_path *path = *ppath; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; int err = 0; /* @@ -3978,28 +3934,67 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, if (map->m_len > EXT_UNWRITTEN_MAX_LEN) map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; - ret = ext4_convert_initialized_extents(handle, inode, map, - path); - if (ret >= 0) { - ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, map->m_len); - } else - err = ret; + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, ppath, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + return err; + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + return err; + ext4_ext_show_leaf(inode, path); + + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); + if (err) + return err; map->m_flags |= EXT4_MAP_UNWRITTEN; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; - - return err ? err : allocated; + return allocated; } static int ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, + struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { + struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); @@ -4021,8 +4016,8 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* get_block() before submit the IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { - ret = ext4_split_convert_extents(handle, inode, map, - path, flags | EXT4_GET_BLOCKS_CONVERT); + ret = ext4_split_convert_extents(handle, inode, map, ppath, + flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; /* @@ -4040,7 +4035,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { ret = ext4_convert_unwritten_extents_endio(handle, inode, map, - path); + ppath); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); err = check_eofblocks_fl(handle, inode, map->m_lblk, @@ -4078,7 +4073,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); + ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -4279,7 +4274,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -4291,7 +4286,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; - * this is why assert can't be put in ext4_ext_find_extent() + * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " @@ -4331,15 +4326,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { - allocated = ext4_ext_convert_initialized_extent( - handle, inode, map, path, flags, - allocated, newblock); + allocated = convert_initialized_extent( + handle, inode, map, &path, + flags, allocated, newblock); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; ret = ext4_ext_handle_unwritten_extents( - handle, inode, map, path, flags, + handle, inode, map, &path, flags, allocated, newblock); if (ret < 0) err = ret; @@ -4376,7 +4371,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * If we are doing bigalloc, check to see if the extent returned - * by ext4_ext_find_extent() implies a cluster we can use. + * by ext4_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { @@ -4451,6 +4446,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags = 0; if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -4486,7 +4483,7 @@ got_allocated_blocks: err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); if (!err) - err = ext4_ext_insert_extent(handle, inode, path, + err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); if (!err && set_unwritten) { @@ -4619,10 +4616,8 @@ out: map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); @@ -4799,7 +4794,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, max_blocks -= lblk; flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -4837,15 +4833,21 @@ static long ext4_zero_range(struct file *file, loff_t offset, ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + if (ret) + goto out_dio; /* * Remove entire range from the extent status tree. + * + * ext4_es_remove_extent(inode, lblk, max_blocks) is + * NOT sufficient. I'm not sure why this is the case, + * but let's be conservative and remove the extent + * status tree for the entire inode. There should be + * no outstanding delalloc extents thanks to the + * filemap_write_and_wait_range() call above. */ - ret = ext4_es_remove_extent(inode, lblk, max_blocks); - if (ret) - goto out_dio; - - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (ret) goto out_dio; } @@ -5304,36 +5306,31 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; - ext4_lblk_t stop_block, current_block; + ext4_lblk_t stop_block; ext4_lblk_t ex_start, ex_end; /* Let path point to the last extent */ - path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - if (!extent) { - ext4_ext_drop_refs(path); - kfree(path); - return ret; - } + if (!extent) + goto out; stop_block = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); - ext4_ext_drop_refs(path); - kfree(path); /* Nothing to shift, if hole is at the end of file */ if (start >= stop_block) - return ret; + goto out; /* * Don't start shifting extents until we make sure the hole is big * enough to accomodate the shift. */ - path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + path = ext4_find_extent(inode, start - 1, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5346,8 +5343,6 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ex_start = 0; ex_end = 0; } - ext4_ext_drop_refs(path); - kfree(path); if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) @@ -5355,7 +5350,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, /* Its safe to start updating extents */ while (start < stop_block) { - path = ext4_ext_find_extent(inode, start, NULL, 0); + path = ext4_find_extent(inode, start, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5365,27 +5360,23 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, (unsigned long) start); return -EIO; } - - current_block = le32_to_cpu(extent->ee_block); - if (start > current_block) { + if (start > le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ - ret = mext_next_extent(inode, path, &extent); - if (ret != 0) { - ext4_ext_drop_refs(path); - kfree(path); - if (ret == 1) - ret = 0; - break; + if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { + path[depth].p_ext++; + } else { + start = ext4_ext_next_allocated_block(path); + continue; } } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, &start); - ext4_ext_drop_refs(path); - kfree(path); if (ret) break; } - +out: + ext4_ext_drop_refs(path); + kfree(path); return ret; } @@ -5508,3 +5499,199 @@ out_mutex: mutex_unlock(&inode->i_mutex); return ret; } + +/** + * ext4_swap_extents - Swap extents between two inodes + * + * @inode1: First inode + * @inode2: Second inode + * @lblk1: Start block for first inode + * @lblk2: Start block for second inode + * @count: Number of blocks to swap + * @mark_unwritten: Mark second inode's extents as unwritten after swap + * @erp: Pointer to save error value + * + * This helper routine does exactly what is promise "swap extents". All other + * stuff such as page-cache locking consistency, bh mapping consistency or + * extent's data copying must be performed by caller. + * Locking: + * i_mutex is held for both inodes + * i_data_sem is locked for write for both inodes + * Assumptions: + * All pages from requested range are locked for both inodes + */ +int +ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, + ext4_lblk_t count, int unwritten, int *erp) +{ + struct ext4_ext_path *path1 = NULL; + struct ext4_ext_path *path2 = NULL; + int replaced_count = 0; + + BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); + BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + + *erp = ext4_es_remove_extent(inode1, lblk1, count); + if (unlikely(*erp)) + return 0; + *erp = ext4_es_remove_extent(inode2, lblk2, count); + if (unlikely(*erp)) + return 0; + + while (count) { + struct ext4_extent *ex1, *ex2, tmp_ex; + ext4_lblk_t e1_blk, e2_blk; + int e1_len, e2_len, len; + int split = 0; + + path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + if (unlikely(IS_ERR(path1))) { + *erp = PTR_ERR(path1); + path1 = NULL; + finish: + count = 0; + goto repeat; + } + path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + if (unlikely(IS_ERR(path2))) { + *erp = PTR_ERR(path2); + path2 = NULL; + goto finish; + } + ex1 = path1[path1->p_depth].p_ext; + ex2 = path2[path2->p_depth].p_ext; + /* Do we have somthing to swap ? */ + if (unlikely(!ex2 || !ex1)) + goto finish; + + e1_blk = le32_to_cpu(ex1->ee_block); + e2_blk = le32_to_cpu(ex2->ee_block); + e1_len = ext4_ext_get_actual_len(ex1); + e2_len = ext4_ext_get_actual_len(ex2); + + /* Hole handling */ + if (!in_range(lblk1, e1_blk, e1_len) || + !in_range(lblk2, e2_blk, e2_len)) { + ext4_lblk_t next1, next2; + + /* if hole after extent, then go to next extent */ + next1 = ext4_ext_next_allocated_block(path1); + next2 = ext4_ext_next_allocated_block(path2); + /* If hole before extent, then shift to that extent */ + if (e1_blk > lblk1) + next1 = e1_blk; + if (e2_blk > lblk2) + next2 = e1_blk; + /* Do we have something to swap */ + if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) + goto finish; + /* Move to the rightest boundary */ + len = next1 - lblk1; + if (len < next2 - lblk2) + len = next2 - lblk2; + if (len > count) + len = count; + lblk1 += len; + lblk2 += len; + count -= len; + goto repeat; + } + + /* Prepare left boundary */ + if (e1_blk < lblk1) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + &path1, lblk1, 0); + if (unlikely(*erp)) + goto finish; + } + if (e2_blk < lblk2) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + &path2, lblk2, 0); + if (unlikely(*erp)) + goto finish; + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + /* Prepare right boundary */ + len = count; + if (len > e1_blk + e1_len - lblk1) + len = e1_blk + e1_len - lblk1; + if (len > e2_blk + e2_len - lblk2) + len = e2_blk + e2_len - lblk2; + + if (len != e1_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + &path1, lblk1 + len, 0); + if (unlikely(*erp)) + goto finish; + } + if (len != e2_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + &path2, lblk2 + len, 0); + if (*erp) + goto finish; + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + BUG_ON(e2_len != e1_len); + *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); + if (unlikely(*erp)) + goto finish; + *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); + if (unlikely(*erp)) + goto finish; + + /* Both extents are fully inside boundaries. Swap it now */ + tmp_ex = *ex1; + ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); + ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); + ex1->ee_len = cpu_to_le16(e2_len); + ex2->ee_len = cpu_to_le16(e1_len); + if (unwritten) + ext4_ext_mark_unwritten(ex2); + if (ext4_ext_is_unwritten(&tmp_ex)) + ext4_ext_mark_unwritten(ex1); + + ext4_ext_try_to_merge(handle, inode2, path2, ex2); + ext4_ext_try_to_merge(handle, inode1, path1, ex1); + *erp = ext4_ext_dirty(handle, inode2, path2 + + path2->p_depth); + if (unlikely(*erp)) + goto finish; + *erp = ext4_ext_dirty(handle, inode1, path1 + + path1->p_depth); + /* + * Looks scarry ah..? second inode already points to new blocks, + * and it was successfully dirtied. But luckily error may happen + * only due to journal error, so full transaction will be + * aborted anyway. + */ + if (unlikely(*erp)) + goto finish; + lblk1 += len; + lblk2 += len; + replaced_count += len; + count -= len; + + repeat: + ext4_ext_drop_refs(path1); + kfree(path1); + ext4_ext_drop_refs(path2); + kfree(path2); + path1 = path2 = NULL; + } + return replaced_count; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0b7e28e7eaa4..94e7855ae71b 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -11,6 +11,8 @@ */ #include <linux/rbtree.h> #include <linux/list_sort.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> #include "ext4.h" #include "extents_status.h" @@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, */ if (!ext4_es_is_delayed(es)) { EXT4_I(inode)->i_es_lru_nr++; - percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_lru_cnt); } + EXT4_I(inode)->i_es_all_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + return es; } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { + EXT4_I(inode)->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + /* Decrease the lru counter when this es is not delayed */ if (!ext4_es_is_delayed(es)) { BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); EXT4_I(inode)->i_es_lru_nr--; - percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_lru_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -426,7 +436,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, unsigned short ee_len; int depth, ee_status, es_status; - path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return; @@ -499,10 +509,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } } out: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); } static void ext4_es_insert_extent_ind_check(struct inode *inode, @@ -731,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es) { struct ext4_es_tree *tree; + struct ext4_es_stats *stats; struct extent_status *es1 = NULL; struct rb_node *node; int found = 0; @@ -767,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, } out: + stats = &EXT4_SB(inode->i_sb)->s_es_stats; if (found) { BUG_ON(!es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; + stats->es_stats_cache_hits++; + } else { + stats->es_stats_cache_misses++; } read_unlock(&EXT4_I(inode)->i_es_lock); @@ -933,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; struct list_head *cur, *tmp; LIST_HEAD(skipped); + ktime_t start_time; + u64 scan_time; int nr_shrunk = 0; int retried = 0, skip_precached = 1, nr_skipped = 0; + es_stats = &sbi->s_es_stats; + start_time = ktime_get(); spin_lock(&sbi->s_es_lru_lock); retry: @@ -948,7 +966,8 @@ retry: * If we have already reclaimed all extents from extent * status tree, just stop the loop immediately. */ - if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) + if (percpu_counter_read_positive( + &es_stats->es_stats_lru_cnt) == 0) break; ei = list_entry(cur, struct ext4_inode_info, i_es_lru); @@ -958,7 +977,7 @@ retry: * time. Normally we try hard to avoid shrinking * precached inodes, but we will as a last resort. */ - if ((sbi->s_es_last_sorted < ei->i_touch_when) || + if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || (skip_precached && ext4_test_inode_state(&ei->vfs_inode, EXT4_STATE_EXT_PRECACHED))) { nr_skipped++; @@ -992,7 +1011,7 @@ retry: if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - sbi->s_es_last_sorted = jiffies; + es_stats->es_stats_last_sorted = jiffies; ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); /* @@ -1010,6 +1029,22 @@ retry: if (locked_ei && nr_shrunk == 0) nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) + es_stats->es_stats_scan_time = (scan_time + + es_stats->es_stats_scan_time*3) / 4; + else + es_stats->es_stats_scan_time = scan_time; + if (scan_time > es_stats->es_stats_max_scan_time) + es_stats->es_stats_max_scan_time = scan_time; + if (likely(es_stats->es_stats_shrunk)) + es_stats->es_stats_shrunk = (nr_shrunk + + es_stats->es_stats_shrunk*3) / 4; + else + es_stats->es_stats_shrunk = nr_shrunk; + + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, + nr_skipped, retried); return nr_shrunk; } @@ -1020,8 +1055,8 @@ static unsigned long ext4_es_count(struct shrinker *shrink, struct ext4_sb_info *sbi; sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); - nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1033,31 +1068,160 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) return ret; nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); - trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; } -void ext4_es_register_shrinker(struct ext4_sb_info *sbi) +static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) { + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void * +ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) +{ + struct ext4_sb_info *sbi = seq->private; + struct ext4_es_stats *es_stats = &sbi->s_es_stats; + struct ext4_inode_info *ei, *max = NULL; + unsigned int inode_cnt = 0; + + if (v != SEQ_START_TOKEN) + return 0; + + /* here we just find an inode that has the max nr. of objects */ + spin_lock(&sbi->s_es_lru_lock); + list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { + inode_cnt++; + if (max && max->i_es_all_nr < ei->i_es_all_nr) + max = ei; + else if (!max) + max = ei; + } + spin_unlock(&sbi->s_es_lru_lock); + + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), + percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); + seq_printf(seq, " %lu/%lu cache hits/misses\n", + es_stats->es_stats_cache_hits, + es_stats->es_stats_cache_misses); + if (es_stats->es_stats_last_sorted != 0) + seq_printf(seq, " %u ms last sorted interval\n", + jiffies_to_msecs(jiffies - + es_stats->es_stats_last_sorted)); + if (inode_cnt) + seq_printf(seq, " %d inodes on lru list\n", inode_cnt); + + seq_printf(seq, "average:\n %llu us scan time\n", + div_u64(es_stats->es_stats_scan_time, 1000)); + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); + if (inode_cnt) + seq_printf(seq, + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + " %llu us max scan time\n", + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, + div_u64(es_stats->es_stats_max_scan_time, 1000)); + + return 0; +} + +static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations ext4_es_seq_shrinker_info_ops = { + .start = ext4_es_seq_shrinker_info_start, + .next = ext4_es_seq_shrinker_info_next, + .stop = ext4_es_seq_shrinker_info_stop, + .show = ext4_es_seq_shrinker_info_show, +}; + +static int +ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = PDE_DATA(inode); + } + + return ret; +} + +static int +ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} + +static const struct file_operations ext4_es_seq_shrinker_info_fops = { + .owner = THIS_MODULE, + .open = ext4_es_seq_shrinker_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ext4_es_seq_shrinker_info_release, +}; + +int ext4_es_register_shrinker(struct ext4_sb_info *sbi) +{ + int err; + INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); - sbi->s_es_last_sorted = 0; + sbi->s_es_stats.es_stats_last_sorted = 0; + sbi->s_es_stats.es_stats_shrunk = 0; + sbi->s_es_stats.es_stats_cache_hits = 0; + sbi->s_es_stats.es_stats_cache_misses = 0; + sbi->s_es_stats.es_stats_scan_time = 0; + sbi->s_es_stats.es_stats_max_scan_time = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); + if (err) + goto err1; + sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - register_shrinker(&sbi->s_es_shrinker); + err = register_shrinker(&sbi->s_es_shrinker); + if (err) + goto err2; + + if (sbi->s_proc) + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, + &ext4_es_seq_shrinker_info_fops, sbi); + + return 0; + +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { + if (sbi->s_proc) + remove_proc_entry("es_shrinker_info", sbi->s_proc); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); unregister_shrinker(&sbi->s_es_shrinker); } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f1b62a419920..efd5f970b501 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -64,6 +64,17 @@ struct ext4_es_tree { struct extent_status *cache_es; /* recently accessed extent */ }; +struct ext4_es_stats { + unsigned long es_stats_last_sorted; + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_lru_cnt; +}; + extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); @@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, (pb & ~ES_MASK)); } -extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); extern void ext4_es_lru_del(struct inode *inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 5b87fc36aab8..8012a5daf401 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1011,8 +1011,7 @@ got: spin_unlock(&sbi->s_next_gen_lock); /* Precompute checksum seed for inode metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index e75f840000a0..36b369697a13 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -318,34 +318,24 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) +static int ext4_alloc_branch(handle_t *handle, + struct ext4_allocation_request *ar, + int indirect_blks, ext4_lblk_t *offsets, + Indirect *branch) { - struct ext4_allocation_request ar; struct buffer_head * bh; ext4_fsblk_t b, new_blocks[4]; __le32 *p; int i, j, err, len = 1; - /* - * Set up for the direct block allocation - */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.len = *blks; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - ar.flags = EXT4_MB_HINT_DATA; - for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { - ar.goal = goal; - new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); + new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); } else - goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, - goal, 0, NULL, &err); + ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, + ar->inode, ar->goal, + ar->flags & EXT4_MB_DELALLOC_RESERVED, + NULL, &err); if (err) { i--; goto failed; @@ -354,7 +344,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, if (i == 0) continue; - bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); + bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; @@ -372,7 +362,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, b = new_blocks[i]; if (i == indirect_blks) - len = ar.len; + len = ar->len; for (j = 0; j < len; j++) *p++ = cpu_to_le32(b++); @@ -381,11 +371,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_handle_dirty_metadata(handle, ar->inode, bh); if (err) goto failed; } - *blks = ar.len; return 0; failed: for (; i >= 0; i--) { @@ -396,10 +385,10 @@ failed: * existing before ext4_alloc_branch() was called. */ if (i > 0 && i != indirect_blks && branch[i].bh) - ext4_forget(handle, 1, inode, branch[i].bh, + ext4_forget(handle, 1, ar->inode, branch[i].bh, branch[i].bh->b_blocknr); - ext4_free_blocks(handle, inode, NULL, new_blocks[i], - (i == indirect_blks) ? ar.len : 1, 0); + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar->len : 1, 0); } return err; } @@ -419,9 +408,9 @@ failed: * inode (->i_blocks, etc.). In case of success we end up with the full * chain to new block and return 0. */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) +static int ext4_splice_branch(handle_t *handle, + struct ext4_allocation_request *ar, + Indirect *where, int num) { int i; int err = 0; @@ -446,9 +435,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, * Update the host buffer_head or inode to point to more just allocated * direct blocks blocks */ - if (num == 0 && blks > 1) { + if (num == 0 && ar->len > 1) { current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) + for (i = 1; i < ar->len; i++) *(where->p + i) = cpu_to_le32(current_block++); } @@ -465,14 +454,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, */ jbd_debug(5, "splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); + err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) goto err_out; } else { /* * OK, we spliced it into the inode itself on a direct block. */ - ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, ar->inode); jbd_debug(5, "splicing direct\n"); } return err; @@ -484,11 +473,11 @@ err_out: * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); + ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), + ar->len, 0); return err; } @@ -525,11 +514,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { + struct ext4_allocation_request ar; int err = -EIO; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; - ext4_fsblk_t goal; int indirect_blks; int blocks_to_boundary = 0; int depth; @@ -579,7 +568,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, return -ENOSPC; } - goal = ext4_find_goal(inode, map->m_lblk, partial); + /* Set up for the direct block allocation */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.logical = map->m_lblk; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + + ar.goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; @@ -588,13 +586,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, * Next look up the indirect map to count the totoal number of * direct blocks to allocate for this branch. */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); + ar.len = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, + err = ext4_alloc_branch(handle, &ar, indirect_blks, offsets + (partial - chain), partial); /* @@ -605,14 +603,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); + err = ext4_splice_branch(handle, &ar, partial, indirect_blks); if (err) goto cleanup; map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); + count = ar.len; got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bea662bd0ca6..3ea62695abce 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -594,6 +594,7 @@ retry: if (ret) { unlock_page(page); page_cache_release(page); + page = NULL; ext4_orphan_add(handle, inode); up_write(&EXT4_I(inode)->xattr_sem); sem_held = 0; @@ -613,7 +614,8 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - block_commit_write(page, from, to); + if (page) + block_commit_write(page, from, to); out: if (page) { unlock_page(page); @@ -1126,8 +1128,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3aa26e9117c4..e9777f93cf05 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -83,8 +83,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + !ext4_has_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -105,8 +104,7 @@ static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + !ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -224,16 +222,15 @@ void ext4_evict_inode(struct inode *inode) goto no_delete; } - if (!is_bad_inode(inode)) - dquot_initialize(inode); + if (is_bad_inode(inode)) + goto no_delete; + dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); - if (is_bad_inode(inode)) - goto no_delete; /* * Protect us against freezing - iput() caller didn't have to have any @@ -590,20 +587,12 @@ found: /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take - * the write lock of i_data_sem, and call get_blocks() + * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); /* - * if the caller is from delayed allocation writeout path - * we have already reserved fs blocks for allocation - * let the underlying get_block() function know to - * avoid double accounting - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - /* * We need to check for EXT4 here because migrate * could have changed the inode type in between */ @@ -631,8 +620,6 @@ found: (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); if (retval > 0) { unsigned int status; @@ -734,11 +721,11 @@ int ext4_get_block(struct inode *inode, sector_t iblock, * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create, int *errp) + ext4_lblk_t block, int create) { struct ext4_map_blocks map; struct buffer_head *bh; - int fatal = 0, err; + int err; J_ASSERT(handle != NULL || create == 0); @@ -747,21 +734,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, err = ext4_map_blocks(handle, inode, &map, create ? EXT4_GET_BLOCKS_CREATE : 0); - /* ensure we send some value back into *errp */ - *errp = 0; - - if (create && err == 0) - err = -ENOSPC; /* should never happen */ + if (err == 0) + return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) - *errp = err; - if (err <= 0) - return NULL; + return ERR_PTR(err); bh = sb_getblk(inode->i_sb, map.m_pblk); - if (unlikely(!bh)) { - *errp = -ENOMEM; - return NULL; - } + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { J_ASSERT(create != 0); J_ASSERT(handle != NULL); @@ -775,44 +755,44 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - fatal = ext4_journal_get_create_access(handle, bh); - if (!fatal && !buffer_uptodate(bh)) { + err = ext4_journal_get_create_access(handle, bh); + if (unlikely(err)) { + unlock_buffer(bh); + goto errout; + } + if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); - if (!fatal) - fatal = err; - } else { + if (unlikely(err)) + goto errout; + } else BUFFER_TRACE(bh, "not a new buffer"); - } - if (fatal) { - *errp = fatal; - brelse(bh); - bh = NULL; - } return bh; +errout: + brelse(bh); + return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create, int *err) + ext4_lblk_t block, int create) { struct buffer_head *bh; - bh = ext4_getblk(handle, inode, block, create, err); - if (!bh) + bh = ext4_getblk(handle, inode, block, create); + if (IS_ERR(bh)) return bh; - if (buffer_uptodate(bh)) + if (!bh || buffer_uptodate(bh)) return bh; ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; put_bh(bh); - *err = -EIO; - return NULL; + return ERR_PTR(-EIO); } int ext4_walk_page_buffers(handle_t *handle, @@ -1536,7 +1516,7 @@ out_unlock: } /* - * This is a special get_blocks_t callback which is used by + * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * @@ -2011,12 +1991,10 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * in data loss. So use reserved blocks to allocate metadata if * possible. * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks - * in question are delalloc blocks. This affects functions in many - * different parts of the allocation call path. This flag exists - * primarily because we don't want to change *many* call functions, so - * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag - * once the inode's allocation semaphore is taken. + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if + * the blocks in question are delalloc blocks. This indicates + * that the blocks and quotas has already been checked when + * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL; @@ -2515,6 +2493,20 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } +/* We always reserve for an inode update; the superblock could be there too */ +static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) +{ + if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE))) + return 1; + + if (pos + len <= 0x7fffffffULL) + return 1; + + /* We might need to update the superblock to set LARGE_FILE */ + return 2; +} + static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2565,7 +2557,8 @@ retry_grab: * of file which has an already mapped buffer. */ retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_da_write_credits(inode, pos, len)); if (IS_ERR(handle)) { page_cache_release(page); return PTR_ERR(handle); @@ -2658,10 +2651,7 @@ static int ext4_da_write_end(struct file *file, if (copied && new_i_size > EXT4_I(inode)->i_disksize) { if (ext4_has_inline_data(inode) || ext4_da_should_update_i_disksize(page, end)) { - down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = new_i_size; - up_write(&EXT4_I(inode)->i_data_sem); + ext4_update_i_disksize(inode, new_i_size); /* We need to mark inode dirty even if * new_i_size is less that inode->i_size * bu greater than i_disksize.(hint delalloc) @@ -3936,8 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); @@ -4127,6 +4116,13 @@ bad_inode: return ERR_PTR(ret); } +struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) +{ + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-EIO); + return ext4_iget(sb, ino); +} + static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) @@ -4226,7 +4222,8 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - if (ext4_inode_blocks_set(handle, raw_inode, ei)) { + err = ext4_inode_blocks_set(handle, raw_inode, ei); + if (err) { spin_unlock(&ei->i_raw_lock); goto out_brelse; } @@ -4536,8 +4533,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_orphan_del(NULL, inode); goto err_out; } - } else + } else { + loff_t oldsize = inode->i_size; + i_size_write(inode, attr->ia_size); + pagecache_isize_extended(inode, oldsize, inode->i_size); + } /* * Blocks are going to be removed from the inode. Wait diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0f2252ec274d..bfda18a15592 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -331,8 +331,7 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; @@ -532,9 +531,17 @@ group_add_out: } case EXT4_IOC_SWAP_BOOT: + { + int err; if (!(filp->f_mode & FMODE_WRITE)) return -EBADF; - return swap_inode_boot_loader(sb, inode); + err = mnt_want_write_file(filp); + if (err) + return err; + err = swap_inode_boot_loader(sb, inode); + mnt_drop_write_file(filp); + return err; + } case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 748c9136a60a..dbfe15c2533c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3155,9 +3155,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); } - BUG_ON(start + size <= ac->ac_o_ex.fe_logical && - start > ac->ac_o_ex.fe_logical); BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ @@ -4410,14 +4409,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, if (IS_NOQUOTA(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; - /* - * For delayed allocation, we could skip the ENOSPC and - * EDQUOT check, as blocks and quotas have been already - * reserved when data being copied into pagecache. - */ - if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) - ar->flags |= EXT4_MB_DELALLOC_RESERVED; - else { + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. @@ -4528,8 +4520,7 @@ out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { - if (!ext4_test_inode_state(ar->inode, - EXT4_STATE_DELALLOC_RESERVED)) + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index d3567f27bae7..a432634f2e6a 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -41,8 +41,7 @@ static int finish_range(handle_t *handle, struct inode *inode, ext4_ext_store_pblock(&newext, lb->first_pblock); /* Locking only for convinience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); - path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); - + path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); path = NULL; @@ -81,13 +80,11 @@ static int finish_range(handle_t *handle, struct inode *inode, goto err_out; } } - retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); + retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); lb->first_pblock = 0; return retval; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 32bce844c2e1..8313ca3324ec 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -20,8 +20,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); @@ -29,8 +28,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 671a74b14fd7..9f2311bc9c4f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -27,120 +27,26 @@ * @lblock: logical block number to find an extent path * @path: pointer to an extent path pointer (for output) * - * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * ext4_find_extent wrapper. Return 0 on success, or a negative error value * on failure. */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **orig_path) + struct ext4_ext_path **ppath) { - int ret = 0; struct ext4_ext_path *path; - path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); if (IS_ERR(path)) - ret = PTR_ERR(path); - else if (path[ext_depth(inode)].p_ext == NULL) - ret = -ENODATA; - else - *orig_path = path; - - return ret; -} - -/** - * copy_extent_status - Copy the extent's initialization status - * - * @src: an extent for getting initialize status - * @dest: an extent to be set the status - */ -static void -copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) -{ - if (ext4_ext_is_unwritten(src)) - ext4_ext_mark_unwritten(dest); - else - dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); -} - -/** - * mext_next_extent - Search for the next extent and set it to "extent" - * - * @inode: inode which is searched - * @path: this will obtain data for the next extent - * @extent: pointer to the next extent we have just gotten - * - * Search the next extent in the array of ext4_ext_path structure (@path) - * and set it to ext4_extent structure (@extent). In addition, the member of - * @path (->p_ext) also points the next extent. Return 0 on success, 1 if - * ext4_ext_path structure refers to the last extent, or a negative error - * value on failure. - */ -int -mext_next_extent(struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent **extent) -{ - struct ext4_extent_header *eh; - int ppos, leaf_ppos = path->p_depth; - - ppos = leaf_ppos; - if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { - /* leaf block */ - *extent = ++path[ppos].p_ext; - path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); - return 0; - } - - while (--ppos >= 0) { - if (EXT_LAST_INDEX(path[ppos].p_hdr) > - path[ppos].p_idx) { - int cur_ppos = ppos; - - /* index block */ - path[ppos].p_idx++; - path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); - if (path[ppos+1].p_bh) - brelse(path[ppos+1].p_bh); - path[ppos+1].p_bh = - sb_bread(inode->i_sb, path[ppos].p_block); - if (!path[ppos+1].p_bh) - return -EIO; - path[ppos+1].p_hdr = - ext_block_hdr(path[ppos+1].p_bh); - - /* Halfway index block */ - while (++cur_ppos < leaf_ppos) { - path[cur_ppos].p_idx = - EXT_FIRST_INDEX(path[cur_ppos].p_hdr); - path[cur_ppos].p_block = - ext4_idx_pblock(path[cur_ppos].p_idx); - if (path[cur_ppos+1].p_bh) - brelse(path[cur_ppos+1].p_bh); - path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, - path[cur_ppos].p_block); - if (!path[cur_ppos+1].p_bh) - return -EIO; - path[cur_ppos+1].p_hdr = - ext_block_hdr(path[cur_ppos+1].p_bh); - } - - path[leaf_ppos].p_ext = *extent = NULL; - - eh = path[leaf_ppos].p_hdr; - if (le16_to_cpu(eh->eh_entries) == 0) - /* empty leaf is found */ - return -ENODATA; - - /* leaf block */ - path[leaf_ppos].p_ext = *extent = - EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); - path[leaf_ppos].p_block = - ext4_ext_pblock(path[leaf_ppos].p_ext); - return 0; - } + return PTR_ERR(path); + if (path[ext_depth(inode)].p_ext == NULL) { + ext4_ext_drop_refs(path); + kfree(path); + *ppath = NULL; + return -ENODATA; } - /* We found the last extent */ - return 1; + *ppath = path; + return 0; } /** @@ -178,417 +84,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode, } /** - * mext_insert_across_blocks - Insert extents across leaf block - * - * @handle: journal handle - * @orig_inode: original inode - * @o_start: first original extent to be changed - * @o_end: last original extent to be changed - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * - * Allocate a new leaf block and insert extents into it. Return 0 on success, - * or a negative error value on failure. - */ -static int -mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, - struct ext4_extent *o_start, struct ext4_extent *o_end, - struct ext4_extent *start_ext, struct ext4_extent *new_ext, - struct ext4_extent *end_ext) -{ - struct ext4_ext_path *orig_path = NULL; - ext4_lblk_t eblock = 0; - int new_flag = 0; - int end_flag = 0; - int err = 0; - - if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { - if (o_start == o_end) { - - /* start_ext new_ext end_ext - * donor |---------|-----------|--------| - * orig |------------------------------| - */ - end_flag = 1; - } else { - - /* start_ext new_ext end_ext - * donor |---------|----------|---------| - * orig |---------------|--------------| - */ - o_end->ee_block = end_ext->ee_block; - o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); - } - - o_start->ee_len = start_ext->ee_len; - eblock = le32_to_cpu(start_ext->ee_block); - new_flag = 1; - - } else if (start_ext->ee_len && new_ext->ee_len && - !end_ext->ee_len && o_start == o_end) { - - /* start_ext new_ext - * donor |--------------|---------------| - * orig |------------------------------| - */ - o_start->ee_len = start_ext->ee_len; - eblock = le32_to_cpu(start_ext->ee_block); - new_flag = 1; - - } else if (!start_ext->ee_len && new_ext->ee_len && - end_ext->ee_len && o_start == o_end) { - - /* new_ext end_ext - * donor |--------------|---------------| - * orig |------------------------------| - */ - o_end->ee_block = end_ext->ee_block; - o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); - - /* - * Set 0 to the extent block if new_ext was - * the first block. - */ - if (new_ext->ee_block) - eblock = le32_to_cpu(new_ext->ee_block); - - new_flag = 1; - } else { - ext4_debug("ext4 move extent: Unexpected insert case\n"); - return -EIO; - } - - if (new_flag) { - err = get_ext_path(orig_inode, eblock, &orig_path); - if (err) - goto out; - - if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, new_ext, 0)) - goto out; - } - - if (end_flag) { - err = get_ext_path(orig_inode, - le32_to_cpu(end_ext->ee_block) - 1, &orig_path); - if (err) - goto out; - - if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, end_ext, 0)) - goto out; - } -out: - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - - return err; - -} - -/** - * mext_insert_inside_block - Insert new extent to the extent block - * - * @o_start: first original extent to be moved - * @o_end: last original extent to be moved - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * @eh: extent header of target leaf block - * @range_to_move: used to decide how to insert extent - * - * Insert extents into the leaf block. The extent (@o_start) is overwritten - * by inserted extents. - */ -static void -mext_insert_inside_block(struct ext4_extent *o_start, - struct ext4_extent *o_end, - struct ext4_extent *start_ext, - struct ext4_extent *new_ext, - struct ext4_extent *end_ext, - struct ext4_extent_header *eh, - int range_to_move) -{ - int i = 0; - unsigned long len; - - /* Move the existing extents */ - if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { - len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - - (unsigned long)(o_end + 1); - memmove(o_end + 1 + range_to_move, o_end + 1, len); - } - - /* Insert start entry */ - if (start_ext->ee_len) - o_start[i++].ee_len = start_ext->ee_len; - - /* Insert new entry */ - if (new_ext->ee_len) { - o_start[i] = *new_ext; - ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); - } - - /* Insert end entry */ - if (end_ext->ee_len) - o_start[i] = *end_ext; - - /* Increment the total entries counter on the extent block */ - le16_add_cpu(&eh->eh_entries, range_to_move); -} - -/** - * mext_insert_extents - Insert new extent - * - * @handle: journal handle - * @orig_inode: original inode - * @orig_path: path indicates first extent to be changed - * @o_start: first original extent to be changed - * @o_end: last original extent to be changed - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * - * Call the function to insert extents. If we cannot add more extents into - * the leaf block, we call mext_insert_across_blocks() to create a - * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 - * on success, or a negative error value on failure. - */ -static int -mext_insert_extents(handle_t *handle, struct inode *orig_inode, - struct ext4_ext_path *orig_path, - struct ext4_extent *o_start, - struct ext4_extent *o_end, - struct ext4_extent *start_ext, - struct ext4_extent *new_ext, - struct ext4_extent *end_ext) -{ - struct ext4_extent_header *eh; - unsigned long need_slots, slots_range; - int range_to_move, depth, ret; - - /* - * The extents need to be inserted - * start_extent + new_extent + end_extent. - */ - need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + - (new_ext->ee_len ? 1 : 0); - - /* The number of slots between start and end */ - slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) - / sizeof(struct ext4_extent); - - /* Range to move the end of extent */ - range_to_move = need_slots - slots_range; - depth = orig_path->p_depth; - orig_path += depth; - eh = orig_path->p_hdr; - - if (depth) { - /* Register to journal */ - BUFFER_TRACE(orig_path->p_bh, "get_write_access"); - ret = ext4_journal_get_write_access(handle, orig_path->p_bh); - if (ret) - return ret; - } - - /* Expansion */ - if (range_to_move > 0 && - (range_to_move > le16_to_cpu(eh->eh_max) - - le16_to_cpu(eh->eh_entries))) { - - ret = mext_insert_across_blocks(handle, orig_inode, o_start, - o_end, start_ext, new_ext, end_ext); - if (ret < 0) - return ret; - } else - mext_insert_inside_block(o_start, o_end, start_ext, new_ext, - end_ext, eh, range_to_move); - - return ext4_ext_dirty(handle, orig_inode, orig_path); -} - -/** - * mext_leaf_block - Move one leaf extent block into the inode. - * - * @handle: journal handle - * @orig_inode: original inode - * @orig_path: path indicates first extent to be changed - * @dext: donor extent - * @from: start offset on the target file - * - * In order to insert extents into the leaf block, we must divide the extent - * in the leaf block into three extents. The one is located to be inserted - * extents, and the others are located around it. - * - * Therefore, this function creates structures to save extents of the leaf - * block, and inserts extents by calling mext_insert_extents() with - * created extents. Return 0 on success, or a negative error value on failure. - */ -static int -mext_leaf_block(handle_t *handle, struct inode *orig_inode, - struct ext4_ext_path *orig_path, struct ext4_extent *dext, - ext4_lblk_t *from) -{ - struct ext4_extent *oext, *o_start, *o_end, *prev_ext; - struct ext4_extent new_ext, start_ext, end_ext; - ext4_lblk_t new_ext_end; - int oext_alen, new_ext_alen, end_ext_alen; - int depth = ext_depth(orig_inode); - int ret; - - start_ext.ee_block = end_ext.ee_block = 0; - o_start = o_end = oext = orig_path[depth].p_ext; - oext_alen = ext4_ext_get_actual_len(oext); - start_ext.ee_len = end_ext.ee_len = 0; - - new_ext.ee_block = cpu_to_le32(*from); - ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); - new_ext.ee_len = dext->ee_len; - new_ext_alen = ext4_ext_get_actual_len(&new_ext); - new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; - - /* - * Case: original extent is first - * oext |--------| - * new_ext |--| - * start_ext |--| - */ - if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && - le32_to_cpu(new_ext.ee_block) < - le32_to_cpu(oext->ee_block) + oext_alen) { - start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - - le32_to_cpu(oext->ee_block)); - start_ext.ee_block = oext->ee_block; - copy_extent_status(oext, &start_ext); - } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { - prev_ext = oext - 1; - /* - * We can merge new_ext into previous extent, - * if these are contiguous and same extent type. - */ - if (ext4_can_extents_be_merged(orig_inode, prev_ext, - &new_ext)) { - o_start = prev_ext; - start_ext.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(prev_ext) + - new_ext_alen); - start_ext.ee_block = oext->ee_block; - copy_extent_status(prev_ext, &start_ext); - new_ext.ee_len = 0; - } - } - - /* - * Case: new_ext_end must be less than oext - * oext |-----------| - * new_ext |-------| - */ - if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - EXT4_ERROR_INODE(orig_inode, - "new_ext_end(%u) should be less than or equal to " - "oext->ee_block(%u) + oext_alen(%d) - 1", - new_ext_end, le32_to_cpu(oext->ee_block), - oext_alen); - ret = -EIO; - goto out; - } - - /* - * Case: new_ext is smaller than original extent - * oext |---------------| - * new_ext |-----------| - * end_ext |---| - */ - if (le32_to_cpu(oext->ee_block) <= new_ext_end && - new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { - end_ext.ee_len = - cpu_to_le16(le32_to_cpu(oext->ee_block) + - oext_alen - 1 - new_ext_end); - copy_extent_status(oext, &end_ext); - end_ext_alen = ext4_ext_get_actual_len(&end_ext); - ext4_ext_store_pblock(&end_ext, - (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); - end_ext.ee_block = - cpu_to_le32(le32_to_cpu(o_end->ee_block) + - oext_alen - end_ext_alen); - } - - ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, - o_end, &start_ext, &new_ext, &end_ext); -out: - return ret; -} - -/** - * mext_calc_swap_extents - Calculate extents for extent swapping. - * - * @tmp_dext: the extent that will belong to the original inode - * @tmp_oext: the extent that will belong to the donor inode - * @orig_off: block offset of original inode - * @donor_off: block offset of donor inode - * @max_count: the maximum length of extents - * - * Return 0 on success, or a negative error value on failure. - */ -static int -mext_calc_swap_extents(struct ext4_extent *tmp_dext, - struct ext4_extent *tmp_oext, - ext4_lblk_t orig_off, ext4_lblk_t donor_off, - ext4_lblk_t max_count) -{ - ext4_lblk_t diff, orig_diff; - struct ext4_extent dext_old, oext_old; - - BUG_ON(orig_off != donor_off); - - /* original and donor extents have to cover the same block offset */ - if (orig_off < le32_to_cpu(tmp_oext->ee_block) || - le32_to_cpu(tmp_oext->ee_block) + - ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) - return -ENODATA; - - if (orig_off < le32_to_cpu(tmp_dext->ee_block) || - le32_to_cpu(tmp_dext->ee_block) + - ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) - return -ENODATA; - - dext_old = *tmp_dext; - oext_old = *tmp_oext; - - /* When tmp_dext is too large, pick up the target range. */ - diff = donor_off - le32_to_cpu(tmp_dext->ee_block); - - ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); - le32_add_cpu(&tmp_dext->ee_block, diff); - le16_add_cpu(&tmp_dext->ee_len, -diff); - - if (max_count < ext4_ext_get_actual_len(tmp_dext)) - tmp_dext->ee_len = cpu_to_le16(max_count); - - orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); - ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); - - /* Adjust extent length if donor extent is larger than orig */ - if (ext4_ext_get_actual_len(tmp_dext) > - ext4_ext_get_actual_len(tmp_oext) - orig_diff) - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - - orig_diff); - - tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); - - copy_extent_status(&oext_old, tmp_dext); - copy_extent_status(&dext_old, tmp_oext); - - return 0; -} - -/** * mext_check_coverage - Check that all extents in range has the same type * * @inode: inode in question @@ -619,171 +114,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, } ret = 1; out: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); return ret; } /** - * mext_replace_branches - Replace original extents with new extents - * - * @handle: journal handle - * @orig_inode: original inode - * @donor_inode: donor inode - * @from: block offset of orig_inode - * @count: block count to be replaced - * @err: pointer to save return value - * - * Replace original inode extents and donor inode extents page by page. - * We implement this replacement in the following three steps: - * 1. Save the block information of original and donor inodes into - * dummy extents. - * 2. Change the block information of original inode to point at the - * donor inode blocks. - * 3. Change the block information of donor inode to point at the saved - * original inode blocks in the dummy extents. - * - * Return replaced block count. - */ -static int -mext_replace_branches(handle_t *handle, struct inode *orig_inode, - struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count, int *err) -{ - struct ext4_ext_path *orig_path = NULL; - struct ext4_ext_path *donor_path = NULL; - struct ext4_extent *oext, *dext; - struct ext4_extent tmp_dext, tmp_oext; - ext4_lblk_t orig_off = from, donor_off = from; - int depth; - int replaced_count = 0; - int dext_alen; - - *err = ext4_es_remove_extent(orig_inode, from, count); - if (*err) - goto out; - - *err = ext4_es_remove_extent(donor_inode, from, count); - if (*err) - goto out; - - /* Get the original extent for the block "orig_off" */ - *err = get_ext_path(orig_inode, orig_off, &orig_path); - if (*err) - goto out; - - /* Get the donor extent for the head */ - *err = get_ext_path(donor_inode, donor_off, &donor_path); - if (*err) - goto out; - depth = ext_depth(orig_inode); - oext = orig_path[depth].p_ext; - tmp_oext = *oext; - - depth = ext_depth(donor_inode); - dext = donor_path[depth].p_ext; - if (unlikely(!dext)) - goto missing_donor_extent; - tmp_dext = *dext; - - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, count); - if (*err) - goto out; - - /* Loop for the donor extents */ - while (1) { - /* The extent for donor must be found. */ - if (unlikely(!dext)) { - missing_donor_extent: - EXT4_ERROR_INODE(donor_inode, - "The extent for donor must be found"); - *err = -EIO; - goto out; - } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - EXT4_ERROR_INODE(donor_inode, - "Donor offset(%u) and the first block of donor " - "extent(%u) should be equal", - donor_off, - le32_to_cpu(tmp_dext.ee_block)); - *err = -EIO; - goto out; - } - - /* Set donor extent to orig extent */ - *err = mext_leaf_block(handle, orig_inode, - orig_path, &tmp_dext, &orig_off); - if (*err) - goto out; - - /* Set orig extent to donor extent */ - *err = mext_leaf_block(handle, donor_inode, - donor_path, &tmp_oext, &donor_off); - if (*err) - goto out; - - dext_alen = ext4_ext_get_actual_len(&tmp_dext); - replaced_count += dext_alen; - donor_off += dext_alen; - orig_off += dext_alen; - - BUG_ON(replaced_count > count); - /* Already moved the expected blocks */ - if (replaced_count >= count) - break; - - if (orig_path) - ext4_ext_drop_refs(orig_path); - *err = get_ext_path(orig_inode, orig_off, &orig_path); - if (*err) - goto out; - depth = ext_depth(orig_inode); - oext = orig_path[depth].p_ext; - tmp_oext = *oext; - - if (donor_path) - ext4_ext_drop_refs(donor_path); - *err = get_ext_path(donor_inode, donor_off, &donor_path); - if (*err) - goto out; - depth = ext_depth(donor_inode); - dext = donor_path[depth].p_ext; - tmp_dext = *dext; - - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, count - replaced_count); - if (*err) - goto out; - } - -out: - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - if (donor_path) { - ext4_ext_drop_refs(donor_path); - kfree(donor_path); - } - - return replaced_count; -} - -/** * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure - * @index: page index + * @index1: page index + * @index2: page index * @page: result page vector * * Grab two locked pages for inode's by inode order */ static int mext_page_double_lock(struct inode *inode1, struct inode *inode2, - pgoff_t index, struct page *page[2]) + pgoff_t index1, pgoff_t index2, struct page *page[2]) { struct address_space *mapping[2]; unsigned fl = AOP_FLAG_NOFS; @@ -793,15 +142,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, mapping[0] = inode1->i_mapping; mapping[1] = inode2->i_mapping; } else { + pgoff_t tmp = index1; + index1 = index2; + index2 = tmp; mapping[0] = inode2->i_mapping; mapping[1] = inode1->i_mapping; } - page[0] = grab_cache_page_write_begin(mapping[0], index, fl); + page[0] = grab_cache_page_write_begin(mapping[0], index1, fl); if (!page[0]) return -ENOMEM; - page[1] = grab_cache_page_write_begin(mapping[1], index, fl); + page[1] = grab_cache_page_write_begin(mapping[1], index2, fl); if (!page[1]) { unlock_page(page[0]); page_cache_release(page[0]); @@ -893,25 +245,27 @@ out: * @o_filp: file structure of original file * @donor_inode: donor inode * @orig_page_offset: page index on original file + * @donor_page_offset: page index on donor file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents - * with donor inode extents by calling mext_replace_branches(). + * with donor inode extents by calling ext4_swap_extents(). * Finally, write out the saved data in new original inode blocks. Return * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, - pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int unwritten, int *err) + pgoff_t orig_page_offset, pgoff_t donor_page_offset, + int data_offset_in_page, + int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; handle_t *handle; - ext4_lblk_t orig_blk_offset; + ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; @@ -939,6 +293,9 @@ again: orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; + donor_blk_offset = donor_page_offset * blocks_per_page + + data_offset_in_page; + /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { @@ -959,7 +316,7 @@ again: replaced_size = data_size; *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, - pagep); + donor_page_offset, pagep); if (unlikely(*err < 0)) goto stop_journal; /* @@ -978,7 +335,7 @@ again: if (*err) goto drop_data_sem; - unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, + unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; @@ -994,9 +351,10 @@ again: *err = -EBUSY; goto drop_data_sem; } - replaced_count = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page, err); + replaced_count = ext4_swap_extents(handle, orig_inode, + donor_inode, orig_blk_offset, + donor_blk_offset, + block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_pages; @@ -1014,9 +372,9 @@ data_copy: goto unlock_pages; } ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, - block_len_in_page, err); + replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 1, err); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { @@ -1061,9 +419,9 @@ repair_branches: * Try to swap extents to it's original places */ ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, - orig_blk_offset, - block_len_in_page, &err2); + replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), @@ -1093,10 +451,14 @@ mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len) { - ext4_lblk_t orig_blocks, donor_blocks; + __u64 orig_eof, donor_eof; unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; + orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; + donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; + + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", @@ -1112,7 +474,7 @@ mext_check_arguments(struct inode *orig_inode, ext4_debug("ext4 move extent: The argument files should " "not be swapfile [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; + return -EBUSY; } /* Ext4 move extent supports only extent based file */ @@ -1132,67 +494,28 @@ mext_check_arguments(struct inode *orig_inode, } /* Start offset should be same */ - if (orig_start != donor_start) { + if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != + (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { ext4_debug("ext4 move extent: orig and donor's start " - "offset are not same [ino:orig %lu, donor %lu]\n", + "offset are not alligned [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || + (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - - if (orig_inode->i_size > donor_inode->i_size) { - donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; - /* TODO: eliminate this artificial restriction */ - if (orig_start >= donor_blocks) { - ext4_debug("ext4 move extent: orig start offset " - "[%llu] should be less than donor file blocks " - "[%u] [ino:orig %lu, donor %lu]\n", - orig_start, donor_blocks, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* TODO: eliminate this artificial restriction */ - if (orig_start + *len > donor_blocks) { - ext4_debug("ext4 move extent: End offset [%llu] should " - "be less than donor file blocks [%u]." - "So adjust length from %llu to %llu " - "[ino:orig %lu, donor %lu]\n", - orig_start + *len, donor_blocks, - *len, donor_blocks - orig_start, - orig_inode->i_ino, donor_inode->i_ino); - *len = donor_blocks - orig_start; - } - } else { - orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; - if (orig_start >= orig_blocks) { - ext4_debug("ext4 move extent: start offset [%llu] " - "should be less than original file blocks " - "[%u] [ino:orig %lu, donor %lu]\n", - orig_start, orig_blocks, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - if (orig_start + *len > orig_blocks) { - ext4_debug("ext4 move extent: Adjust length " - "from %llu to %llu. Because it should be " - "less than original file blocks " - "[ino:orig %lu, donor %lu]\n", - *len, orig_blocks - orig_start, - orig_inode->i_ino, donor_inode->i_ino); - *len = orig_blocks - orig_start; - } - } - + if (orig_eof < orig_start + *len - 1) + *len = orig_eof - orig_start; + if (donor_eof < donor_start + *len - 1) + *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, @@ -1208,60 +531,26 @@ mext_check_arguments(struct inode *orig_inode, * * @o_filp: file structure of the original file * @d_filp: file structure of the donor file - * @orig_start: start offset in block for orig - * @donor_start: start offset in block for donor + * @orig_blk: start offset in block for orig + * @donor_blk: start offset in block for donor * @len: the number of blocks to be moved * @moved_len: moved block length * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. * - * Note: ext4_move_extents() proceeds the following order. - * 1:ext4_move_extents() calculates the last block number of moving extent - * function by the start block number (orig_start) and the number of blocks - * to be moved (len) specified as arguments. - * If the {orig, donor}_start points a hole, the extent's start offset - * pointed by ext_cur (current extent), holecheck_path, orig_path are set - * after hole behind. - * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent - * or the ext_cur exceeds the block_end which is last logical block number. - * 3:To get the length of continues area, call mext_next_extent() - * specified with the ext_cur (initial value is holecheck_path) re-cursive, - * until find un-continuous extent, the start logical block number exceeds - * the block_end or the extent points to the last extent. - * 4:Exchange the original inode data with donor inode data - * from orig_page_offset to seq_end_page. - * The start indexes of data are specified as arguments. - * That of the original inode is orig_page_offset, - * and the donor inode is also orig_page_offset - * (To easily handle blocksize != pagesize case, the offset for the - * donor inode is block unit). - * 5:Update holecheck_path and orig_path to points a next proceeding extent, - * then returns to step 2. - * 6:Release holecheck_path, orig_path and set the len to moved_len - * which shows the number of moved blocks. - * The moved_len is useful for the command to calculate the file offset - * for starting next move extent ioctl. - * 7:Return 0 on success, or a negative error value on failure. */ int -ext4_move_extents(struct file *o_filp, struct file *d_filp, - __u64 orig_start, __u64 donor_start, __u64 len, - __u64 *moved_len) +ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, + __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); - struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; - struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; - ext4_lblk_t block_start = orig_start; - ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; - ext4_lblk_t rest_blocks; - pgoff_t orig_page_offset = 0, seq_end_page; - int ret, depth, last_extent = 0; + struct ext4_ext_path *path = NULL; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; - int data_offset_in_page; - int block_len_in_page; - int unwritten; + ext4_lblk_t o_end, o_start = orig_blk; + ext4_lblk_t d_start = donor_blk; + int ret; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " @@ -1303,121 +592,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len); + ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, + donor_blk, &len); if (ret) goto out; + o_end = o_start + len; - file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; - block_end = block_start + len - 1; - if (file_end < block_end) - len -= block_end - file_end; + while (o_start < o_end) { + struct ext4_extent *ex; + ext4_lblk_t cur_blk, next_blk; + pgoff_t orig_page_index, donor_page_index; + int offset_in_page; + int unwritten, cur_len; - ret = get_ext_path(orig_inode, block_start, &orig_path); - if (ret) - goto out; - - /* Get path structure to check the hole */ - ret = get_ext_path(orig_inode, block_start, &holecheck_path); - if (ret) - goto out; - - depth = ext_depth(orig_inode); - ext_cur = holecheck_path[depth].p_ext; - - /* - * Get proper starting location of block replacement if block_start was - * within the hole. - */ - if (le32_to_cpu(ext_cur->ee_block) + - ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { - /* - * The hole exists between extents or the tail of - * original file. - */ - last_extent = mext_next_extent(orig_inode, - holecheck_path, &ext_cur); - if (last_extent < 0) { - ret = last_extent; - goto out; - } - last_extent = mext_next_extent(orig_inode, orig_path, - &ext_dummy); - if (last_extent < 0) { - ret = last_extent; + ret = get_ext_path(orig_inode, o_start, &path); + if (ret) goto out; - } - seq_start = le32_to_cpu(ext_cur->ee_block); - } else if (le32_to_cpu(ext_cur->ee_block) > block_start) - /* The hole exists at the beginning of original file. */ - seq_start = le32_to_cpu(ext_cur->ee_block); - else - seq_start = block_start; - - /* No blocks within the specified range. */ - if (le32_to_cpu(ext_cur->ee_block) > block_end) { - ext4_debug("ext4 move extent: The specified range of file " - "may be the hole\n"); - ret = -EINVAL; - goto out; - } - - /* Adjust start blocks */ - add_blocks = min(le32_to_cpu(ext_cur->ee_block) + - ext4_ext_get_actual_len(ext_cur), block_end + 1) - - max(le32_to_cpu(ext_cur->ee_block), block_start); - - while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { - seq_blocks += add_blocks; - - /* Adjust tail blocks */ - if (seq_start + seq_blocks - 1 > block_end) - seq_blocks = block_end - seq_start + 1; - - ext_prev = ext_cur; - last_extent = mext_next_extent(orig_inode, holecheck_path, - &ext_cur); - if (last_extent < 0) { - ret = last_extent; - break; - } - add_blocks = ext4_ext_get_actual_len(ext_cur); - - /* - * Extend the length of contiguous block (seq_blocks) - * if extents are contiguous. - */ - if (ext4_can_extents_be_merged(orig_inode, - ext_prev, ext_cur) && - block_end >= le32_to_cpu(ext_cur->ee_block) && - !last_extent) + ex = path[path->p_depth].p_ext; + next_blk = ext4_ext_next_allocated_block(path); + cur_blk = le32_to_cpu(ex->ee_block); + cur_len = ext4_ext_get_actual_len(ex); + /* Check hole before the start pos */ + if (cur_blk + cur_len - 1 < o_start) { + if (next_blk == EXT_MAX_BLOCKS) { + o_start = o_end; + ret = -ENODATA; + goto out; + } + d_start += next_blk - o_start; + o_start = next_blk; continue; - - /* Is original extent is unwritten */ - unwritten = ext4_ext_is_unwritten(ext_prev); - - data_offset_in_page = seq_start % blocks_per_page; - - /* - * Calculate data blocks count that should be swapped - * at the first page. - */ - if (data_offset_in_page + seq_blocks > blocks_per_page) { - /* Swapped blocks are across pages */ - block_len_in_page = - blocks_per_page - data_offset_in_page; - } else { - /* Swapped blocks are in a page */ - block_len_in_page = seq_blocks; + /* Check hole after the start pos */ + } else if (cur_blk > o_start) { + /* Skip hole */ + d_start += cur_blk - o_start; + o_start = cur_blk; + /* Extent inside requested range ?*/ + if (cur_blk >= o_end) + goto out; + } else { /* in_range(o_start, o_blk, o_len) */ + cur_len += cur_blk - o_start; } - - orig_page_offset = seq_start >> - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); - seq_end_page = (seq_start + seq_blocks - 1) >> - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); - seq_start = le32_to_cpu(ext_cur->ee_block); - rest_blocks = seq_blocks; - + unwritten = ext4_ext_is_unwritten(ex); + if (o_end - o_start < cur_len) + cur_len = o_end - o_start; + + orig_page_index = o_start >> (PAGE_CACHE_SHIFT - + orig_inode->i_blkbits); + donor_page_index = d_start >> (PAGE_CACHE_SHIFT - + donor_inode->i_blkbits); + offset_in_page = o_start % blocks_per_page; + if (cur_len > blocks_per_page- offset_in_page) + cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: * a. transaction deadlock among ext4_journal_start, @@ -1426,77 +652,29 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, * in move_extent_per_page */ ext4_double_up_write_data_sem(orig_inode, donor_inode); - - while (orig_page_offset <= seq_end_page) { - - /* Swap original branches with new branches */ - block_len_in_page = move_extent_per_page( - o_filp, donor_inode, - orig_page_offset, - data_offset_in_page, - block_len_in_page, - unwritten, &ret); - - /* Count how many blocks we have exchanged */ - *moved_len += block_len_in_page; - if (ret < 0) - break; - if (*moved_len > len) { - EXT4_ERROR_INODE(orig_inode, - "We replaced blocks too much! " - "sum of replaced: %llu requested: %llu", - *moved_len, len); - ret = -EIO; - break; - } - - orig_page_offset++; - data_offset_in_page = 0; - rest_blocks -= block_len_in_page; - if (rest_blocks > blocks_per_page) - block_len_in_page = blocks_per_page; - else - block_len_in_page = rest_blocks; - } - + /* Swap original branches with new branches */ + move_extent_per_page(o_filp, donor_inode, + orig_page_index, donor_page_index, + offset_in_page, cur_len, + unwritten, &ret); ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; - - /* Decrease buffer counter */ - if (holecheck_path) - ext4_ext_drop_refs(holecheck_path); - ret = get_ext_path(orig_inode, seq_start, &holecheck_path); - if (ret) - break; - depth = holecheck_path->p_depth; - - /* Decrease buffer counter */ - if (orig_path) - ext4_ext_drop_refs(orig_path); - ret = get_ext_path(orig_inode, seq_start, &orig_path); - if (ret) - break; - - ext_cur = holecheck_path[depth].p_ext; - add_blocks = ext4_ext_get_actual_len(ext_cur); - seq_blocks = 0; - + o_start += cur_len; + d_start += cur_len; } + *moved_len = o_start - orig_blk; + if (*moved_len > len) + *moved_len = len; + out: if (*moved_len) { ext4_discard_preallocations(orig_inode); ext4_discard_preallocations(donor_inode); } - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - if (holecheck_path) { - ext4_ext_drop_refs(holecheck_path); - kfree(holecheck_path); - } + ext4_ext_drop_refs(path); + kfree(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); ext4_inode_resume_unlocked_dio(donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 603e4ebbd0ac..123798c5ac31 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -53,7 +53,7 @@ static struct buffer_head *ext4_append(handle_t *handle, ext4_lblk_t *block) { struct buffer_head *bh; - int err = 0; + int err; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= @@ -62,9 +62,9 @@ static struct buffer_head *ext4_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1, &err); - if (!bh) - return ERR_PTR(err); + bh = ext4_bread(handle, inode, *block, 1); + if (IS_ERR(bh)) + return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; BUFFER_TRACE(bh, "get_write_access"); @@ -94,20 +94,20 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, { struct buffer_head *bh; struct ext4_dir_entry *dirent; - int err = 0, is_dx_block = 0; + int is_dx_block = 0; - bh = ext4_bread(NULL, inode, block, 0, &err); - if (!bh) { - if (err == 0) { - ext4_error_inode(inode, __func__, line, block, - "Directory hole found"); - return ERR_PTR(-EIO); - } + bh = ext4_bread(NULL, inode, block, 0); + if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, __func__, line, - "error reading directory block " - "(ino %lu, block %lu)", inode->i_ino, + "error %ld reading directory block " + "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, (unsigned long) block); - return ERR_PTR(err); + + return bh; + } + if (!bh) { + ext4_error_inode(inode, __func__, line, block, "Directory hole found"); + return ERR_PTR(-EIO); } dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ @@ -124,8 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, "directory leaf block found instead of index block"); return ERR_PTR(-EIO); } - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) || + if (!ext4_has_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; @@ -253,8 +252,7 @@ static unsigned dx_node_limit(struct inode *dir); static struct dx_frame *dx_probe(const struct qstr *d_name, struct inode *dir, struct dx_hash_info *hinfo, - struct dx_frame *frame, - int *err); + struct dx_frame *frame); static void dx_release(struct dx_frame *frames); static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); @@ -270,8 +268,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *err); + struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); @@ -340,8 +337,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct ext4_dir_entry_tail *t; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, dirent); @@ -362,8 +358,7 @@ static void ext4_dirent_csum_set(struct inode *inode, { struct ext4_dir_entry_tail *t; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, dirent); @@ -438,8 +433,7 @@ static int ext4_dx_csum_verify(struct inode *inode, struct dx_tail *t; int count_offset, limit, count; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -468,8 +462,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) struct dx_tail *t; int count_offset, limit, count; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -557,8 +550,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -567,8 +559,7 @@ static inline unsigned dx_node_limit(struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -641,7 +632,9 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); - if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; + bh = ext4_bread(NULL,dir, block, 0); + if (!bh || IS_ERR(bh)) + continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); @@ -669,29 +662,25 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, */ static struct dx_frame * dx_probe(const struct qstr *d_name, struct inode *dir, - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) + struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; - struct buffer_head *bh; struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; - frame->bh = NULL; - bh = ext4_read_dirblock(dir, 0, INDEX); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); - goto fail; - } - root = (struct dx_root *) bh->b_data; + frame->bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; + + root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", root->info.hash_version); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } hinfo->hash_version = root->info.hash_version; @@ -705,16 +694,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.unused_flags & 1) { ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } if ((indirect = root->info.indirect_levels) > 1) { ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } @@ -724,27 +709,21 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { ext4_warning(dir->i_sb, "dx entry: limit != root limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } dxtrace(printk("Look up %x", hash)); - while (1) - { + while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { ext4_warning(dir->i_sb, "dx entry: no count or count > limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; + goto fail; } p = entries + 1; q = entries + count - 1; - while (p <= q) - { + while (p <= q) { m = p + (q - p)/2; dxtrace(printk(".")); if (dx_get_hash(m) > hash) @@ -753,8 +732,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, p = m + 1; } - if (0) // linear search cross check - { + if (0) { // linear search cross check unsigned n = count - 1; at = entries; while (n--) @@ -771,38 +749,35 @@ dx_probe(const struct qstr *d_name, struct inode *dir, at = p - 1; dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); - frame->bh = bh; frame->entries = entries; frame->at = at; - if (!indirect--) return frame; - bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); - goto fail2; + if (!indirect--) + return frame; + frame++; + frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(frame->bh)) { + ret_err = (struct dx_frame *) frame->bh; + frame->bh = NULL; + goto fail; } - entries = ((struct dx_node *) bh->b_data)->entries; + entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, "dx entry: limit != node limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; + goto fail; } - frame++; - frame->bh = NULL; } -fail2: +fail: while (frame >= frame_in) { brelse(frame->bh); frame--; } -fail: - if (*err == ERR_BAD_DX_DIR) + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning(dir->i_sb, "Corrupt dir inode %lu, running e2fsck is " "recommended.", dir->i_ino); - return NULL; + return ret_err; } static void dx_release (struct dx_frame *frames) @@ -988,9 +963,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir, &hinfo, frames, &err); - if (!frame) - return err; + frame = dx_probe(NULL, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { @@ -1227,8 +1202,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, err = 0; - int namelen; + int i, namelen; *res_dir = NULL; sb = dir->i_sb; @@ -1258,17 +1232,13 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto restart; } if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); + bh = ext4_dx_find_entry(dir, d_name, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (err == -ENOENT) - return NULL; - if (err && err != ERR_BAD_DX_DIR) - return ERR_PTR(err); - if (bh) + if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) return bh; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); @@ -1298,10 +1268,10 @@ restart: break; } num++; - bh = ext4_getblk(NULL, dir, b++, 0, &err); - if (unlikely(err)) { + bh = ext4_getblk(NULL, dir, b++, 0); + if (unlikely(IS_ERR(bh))) { if (ra_max == 0) - return ERR_PTR(err); + return bh; break; } bh_use[ra_max] = bh; @@ -1366,7 +1336,7 @@ cleanup_and_exit: } static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) + struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; struct dx_hash_info hinfo; @@ -1375,25 +1345,23 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q ext4_lblk_t block; int retval; - if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) - return NULL; + frame = dx_probe(d_name, dir, &hinfo, frames); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); + if (IS_ERR(bh)) goto errout; - } + retval = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); - if (retval == 1) { /* Success! */ - dx_release(frames); - return bh; - } + if (retval == 1) + goto success; brelse(bh); if (retval == -1) { - *err = ERR_BAD_DX_DIR; + bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } @@ -1402,18 +1370,19 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q frames, NULL); if (retval < 0) { ext4_warning(sb, - "error reading index page in directory #%lu", - dir->i_ino); - *err = retval; + "error %d reading index page in directory #%lu", + retval, dir->i_ino); + bh = ERR_PTR(retval); goto errout; } } while (retval == 1); - *err = -ENOENT; + bh = NULL; errout: dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); - dx_release (frames); - return NULL; +success: + dx_release(frames); + return bh; } static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -1441,7 +1410,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi dentry); return ERR_PTR(-EIO); } - inode = ext4_iget(dir->i_sb, ino); + inode = ext4_iget_normal(dir->i_sb, ino); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1474,7 +1443,7 @@ struct dentry *ext4_get_parent(struct dentry *child) return ERR_PTR(-EIO); } - return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino)); } /* @@ -1533,7 +1502,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, - struct dx_hash_info *hinfo, int *error) + struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count, continued; @@ -1548,16 +1517,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int csum_size = 0; int err = 0, i; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - *error = PTR_ERR(bh2); - return NULL; + return (struct ext4_dir_entry_2 *) bh2; } BUFFER_TRACE(*bh, "get_write_access"); @@ -1617,8 +1584,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); /* Which block gets the new entry? */ - if (hinfo->hash >= hash2) - { + if (hinfo->hash >= hash2) { swap(*bh, bh2); de = de2; } @@ -1638,8 +1604,7 @@ journal_error: brelse(bh2); *bh = NULL; ext4_std_error(dir->i_sb, err); - *error = err; - return NULL; + return ERR_PTR(err); } int ext4_find_dest_de(struct inode *dir, struct inode *inode, @@ -1718,8 +1683,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, int csum_size = 0; int err; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { @@ -1786,8 +1750,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; int csum_size = 0; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; @@ -1862,8 +1825,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ext4_handle_dirty_dx_node(handle, dir, frame->bh); ext4_handle_dirty_dirent_node(handle, dir, bh); - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - if (!de) { + de = do_split(handle,dir, &bh, frame, &hinfo); + if (IS_ERR(de)) { /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up @@ -1871,7 +1834,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, */ ext4_mark_inode_dirty(handle, dir); dx_release(frames); - return retval; + return PTR_ERR(de); } dx_release(frames); @@ -1904,8 +1867,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_lblk_t block, blocks; int csum_size = 0; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; @@ -1982,9 +1944,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); - if (!frame) - return err; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); entries = frame->entries; at = frame->at; bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); @@ -2095,9 +2057,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; } } - de = do_split(handle, dir, &bh, frame, &hinfo, &err); - if (!de) + de = do_split(handle, dir, &bh, frame, &hinfo); + if (IS_ERR(de)) { + err = PTR_ERR(de); goto cleanup; + } err = add_dirent_to_buf(handle, dentry, inode, de, bh); goto cleanup; @@ -2167,8 +2131,7 @@ static int ext4_delete_entry(handle_t *handle, return err; } - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); @@ -2387,8 +2350,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, int csum_size = 0; int err; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -2403,10 +2365,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); - if (err) - goto out; de = (struct ext4_dir_entry_2 *)dir_block->b_data; ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); set_nlink(inode, 2); @@ -2573,7 +2531,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) int err = 0, rc; bool dirty = false; - if (!sbi->s_journal) + if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && @@ -3190,6 +3148,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) } } +static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent, + int credits, handle_t **h) +{ + struct inode *wh; + handle_t *handle; + int retries = 0; + + /* + * for inode block, sb block, group summaries, + * and inode bitmap + */ + credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS + 4); +retry: + wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE, + &ent->dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + + handle = ext4_journal_current_handle(); + if (IS_ERR(wh)) { + if (handle) + ext4_journal_stop(handle); + if (PTR_ERR(wh) == -ENOSPC && + ext4_should_retry_alloc(ent->dir->i_sb, &retries)) + goto retry; + } else { + *h = handle; + init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); + wh->i_op = &ext4_special_inode_operations; + } + return wh; +} + /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. @@ -3199,7 +3190,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { @@ -3214,6 +3206,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, }; int force_reread; int retval; + struct inode *whiteout = NULL; + int credits; + u8 old_file_type; dquot_initialize(old.dir); dquot_initialize(new.dir); @@ -3252,11 +3247,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) ext4_alloc_da_blocks(old.inode); - handle = ext4_journal_start(old.dir, EXT4_HT_DIR, - (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); - if (IS_ERR(handle)) - return PTR_ERR(handle); + credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (!(flags & RENAME_WHITEOUT)) { + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } else { + whiteout = ext4_whiteout_for_rename(&old, credits, &handle); + if (IS_ERR(whiteout)) + return PTR_ERR(whiteout); + } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); @@ -3284,13 +3285,26 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, */ force_reread = (new.dir->i_ino == old.dir->i_ino && ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); + + old_file_type = old.de->file_type; + if (whiteout) { + /* + * Do this before adding a new entry, so the old entry is sure + * to be still pointing to the valid old entry. + */ + retval = ext4_setent(handle, &old, whiteout->i_ino, + EXT4_FT_CHRDEV); + if (retval) + goto end_rename; + ext4_mark_inode_dirty(handle, whiteout); + } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { retval = ext4_setent(handle, &new, - old.inode->i_ino, old.de->file_type); + old.inode->i_ino, old_file_type); if (retval) goto end_rename; } @@ -3305,10 +3319,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old.inode->i_ctime = ext4_current_time(old.inode); ext4_mark_inode_dirty(handle, old.inode); - /* - * ok, that's it - */ - ext4_rename_delete(handle, &old, force_reread); + if (!whiteout) { + /* + * ok, that's it + */ + ext4_rename_delete(handle, &old, force_reread); + } if (new.inode) { ext4_dec_count(handle, new.inode); @@ -3344,6 +3360,12 @@ end_rename: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); + if (whiteout) { + if (retval) + drop_nlink(whiteout); + unlock_new_inode(whiteout); + iput(whiteout); + } if (handle) ext4_journal_stop(handle); return retval; @@ -3476,18 +3498,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); } - /* - * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" - * is equivalent to regular rename. - */ - return ext4_rename(old_dir, old_dentry, new_dir, new_dentry); + + return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } /* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 1e43b905ff98..f298c60f907d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1212,8 +1212,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, { struct buffer_head *bh; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 05c159218bc2..1eda6ab0ef9d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -70,7 +70,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb, static void ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); @@ -141,8 +140,7 @@ static __le32 ext4_superblock_csum(struct super_block *sb, static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); @@ -152,8 +150,7 @@ void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); @@ -820,10 +817,9 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_extent_cache_cnt); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif @@ -885,6 +881,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_lru); + ei->i_es_all_nr = 0; ei->i_es_lru_nr = 0; ei->i_touch_when = 0; ei->i_reserved_data_blocks = 0; @@ -1002,7 +999,7 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb, * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ - inode = ext4_iget(sb, ino); + inode = ext4_iget_normal(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { @@ -1124,25 +1121,6 @@ static const struct super_operations ext4_sops = { .bdev_try_to_free_page = bdev_try_to_free_page, }; -static const struct super_operations ext4_nojournal_sops = { - .alloc_inode = ext4_alloc_inode, - .destroy_inode = ext4_destroy_inode, - .write_inode = ext4_write_inode, - .dirty_inode = ext4_dirty_inode, - .drop_inode = ext4_drop_inode, - .evict_inode = ext4_evict_inode, - .sync_fs = ext4_sync_fs_nojournal, - .put_super = ext4_put_super, - .statfs = ext4_statfs, - .remount_fs = ext4_remount, - .show_options = ext4_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ext4_quota_read, - .quota_write = ext4_quota_write, -#endif - .bdev_try_to_free_page = bdev_try_to_free_page, -}; - static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, @@ -1712,13 +1690,6 @@ static int parse_options(char *options, struct super_block *sb, "not specified"); return 0; } - } else { - if (sbi->s_jquota_fmt) { - ext4_msg(sb, KERN_ERR, "journaled quota format " - "specified with no journaling " - "enabled"); - return 0; - } } #endif if (test_opt(sb, DIOREAD_NOLOCK)) { @@ -2016,8 +1987,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); - if ((sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { + if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __le16 save_csum; __u32 csum32; @@ -2035,6 +2005,10 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, } /* old crc16 code */ + if (!(sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM))) + return 0; + offset = offsetof(struct ext4_group_desc, bg_checksum); crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); @@ -2191,7 +2165,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { - jbd_debug(1, "Errors on filesystem, " + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list.\n"); es->s_last_orphan = 0; } @@ -2207,7 +2181,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (ret < 0) @@ -2263,7 +2237,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } @@ -2548,6 +2522,16 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, return count; } +static ssize_t es_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + + unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) + + a->u.offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + static ssize_t reserved_clusters_show(struct ext4_attr *a, struct ext4_sb_info *sbi, char *buf) { @@ -2601,14 +2585,29 @@ static struct ext4_attr ext4_attr_##_name = { \ .offset = offsetof(struct ext4_sb_info, _elname),\ }, \ } + +#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .u = { \ + .offset = offsetof(struct ext4_super_block, _elname), \ + }, \ +} + #define EXT4_ATTR(name, mode, show, store) \ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) + +#define EXT4_RO_ATTR_ES_UI(name, elname) \ + EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname) #define EXT4_RW_ATTR_SBI_UI(name, elname) \ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) + #define ATTR_LIST(name) &ext4_attr_##name.attr #define EXT4_DEPRECATED_ATTR(_name, _val) \ static struct ext4_attr ext4_attr_##_name = { \ @@ -2641,6 +2640,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); +EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2664,6 +2666,9 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(errors_count), + ATTR_LIST(first_error_time), + ATTR_LIST(last_error_time), NULL, }; @@ -2723,9 +2728,25 @@ static void ext4_feat_release(struct kobject *kobj) complete(&ext4_feat->f_kobj_unregister); } +static ssize_t ext4_feat_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "supported\n"); +} + +/* + * We can not use ext4_attr_show/store because it relies on the kobject + * being embedded in the ext4_sb_info structure which is definitely not + * true in this case. + */ +static const struct sysfs_ops ext4_feat_ops = { + .show = ext4_feat_show, + .store = NULL, +}; + static struct kobj_type ext4_feat_ktype = { .default_attrs = ext4_feat_attrs, - .sysfs_ops = &ext4_attr_ops, + .sysfs_ops = &ext4_feat_ops, .release = ext4_feat_release, }; @@ -3179,8 +3200,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; @@ -3190,6 +3210,10 @@ static int set_journal_csum_feature_set(struct super_block *sb) incompat = 0; } + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_CSUM_V3 | + JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, @@ -3202,11 +3226,8 @@ static int set_journal_csum_feature_set(struct super_block *sb) jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | - JBD2_FEATURE_INCOMPAT_CSUM_V3 | - JBD2_FEATURE_INCOMPAT_CSUM_V2); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; @@ -3436,7 +3457,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) logical_sb_block = sb_block; } - if (!(bh = sb_bread(sb, logical_sb_block))) { + if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); goto out_fail; } @@ -3487,8 +3508,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Precompute checksum seed for all metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); @@ -3519,8 +3539,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); - if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) - set_opt(sb, BLOCK_VALIDITY); + /* block_validity enabled by default; disable with noblock_validity */ + set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); @@ -3646,7 +3666,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); - bh = sb_bread(sb, logical_sb_block); + bh = sb_bread_unmovable(sb, logical_sb_block); if (!bh) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); @@ -3868,7 +3888,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); - sbi->s_group_desc[i] = sb_bread(sb, block); + sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); if (!sbi->s_group_desc[i]) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); @@ -3890,13 +3910,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.data = (unsigned long) sb; /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sbi); - - err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); + if (ext4_es_register_shrinker(sbi)) goto failed_mount3; - } sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; @@ -3904,11 +3919,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* * set up enough so that it can read an inode */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; + sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4229,10 +4240,9 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } -failed_mount3: ext4_es_unregister_shrinker(sbi); +failed_mount3: del_timer_sync(&sbi->s_err_report); - percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -4247,7 +4257,7 @@ failed_mount: remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif ext4_blkdev_remove(sbi); @@ -4375,6 +4385,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } + if ((le32_to_cpu(es->s_feature_ro_compat) & + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + es->s_checksum != ext4_superblock_csum(sb, es)) { + ext4_msg(sb, KERN_ERR, "external journal has " + "corrupt superblock"); + brelse(bh); + goto out_bdev; + } + if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); brelse(bh); @@ -4677,15 +4696,19 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * being sent at the end of the function. But we can skip it if * transaction_commit will do it for us. */ - target = jbd2_get_latest_transaction(sbi->s_journal); - if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + if (sbi->s_journal) { + target = jbd2_get_latest_transaction(sbi->s_journal); + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + needs_barrier = true; + + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { + if (wait) + ret = jbd2_log_wait_commit(sbi->s_journal, + target); + } + } else if (wait && test_opt(sb, BARRIER)) needs_barrier = true; - - if (jbd2_journal_start_commit(sbi->s_journal, &target)) { - if (wait) - ret = jbd2_log_wait_commit(sbi->s_journal, target); - } if (needs_barrier) { int err; err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); @@ -4696,19 +4719,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait) return ret; } -static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) -{ - int ret = 0; - - trace_ext4_sync_fs(sb, wait); - flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - dquot_writeback_dquots(sb, -1); - if (wait && test_opt(sb, BARRIER)) - ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); - - return ret; -} - /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. @@ -4727,23 +4737,26 @@ static int ext4_freeze(struct super_block *sb) journal = EXT4_SB(sb)->s_journal; - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + if (journal) { + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* - * Don't clear the needs_recovery flag if we failed to flush - * the journal. - */ - error = jbd2_journal_flush(journal); - if (error < 0) - goto out; + /* + * Don't clear the needs_recovery flag if we failed to + * flush the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; + } /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: - /* we rely on upper layer to stop further updates */ - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (journal) + /* we rely on upper layer to stop further updates */ + jbd2_journal_unlock_updates(journal); return error; } @@ -4774,7 +4787,7 @@ struct ext4_mount_options { u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; + char *s_qf_names[EXT4_MAXQUOTAS]; #endif }; @@ -4804,7 +4817,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], GFP_KERNEL); @@ -4965,7 +4978,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA /* Release old quota file names */ - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); if (enable_quota) { if (sb_any_quota_suspended(sb)) @@ -4994,7 +5007,7 @@ restore_opts: sbi->s_max_batch_time = old_opts.s_max_batch_time; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } @@ -5197,7 +5210,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, { int err; struct inode *qf_inode; - unsigned long qf_inums[MAXQUOTAS] = { + unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) }; @@ -5225,13 +5238,13 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, static int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; - unsigned long qf_inums[MAXQUOTAS] = { + unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED); @@ -5309,7 +5322,6 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; @@ -5324,9 +5336,9 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, while (toread > 0) { tocopy = sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread; - bh = ext4_bread(NULL, inode, blk, 0, &err); - if (err) - return err; + bh = ext4_bread(NULL, inode, blk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) /* A hole? */ memset(data, 0, tocopy); else @@ -5347,8 +5359,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); + int err, offset = off & (sb->s_blocksize - 1); struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -5369,14 +5380,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - bh = ext4_bread(handle, inode, blk, 1, &err); + bh = ext4_bread(handle, inode, blk, 1); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, bh); if (err) { brelse(bh); - goto out; + return err; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); @@ -5385,8 +5398,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: - if (err) - return err; if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e7387337060c..1e09fc77395c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -142,8 +142,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + if (ext4_has_metadata_csum(inode->i_sb) && (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) return 0; return 1; @@ -153,8 +152,7 @@ static void ext4_xattr_block_csum_set(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); @@ -190,14 +188,28 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) } static int -ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) +ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, + void *value_start) { - while (!IS_LAST_ENTRY(entry)) { - struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); + struct ext4_xattr_entry *e = entry; + + while (!IS_LAST_ENTRY(e)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) return -EIO; - entry = next; + e = next; } + + while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_size != 0 && + (value_start + le16_to_cpu(entry->e_value_offs) < + (void *)e + sizeof(__u32) || + value_start + le16_to_cpu(entry->e_value_offs) + + le32_to_cpu(entry->e_value_size) > end)) + return -EIO; + entry = EXT4_XATTR_NEXT(entry); + } + return 0; } @@ -214,7 +226,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) return -EIO; if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) return -EIO; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data); if (!error) set_buffer_verified(bh); return error; @@ -331,7 +344,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(entry, end); + error = ext4_xattr_check_names(entry, end, entry); if (error) goto cleanup; error = ext4_xattr_find_entry(&entry, name_index, name, @@ -463,7 +476,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(IFIRST(header), end); + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), @@ -899,14 +912,8 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - /* - * take i_data_sem because we will test - * i_delalloc_reserved_flag in ext4_mb_new_blocks - */ - down_read(&EXT4_I(inode)->i_data_sem); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); - up_read((&EXT4_I(inode)->i_data_sem)); if (error) goto cleanup; @@ -986,7 +993,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = ext4_xattr_check_names(IFIRST(header), is->s.end); + error = ext4_xattr_check_names(IFIRST(header), is->s.end, + IFIRST(header)); if (error) return error; /* Find the named attribute. */ diff --git a/fs/internal.h b/fs/internal.h index 9477f8f6aefc..757ba2abf21e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -47,7 +47,6 @@ extern void __init chrdev_init(void); /* * namei.c */ -extern int __inode_permission(struct inode *, int); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); @@ -139,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, extern int rw_verify_area(int, struct file *, const loff_t *, size_t); /* - * splice.c - */ -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); - -/* * pipe.c */ extern const struct file_operations pipefifo_fops; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 06fe11e0abfa..aab8549591e7 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode) goto out_err; } - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) { printk(KERN_ERR "%s: Cannot get buffer for journal superblock\n", diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 7f34f4716165..988b32ed4c87 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -96,15 +96,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_transaction == NULL && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { - /* - * Get our reference so that bh cannot be freed before - * we unlock it - */ - get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; - BUFFER_TRACE(bh, "release"); - __brelse(bh); } return ret; } @@ -122,8 +115,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) nblocks = jbd2_space_needed(journal); while (jbd2_log_space_left(journal) < nblocks) { - if (journal->j_flags & JBD2_ABORT) - return; write_unlock(&journal->j_state_lock); mutex_lock(&journal->j_checkpoint_mutex); @@ -139,6 +130,10 @@ void __jbd2_log_wait_for_space(journal_t *journal) * trace for forensic evidence. */ write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + mutex_unlock(&journal->j_checkpoint_mutex); + return; + } spin_lock(&journal->j_list_lock); nblocks = jbd2_space_needed(journal); space_left = jbd2_log_space_left(journal); @@ -183,58 +178,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) } } -/* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. - * - * Return 0 on success, and return <0 if some buffers have failed - * to be written out. - * - * Called with j_list_lock held. - */ -static int __wait_cp_io(journal_t *journal, transaction_t *transaction) -{ - struct journal_head *jh; - struct buffer_head *bh; - tid_t this_tid; - int released = 0; - int ret = 0; - - this_tid = transaction->t_tid; -restart: - /* Did somebody clean up the transaction in the meanwhile? */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return ret; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; - } - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - - /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list - */ - released = __jbd2_journal_remove_checkpoint(jh); - __brelse(bh); - } - - return ret; -} - static void __flush_batch(journal_t *journal, int *batch_count) { @@ -255,81 +198,6 @@ __flush_batch(journal_t *journal, int *batch_count) } /* - * Try to flush one buffer from the checkpoint list to disk. - * - * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. Return <0 if the buffer has failed to - * be written out. - * - * Called with j_list_lock held and drops it if 1 is returned - */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - int *batch_count, transaction_t *transaction) -{ - struct buffer_head *bh = jh2bh(jh); - int ret = 0; - - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - ret = 1; - } else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; - - transaction->t_chp_stats.cs_forced_to_close++; - spin_unlock(&journal->j_list_lock); - if (unlikely(journal->j_flags & JBD2_UNMOUNT)) - /* - * The journal thread is dead; so starting and - * waiting for a commit to finish will cause - * us to wait for a _very_ long time. - */ - printk(KERN_ERR "JBD2: %s: " - "Waiting for Godot: block %llu\n", - journal->j_devname, - (unsigned long long) bh->b_blocknr); - jbd2_log_start_commit(journal, tid); - jbd2_log_wait_commit(journal, tid); - ret = 1; - } else if (!buffer_dirty(bh)) { - ret = 1; - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - get_bh(bh); - BUFFER_TRACE(bh, "remove from checkpoint"); - __jbd2_journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - __brelse(bh); - } else { - /* - * Important: we are about to write the buffer, and - * possibly block, while still holding the journal lock. - * We cannot afford to let the transaction logic start - * messing around with this buffer before we write it to - * disk, as that would break recoverability. - */ - BUFFER_TRACE(bh, "queue"); - get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); - journal->j_chkpt_bhs[*batch_count] = bh; - __buffer_relink_io(jh); - transaction->t_chp_stats.cs_written++; - (*batch_count)++; - if (*batch_count == JBD2_NR_BATCH) { - spin_unlock(&journal->j_list_lock); - __flush_batch(journal, batch_count); - ret = 1; - } - } - return ret; -} - -/* * Perform an actual checkpoint. We take the first transaction on the * list of transactions to be checkpointed and send all its buffers * to disk. We submit larger chunks of data at once. @@ -339,9 +207,11 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, */ int jbd2_log_do_checkpoint(journal_t *journal) { - transaction_t *transaction; - tid_t this_tid; - int result; + struct journal_head *jh; + struct buffer_head *bh; + transaction_t *transaction; + tid_t this_tid; + int result, batch_count = 0; jbd_debug(1, "Start checkpoint\n"); @@ -374,45 +244,117 @@ restart: * done (maybe it's a new transaction, but it fell at the same * address). */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct journal_head *jh; - int retry = 0, err; - - while (!retry && transaction->t_checkpoint_list) { - jh = transaction->t_checkpoint_list; - retry = __process_buffer(journal, jh, &batch_count, - transaction); - if (retry < 0 && !result) - result = retry; - if (!retry && (need_resched() || - spin_needbreak(&journal->j_list_lock))) { - spin_unlock(&journal->j_list_lock); - retry = 1; - break; - } + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; + + /* checkpoint all of the transaction's buffers */ + while (transaction->t_checkpoint_list) { + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + get_bh(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto retry; } + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; - if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } - __flush_batch(journal, &batch_count); + transaction->t_chp_stats.cs_forced_to_close++; + spin_unlock(&journal->j_list_lock); + if (unlikely(journal->j_flags & JBD2_UNMOUNT)) + /* + * The journal thread is dead; so + * starting and waiting for a commit + * to finish will cause us to wait for + * a _very_ long time. + */ + printk(KERN_ERR + "JBD2: %s: Waiting for Godot: block %llu\n", + journal->j_devname, (unsigned long long) bh->b_blocknr); + + jbd2_log_start_commit(journal, tid); + jbd2_log_wait_commit(journal, tid); + goto retry; + } + if (!buffer_dirty(bh)) { + if (unlikely(buffer_write_io_error(bh)) && !result) + result = -EIO; + BUFFER_TRACE(bh, "remove from checkpoint"); + if (__jbd2_journal_remove_checkpoint(jh)) + /* The transaction was released; we're done */ + goto out; + continue; } + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal + * lock. We cannot afford to let the transaction + * logic start messing around with this buffer before + * we write it to disk, as that would break + * recoverability. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + journal->j_chkpt_bhs[batch_count++] = bh; + __buffer_relink_io(jh); + transaction->t_chp_stats.cs_written++; + if ((batch_count == JBD2_NR_BATCH) || + need_resched() || + spin_needbreak(&journal->j_list_lock)) + goto unlock_and_flush; + } - if (retry) { + if (batch_count) { + unlock_and_flush: + spin_unlock(&journal->j_list_lock); + retry: + if (batch_count) + __flush_batch(journal, &batch_count); spin_lock(&journal->j_list_lock); goto restart; + } + + /* + * Now we issued all of the transaction's buffers, let's deal + * with the buffers that are out for I/O. + */ +restart2: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; + + while (transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + get_bh(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart2; } + if (unlikely(buffer_write_io_error(bh)) && !result) + result = -EIO; + /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one + * Now in whatever state the buffer currently is, we + * know that it has been written out and so we can + * drop it from the list */ - err = __wait_cp_io(journal, transaction); - if (!result) - result = err; + if (__jbd2_journal_remove_checkpoint(jh)) + break; } out: spin_unlock(&journal->j_list_lock); @@ -478,18 +420,16 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * Find all the written-back checkpoint buffers in the given list and * release them. * - * Called with the journal locked. * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns 1 if we freed the transaction, 0 otherwise. */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +static int journal_clean_one_cp_list(struct journal_head *jh) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret, freed = 0; + int ret; + int freed = 0; - *released = 0; if (!jh) return 0; @@ -498,13 +438,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) jh = next_jh; next_jh = jh->b_cpnext; ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } - } + if (!ret) + return freed; + if (ret == 2) + return 1; + freed = 1; /* * This function only frees up some memory * if possible so we dont have an obligation @@ -523,49 +461,49 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * * Find all the written-back checkpoint buffers in the journal and release them. * - * Called with the journal locked. * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) */ - -int __jbd2_journal_clean_checkpoint_list(journal_t *journal) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0; - int released; + int ret; transaction = journal->j_checkpoint_transactions; if (!transaction) - goto out; + return; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ if (need_resched()) - goto out; - if (released) + return; + if (ret) continue; /* * It is essential that we are as careful as in the case of * t_checkpoint_list with removing the buffer from the list as * we can possibly see not yet submitted buffers on io_list */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); + ret = journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list); if (need_resched()) - goto out; + return; + /* + * Stop scanning if we couldn't free the transaction. This + * avoids pointless scanning of transactions which still + * weren't checkpointed. + */ + if (!ret) + return; } while (transaction != last_transaction); -out: - return ret; } /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 19d74d86d99c..e4dc74713a43 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) goto out_err; } - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) { printk(KERN_ERR "%s: Cannot get buffer for journal superblock\n", @@ -1522,14 +1522,6 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (jbd2_journal_has_csum_v2or3(journal) && - JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { - /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " - "at the same time!\n"); - goto out; - } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { /* Can't have checksum v2 and v3 at the same time! */ @@ -1538,6 +1530,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (jbd2_journal_has_csum_v2or3(journal) && + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + goto out; + } + if (!jbd2_verify_csum_type(journal, sb)) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 9b329b55ffe3..bcbef08a4d8f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -525,6 +525,7 @@ static int do_one_pass(journal_t *journal, !jbd2_descr_block_csum_verify(journal, bh->b_data)) { err = -EIO; + brelse(bh); goto failed; } diff --git a/fs/namei.c b/fs/namei.c index 43927d14db67..42df664e95e5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask) return security_inode_permission(inode, mask); } +EXPORT_SYMBOL(__inode_permission); /** * sb_permission - Check superblock-level permissions @@ -2383,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path, } EXPORT_SYMBOL(kern_path_mountpoint); -/* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. - */ -static inline int check_sticky(struct inode *dir, struct inode *inode) +int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (!(dir->i_mode & S_ISVTX)) - return 0; if (uid_eq(inode->i_uid, fsuid)) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } +EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check @@ -3064,9 +3060,12 @@ finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); - if (error) { + + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file, current_cred()); + if (!error) { + *opened |= FILE_OPENED; + } else { if (error == -EOPENSTALE) goto stale_open; goto out; @@ -4210,12 +4209,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, bool should_retry = false; int error; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) return -EINVAL; + if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) + return -EPERM; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { @@ -4347,6 +4350,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + int error = may_create(dir, dentry); + if (error) + return error; + + if (!dir->i_op->mknod) + return -EPERM; + + return dir->i_op->mknod(dir, dentry, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} +EXPORT_SYMBOL(vfs_whiteout); + int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/fs/namespace.c b/fs/namespace.c index fbba8b17330d..5b66b2b3624d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index c6e4bda63000..9e5bc42180e4 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index c89357c7a914..919efd4a1a23 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 3a0828d57339..2641dbad345c 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -6,7 +6,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index b3918f7ac34d..f093c7ec983b 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/open.c b/fs/open.c index d6fd3acde134..de92c13b58be 100644 --- a/fs/open.c +++ b/fs/open.c @@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags, f = get_empty_filp(); if (!IS_ERR(f)) { f->f_flags = flags; - f->f_path = *path; - error = do_dentry_open(f, NULL, cred); + error = vfs_open(path, f, cred); if (!error) { /* from now on we need fput() to dispose of f */ error = open_check_o_direct(f); @@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * vfs_open - open the file at the given path + * @path: path to open + * @filp: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *filp, + const struct cred *cred) +{ + struct inode *inode = path->dentry->d_inode; + + if (inode->i_op->dentry_open) + return inode->i_op->dentry_open(path->dentry, filp, cred); + else { + filp->f_path = *path; + return do_dentry_open(filp, NULL, cred); + } +} +EXPORT_SYMBOL(vfs_open); + static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig new file mode 100644 index 000000000000..e60125976873 --- /dev/null +++ b/fs/overlayfs/Kconfig @@ -0,0 +1,10 @@ +config OVERLAYFS_FS + tristate "Overlay filesystem support" + help + An overlay filesystem combines two filesystems - an 'upper' filesystem + and a 'lower' filesystem. When a name exists in both filesystems, the + object in the 'upper' filesystem is visible while the object in the + 'lower' filesystem is either hidden or, in the case of directories, + merged with the 'upper' object. + + For more information see Documentation/filesystems/overlayfs.txt diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile new file mode 100644 index 000000000000..8f91889480d0 --- /dev/null +++ b/fs/overlayfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o + +overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c new file mode 100644 index 000000000000..ea10a8719107 --- /dev/null +++ b/fs/overlayfs/copy_up.c @@ -0,0 +1,414 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/splice.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/namei.h> +#include "overlayfs.h" + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +int ovl_copy_xattr(struct dentry *old, struct dentry *new) +{ + ssize_t list_size, size; + char *buf, *name, *value; + int error; + + if (!old->d_inode->i_op->getxattr || + !new->d_inode->i_op->getxattr) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) { + if (list_size == -EOPNOTSUPP) + return 0; + return list_size; + } + + buf = kzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = -ENOMEM; + value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL); + if (!value) + goto out; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out_free_value; + } + + for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX); + if (size <= 0) { + error = size; + goto out_free_value; + } + error = vfs_setxattr(new, name, value, size, 0); + if (error) + goto out_free_value; + } + +out_free_value: + kfree(value); +out: + kfree(buf); + return error; +} + +static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +{ + struct file *old_file; + struct file *new_file; + loff_t old_pos = 0; + loff_t new_pos = 0; + int error = 0; + + if (len == 0) + return 0; + + old_file = ovl_path_open(old, O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + new_file = ovl_path_open(new, O_WRONLY); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out_fput; + } + + /* FIXME: copy up sparse files efficiently */ + while (len) { + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) { + error = -EINTR; + break; + } + + bytes = do_splice_direct(old_file, &old_pos, + new_file, &new_pos, + this_len, SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + WARN_ON(old_pos != new_pos); + + len -= bytes; + } + + fput(new_file); +out_fput: + fput(old_file); + return error; +} + +static char *ovl_read_symlink(struct dentry *realdentry) +{ + int res; + char *buf; + struct inode *inode = realdentry->d_inode; + mm_segment_t old_fs; + + res = -EINVAL; + if (!inode->i_op->readlink) + goto err; + + res = -ENOMEM; + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + goto err; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = inode->i_op->readlink(realdentry, + (char __user *)buf, PAGE_SIZE - 1); + set_fs(old_fs); + if (res < 0) { + free_page((unsigned long) buf); + goto err; + } + buf[res] = '\0'; + + return buf; + +err: + return ERR_PTR(res); +} + +static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return notify_change(upperdentry, &attr, NULL); +} + +int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) +{ + int err = 0; + + if (!S_ISLNK(stat->mode)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat->mode, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) { + struct iattr attr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = stat->uid, + .ia_gid = stat->gid, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) + ovl_set_timestamps(upperdentry, stat); + + return err; + +} + +static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, + struct dentry *dentry, struct path *lowerpath, + struct kstat *stat, struct iattr *attr, + const char *link) +{ + struct inode *wdir = workdir->d_inode; + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry = NULL; + struct dentry *upper = NULL; + umode_t mode = stat->mode; + int err; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out1; + + /* Can't properly set mode on creation because of the umask */ + stat->mode &= S_IFMT; + err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); + stat->mode = mode; + if (err) + goto out2; + + if (S_ISREG(stat->mode)) { + struct path upperpath; + ovl_path_upper(dentry, &upperpath); + BUG_ON(upperpath.dentry != NULL); + upperpath.dentry = newdentry; + + err = ovl_copy_up_data(lowerpath, &upperpath, stat->size); + if (err) + goto out_cleanup; + } + + err = ovl_copy_xattr(lowerpath->dentry, newdentry); + if (err) + goto out_cleanup; + + mutex_lock(&newdentry->d_inode->i_mutex); + err = ovl_set_attr(newdentry, stat); + if (!err && attr) + err = notify_change(newdentry, attr, NULL); + mutex_unlock(&newdentry->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + + ovl_dentry_update(dentry, newdentry); + newdentry = NULL; + + /* + * Non-directores become opaque when copied up. + */ + if (!S_ISDIR(stat->mode)) + ovl_dentry_set_opaque(dentry, true); +out2: + dput(upper); +out1: + dput(newdentry); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out; +} + +/* + * Copy up a single dentry + * + * Directory renames only allowed on "pure upper" (already created on + * upper filesystem, never copied up). Directories which are on lower or + * are merged may not be renamed. For these -EXDEV is returned and + * userspace has to deal with it. This means, when copying up a + * directory we can rely on it and ancestors being stable. + * + * Non-directory renames start with copy up of source if necessary. The + * actual rename will only proceed once the copy up was successful. Copy + * up uses upper parent i_mutex for exclusion. Since rename can change + * d_parent it is possible that the copy up will lock the old parent. At + * that point the file will have already been copied up anyway. + */ +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr) +{ + struct dentry *workdir = ovl_workdir(dentry); + int err; + struct kstat pstat; + struct path parentpath; + struct dentry *upperdir; + struct dentry *upperdentry; + const struct cred *old_cred; + struct cred *override_cred; + char *link = NULL; + + ovl_path_upper(parent, &parentpath); + upperdir = parentpath.dentry; + + err = vfs_getattr(&parentpath, &pstat); + if (err) + return err; + + if (S_ISLNK(stat->mode)) { + link = ovl_read_symlink(lowerpath->dentry); + if (IS_ERR(link)) + return PTR_ERR(link); + } + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_free_link; + + override_cred->fsuid = stat->uid; + override_cred->fsgid = stat->gid; + /* + * CAP_SYS_ADMIN for copying up extended attributes + * CAP_DAC_OVERRIDE for create + * CAP_FOWNER for chmod, timestamp update + * CAP_FSETID for chmod + * CAP_CHOWN for chown + * CAP_MKNOD for mknod + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + cap_raise(override_cred->cap_effective, CAP_MKNOD); + old_cred = override_creds(override_cred); + + err = -EIO; + if (lock_rename(workdir, upperdir) != NULL) { + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + goto out_unlock; + } + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + unlock_rename(workdir, upperdir); + err = 0; + /* Raced with another copy-up? Do the setattr here */ + if (attr) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } + goto out_put_cred; + } + + err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath, + stat, attr, link); + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, &pstat); + } +out_unlock: + unlock_rename(workdir, upperdir); +out_put_cred: + revert_creds(old_cred); + put_cred(override_cred); + +out_free_link: + if (link) + free_page((unsigned long) link); + + return err; +} + +int ovl_copy_up(struct dentry *dentry) +{ + int err; + + err = 0; + while (!err) { + struct dentry *next; + struct dentry *parent; + struct path lowerpath; + struct kstat stat; + enum ovl_path_type type = ovl_path_type(dentry); + + if (type != OVL_PATH_LOWER) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (;;) { + parent = dget_parent(next); + + type = ovl_path_type(parent); + if (type != OVL_PATH_LOWER) + break; + + dput(next); + next = parent; + } + + ovl_path_lower(next, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (!err) + err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL); + + dput(parent); + dput(next); + } + + return err; +} diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c new file mode 100644 index 000000000000..15cd91ad9940 --- /dev/null +++ b/fs/overlayfs/dir.c @@ -0,0 +1,921 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/cred.h> +#include "overlayfs.h" + +void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) +{ + int err; + + dget(wdentry); + if (S_ISDIR(wdentry->d_inode->i_mode)) + err = ovl_do_rmdir(wdir, wdentry); + else + err = ovl_do_unlink(wdir, wdentry); + dput(wdentry); + + if (err) { + pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + wdentry, err); + } +} + +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) +{ + struct dentry *temp; + char name[20]; + + snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + + temp = lookup_one_len(name, workdir, strlen(name)); + if (!IS_ERR(temp) && temp->d_inode) { + pr_err("overlayfs: workdir/%s already exists\n", name); + dput(temp); + temp = ERR_PTR(-EIO); + } + + return temp; +} + +/* caller holds i_mutex on workdir */ +static struct dentry *ovl_whiteout(struct dentry *workdir, + struct dentry *dentry) +{ + int err; + struct dentry *whiteout; + struct inode *wdir = workdir->d_inode; + + whiteout = ovl_lookup_temp(workdir, dentry); + if (IS_ERR(whiteout)) + return whiteout; + + err = ovl_do_whiteout(wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + } + + return whiteout; +} + +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug) +{ + int err; + + if (newdentry->d_inode) + return -ESTALE; + + if (hardlink) { + err = ovl_do_link(hardlink, dir, newdentry, debug); + } else { + switch (stat->mode & S_IFMT) { + case S_IFREG: + err = ovl_do_create(dir, newdentry, stat->mode, debug); + break; + + case S_IFDIR: + err = ovl_do_mkdir(dir, newdentry, stat->mode, debug); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ovl_do_mknod(dir, newdentry, + stat->mode, stat->rdev, debug); + break; + + case S_IFLNK: + err = ovl_do_symlink(dir, newdentry, link, debug); + break; + + default: + err = -EPERM; + } + } + if (!err && WARN_ON(!newdentry->d_inode)) { + /* + * Not quite sure if non-instantiated dentry is legal or not. + * VFS doesn't seem to care so check and warn here. + */ + err = -ENOENT; + } + return err; +} + +static int ovl_set_opaque(struct dentry *upperdentry) +{ + return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); +} + +static void ovl_remove_opaque(struct dentry *upperdentry) +{ + int err; + + err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr); + if (err) { + pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", + upperdentry->d_name.name, err); + } +} + +static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + enum ovl_path_type type; + struct path realpath; + + type = ovl_path_real(dentry, &realpath); + err = vfs_getattr(&realpath, stat); + if (err) + return err; + + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (type == OVL_PATH_MERGE) + stat->nlink = 1; + + return 0; +} + +static int ovl_create_upper(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry; + int err; + + mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + newdentry = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + err = ovl_create_real(udir, newdentry, stat, link, hardlink, false); + if (err) + goto out_dput; + + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput: + dput(newdentry); +out_unlock: + mutex_unlock(&udir->i_mutex); + return err; +} + +static int ovl_lock_rename_workdir(struct dentry *workdir, + struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} + +static struct dentry *ovl_clear_empty(struct dentry *dentry, + struct list_head *list) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct path upperpath; + struct dentry *upper; + struct dentry *opaquedir; + struct kstat stat; + int err; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + ovl_path_upper(dentry, &upperpath); + err = vfs_getattr(&upperpath, &stat); + if (err) + goto out_unlock; + + err = -ESTALE; + if (!S_ISDIR(stat.mode)) + goto out_unlock; + upper = upperpath.dentry; + if (upper->d_parent->d_inode != udir) + goto out_unlock; + + opaquedir = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out_unlock; + + err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true); + if (err) + goto out_dput; + + err = ovl_copy_xattr(upper, opaquedir); + if (err) + goto out_cleanup; + + err = ovl_set_opaque(opaquedir); + if (err) + goto out_cleanup; + + mutex_lock(&opaquedir->d_inode->i_mutex); + err = ovl_set_attr(opaquedir, &stat); + mutex_unlock(&opaquedir->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup_whiteouts(upper, list); + ovl_cleanup(wdir, upper); + unlock_rename(workdir, upperdir); + + /* dentry's upper doesn't match now, get rid of it */ + d_drop(dentry); + + return opaquedir; + +out_cleanup: + ovl_cleanup(wdir, opaquedir); +out_dput: + dput(opaquedir); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return ERR_PTR(err); +} + +static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, + enum ovl_path_type type) +{ + int err; + struct dentry *ret = NULL; + LIST_HEAD(list); + + err = ovl_check_empty_dir(dentry, &list); + if (err) + ret = ERR_PTR(err); + else if (type == OVL_PATH_MERGE) + ret = ovl_clear_empty(dentry, &list); + + ovl_cache_free(&list); + + return ret; +} + +static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *upper; + struct dentry *newdentry; + int err; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_dput; + + err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true); + if (err) + goto out_dput2; + + if (S_ISDIR(stat->mode)) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, + RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup(wdir, upper); + } else { + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + } + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput2: + dput(upper); +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out_dput2; +} + +static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, + const char *link, struct dentry *hardlink) +{ + int err; + struct inode *inode; + struct kstat stat = { + .mode = mode, + .rdev = rdev, + }; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata); + if (!inode) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_iput; + + if (!ovl_dentry_is_opaque(dentry)) { + err = ovl_create_upper(dentry, inode, &stat, link, hardlink); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_iput; + + /* + * CAP_SYS_ADMIN for setting opaque xattr + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + old_cred = override_creds(override_cred); + + err = ovl_create_over_whiteout(dentry, inode, &stat, link, + hardlink); + + revert_creds(old_cred); + put_cred(override_cred); + } + + if (!err) + inode = NULL; +out_iput: + iput(inode); +out: + return err; +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_create_or_link(dentry, mode, rdev, link, NULL); + ovl_drop_write(dentry); + } + + return err; +} + +static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + /* Don't allow creation of "whiteout" on overlay */ + if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) + return -EPERM; + + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct inode *dir, struct dentry *dentry, + const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + struct dentry *upper; + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + upper = ovl_dentry_upper(old); + err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper); + +out_drop_write: + ovl_drop_write(old); +out: + return err; +} + +static int ovl_remove_and_whiteout(struct dentry *dentry, + enum ovl_path_type type, bool is_dir) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *whiteout; + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (is_dir) { + opaquedir = ovl_check_empty_and_clear(dentry, type); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out_dput; + + whiteout = ovl_whiteout(workdir, dentry); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto out_unlock; + + if (type == OVL_PATH_LOWER) { + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto kill_whiteout; + + err = ovl_do_rename(wdir, whiteout, udir, upper, 0); + dput(upper); + if (err) + goto kill_whiteout; + } else { + int flags = 0; + + upper = ovl_dentry_upper(dentry); + if (opaquedir) + upper = opaquedir; + err = -ESTALE; + if (upper->d_parent != upperdir) + goto kill_whiteout; + + if (is_dir) + flags |= RENAME_EXCHANGE; + + err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + if (err) + goto kill_whiteout; + + if (is_dir) + ovl_cleanup(wdir, upper); + } + ovl_dentry_version_inc(dentry->d_parent); +out_d_drop: + d_drop(dentry); + dput(whiteout); +out_unlock: + unlock_rename(workdir, upperdir); +out_dput: + dput(opaquedir); +out: + return err; + +kill_whiteout: + ovl_cleanup(wdir, whiteout); + goto out_d_drop; +} + +static int ovl_remove_upper(struct dentry *dentry, bool is_dir) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *dir = upperdir->d_inode; + struct dentry *upper = ovl_dentry_upper(dentry); + int err; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + err = -ESTALE; + if (upper->d_parent == upperdir) { + /* Don't let d_delete() think it can reset d_inode */ + dget(upper); + if (is_dir) + err = vfs_rmdir(dir, upper); + else + err = vfs_unlink(dir, upper, NULL); + dput(upper); + ovl_dentry_version_inc(dentry->d_parent); + } + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ + d_drop(dentry); + mutex_unlock(&dir->i_mutex); + + return err; +} + +static inline int ovl_check_sticky(struct dentry *dentry) +{ + struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode; + struct inode *inode = ovl_dentry_real(dentry)->d_inode; + + if (check_sticky(dir, inode)) + return -EPERM; + + return 0; +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + enum ovl_path_type type; + int err; + + err = ovl_check_sticky(dentry); + if (err) + goto out; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_drop_write; + + type = ovl_path_type(dentry); + if (type == OVL_PATH_PURE_UPPER) { + err = ovl_remove_upper(dentry, is_dir); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + + err = ovl_remove_and_whiteout(dentry, type, is_dir); + + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, true); +} + +static int ovl_rename2(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new, + unsigned int flags) +{ + int err; + enum ovl_path_type old_type; + enum ovl_path_type new_type; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool old_opaque; + bool new_opaque; + bool new_create = false; + bool cleanup_whiteout = false; + bool overwrite = !(flags & RENAME_EXCHANGE); + bool is_dir = S_ISDIR(old->d_inode->i_mode); + bool new_is_dir = false; + struct dentry *opaquedir = NULL; + const struct cred *old_cred = NULL; + struct cred *override_cred = NULL; + + err = -EINVAL; + if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + goto out; + + flags &= ~RENAME_NOREPLACE; + + err = ovl_check_sticky(old); + if (err) + goto out; + + /* Don't copy up directory trees */ + old_type = ovl_path_type(old); + err = -EXDEV; + if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir) + goto out; + + if (new->d_inode) { + err = ovl_check_sticky(new); + if (err) + goto out; + + if (S_ISDIR(new->d_inode->i_mode)) + new_is_dir = true; + + new_type = ovl_path_type(new); + err = -EXDEV; + if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) + goto out; + + err = 0; + if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { + if (ovl_dentry_lower(old)->d_inode == + ovl_dentry_lower(new)->d_inode) + goto out; + } + if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) { + if (ovl_dentry_upper(old)->d_inode == + ovl_dentry_upper(new)->d_inode) + goto out; + } + } else { + if (ovl_dentry_is_opaque(new)) + new_type = OVL_PATH_UPPER; + else + new_type = OVL_PATH_PURE_UPPER; + } + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (!overwrite) { + err = ovl_copy_up(new); + if (err) + goto out_drop_write; + } + + old_opaque = old_type != OVL_PATH_PURE_UPPER; + new_opaque = new_type != OVL_PATH_PURE_UPPER; + + if (old_opaque || new_opaque) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + } + + if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { + opaquedir = ovl_check_empty_and_clear(new, new_type); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) { + opaquedir = NULL; + goto out_revert_creds; + } + } + + if (overwrite) { + if (old_opaque) { + if (new->d_inode || !new_opaque) { + /* Whiteout source */ + flags |= RENAME_WHITEOUT; + } else { + /* Switch whiteouts */ + flags |= RENAME_EXCHANGE; + } + } else if (is_dir && !new->d_inode && new_opaque) { + flags |= RENAME_EXCHANGE; + cleanup_whiteout = true; + } + } + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = ovl_dentry_upper(old); + newdentry = ovl_dentry_upper(new); + if (newdentry) { + if (opaquedir) { + newdentry = opaquedir; + opaquedir = NULL; + } else { + dget(newdentry); + } + } else { + new_create = true; + newdentry = lookup_one_len(new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + } + + err = -ESTALE; + if (olddentry->d_parent != old_upperdir) + goto out_dput; + if (newdentry->d_parent != new_upperdir) + goto out_dput; + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + if (is_dir && !old_opaque && new_opaque) { + err = ovl_set_opaque(olddentry); + if (err) + goto out_dput; + } + if (!overwrite && new_is_dir && old_opaque && !new_opaque) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_dput; + } + + if (old_opaque || new_opaque) { + err = ovl_do_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + flags); + } else { + /* No debug for the plain case */ + BUG_ON(flags & ~RENAME_EXCHANGE); + err = vfs_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + NULL, flags); + } + + if (err) { + if (is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(newdentry); + goto out_dput; + } + + if (is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(newdentry); + + if (old_opaque != new_opaque) { + ovl_dentry_set_opaque(old, new_opaque); + if (!overwrite) + ovl_dentry_set_opaque(new, old_opaque); + } + + if (cleanup_whiteout) + ovl_cleanup(old_upperdir->d_inode, newdentry); + + ovl_dentry_version_inc(old->d_parent); + ovl_dentry_version_inc(new->d_parent); + +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); +out_revert_creds: + if (old_opaque || new_opaque) { + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(old); +out: + dput(opaquedir); + return err; +} + +const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename2 = ovl_rename2, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_dir_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c new file mode 100644 index 000000000000..af2d18c9fcee --- /dev/null +++ b/fs/overlayfs/inode.c @@ -0,0 +1,425 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include "overlayfs.h" + +static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr, + bool no_data) +{ + int err; + struct dentry *parent; + struct kstat stat; + struct path lowerpath; + + parent = dget_parent(dentry); + err = ovl_copy_up(parent); + if (err) + goto out_dput_parent; + + ovl_path_lower(dentry, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (err) + goto out_dput_parent; + + if (no_data) + stat.size = 0; + + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr); + +out_dput_parent: + dput(parent); + return err; +} + +int ovl_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } else { + err = ovl_copy_up_last(dentry, attr, false); + } + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct path realpath; + + ovl_path_real(dentry, &realpath); + return vfs_getattr(&realpath, stat); +} + +int ovl_permission(struct inode *inode, int mask) +{ + struct ovl_entry *oe; + struct dentry *alias = NULL; + struct inode *realinode; + struct dentry *realdentry; + bool is_upper; + int err; + + if (S_ISDIR(inode->i_mode)) { + oe = inode->i_private; + } else if (mask & MAY_NOT_BLOCK) { + return -ECHILD; + } else { + /* + * For non-directories find an alias and get the info + * from there. + */ + alias = d_find_any_alias(inode); + if (WARN_ON(!alias)) + return -ENOENT; + + oe = alias->d_fsdata; + } + + realdentry = ovl_entry_real(oe, &is_upper); + + /* Careful in RCU walk mode */ + realinode = ACCESS_ONCE(realdentry->d_inode); + if (!realinode) { + WARN_ON(!(mask & MAY_NOT_BLOCK)); + err = -ENOENT; + goto out_dput; + } + + if (mask & MAY_WRITE) { + umode_t mode = realinode->i_mode; + + /* + * Writes will always be redirected to upper layer, so + * ignore lower layer being read-only. + * + * If the overlay itself is read-only then proceed + * with the permission check, don't return EROFS. + * This will only happen if this is the lower layer of + * another overlayfs. + * + * If upper fs becomes read-only after the overlay was + * constructed return EROFS to prevent modification of + * upper layer. + */ + err = -EROFS; + if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + goto out_dput; + } + + err = __inode_permission(realinode, mask); +out_dput: + dput(alias); + return err; +} + + +struct ovl_link_data { + struct dentry *realdentry; + void *cookie; +}; + +static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + void *ret; + struct dentry *realdentry; + struct inode *realinode; + + realdentry = ovl_dentry_real(dentry); + realinode = realdentry->d_inode; + + if (WARN_ON(!realinode->i_op->follow_link)) + return ERR_PTR(-EPERM); + + ret = realinode->i_op->follow_link(realdentry, nd); + if (IS_ERR(ret)) + return ret; + + if (realinode->i_op->put_link) { + struct ovl_link_data *data; + + data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); + if (!data) { + realinode->i_op->put_link(realdentry, nd, ret); + return ERR_PTR(-ENOMEM); + } + data->realdentry = realdentry; + data->cookie = ret; + + return data; + } else { + return NULL; + } +} + +static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + struct inode *realinode; + struct ovl_link_data *data = c; + + if (!data) + return; + + realinode = data->realdentry->d_inode; + realinode->i_op->put_link(data->realdentry, nd, data->cookie); + kfree(data); +} + +static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + struct path realpath; + struct inode *realinode; + + ovl_path_real(dentry, &realpath); + realinode = realpath.dentry->d_inode; + + if (!realinode->i_op->readlink) + return -EINVAL; + + touch_atime(&realpath); + + return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); +} + + +static bool ovl_is_private_xattr(const char *name) +{ + return strncmp(name, "trusted.overlay.", 14) == 0; +} + +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = -EPERM; + if (ovl_is_private_xattr(name)) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + upperdentry = ovl_dentry_upper(dentry); + err = vfs_setxattr(upperdentry, name, value, size, flags); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + return -ENODATA; + + return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + ssize_t res; + int off; + + res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + if (res <= 0 || size == 0) + return res; + + if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + return res; + + /* filter out private xattrs */ + for (off = 0; off < res;) { + char *s = list + off; + size_t slen = strlen(s) + 1; + + BUG_ON(off + slen > res); + + if (ovl_is_private_xattr(s)) { + res -= slen; + memmove(s, s + slen, res - off); + } else { + off += slen; + } + } + + return res; +} + +int ovl_removexattr(struct dentry *dentry, const char *name) +{ + int err; + struct path realpath; + enum ovl_path_type type; + + err = ovl_want_write(dentry); + if (err) + goto out; + + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + goto out_drop_write; + + type = ovl_path_real(dentry, &realpath); + if (type == OVL_PATH_LOWER) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_removexattr(realpath.dentry, name); +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, + struct dentry *realdentry) +{ + if (type != OVL_PATH_LOWER) + return false; + + if (special_file(realdentry->d_inode->i_mode)) + return false; + + if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC)) + return false; + + return true; +} + +static int ovl_dentry_open(struct dentry *dentry, struct file *file, + const struct cred *cred) +{ + int err; + struct path realpath; + enum ovl_path_type type; + bool want_write = false; + + type = ovl_path_real(dentry, &realpath); + if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { + want_write = true; + err = ovl_want_write(dentry); + if (err) + goto out; + + if (file->f_flags & O_TRUNC) + err = ovl_copy_up_last(dentry, NULL, true); + else + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_open(&realpath, file, cred); +out_drop_write: + if (want_write) + ovl_drop_write(dentry); +out: + return err; +} + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, + .dentry_open = ovl_dentry_open, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .follow_link = ovl_follow_link, + .put_link = ovl_put_link, + .readlink = ovl_readlink, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return NULL; + + mode &= S_IFMT; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_flags |= S_NOATIME | S_NOCMTIME; + + switch (mode) { + case S_IFDIR: + inode->i_private = oe; + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + break; + + case S_IFLNK: + inode->i_op = &ovl_symlink_inode_operations; + break; + + case S_IFREG: + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + inode->i_op = &ovl_file_inode_operations; + break; + + default: + WARN(1, "illegal file type: %i\n", mode); + iput(inode); + inode = NULL; + } + + return inode; + +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h new file mode 100644 index 000000000000..814bed33dd07 --- /dev/null +++ b/fs/overlayfs/overlayfs.h @@ -0,0 +1,191 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/kernel.h> + +struct ovl_entry; + +enum ovl_path_type { + OVL_PATH_PURE_UPPER, + OVL_PATH_UPPER, + OVL_PATH_MERGE, + OVL_PATH_LOWER, +}; + +extern const char *ovl_opaque_xattr; + +static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_rmdir(dir, dentry); + pr_debug("rmdir(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_unlink(dir, dentry, NULL); + pr_debug("unlink(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool debug) +{ + int err = vfs_link(old_dentry, dir, new_dentry, NULL); + if (debug) { + pr_debug("link(%pd2, %pd2) = %i\n", + old_dentry, new_dentry, err); + } + return err; +} + +static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_create(dir, dentry, mode, true); + if (debug) + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_mkdir(dir, dentry, mode); + if (debug) + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev, bool debug) +{ + int err = vfs_mknod(dir, dentry, mode, dev); + if (debug) { + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", + dentry, mode, dev, err); + } + return err; +} + +static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname, bool debug) +{ + int err = vfs_symlink(dir, dentry, oldname); + if (debug) + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + return err; +} + +static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err = vfs_setxattr(dentry, name, value, size, flags); + pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n", + dentry, name, (int) size, (char *) value, flags, err); + return err; +} + +static inline int ovl_do_removexattr(struct dentry *dentry, const char *name) +{ + int err = vfs_removexattr(dentry, name); + pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); + return err; +} + +static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, + struct inode *newdir, struct dentry *newdentry, + unsigned int flags) +{ + int err; + + pr_debug("rename2(%pd2, %pd2, 0x%x)\n", + olddentry, newdentry, flags); + + err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); + + if (err) { + pr_debug("...rename2(%pd2, %pd2, ...) = %i\n", + olddentry, newdentry, err); + } + return err; +} + +static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_whiteout(dir, dentry); + pr_debug("whiteout(%pd2) = %i\n", dentry, err); + return err; +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry); +u64 ovl_dentry_version_get(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_path_upper(struct dentry *dentry, struct path *path); +void ovl_path_lower(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +struct dentry *ovl_dentry_upper(struct dentry *dentry); +struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); +struct dentry *ovl_workdir(struct dentry *dentry); +int ovl_want_write(struct dentry *dentry); +void ovl_drop_write(struct dentry *dentry); +bool ovl_dentry_is_opaque(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); +bool ovl_is_whiteout(struct dentry *dentry); +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +struct file *ovl_path_open(struct path *path, int flags); + +struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, + struct kstat *stat, const char *link); + +/* readdir.c */ +extern const struct file_operations ovl_dir_operations; +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); +void ovl_cache_free(struct list_head *list); + +/* inode.c */ +int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_permission(struct inode *inode, int mask); +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); +int ovl_removexattr(struct dentry *dentry, const char *name); + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe); +static inline void ovl_copyattr(struct inode *from, struct inode *to) +{ + to->i_uid = from->i_uid; + to->i_gid = from->i_gid; +} + +/* dir.c */ +extern const struct inode_operations ovl_dir_inode_operations; +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug); +void ovl_cleanup(struct inode *dir, struct dentry *dentry); + +/* copy_up.c */ +int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr); +int ovl_copy_xattr(struct dentry *old, struct dentry *new); +int ovl_set_attr(struct dentry *upper, struct kstat *stat); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c new file mode 100644 index 000000000000..910553f37aca --- /dev/null +++ b/fs/overlayfs/readdir.c @@ -0,0 +1,590 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/xattr.h> +#include <linux/rbtree.h> +#include <linux/security.h> +#include <linux/cred.h> +#include "overlayfs.h" + +struct ovl_cache_entry { + unsigned int len; + unsigned int type; + u64 ino; + bool is_whiteout; + struct list_head l_node; + struct rb_node node; + char name[]; +}; + +struct ovl_dir_cache { + long refcount; + u64 version; + struct list_head entries; +}; + +struct ovl_readdir_data { + struct dir_context ctx; + bool is_merge; + struct rb_root root; + struct list_head *list; + struct list_head middle; + int count; + int err; +}; + +struct ovl_dir_file { + bool is_real; + bool is_upper; + struct ovl_dir_cache *cache; + struct ovl_cache_entry cursor; + struct file *realfile; + struct file *upperfile; +}; + +static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) +{ + return container_of(n, struct ovl_cache_entry, node); +} + +static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, + const char *name, int len) +{ + struct rb_node *node = root->rb_node; + int cmp; + + while (node) { + struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); + + cmp = strncmp(name, p->name, len); + if (cmp > 0) + node = p->node.rb_right; + else if (cmp < 0 || len < p->len) + node = p->node.rb_left; + else + return p; + } + + return NULL; +} + +static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, + u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); + + p = kmalloc(size, GFP_KERNEL); + if (p) { + memcpy(p->name, name, len); + p->name[len] = '\0'; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + } + + return p; +} + +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, u64 ino, + unsigned int d_type) +{ + struct rb_node **newp = &rdd->root.rb_node; + struct rb_node *parent = NULL; + struct ovl_cache_entry *p; + + while (*newp) { + int cmp; + struct ovl_cache_entry *tmp; + + parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + return 0; + } + + p = ovl_cache_entry_new(name, len, ino, d_type); + if (p == NULL) + return -ENOMEM; + + list_add_tail(&p->l_node, rdd->list); + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, &rdd->root); + + return 0; +} + +static int ovl_fill_lower(struct ovl_readdir_data *rdd, + const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(&rdd->root, name, namelen); + if (p) { + list_move_tail(&p->l_node, &rdd->middle); + } else { + p = ovl_cache_entry_new(name, namelen, ino, d_type); + if (p == NULL) + rdd->err = -ENOMEM; + else + list_add_tail(&p->l_node, &rdd->middle); + } + + return rdd->err; +} + +void ovl_cache_free(struct list_head *list) +{ + struct ovl_cache_entry *p; + struct ovl_cache_entry *n; + + list_for_each_entry_safe(p, n, list, l_node) + kfree(p); + + INIT_LIST_HEAD(list); +} + +static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +{ + struct ovl_dir_cache *cache = od->cache; + + list_del(&od->cursor.l_node); + WARN_ON(cache->refcount <= 0); + cache->refcount--; + if (!cache->refcount) { + if (ovl_dir_cache(dentry) == cache) + ovl_set_dir_cache(dentry, NULL); + + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static int ovl_fill_merge(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + + rdd->count++; + if (!rdd->is_merge) + return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + else + return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); +} + +static inline int ovl_dir_read(struct path *realpath, + struct ovl_readdir_data *rdd) +{ + struct file *realfile; + int err; + + realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + rdd->ctx.pos = 0; + do { + rdd->count = 0; + rdd->err = 0; + err = iterate_dir(realfile, &rdd->ctx); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + fput(realfile); + + return err; +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct ovl_dir_cache *cache = od->cache; + struct dentry *dentry = file->f_path.dentry; + enum ovl_path_type type = ovl_path_type(dentry); + + if (cache && ovl_dentry_version_get(dentry) != cache->version) { + ovl_cache_put(od, dentry); + od->cache = NULL; + } + WARN_ON(!od->is_real && type != OVL_PATH_MERGE); + if (od->is_real && type == OVL_PATH_MERGE) + od->is_real = false; +} + +static int ovl_dir_mark_whiteouts(struct dentry *dir, + struct ovl_readdir_data *rdd) +{ + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + ovl_cache_free(rdd->list); + return -ENOMEM; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + mutex_lock(&dir->d_inode->i_mutex); + list_for_each_entry(p, rdd->list, l_node) { + if (!p->name) + continue; + + if (p->type != DT_CHR) + continue; + + dentry = lookup_one_len(p->name, dir, p->len); + if (IS_ERR(dentry)) + continue; + + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + mutex_unlock(&dir->d_inode->i_mutex); + + revert_creds(old_cred); + put_cred(override_cred); + + return 0; +} + +static inline int ovl_dir_read_merged(struct path *upperpath, + struct path *lowerpath, + struct list_head *list) +{ + int err; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .list = list, + .root = RB_ROOT, + .is_merge = false, + }; + + if (upperpath->dentry) { + err = ovl_dir_read(upperpath, &rdd); + if (err) + goto out; + + if (lowerpath->dentry) { + err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd); + if (err) + goto out; + } + } + if (lowerpath->dentry) { + /* + * Insert lowerpath entries before upperpath ones, this allows + * offsets to be reasonably constant + */ + list_add(&rdd.middle, rdd.list); + rdd.is_merge = true; + err = ovl_dir_read(lowerpath, &rdd); + list_del(&rdd.middle); + } +out: + return err; + +} + +static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) +{ + struct ovl_cache_entry *p; + loff_t off = 0; + + list_for_each_entry(p, &od->cache->entries, l_node) { + if (!p->name) + continue; + if (off >= pos) + break; + off++; + } + list_move_tail(&od->cursor.l_node, &p->l_node); +} + +static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) +{ + int res; + struct path lowerpath; + struct path upperpath; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(dentry); + if (cache && ovl_dentry_version_get(dentry) == cache->version) { + cache->refcount++; + return cache; + } + ovl_set_dir_cache(dentry, NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + cache->refcount = 1; + INIT_LIST_HEAD(&cache->entries); + + ovl_path_lower(dentry, &lowerpath); + ovl_path_upper(dentry, &upperpath); + + res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(dentry, cache); + + return cache; +} + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + + if (!ctx->pos) + ovl_dir_reset(file); + + if (od->is_real) + return iterate_dir(od->realfile, ctx); + + if (!od->cache) { + struct ovl_dir_cache *cache; + + cache = ovl_cache_get(dentry); + if (IS_ERR(cache)) + return PTR_ERR(cache); + + od->cache = cache; + ovl_seek_cursor(od, ctx->pos); + } + + while (od->cursor.l_node.next != &od->cache->entries) { + struct ovl_cache_entry *p; + + p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node); + /* Skip cursors */ + if (p->name) { + if (!p->is_whiteout) { + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + } + ctx->pos++; + } + list_move(&od->cursor.l_node, &p->l_node); + } + return 0; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + mutex_lock(&file_inode(file)->i_mutex); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else { + res = -EINVAL; + + switch (origin) { + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_SET: + break; + default: + goto out_unlock; + } + if (offset < 0) + goto out_unlock; + + if (offset != file->f_pos) { + file->f_pos = offset; + if (od->cache) + ovl_seek_cursor(od, offset); + } + res = offset; + } +out_unlock: + mutex_unlock(&file_inode(file)->i_mutex); + + return res; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct file *realfile = od->realfile; + + /* + * Need to check if we started out being a lower dir, but got copied up + */ + if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { + struct inode *inode = file_inode(file); + + realfile = od->upperfile; + if (!realfile) { + struct path upperpath; + + ovl_path_upper(dentry, &upperpath); + realfile = ovl_path_open(&upperpath, O_RDONLY); + mutex_lock(&inode->i_mutex); + if (!od->upperfile) { + if (IS_ERR(realfile)) { + mutex_unlock(&inode->i_mutex); + return PTR_ERR(realfile); + } + od->upperfile = realfile; + } else { + /* somebody has beaten us to it */ + if (!IS_ERR(realfile)) + fput(realfile); + realfile = od->upperfile; + } + mutex_unlock(&inode->i_mutex); + } + } + + return vfs_fsync_range(realfile, start, end, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + if (od->cache) { + mutex_lock(&inode->i_mutex); + ovl_cache_put(od, file->f_path.dentry); + mutex_unlock(&inode->i_mutex); + } + fput(od->realfile); + if (od->upperfile) + fput(od->upperfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = ovl_path_open(&realpath, file->f_flags); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + INIT_LIST_HEAD(&od->cursor.l_node); + od->realfile = realfile; + od->is_real = (type != OVL_PATH_MERGE); + od->is_upper = (type != OVL_PATH_LOWER); + file->private_data = od; + + return 0; +} + +const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .iterate = ovl_iterate, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) +{ + int err; + struct path lowerpath; + struct path upperpath; + struct ovl_cache_entry *p; + + ovl_path_upper(dentry, &upperpath); + ovl_path_lower(dentry, &lowerpath); + + err = ovl_dir_read_merged(&upperpath, &lowerpath, list); + if (err) + return err; + + err = 0; + + list_for_each_entry(p, list, l_node) { + if (p->is_whiteout) + continue; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + err = -ENOTEMPTY; + break; + } + + return err; +} + +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) +{ + struct ovl_cache_entry *p; + + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT); + list_for_each_entry(p, list, l_node) { + struct dentry *dentry; + + if (!p->is_whiteout) + continue; + + dentry = lookup_one_len(p->name, upper, p->len); + if (IS_ERR(dentry)) { + pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + upper->d_name.name, p->len, p->name, + (int) PTR_ERR(dentry)); + continue; + } + ovl_cleanup(upper->d_inode, dentry); + dput(dentry); + } + mutex_unlock(&upper->d_inode->i_mutex); +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c new file mode 100644 index 000000000000..08b704cebfc4 --- /dev/null +++ b/fs/overlayfs/super.c @@ -0,0 +1,796 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include "overlayfs.h" + +MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + +#define OVERLAYFS_SUPER_MAGIC 0x794c764f + +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; +}; + +/* private information held for overlayfs's superblock */ +struct ovl_fs { + struct vfsmount *upper_mnt; + struct vfsmount *lower_mnt; + struct dentry *workdir; + long lower_namelen; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; +}; + +struct ovl_dir_cache; + +/* private information held for every overlayfs dentry */ +struct ovl_entry { + struct dentry *__upperdentry; + struct dentry *lowerdentry; + struct ovl_dir_cache *cache; + union { + struct { + u64 version; + bool opaque; + }; + struct rcu_head rcu; + }; +}; + +const char *ovl_opaque_xattr = "trusted.overlay.opaque"; + + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->__upperdentry) { + if (oe->lowerdentry) { + if (S_ISDIR(dentry->d_inode->i_mode)) + return OVL_PATH_MERGE; + else + return OVL_PATH_UPPER; + } else { + if (oe->opaque) + return OVL_PATH_UPPER; + else + return OVL_PATH_PURE_UPPER; + } + } else { + return OVL_PATH_LOWER; + } +} + +static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) +{ + struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); + /* + * Make sure to order reads to upperdentry wrt ovl_dentry_update() + */ + smp_read_barrier_depends(); + return upperdentry; +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_upperdentry_dereference(oe); +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + + enum ovl_path_type type = ovl_path_type(dentry); + + if (type == OVL_PATH_LOWER) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return ovl_upperdentry_dereference(oe); +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->lowerdentry; +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (!realdentry) + realdentry = oe->lowerdentry; + + return realdentry; +} + +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) +{ + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (realdentry) { + *is_upper = true; + } else { + realdentry = oe->lowerdentry; + *is_upper = false; + } + return realdentry; +} + +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->cache; +} + +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + oe->cache = cache; +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->lower_mnt; + path->dentry = oe->lowerdentry; +} + +int ovl_want_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return mnt_want_write(ofs->upper_mnt); +} + +void ovl_drop_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + mnt_drop_write(ofs->upper_mnt); +} + +struct dentry *ovl_workdir(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return ofs->workdir; +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + return oe->opaque; +} + +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) +{ + struct ovl_entry *oe = dentry->d_fsdata; + oe->opaque = opaque; +} + +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(oe->__upperdentry); + BUG_ON(!upperdentry->d_inode); + /* + * Make sure upperdentry is consistent before making it visible to + * ovl_upperdentry_dereference(). + */ + smp_wmb(); + oe->__upperdentry = upperdentry; +} + +void ovl_dentry_version_inc(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + oe->version++; +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + return oe->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + return inode && IS_WHITEOUT(inode); +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + int res; + char val; + struct inode *inode = dentry->d_inode; + + if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) + return false; + + res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + dput(oe->__upperdentry); + dput(oe->lowerdentry); + kfree_rcu(oe, rcu); + } +} + +static const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, +}; + +static struct ovl_entry *ovl_alloc_entry(void) +{ + return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); +} + +static inline struct dentry *ovl_lookup_real(struct dentry *dir, + struct qstr *name) +{ + struct dentry *dentry; + + mutex_lock(&dir->d_inode->i_mutex); + dentry = lookup_one_len(name->name, dir, name->len); + mutex_unlock(&dir->d_inode->i_mutex); + + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) == -ENOENT) + dentry = NULL; + } else if (!dentry->d_inode) { + dput(dentry); + dentry = NULL; + } + return dentry; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_entry *oe; + struct dentry *upperdir; + struct dentry *lowerdir; + struct dentry *upperdentry = NULL; + struct dentry *lowerdentry = NULL; + struct inode *inode = NULL; + int err; + + err = -ENOMEM; + oe = ovl_alloc_entry(); + if (!oe) + goto out; + + upperdir = ovl_dentry_upper(dentry->d_parent); + lowerdir = ovl_dentry_lower(dentry->d_parent); + + if (upperdir) { + upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto out_put_dir; + + if (lowerdir && upperdentry) { + if (ovl_is_whiteout(upperdentry)) { + dput(upperdentry); + upperdentry = NULL; + oe->opaque = true; + } else if (ovl_is_opaquedir(upperdentry)) { + oe->opaque = true; + } + } + } + if (lowerdir && !oe->opaque) { + lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); + err = PTR_ERR(lowerdentry); + if (IS_ERR(lowerdentry)) + goto out_dput_upper; + } + + if (lowerdentry && upperdentry && + (!S_ISDIR(upperdentry->d_inode->i_mode) || + !S_ISDIR(lowerdentry->d_inode->i_mode))) { + dput(lowerdentry); + lowerdentry = NULL; + oe->opaque = true; + } + + if (lowerdentry || upperdentry) { + struct dentry *realdentry; + + realdentry = upperdentry ? upperdentry : lowerdentry; + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, + oe); + if (!inode) + goto out_dput; + ovl_copyattr(realdentry->d_inode, inode); + } + + oe->__upperdentry = upperdentry; + oe->lowerdentry = lowerdentry; + + dentry->d_fsdata = oe; + d_add(dentry, inode); + + return NULL; + +out_dput: + dput(lowerdentry); +out_dput_upper: + dput(upperdentry); +out_put_dir: + kfree(oe); +out: + return ERR_PTR(err); +} + +struct file *ovl_path_open(struct path *path, int flags) +{ + return dentry_open(path, flags, current_cred()); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + dput(ufs->workdir); + mntput(ufs->upper_mnt); + mntput(ufs->lower_mnt); + + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); + kfree(ufs); +} + +/** + * ovl_statfs + * @sb: The overlayfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. As writes always target the upper layer + * filesystem pass the statfs to the same filesystem. + */ +static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct dentry *root_dentry = dentry->d_sb->s_root; + struct path path; + int err; + + ovl_path_upper(root_dentry, &path); + + err = vfs_statfs(&path, buf); + if (!err) { + buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen); + buf->f_type = OVERLAYFS_SUPER_MAGIC; + } + + return err; +} + +/** + * ovl_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ufs = sb->s_fs_info; + + seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + seq_printf(m, ",workdir=%s", ufs->config.workdir); + return 0; +} + +static const struct super_operations ovl_super_operations = { + .put_super = ovl_put_super, + .statfs = ovl_statfs, + .show_options = ovl_show_options, +}; + +enum { + OPT_LOWERDIR, + OPT_UPPERDIR, + OPT_WORKDIR, + OPT_ERR, +}; + +static const match_table_t ovl_tokens = { + {OPT_LOWERDIR, "lowerdir=%s"}, + {OPT_UPPERDIR, "upperdir=%s"}, + {OPT_WORKDIR, "workdir=%s"}, + {OPT_ERR, NULL} +}; + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case OPT_UPPERDIR: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case OPT_LOWERDIR: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + case OPT_WORKDIR: + kfree(config->workdir); + config->workdir = match_strdup(&args[0]); + if (!config->workdir) + return -ENOMEM; + break; + + default: + return -EINVAL; + } + } + return 0; +} + +#define OVL_WORKDIR_NAME "work" + +static struct dentry *ovl_workdir_create(struct vfsmount *mnt, + struct dentry *dentry) +{ + struct inode *dir = dentry->d_inode; + struct dentry *work; + int err; + bool retried = false; + + err = mnt_want_write(mnt); + if (err) + return ERR_PTR(err); + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); +retry: + work = lookup_one_len(OVL_WORKDIR_NAME, dentry, + strlen(OVL_WORKDIR_NAME)); + + if (!IS_ERR(work)) { + struct kstat stat = { + .mode = S_IFDIR | 0, + }; + + if (work->d_inode) { + err = -EEXIST; + if (retried) + goto out_dput; + + retried = true; + ovl_cleanup(dir, work); + dput(work); + goto retry; + } + + err = ovl_create_real(dir, work, &stat, NULL, NULL, true); + if (err) + goto out_dput; + } +out_unlock: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(mnt); + + return work; + +out_dput: + dput(work); + work = ERR_PTR(err); + goto out_unlock; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err; + + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + err = -EINVAL; + } + return err; +} + +static bool ovl_is_allowed_fs_type(struct dentry *root) +{ + const struct dentry_operations *dop = root->d_op; + + /* + * We don't support: + * - automount filesystems + * - filesystems with revalidate (FIXME for lower layer) + * - filesystems with case insensitive names + */ + if (dop && + (dop->d_manage || dop->d_automount || + dop->d_revalidate || dop->d_weak_revalidate || + dop->d_compare || dop->d_hash)) { + return false; + } + return true; +} + +/* Workdir should not be subdir of upperdir and vice versa */ +static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) +{ + bool ok = false; + + if (workdir != upperdir) { + ok = (lock_rename(workdir, upperdir) == NULL); + unlock_rename(workdir, upperdir); + } + return ok; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path lowerpath; + struct path upperpath; + struct path workpath; + struct inode *root_inode; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ufs; + struct kstatfs statfs; + int err; + + err = -ENOMEM; + ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) + goto out; + + err = ovl_parse_opt((char *) data, &ufs->config); + if (err) + goto out_free_config; + + /* FIXME: workdir is not needed for a R/O mount */ + err = -EINVAL; + if (!ufs->config.upperdir || !ufs->config.lowerdir || + !ufs->config.workdir) { + pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); + goto out_free_config; + } + + err = -ENOMEM; + oe = ovl_alloc_entry(); + if (oe == NULL) + goto out_free_config; + + err = ovl_mount_dir(ufs->config.upperdir, &upperpath); + if (err) + goto out_free_oe; + + err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); + if (err) + goto out_put_upperpath; + + err = ovl_mount_dir(ufs->config.workdir, &workpath); + if (err) + goto out_put_lowerpath; + + err = -EINVAL; + if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || + !S_ISDIR(lowerpath.dentry->d_inode->i_mode) || + !S_ISDIR(workpath.dentry->d_inode->i_mode)) { + pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n"); + goto out_put_workpath; + } + + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + + if (!ovl_is_allowed_fs_type(upperpath.dentry)) { + pr_err("overlayfs: filesystem of upperdir is not supported\n"); + goto out_put_workpath; + } + + if (!ovl_is_allowed_fs_type(lowerpath.dentry)) { + pr_err("overlayfs: filesystem of lowerdir is not supported\n"); + goto out_put_workpath; + } + + err = vfs_statfs(&lowerpath, &statfs); + if (err) { + pr_err("overlayfs: statfs failed on lowerpath\n"); + goto out_put_workpath; + } + ufs->lower_namelen = statfs.f_namelen; + + sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, + lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + + err = -EINVAL; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + goto out_put_workpath; + } + + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out_put_workpath; + } + + ufs->lower_mnt = clone_private_mount(&lowerpath); + err = PTR_ERR(ufs->lower_mnt); + if (IS_ERR(ufs->lower_mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_upper_mnt; + } + + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_err("overlayfs: failed to create directory %s/%s\n", + ufs->config.workdir, OVL_WORKDIR_NAME); + goto out_put_lower_mnt; + } + + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + ufs->lower_mnt->mnt_flags |= MNT_READONLY; + + /* If the upper fs is r/o, we mark overlayfs r/o too */ + if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) + sb->s_flags |= MS_RDONLY; + + sb->s_d_op = &ovl_dentry_operations; + + err = -ENOMEM; + root_inode = ovl_new_inode(sb, S_IFDIR, oe); + if (!root_inode) + goto out_put_workdir; + + root_dentry = d_make_root(root_inode); + if (!root_dentry) + goto out_put_workdir; + + mntput(upperpath.mnt); + mntput(lowerpath.mnt); + path_put(&workpath); + + oe->__upperdentry = upperpath.dentry; + oe->lowerdentry = lowerpath.dentry; + + root_dentry->d_fsdata = oe; + + sb->s_magic = OVERLAYFS_SUPER_MAGIC; + sb->s_op = &ovl_super_operations; + sb->s_root = root_dentry; + sb->s_fs_info = ufs; + + return 0; + +out_put_workdir: + dput(ufs->workdir); +out_put_lower_mnt: + mntput(ufs->lower_mnt); +out_put_upper_mnt: + mntput(ufs->upper_mnt); +out_put_workpath: + path_put(&workpath); +out_put_lowerpath: + path_put(&lowerpath); +out_put_upperpath: + path_put(&upperpath); +out_free_oe: + kfree(oe); +out_free_config: + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); + kfree(ufs); +out: + return err; +} + +static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlayfs", + .mount = ovl_mount, + .kill_sb = kill_anon_super, +}; +MODULE_ALIAS_FS("overlayfs"); + +static int __init ovl_init(void) +{ + return register_filesystem(&ovl_fs_type); +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); +} + +module_init(ovl_init); +module_exit(ovl_exit); diff --git a/fs/splice.c b/fs/splice.c index f5cb9ba84510..75c6058eabf2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +EXPORT_SYMBOL(do_splice_direct); static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h index f97804bdf1ff..7461327e14e4 100644 --- a/include/acpi/acnames.h +++ b/include/acpi/acnames.h @@ -52,6 +52,7 @@ #define METHOD_NAME__CBA "_CBA" #define METHOD_NAME__CID "_CID" #define METHOD_NAME__CRS "_CRS" +#define METHOD_NAME__DDN "_DDN" #define METHOD_NAME__HID "_HID" #define METHOD_NAME__INI "_INI" #define METHOD_NAME__PLD "_PLD" diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 57ee0528aacb..f34a0835aa4f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -433,6 +433,7 @@ int acpi_device_set_power(struct acpi_device *device, int state); int acpi_bus_init_power(struct acpi_device *device); int acpi_device_fix_up_power(struct acpi_device *device); int acpi_bus_update_power(acpi_handle handle, int *state_p); +int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); #ifdef CONFIG_PM diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 9fc1d71c82bc..ab2acf629a64 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -46,7 +46,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20140828 +#define ACPI_CA_VERSION 0x20140926 #include <acpi/acconfig.h> #include <acpi/actypes.h> diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index ac03ec81d342..7000e66f768e 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -721,7 +721,7 @@ typedef u32 acpi_event_type; * | | | +--- Enabled for wake? * | | +----- Set? * | +------- Has a handler? - * +----------- <Reserved> + * +------------- <Reserved> */ typedef u32 acpi_event_status; @@ -729,7 +729,7 @@ typedef u32 acpi_event_status; #define ACPI_EVENT_FLAG_ENABLED (acpi_event_status) 0x01 #define ACPI_EVENT_FLAG_WAKE_ENABLED (acpi_event_status) 0x02 #define ACPI_EVENT_FLAG_SET (acpi_event_status) 0x04 -#define ACPI_EVENT_FLAG_HANDLE (acpi_event_status) 0x08 +#define ACPI_EVENT_FLAG_HAS_HANDLER (acpi_event_status) 0x08 /* Actions for acpi_set_gpe, acpi_gpe_wakeup, acpi_hw_low_set_gpe */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index b7926bb9b444..407a12f663eb 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -432,6 +432,7 @@ static inline bool acpi_driver_match_device(struct device *dev, int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *); int acpi_device_modalias(struct device *, char *, int); +struct platform_device *acpi_create_platform_device(struct acpi_device *); #define ACPI_PTR(_ptr) (_ptr) #else /* !CONFIG_ACPI */ diff --git a/include/linux/audit.h b/include/linux/audit.h index 36dffeccebdb..e58fe7df8b9c 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -90,7 +90,7 @@ extern unsigned compat_dir_class[]; extern unsigned compat_chattr_class[]; extern unsigned compat_signal_class[]; -extern int __weak audit_classify_compat_syscall(int abi, unsigned syscall); +extern int audit_classify_compat_syscall(int abi, unsigned syscall); /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 324329ceea1e..73b45225a7ca 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -175,12 +175,13 @@ void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); -struct buffer_head *__getblk(struct block_device *bdev, sector_t block, - unsigned size); +struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); -struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); +struct buffer_head *__bread_gfp(struct block_device *, + sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); @@ -295,7 +296,13 @@ static inline void bforget(struct buffer_head *bh) static inline struct buffer_head * sb_bread(struct super_block *sb, sector_t block) { - return __bread(sb->s_bdev, block, sb->s_blocksize); + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); +} + +static inline struct buffer_head * +sb_bread_unmovable(struct super_block *sb, sector_t block) +{ + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0); } static inline void @@ -307,7 +314,7 @@ sb_breadahead(struct super_block *sb, sector_t block) static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block) { - return __getblk(sb->s_bdev, block, sb->s_blocksize); + return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } static inline struct buffer_head * @@ -344,6 +351,36 @@ static inline void lock_buffer(struct buffer_head *bh) __lock_buffer(bh); } +static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, + sector_t block, + unsigned size) +{ + return __getblk_gfp(bdev, block, size, 0); +} + +static inline struct buffer_head *__getblk(struct block_device *bdev, + sector_t block, + unsigned size) +{ + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); +} + +/** + * __bread() - reads a specified block and returns the bh + * @bdev: the block_device to read from + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that contains it. + * The page cache is allocated from movable area so that it can be migrated. + * It returns NULL if the block was unreadable. + */ +static inline struct buffer_head * +__bread(struct block_device *bdev, sector_t block, unsigned size) +{ + return __bread_gfp(bdev, block, size, __GFP_MOVABLE); +} + extern int __set_page_dirty_buffers(struct page *page); #else /* CONFIG_BLOCK */ diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 653f0e2b6ca9..abcafaa20b86 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -287,7 +287,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); -extern struct clocksource * __init __weak clocksource_default_clock(void); +extern struct clocksource * __init clocksource_default_clock(void); extern void clocksource_mark_unstable(struct clocksource *cs); extern u64 diff --git a/include/linux/cpufreq-dt.h b/include/linux/cpufreq-dt.h new file mode 100644 index 000000000000..0414009e2c30 --- /dev/null +++ b/include/linux/cpufreq-dt.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 Marvell + * Thomas Petazzoni <thomas.petazzoni@free-electrons.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __CPUFREQ_DT_H__ +#define __CPUFREQ_DT_H__ + +struct cpufreq_dt_platform_data { + /* + * True when each CPU has its own clock to control its + * frequency, false when all CPUs are controlled by a single + * clock. + */ + bool independent_clocks; +}; + +#endif /* __CPUFREQ_DT_H__ */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 138336b6bb04..503b085b7832 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -219,6 +219,7 @@ __ATTR(_name, 0644, show_##_name, store_##_name) struct cpufreq_driver { char name[CPUFREQ_NAME_LEN]; u8 flags; + void *driver_data; /* needed by all drivers */ int (*init) (struct cpufreq_policy *policy); @@ -312,6 +313,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); const char *cpufreq_get_current_driver(void); +void *cpufreq_get_driver_data(void); static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max) diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 72ab536ad3de..3849fce7ecfe 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -14,14 +14,13 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; -extern int __weak elfcorehdr_alloc(unsigned long long *addr, - unsigned long long *size); -extern void __weak elfcorehdr_free(unsigned long long addr); -extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); -extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); -extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot); +extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); +extern void elfcorehdr_free(unsigned long long addr); +extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); +extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot); extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); diff --git a/include/linux/efi.h b/include/linux/efi.h index 45cb4ffdea62..0949f9c7e872 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -92,6 +92,7 @@ typedef struct { #define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ #define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ #define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ +#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ @@ -502,6 +503,10 @@ typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); +typedef efi_status_t +efi_set_variable_nonblocking_t(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, void *data); + typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count); typedef void efi_reset_system_t (int reset_type, efi_status_t status, unsigned long data_size, efi_char16_t *data); @@ -821,6 +826,7 @@ extern struct efi { efi_get_variable_t *get_variable; efi_get_next_variable_t *get_next_variable; efi_set_variable_t *set_variable; + efi_set_variable_nonblocking_t *set_variable_nonblocking; efi_query_variable_info_t *query_variable_info; efi_update_capsule_t *update_capsule; efi_query_capsule_caps_t *query_capsule_caps; @@ -886,6 +892,13 @@ extern bool efi_poweroff_required(void); (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \ (md) = (void *)(md) + (m)->desc_size) +/* + * Format an EFI memory descriptor's type and attributes to a user-provided + * character buffer, as per snprintf(), and return the buffer. + */ +char * __init efi_md_typeattr_format(char *buf, size_t size, + const efi_memory_desc_t *md); + /** * efi_range_is_wc - check the WC bit on an address range * @start: starting kvirt address @@ -1034,6 +1047,7 @@ struct efivar_operations { efi_get_variable_t *get_variable; efi_get_next_variable_t *get_next_variable; efi_set_variable_t *set_variable; + efi_set_variable_nonblocking_t *set_variable_nonblocking; efi_query_variable_store_t *query_variable_store; }; @@ -1227,4 +1241,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, unsigned long *load_addr, unsigned long *load_size); +efi_status_t efi_parse_options(char *cmdline); + +bool efi_runtime_disabled(void); #endif /* _LINUX_EFI_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index a957d4366c24..4e41a4a331bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -223,6 +223,13 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_TIMES_SET (1 << 16) /* + * Whiteout is represented by a char device. The following constants define the + * mode and device number to use. + */ +#define WHITEOUT_MODE 0 +#define WHITEOUT_DEV 0 + +/* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares @@ -254,6 +261,12 @@ struct iattr { */ #include <linux/quota.h> +/* + * Maximum number of layers of fs stack. Needs to be limited to + * prevent kernel stack overflow + */ +#define FILESYSTEM_MAX_STACK_DEPTH 2 + /** * enum positive_aop_returns - aop return codes with specific semantics * @@ -1266,6 +1279,11 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; }; extern struct timespec current_fs_time(struct super_block *sb); @@ -1398,6 +1416,7 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); +extern int vfs_whiteout(struct inode *, struct dentry *); /* * VFS dentry helper functions. @@ -1528,6 +1547,9 @@ struct inode_operations { umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); int (*set_acl)(struct inode *, struct posix_acl *, int); + + /* WARNING: probably going away soon, do not use! */ + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); } ____cacheline_aligned; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, @@ -1625,6 +1647,9 @@ struct super_operations { #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) +#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ + (inode)->i_rdev == WHITEOUT_DEV) + /* * Inode state bits. Protected by inode->i_lock * @@ -2040,6 +2065,7 @@ extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); +extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); @@ -2253,7 +2279,9 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); +extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); +extern int __check_sticky(struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { @@ -2452,6 +2480,9 @@ extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *, size_t len, unsigned int flags); +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); @@ -2737,6 +2768,14 @@ static inline int is_sxid(umode_t mode) return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); } +static inline int check_sticky(struct inode *dir, struct inode *inode) +{ + if (!(dir->i_mode & S_ISVTX)) + return 0; + + return __check_sticky(dir, inode); +} + static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0dae71e9971c..704b9a599b26 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1042,7 +1042,7 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ -int __jbd2_journal_clean_checkpoint_list(journal_t *journal); +void __jbd2_journal_clean_checkpoint_list(journal_t *journal); int __jbd2_journal_remove_checkpoint(struct journal_head *); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 40728cf1c452..3d770f5564b8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -403,6 +403,7 @@ int vsscanf(const char *, const char *, va_list); extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); +extern bool parse_option_str(const char *str, const char *option); extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 6b06d378f3df..e465bb15912d 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -283,7 +283,7 @@ struct kgdb_io { extern struct kgdb_arch arch_kgdb_ops; -extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs); +extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs); #ifdef CONFIG_SERIAL_KGDB_NMI extern int kgdb_register_nmi_console(void); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 28be31f49250..ea53b04993f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1080,6 +1080,7 @@ void kvm_device_get(struct kvm_device *dev); void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); +void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; diff --git a/include/linux/leds.h b/include/linux/leds.h index e43686472197..a57611d0c94e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -13,8 +13,8 @@ #define __LINUX_LEDS_H_INCLUDED #include <linux/list.h> -#include <linux/spinlock.h> #include <linux/rwsem.h> +#include <linux/spinlock.h> #include <linux/timer.h> #include <linux/workqueue.h> @@ -31,8 +31,8 @@ enum led_brightness { struct led_classdev { const char *name; - int brightness; - int max_brightness; + enum led_brightness brightness; + enum led_brightness max_brightness; int flags; /* Lower 16 bits reflect status */ @@ -140,6 +140,16 @@ extern void led_blink_set_oneshot(struct led_classdev *led_cdev, */ extern void led_set_brightness(struct led_classdev *led_cdev, enum led_brightness brightness); +/** + * led_update_brightness - update LED brightness + * @led_cdev: the LED to query + * + * Get an LED's current brightness and update led_cdev->brightness + * member with the obtained value. + * + * Returns: 0 on success or negative error value on failure + */ +extern int led_update_brightness(struct led_classdev *led_cdev); /* * LED Triggers diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h new file mode 100644 index 000000000000..307d9cab2026 --- /dev/null +++ b/include/linux/mailbox_client.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013-2014 Linaro Ltd. + * Author: Jassi Brar <jassisinghbrar@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MAILBOX_CLIENT_H +#define __MAILBOX_CLIENT_H + +#include <linux/of.h> +#include <linux/device.h> + +struct mbox_chan; + +/** + * struct mbox_client - User of a mailbox + * @dev: The client device + * @tx_block: If the mbox_send_message should block until data is + * transmitted. + * @tx_tout: Max block period in ms before TX is assumed failure + * @knows_txdone: If the client could run the TX state machine. Usually + * if the client receives some ACK packet for transmission. + * Unused if the controller already has TX_Done/RTR IRQ. + * @rx_callback: Atomic callback to provide client the data received + * @tx_done: Atomic callback to tell client of data transmission + */ +struct mbox_client { + struct device *dev; + bool tx_block; + unsigned long tx_tout; + bool knows_txdone; + + void (*rx_callback)(struct mbox_client *cl, void *mssg); + void (*tx_done)(struct mbox_client *cl, void *mssg, int r); +}; + +struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index); +int mbox_send_message(struct mbox_chan *chan, void *mssg); +void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ +bool mbox_client_peek_data(struct mbox_chan *chan); /* atomic */ +void mbox_free_channel(struct mbox_chan *chan); /* may sleep */ + +#endif /* __MAILBOX_CLIENT_H */ diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h new file mode 100644 index 000000000000..d4cf96f07cfc --- /dev/null +++ b/include/linux/mailbox_controller.h @@ -0,0 +1,133 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MAILBOX_CONTROLLER_H +#define __MAILBOX_CONTROLLER_H + +#include <linux/of.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/device.h> +#include <linux/completion.h> + +struct mbox_chan; + +/** + * struct mbox_chan_ops - methods to control mailbox channels + * @send_data: The API asks the MBOX controller driver, in atomic + * context try to transmit a message on the bus. Returns 0 if + * data is accepted for transmission, -EBUSY while rejecting + * if the remote hasn't yet read the last data sent. Actual + * transmission of data is reported by the controller via + * mbox_chan_txdone (if it has some TX ACK irq). It must not + * sleep. + * @startup: Called when a client requests the chan. The controller + * could ask clients for additional parameters of communication + * to be provided via client's chan_data. This call may + * block. After this call the Controller must forward any + * data received on the chan by calling mbox_chan_received_data. + * The controller may do stuff that need to sleep. + * @shutdown: Called when a client relinquishes control of a chan. + * This call may block too. The controller must not forward + * any received data anymore. + * The controller may do stuff that need to sleep. + * @last_tx_done: If the controller sets 'txdone_poll', the API calls + * this to poll status of last TX. The controller must + * give priority to IRQ method over polling and never + * set both txdone_poll and txdone_irq. Only in polling + * mode 'send_data' is expected to return -EBUSY. + * The controller may do stuff that need to sleep/block. + * Used only if txdone_poll:=true && txdone_irq:=false + * @peek_data: Atomic check for any received data. Return true if controller + * has some data to push to the client. False otherwise. + */ +struct mbox_chan_ops { + int (*send_data)(struct mbox_chan *chan, void *data); + int (*startup)(struct mbox_chan *chan); + void (*shutdown)(struct mbox_chan *chan); + bool (*last_tx_done)(struct mbox_chan *chan); + bool (*peek_data)(struct mbox_chan *chan); +}; + +/** + * struct mbox_controller - Controller of a class of communication channels + * @dev: Device backing this controller + * @ops: Operators that work on each communication chan + * @chans: Array of channels + * @num_chans: Number of channels in the 'chans' array. + * @txdone_irq: Indicates if the controller can report to API when + * the last transmitted data was read by the remote. + * Eg, if it has some TX ACK irq. + * @txdone_poll: If the controller can read but not report the TX + * done. Ex, some register shows the TX status but + * no interrupt rises. Ignored if 'txdone_irq' is set. + * @txpoll_period: If 'txdone_poll' is in effect, the API polls for + * last TX's status after these many millisecs + * @of_xlate: Controller driver specific mapping of channel via DT + * @poll: API private. Used to poll for TXDONE on all channels. + * @node: API private. To hook into list of controllers. + */ +struct mbox_controller { + struct device *dev; + struct mbox_chan_ops *ops; + struct mbox_chan *chans; + int num_chans; + bool txdone_irq; + bool txdone_poll; + unsigned txpoll_period; + struct mbox_chan *(*of_xlate)(struct mbox_controller *mbox, + const struct of_phandle_args *sp); + /* Internal to API */ + struct timer_list poll; + struct list_head node; +}; + +/* + * The length of circular buffer for queuing messages from a client. + * 'msg_count' tracks the number of buffered messages while 'msg_free' + * is the index where the next message would be buffered. + * We shouldn't need it too big because every transfer is interrupt + * triggered and if we have lots of data to transfer, the interrupt + * latencies are going to be the bottleneck, not the buffer length. + * Besides, mbox_send_message could be called from atomic context and + * the client could also queue another message from the notifier 'tx_done' + * of the last transfer done. + * REVISIT: If too many platforms see the "Try increasing MBOX_TX_QUEUE_LEN" + * print, it needs to be taken from config option or somesuch. + */ +#define MBOX_TX_QUEUE_LEN 20 + +/** + * struct mbox_chan - s/w representation of a communication chan + * @mbox: Pointer to the parent/provider of this channel + * @txdone_method: Way to detect TXDone chosen by the API + * @cl: Pointer to the current owner of this channel + * @tx_complete: Transmission completion + * @active_req: Currently active request hook + * @msg_count: No. of mssg currently queued + * @msg_free: Index of next available mssg slot + * @msg_data: Hook for data packet + * @lock: Serialise access to the channel + * @con_priv: Hook for controller driver to attach private data + */ +struct mbox_chan { + struct mbox_controller *mbox; + unsigned txdone_method; + struct mbox_client *cl; + struct completion tx_complete; + void *active_req; + unsigned msg_count, msg_free; + void *msg_data[MBOX_TX_QUEUE_LEN]; + spinlock_t lock; /* Serialise access to the channel */ + void *con_priv; +}; + +int mbox_controller_register(struct mbox_controller *mbox); /* can sleep */ +void mbox_controller_unregister(struct mbox_controller *mbox); /* can sleep */ +void mbox_chan_received_data(struct mbox_chan *chan, void *data); /* atomic */ +void mbox_chan_txdone(struct mbox_chan *chan, int r); /* atomic */ + +#endif /* __MAILBOX_CONTROLLER_H */ diff --git a/include/linux/memory.h b/include/linux/memory.h index bb7384e3c3d8..8b8d8d12348e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -35,7 +35,7 @@ struct memory_block { }; int arch_get_memory_phys_device(unsigned long start_pfn); -unsigned long __weak memory_block_size_bytes(void); +unsigned long memory_block_size_bytes(void); /* These states are exposed to userspace as text strings in sysfs */ #define MEM_ONLINE (1<<0) /* exposed to userspace */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 02d11ee7f19d..27eb1bfbe704 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1176,6 +1176,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); diff --git a/include/linux/mount.h b/include/linux/mount.h index 9262e4bf0cc3..c2c561dc0114 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -81,6 +81,9 @@ extern struct vfsmount *mntget(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); +struct path; +extern struct vfsmount *clone_private_mount(struct path *path); + struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, diff --git a/include/linux/oom.h b/include/linux/oom.h index 647395a1a550..e8d6e1058723 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -50,6 +50,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); + +extern int oom_kills_count(void); +extern void note_oom_kill(void); extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, diff --git a/include/linux/mailbox.h b/include/linux/pl320-ipc.h index 5161f63ec1c8..5161f63ec1c8 100644 --- a/include/linux/mailbox.h +++ b/include/linux/pl320-ipc.h diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 9ab4bf7c4646..636e82834506 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -15,6 +15,7 @@ enum { PM_QOS_CPU_DMA_LATENCY, PM_QOS_NETWORK_LATENCY, PM_QOS_NETWORK_THROUGHPUT, + PM_QOS_MEMORY_BANDWIDTH, /* insert new class ID */ PM_QOS_NUM_CLASSES, @@ -32,6 +33,7 @@ enum pm_qos_flags_status { #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 +#define PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE 0 #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) @@ -69,7 +71,8 @@ struct dev_pm_qos_request { enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ + PM_QOS_MIN, /* return the smallest value */ + PM_QOS_SUM /* return the sum */ }; /* diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h index fe25876c1a5d..17d7d0d20eca 100644 --- a/include/linux/pnfs_osd_xdr.h +++ b/include/linux/pnfs_osd_xdr.h @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/include/linux/string.h b/include/linux/string.h index e6edfe51575a..2e22a2e58f3a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -132,7 +132,7 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, - const void *from, size_t available); + const void *from, size_t available); /** * strstarts - does @str start with @prefix? @@ -144,7 +144,8 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } -extern size_t memweight(const void *ptr, size_t bytes); +size_t memweight(const void *ptr, size_t bytes); +void memzero_explicit(void *s, size_t count); /** * kbasename - return the last part of a pathname. diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0305cde21a74..ef90838b36a0 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -44,6 +44,10 @@ #define KELVIN_TO_CELSIUS(t) (long)(((long)t-2732 >= 0) ? \ ((long)t-2732+5)/10 : ((long)t-2732-5)/10) #define CELSIUS_TO_KELVIN(t) ((t)*10+2732) +#define DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, off) (((t) - (off)) * 100) +#define DECI_KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, 2732) +#define MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, off) (((t) / 100) + (off)) +#define MILLICELSIUS_TO_DECI_KELVIN(t) MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, 2732) /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 1ad4724458de..baa81718d985 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -63,7 +63,17 @@ struct uio_port { #define MAX_UIO_PORT_REGIONS 5 -struct uio_device; +struct uio_device { + struct module *owner; + struct device *dev; + int minor; + atomic_t event; + struct fasync_struct *async_queue; + wait_queue_head_t wait; + struct uio_info *info; + struct kobject *map_dir; + struct kobject *portio_dir; +}; /** * struct uio_info - UIO device capabilities diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 4f844c6b03ee..60beb5dc7977 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -98,11 +98,11 @@ struct uprobes_state { struct xol_area *xol_area; }; -extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); -extern bool __weak is_trap_insn(uprobe_opcode_t *insn); -extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern bool is_swbp_insn(uprobe_opcode_t *insn); +extern bool is_trap_insn(uprobe_opcode_t *insn); +extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); @@ -128,8 +128,8 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); -extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); -extern void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, +extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); +extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); #else /* !CONFIG_UPROBES */ struct uprobes_state { diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 2a3038ee17a3..395b70e0eccf 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -97,13 +97,8 @@ struct watchdog_device { #define WDOG_UNREGISTERED 4 /* Has the device been unregistered */ }; -#ifdef CONFIG_WATCHDOG_NOWAYOUT -#define WATCHDOG_NOWAYOUT 1 -#define WATCHDOG_NOWAYOUT_INIT_STATUS (1 << WDOG_NO_WAY_OUT) -#else -#define WATCHDOG_NOWAYOUT 0 -#define WATCHDOG_NOWAYOUT_INIT_STATUS 0 -#endif +#define WATCHDOG_NOWAYOUT IS_BUILTIN(CONFIG_WATCHDOG_NOWAYOUT) +#define WATCHDOG_NOWAYOUT_INIT_STATUS (WATCHDOG_NOWAYOUT << WDOG_NO_WAY_OUT) /* Use the following function to check whether or not the watchdog is active */ static inline bool watchdog_active(struct watchdog_device *wdd) diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h index b2e85fdd2ae0..a09cca829082 100644 --- a/include/scsi/osd_initiator.h +++ b/include/scsi/osd_initiator.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * Benny Halevy <bhalevy@panasas.com> * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 6ca3265a4dca..7a8d2cd30328 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Public Declarations of the ORE API * diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h index a2594afe05c7..e0ca835e7bf7 100644 --- a/include/scsi/osd_protocol.h +++ b/include/scsi/osd_protocol.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * Benny Halevy <bhalevy@panasas.com> * * This program is free software; you can redistribute it and/or modify @@ -496,7 +496,7 @@ struct osd_timestamp { */ struct osd_key_identifier { - u8 id[7]; /* if you know why 7 please email bharrosh@panasas.com */ + u8 id[7]; /* if you know why 7 please email ooo@electrozaur.com */ } __packed; /* for osd_capability.format */ diff --git a/include/scsi/osd_sec.h b/include/scsi/osd_sec.h index f96151c9c9e8..7abeb0f0db30 100644 --- a/include/scsi/osd_sec.h +++ b/include/scsi/osd_sec.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * Benny Halevy <bhalevy@panasas.com> * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_sense.h b/include/scsi/osd_sense.h index 91db543a5502..d52aa93a0b2d 100644 --- a/include/scsi/osd_sense.h +++ b/include/scsi/osd_sense.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * Benny Halevy <bhalevy@panasas.com> * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_types.h b/include/scsi/osd_types.h index bd0be7ed4bcf..48e8a165e136 100644 --- a/include/scsi/osd_types.h +++ b/include/scsi/osd_types.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * Benny Halevy <bhalevy@panasas.com> * * This program is free software; you can redistribute it and/or modify diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 9ec9864ecf38..23c518a0340c 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -108,6 +108,8 @@ #define DA_EMULATE_ALUA 0 /* Enforce SCSI Initiator Port TransportID with 'ISID' for PR */ #define DA_ENFORCE_PR_ISIDS 1 +/* Force SPC-3 PR Activate Persistence across Target Power Loss */ +#define DA_FORCE_PR_APTPL 0 #define DA_STATUS_MAX_SECTORS_MIN 16 #define DA_STATUS_MAX_SECTORS_MAX 8192 /* By default don't report non-rotating (solid state) medium */ @@ -680,6 +682,7 @@ struct se_dev_attrib { enum target_prot_type pi_prot_type; enum target_prot_type hw_pi_prot_type; int enforce_pr_isids; + int force_pr_aptpl; int is_nonrot; int emulate_rest_reord; u32 hw_block_size; @@ -903,4 +906,18 @@ struct se_wwn { struct config_group fabric_stat_group; }; +static inline void atomic_inc_mb(atomic_t *v) +{ + smp_mb__before_atomic(); + atomic_inc(v); + smp_mb__after_atomic(); +} + +static inline void atomic_dec_mb(atomic_t *v) +{ + smp_mb__before_atomic(); + atomic_dec(v); + smp_mb__after_atomic(); +} + #endif /* TARGET_CORE_BASE_H */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d4f70a7fe876..ff4bd1b35246 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2369,7 +2369,7 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, show_extent_status(__entry->found ? __entry->status : 0)) ); -TRACE_EVENT(ext4_es_shrink_enter, +DECLARE_EVENT_CLASS(ext4__es_shrink_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt), @@ -2391,26 +2391,38 @@ TRACE_EVENT(ext4_es_shrink_enter, __entry->nr_to_scan, __entry->cache_cnt) ); -TRACE_EVENT(ext4_es_shrink_exit, - TP_PROTO(struct super_block *sb, int shrunk_nr, int cache_cnt), +DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count, + TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), - TP_ARGS(sb, shrunk_nr, cache_cnt), + TP_ARGS(sb, nr_to_scan, cache_cnt) +); + +DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter, + TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), + + TP_ARGS(sb, nr_to_scan, cache_cnt) +); + +TRACE_EVENT(ext4_es_shrink_scan_exit, + TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt), + + TP_ARGS(sb, nr_shrunk, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) - __field( int, shrunk_nr ) + __field( int, nr_shrunk ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; - __entry->shrunk_nr = shrunk_nr; + __entry->nr_shrunk = nr_shrunk; __entry->cache_cnt = cache_cnt; ), - TP_printk("dev %d,%d shrunk_nr %d cache_cnt %d", + TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->shrunk_nr, __entry->cache_cnt) + __entry->nr_shrunk, __entry->cache_cnt) ); TRACE_EVENT(ext4_collapse_range, @@ -2438,6 +2450,37 @@ TRACE_EVENT(ext4_collapse_range, __entry->offset, __entry->len) ); +TRACE_EVENT(ext4_es_shrink, + TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, + int skip_precached, int nr_skipped, int retried), + + TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nr_shrunk ) + __field( unsigned long long, scan_time ) + __field( int, skip_precached ) + __field( int, nr_skipped ) + __field( int, retried ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nr_shrunk = nr_shrunk; + __entry->scan_time = div_u64(scan_time, 1000); + __entry->skip_precached = skip_precached; + __entry->nr_skipped = nr_skipped; + __entry->retried = retried; + ), + + TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d " + "nr_skipped %d retried %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, + __entry->scan_time, __entry->skip_precached, + __entry->nr_skipped, __entry->retried) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h new file mode 100644 index 000000000000..0f4f95d63c03 --- /dev/null +++ b/include/trace/events/thermal.h @@ -0,0 +1,83 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM thermal + +#if !defined(_TRACE_THERMAL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_THERMAL_H + +#include <linux/thermal.h> +#include <linux/tracepoint.h> + +TRACE_EVENT(thermal_temperature, + + TP_PROTO(struct thermal_zone_device *tz), + + TP_ARGS(tz), + + TP_STRUCT__entry( + __string(thermal_zone, tz->type) + __field(int, id) + __field(int, temp_prev) + __field(int, temp) + ), + + TP_fast_assign( + __assign_str(thermal_zone, tz->type); + __entry->id = tz->id; + __entry->temp_prev = tz->last_temperature; + __entry->temp = tz->temperature; + ), + + TP_printk("thermal_zone=%s id=%d temp_prev=%d temp=%d", + __get_str(thermal_zone), __entry->id, __entry->temp_prev, + __entry->temp) +); + +TRACE_EVENT(cdev_update, + + TP_PROTO(struct thermal_cooling_device *cdev, unsigned long target), + + TP_ARGS(cdev, target), + + TP_STRUCT__entry( + __string(type, cdev->type) + __field(unsigned long, target) + ), + + TP_fast_assign( + __assign_str(type, cdev->type); + __entry->target = target; + ), + + TP_printk("type=%s target=%lu", __get_str(type), __entry->target) +); + +TRACE_EVENT(thermal_zone_trip, + + TP_PROTO(struct thermal_zone_device *tz, int trip, + enum thermal_trip_type trip_type), + + TP_ARGS(tz, trip, trip_type), + + TP_STRUCT__entry( + __string(thermal_zone, tz->type) + __field(int, id) + __field(int, trip) + __field(enum thermal_trip_type, trip_type) + ), + + TP_fast_assign( + __assign_str(thermal_zone, tz->type); + __entry->id = tz->id; + __entry->trip = trip; + __entry->trip_type = trip_type; + ), + + TP_printk("thermal_zone=%s id=%d trip=%d trip_type=%d", + __get_str(thermal_zone), __entry->id, __entry->trip, + __entry->trip_type) +); + +#endif /* _TRACE_THERMAL_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 6cad97485bad..b70237e8bc37 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -374,6 +374,7 @@ header-y += swab.h header-y += synclink.h header-y += sysctl.h header-y += sysinfo.h +header-y += target_core_user.h header-y += taskstats.h header-y += tcp.h header-y += tcp_metrics.h diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index ca1a11bb4443..3735fa0a6784 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -37,6 +37,7 @@ #define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ struct fstrim_range { __u64 start; diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h new file mode 100644 index 000000000000..7dcfbe6771b1 --- /dev/null +++ b/include/uapi/linux/target_core_user.h @@ -0,0 +1,142 @@ +#ifndef __TARGET_CORE_USER_H +#define __TARGET_CORE_USER_H + +/* This header will be used by application too */ + +#include <linux/types.h> +#include <linux/uio.h> + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +#define TCMU_VERSION "1.0" + +/* + * Ring Design + * ----------- + * + * The mmaped area is divided into three parts: + * 1) The mailbox (struct tcmu_mailbox, below) + * 2) The command ring + * 3) Everything beyond the command ring (data) + * + * The mailbox tells userspace the offset of the command ring from the + * start of the shared memory region, and how big the command ring is. + * + * The kernel passes SCSI commands to userspace by putting a struct + * tcmu_cmd_entry in the ring, updating mailbox->cmd_head, and poking + * userspace via uio's interrupt mechanism. + * + * tcmu_cmd_entry contains a header. If the header type is PAD, + * userspace should skip hdr->length bytes (mod cmdr_size) to find the + * next cmd_entry. + * + * Otherwise, the entry will contain offsets into the mmaped area that + * contain the cdb and data buffers -- the latter accessible via the + * iov array. iov addresses are also offsets into the shared area. + * + * When userspace is completed handling the command, set + * entry->rsp.scsi_status, fill in rsp.sense_buffer if appropriate, + * and also set mailbox->cmd_tail equal to the old cmd_tail plus + * hdr->length, mod cmdr_size. If cmd_tail doesn't equal cmd_head, it + * should process the next packet the same way, and so on. + */ + +#define TCMU_MAILBOX_VERSION 1 +#define ALIGN_SIZE 64 /* Should be enough for most CPUs */ + +struct tcmu_mailbox { + __u16 version; + __u16 flags; + __u32 cmdr_off; + __u32 cmdr_size; + + __u32 cmd_head; + + /* Updated by user. On its own cacheline */ + __u32 cmd_tail __attribute__((__aligned__(ALIGN_SIZE))); + +} __packed; + +enum tcmu_opcode { + TCMU_OP_PAD = 0, + TCMU_OP_CMD, +}; + +/* + * Only a few opcodes, and length is 8-byte aligned, so use low bits for opcode. + */ +struct tcmu_cmd_entry_hdr { + __u32 len_op; +} __packed; + +#define TCMU_OP_MASK 0x7 + +static inline enum tcmu_opcode tcmu_hdr_get_op(struct tcmu_cmd_entry_hdr *hdr) +{ + return hdr->len_op & TCMU_OP_MASK; +} + +static inline void tcmu_hdr_set_op(struct tcmu_cmd_entry_hdr *hdr, enum tcmu_opcode op) +{ + hdr->len_op &= ~TCMU_OP_MASK; + hdr->len_op |= (op & TCMU_OP_MASK); +} + +static inline __u32 tcmu_hdr_get_len(struct tcmu_cmd_entry_hdr *hdr) +{ + return hdr->len_op & ~TCMU_OP_MASK; +} + +static inline void tcmu_hdr_set_len(struct tcmu_cmd_entry_hdr *hdr, __u32 len) +{ + hdr->len_op &= TCMU_OP_MASK; + hdr->len_op |= len; +} + +/* Currently the same as SCSI_SENSE_BUFFERSIZE */ +#define TCMU_SENSE_BUFFERSIZE 96 + +struct tcmu_cmd_entry { + struct tcmu_cmd_entry_hdr hdr; + + uint16_t cmd_id; + uint16_t __pad1; + + union { + struct { + uint64_t cdb_off; + uint64_t iov_cnt; + struct iovec iov[0]; + } req; + struct { + uint8_t scsi_status; + uint8_t __pad1; + uint16_t __pad2; + uint32_t __pad3; + char sense_buffer[TCMU_SENSE_BUFFERSIZE]; + } rsp; + }; + +} __packed; + +#define TCMU_OP_ALIGN_SIZE sizeof(uint64_t) + +enum tcmu_genl_cmd { + TCMU_CMD_UNSPEC, + TCMU_CMD_ADDED_DEVICE, + TCMU_CMD_REMOVED_DEVICE, + __TCMU_CMD_MAX, +}; +#define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1) + +enum tcmu_genl_attr { + TCMU_ATTR_UNSPEC, + TCMU_ATTR_DEVICE, + TCMU_ATTR_MINOR, + __TCMU_ATTR_MAX, +}; +#define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) + +#endif diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; + if (test_thread_flag(TIF_MEMDIE)) + return false; + if (pm_nosig_freezing || cgroup_freezing(p)) return true; @@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p) { unsigned long flags; - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ spin_lock_irqsave(&freezer_lock, flags); if (frozen(p)) wake_up_process(p); diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b323221b9ee..5a6ec8678b9a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); if (!user_only) { @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { @@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +static bool __check_frozen_processes(void) +{ + struct task_struct *g, *p; + + for_each_process_thread(g, p) + if (p != current && !freezer_should_skip(p) && !frozen(p)) + return false; + + return true; +} + +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + bool ret; + + read_lock(&tasklist_lock); + ret = __check_frozen_processes(); + read_unlock(&tasklist_lock); + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,11 +157,25 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + } else { + printk("done."); + } } printk("\n"); BUG_ON(in_atomic()); @@ -191,11 +230,11 @@ void thaw_processes(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); @@ -218,10 +257,10 @@ void thaw_kernel_threads(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); diff --git a/lib/cmdline.c b/lib/cmdline.c index 76a712e6e20e..8f13cf73c2ec 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -160,3 +160,32 @@ unsigned long long memparse(const char *ptr, char **retptr) return ret; } EXPORT_SYMBOL(memparse); + +/** + * parse_option_str - Parse a string and check an option is set or not + * @str: String to be parsed + * @option: option name + * + * This function parses a string containing a comma-separated list of + * strings like a=b,c. + * + * Return true if there's such option in the string, or return false. + */ +bool parse_option_str(const char *str, const char *option) +{ + while (*str) { + if (!strncmp(str, option, strlen(option))) { + str += strlen(option); + if (!*str || *str == ',') + return true; + } + + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return false; +} diff --git a/lib/string.c b/lib/string.c index 2fc20aa06f84..10063300b830 100644 --- a/lib/string.c +++ b/lib/string.c @@ -598,6 +598,22 @@ void *memset(void *s, int c, size_t count) EXPORT_SYMBOL(memset); #endif +/** + * memzero_explicit - Fill a region of memory (e.g. sensitive + * keying data) with 0s. + * @s: Pointer to the start of the area. + * @count: The size of the area. + * + * memzero_explicit() doesn't need an arch-specific version as + * it just invokes the one of memset() implicitly. + */ +void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + OPTIMIZER_HIDE_VAR(s); +} +EXPORT_SYMBOL(memzero_explicit); + #ifndef __HAVE_ARCH_MEMCPY /** * memcpy - Copy one area of memory to another diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18f..5340f6b91312 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b6381..9cd36b822444 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2252,6 +2252,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, } /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if * we're still under heavy pressure. diff --git a/mm/shmem.c b/mm/shmem.c index cd6fc7590e54..185836ba53ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2345,6 +2345,32 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru return 0; } +static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) +{ + struct dentry *whiteout; + int error; + + whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); + if (!whiteout) + return -ENOMEM; + + error = shmem_mknod(old_dir, whiteout, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); + dput(whiteout); + if (error) + return error; + + /* + * Cheat and hash the whiteout while the old dentry is still in + * place, instead of playing games with FS_RENAME_DOES_D_MOVE. + * + * d_lookup() will consistently find one of them at this point, + * not sure which one, but that isn't even important. + */ + d_rehash(whiteout); + return 0; +} + /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if @@ -2356,7 +2382,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode); - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) @@ -2365,6 +2391,14 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc if (!simple_empty(new_dentry)) return -ENOTEMPTY; + if (flags & RENAME_WHITEOUT) { + int error; + + error = shmem_whiteout(old_dir, old_dentry); + if (error) + return error; + } + if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { diff --git a/mm/truncate.c b/mm/truncate.c index 96d167372d89..261eaf6e5a19 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ #include <linux/cleancache.h> +#include <linux/rmap.h> #include "internal.h" static void clear_exceptional_entry(struct address_space *mapping, @@ -719,12 +720,68 @@ EXPORT_SYMBOL(truncate_pagecache); */ void truncate_setsize(struct inode *inode, loff_t newsize) { + loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + +/** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index bfe1cf6b492f..166d59cdc86b 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -781,16 +781,15 @@ static int snd_pcm_action_group(struct action_ops *ops, { struct snd_pcm_substream *s = NULL; struct snd_pcm_substream *s1; - int res = 0; + int res = 0, depth = 1; snd_pcm_group_for_each_entry(s, substream) { if (do_lock && s != substream) { if (s->pcm->nonatomic) - mutex_lock_nested(&s->self_group.mutex, - SINGLE_DEPTH_NESTING); + mutex_lock_nested(&s->self_group.mutex, depth); else - spin_lock_nested(&s->self_group.lock, - SINGLE_DEPTH_NESTING); + spin_lock_nested(&s->self_group.lock, depth); + depth++; } res = ops->pre_action(s, state); if (res < 0) @@ -906,8 +905,7 @@ static int snd_pcm_action_lock_mutex(struct action_ops *ops, down_read(&snd_pcm_link_rwsem); if (snd_pcm_stream_linked(substream)) { mutex_lock(&substream->group->mutex); - mutex_lock_nested(&substream->self_group.mutex, - SINGLE_DEPTH_NESTING); + mutex_lock(&substream->self_group.mutex); res = snd_pcm_action_group(ops, substream, state, 1); mutex_unlock(&substream->self_group.mutex); mutex_unlock(&substream->group->mutex); @@ -3311,7 +3309,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { #ifndef ARCH_HAS_DMA_MMAP_COHERENT /* This should be defined / handled globally! */ -#ifdef CONFIG_ARM +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) #define ARCH_HAS_DMA_MMAP_COHERENT #endif #endif diff --git a/sound/pci/hda/hda_local.h b/sound/pci/hda/hda_local.h index 7eb44e78e141..62658f2f8c9f 100644 --- a/sound/pci/hda/hda_local.h +++ b/sound/pci/hda/hda_local.h @@ -419,7 +419,7 @@ struct snd_hda_pin_quirk { .subvendor = _subvendor,\ .name = _name,\ .value = _value,\ - .pins = (const struct hda_pintbl[]) { _pins } \ + .pins = (const struct hda_pintbl[]) { _pins, {0, 0}} \ } #else @@ -427,7 +427,7 @@ struct snd_hda_pin_quirk { { .codec = _codec,\ .subvendor = _subvendor,\ .value = _value,\ - .pins = (const struct hda_pintbl[]) { _pins } \ + .pins = (const struct hda_pintbl[]) { _pins, {0, 0}} \ } #endif diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 39862e98551c..9dc9cf8c90e9 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -1583,19 +1583,22 @@ static bool hdmi_present_sense(struct hdmi_spec_per_pin *per_pin, int repoll) } } - if (pin_eld->eld_valid && !eld->eld_valid) { - update_eld = true; + if (pin_eld->eld_valid != eld->eld_valid) eld_changed = true; - } + + if (pin_eld->eld_valid && !eld->eld_valid) + update_eld = true; + if (update_eld) { bool old_eld_valid = pin_eld->eld_valid; pin_eld->eld_valid = eld->eld_valid; - eld_changed = pin_eld->eld_size != eld->eld_size || + if (pin_eld->eld_size != eld->eld_size || memcmp(pin_eld->eld_buffer, eld->eld_buffer, - eld->eld_size) != 0; - if (eld_changed) + eld->eld_size) != 0) { memcpy(pin_eld->eld_buffer, eld->eld_buffer, eld->eld_size); + eld_changed = true; + } pin_eld->eld_size = eld->eld_size; pin_eld->info = eld->info; diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index bc86c36b4bfa..34b7bdb510c7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2884,6 +2884,9 @@ static void alc283_shutup(struct hda_codec *codec) alc_write_coef_idx(codec, 0x43, 0x9004); + /*depop hp during suspend*/ + alc_write_coef_idx(codec, 0x06, 0x2100); + snd_hda_codec_write(codec, hp_pin, 0, AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_MUTE); @@ -5610,9 +5613,9 @@ static void alc662_led_gpio1_mute_hook(void *private_data, int enabled) unsigned int oldval = spec->gpio_led; if (enabled) - spec->gpio_led &= ~0x01; - else spec->gpio_led |= 0x01; + else + spec->gpio_led &= ~0x01; if (spec->gpio_led != oldval) snd_hda_codec_write(codec, 0x01, 0, AC_VERB_SET_GPIO_DATA, spec->gpio_led); diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 223c47b33ba3..c657752a420c 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -385,6 +385,36 @@ YAMAHA_DEVICE(0x105d, NULL), } }, { + USB_DEVICE(0x0499, 0x1509), + .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { + /* .vendor_name = "Yamaha", */ + /* .product_name = "Steinberg UR22", */ + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_COMPOSITE, + .data = (const struct snd_usb_audio_quirk[]) { + { + .ifnum = 1, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + { + .ifnum = 2, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + { + .ifnum = 3, + .type = QUIRK_MIDI_YAMAHA + }, + { + .ifnum = 4, + .type = QUIRK_IGNORE_INTERFACE + }, + { + .ifnum = -1 + } + } + } +}, +{ USB_DEVICE(0x0499, 0x150a), .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { /* .vendor_name = "Yamaha", */ diff --git a/tools/power/acpi/os_specific/service_layers/osunixxf.c b/tools/power/acpi/os_specific/service_layers/osunixxf.c index 60b58cd18410..7ccb073f8316 100644 --- a/tools/power/acpi/os_specific/service_layers/osunixxf.c +++ b/tools/power/acpi/os_specific/service_layers/osunixxf.c @@ -122,6 +122,14 @@ static void os_enter_line_edit_mode(void) { struct termios local_term_attributes; + term_attributes_were_set = 0; + + /* STDIN must be a terminal */ + + if (!isatty(STDIN_FILENO)) { + return; + } + /* Get and keep the original attributes */ if (tcgetattr(STDIN_FILENO, &original_term_attributes)) { diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c index 53cee781e24e..24d32968802d 100644 --- a/tools/power/acpi/tools/acpidump/apdump.c +++ b/tools/power/acpi/tools/acpidump/apdump.c @@ -146,7 +146,7 @@ u32 ap_get_table_length(struct acpi_table_header *table) if (ACPI_VALIDATE_RSDP_SIG(table->signature)) { rsdp = ACPI_CAST_PTR(struct acpi_table_rsdp, table); - return (rsdp->length); + return (acpi_tb_get_rsdp_length(rsdp)); } /* Normal ACPI table */ diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index e51d9f9b995f..c1e6ae989a43 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -43,13 +43,13 @@ static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long size) + unsigned long npages) { gfn_t end_gfn; pfn_t pfn; pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + (size >> PAGE_SHIFT); + end_gfn = gfn + npages; gfn += 1; if (is_error_noslot_pfn(pfn)) @@ -119,7 +119,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) * Pin all pages we are about to map in memory. This is * important because we unmap and unpin in 4kb steps later. */ - pfn = kvm_pin_pages(slot, gfn, page_size); + pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); if (is_error_noslot_pfn(pfn)) { gfn += 1; continue; @@ -131,7 +131,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) if (r) { printk(KERN_ERR "kvm_iommu_map_address:" "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size); + kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); goto unmap_pages; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 384eaa7b02fa..25ffac9e947d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2354,6 +2354,12 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) return 0; } +void kvm_unregister_device_ops(u32 type) +{ + if (kvm_device_ops_table[type] != NULL) + kvm_device_ops_table[type] = NULL; +} + static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { @@ -3328,5 +3334,6 @@ void kvm_exit(void) kvm_arch_exit(); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); + kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 281e7cf2b8e5..620e37f741b8 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -283,3 +283,8 @@ int kvm_vfio_ops_init(void) { return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); } + +void kvm_vfio_ops_exit(void) +{ + kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); +} diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h index 92eac75d6b62..ab88c7dc0514 100644 --- a/virt/kvm/vfio.h +++ b/virt/kvm/vfio.h @@ -3,11 +3,15 @@ #ifdef CONFIG_KVM_VFIO int kvm_vfio_ops_init(void); +void kvm_vfio_ops_exit(void); #else static inline int kvm_vfio_ops_init(void) { return 0; } +static inline void kvm_vfio_ops_exit(void) +{ +} #endif #endif |