summaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/sysfs-devices-memory14
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu14
-rw-r--r--Documentation/device-mapper/snapshot.txt60
-rw-r--r--Documentation/feature-removal-schedule.txt16
-rw-r--r--Documentation/filesystems/proc.txt9
-rw-r--r--Documentation/hwmon/lis3lv02d55
-rw-r--r--Documentation/hwmon/w83627ehf10
-rw-r--r--Documentation/i2c/writing-clients2
-rw-r--r--Documentation/md.txt72
-rw-r--r--Documentation/memory-hotplug.txt11
-rw-r--r--Documentation/misc-devices/ad525x_dpot.txt57
-rw-r--r--Documentation/nommu-mmap.txt26
-rw-r--r--Documentation/spinlocks.txt184
-rw-r--r--Documentation/sysctl/kernel.txt31
-rw-r--r--Documentation/vm/hugetlbpage.txt262
-rw-r--r--Documentation/vm/ksm.txt22
-rw-r--r--Documentation/vm/page-types.c68
17 files changed, 638 insertions, 275 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index 9fe91c02ee40..bf1627b02a03 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -60,6 +60,19 @@ Description:
Users: hotplug memory remove tools
https://w3.opensource.ibm.com/projects/powerpc-utils/
+
+What: /sys/devices/system/memoryX/nodeY
+Date: October 2009
+Contact: Linux Memory Management list <linux-mm@kvack.org>
+Description:
+ When CONFIG_NUMA is enabled, a symbolic link that
+ points to the corresponding NUMA node directory.
+
+ For example, the following symbolic link is created for
+ memory section 9 on node0:
+ /sys/devices/system/memory/memory9/node0 -> ../../node/node0
+
+
What: /sys/devices/system/node/nodeX/memoryY
Date: September 2008
Contact: Gary Hade <garyhade@us.ibm.com>
@@ -70,4 +83,3 @@ Description:
memory section directory. For example, the following symbolic
link is created for memory section 9 on node0.
/sys/devices/system/node/node0/memory9 -> ../../memory/memory9
-
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 2aae06fcbed7..84a710f87c64 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -92,6 +92,20 @@ Description: Discover NUMA node a CPU belongs to
/sys/devices/system/cpu/cpu42/node2 -> ../../node/node2
+What: /sys/devices/system/cpu/cpu#/node
+Date: October 2009
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Discover NUMA node a CPU belongs to
+
+ When CONFIG_NUMA is enabled, a symbolic link that points
+ to the corresponding NUMA node directory.
+
+ For example, the following symlink is created for cpu42
+ in NUMA node 2:
+
+ /sys/devices/system/cpu/cpu42/node2 -> ../../node/node2
+
+
What: /sys/devices/system/cpu/cpu#/topology/core_id
/sys/devices/system/cpu/cpu#/topology/core_siblings
/sys/devices/system/cpu/cpu#/topology/core_siblings_list
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt
index a5009c8300f3..e3a77b215135 100644
--- a/Documentation/device-mapper/snapshot.txt
+++ b/Documentation/device-mapper/snapshot.txt
@@ -8,13 +8,19 @@ the block device which are also writable without interfering with the
original content;
*) To create device "forks", i.e. multiple different versions of the
same data stream.
+*) To merge a snapshot of a block device back into the snapshot's origin
+device.
+In the first two cases, dm copies only the chunks of data that get
+changed and uses a separate copy-on-write (COW) block device for
+storage.
-In both cases, dm copies only the chunks of data that get changed and
-uses a separate copy-on-write (COW) block device for storage.
+For snapshot merge the contents of the COW storage are merged back into
+the origin device.
-There are two dm targets available: snapshot and snapshot-origin.
+There are three dm targets available:
+snapshot, snapshot-origin, and snapshot-merge.
*) snapshot-origin <origin>
@@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be
saved on disk - they can be kept in memory by the kernel.
-How this is used by LVM2
-========================
+* snapshot-merge <origin> <COW device> <persistent> <chunksize>
+
+takes the same table arguments as the snapshot target except it only
+works with persistent snapshots. This target assumes the role of the
+"snapshot-origin" target and must not be loaded if the "snapshot-origin"
+is still present for <origin>.
+
+Creates a merging snapshot that takes control of the changed chunks
+stored in the <COW device> of an existing snapshot, through a handover
+procedure, and merges these chunks back into the <origin>. Once merging
+has started (in the background) the <origin> may be opened and the merge
+will continue while I/O is flowing to it. Changes to the <origin> are
+deferred until the merging snapshot's corresponding chunk(s) have been
+merged. Once merging has started the snapshot device, associated with
+the "snapshot" target, will return -EIO when accessed.
+
+
+How snapshot is used by LVM2
+============================
When you create the first LVM2 snapshot of a volume, four dm devices are used:
1) a device containing the original mapping table of the source volume;
@@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
+
+How snapshot-merge is used by LVM2
+==================================
+A merging snapshot assumes the role of the "snapshot-origin" while
+merging. As such the "snapshot-origin" is replaced with
+"snapshot-merge". The "-real" device is not changed and the "-cow"
+device is renamed to <origin name>-cow to aid LVM2's cleanup of the
+merging snapshot after it completes. The "snapshot" that hands over its
+COW device to the "snapshot-merge" is deactivated (unless using lvchange
+--refresh); but if it is left active it will simply return I/O errors.
+
+A snapshot will merge into its origin with the following command:
+
+lvconvert --merge volumeGroup/snap
+
+we'll now have this situation:
+
+# dmsetup table|grep volumeGroup
+
+volumeGroup-base-real: 0 2097152 linear 8:19 384
+volumeGroup-base-cow: 0 204800 linear 8:19 2097536
+volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
+
+# ls -lL /dev/mapper/volumeGroup-*
+brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
+brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
+brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 2a4d77946c7d..21ab9357326d 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -291,22 +291,6 @@ Who: Michael Buesch <mb@bu3sch.de>
---------------------------
-What: usedac i386 kernel parameter
-When: 2.6.27
-Why: replaced by allowdac and no dac combination
-Who: Glauber Costa <gcosta@redhat.com>
-
----------------------------
-
-What: print_fn_descriptor_symbol()
-When: October 2009
-Why: The %pF vsprintf format provides the same functionality in a
- simpler way. print_fn_descriptor_symbol() is deprecated but
- still present to give out-of-tree modules time to change.
-Who: Bjorn Helgaas <bjorn.helgaas@hp.com>
-
----------------------------
-
What: /sys/o2cb symlink
When: January 2010
Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 94b9f2056f4c..220cc6376ef8 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -38,6 +38,7 @@ Table of Contents
3.3 /proc/<pid>/io - Display the IO accounting fields
3.4 /proc/<pid>/coredump_filter - Core dump filtering settings
3.5 /proc/<pid>/mountinfo - Information about mounts
+ 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
------------------------------------------------------------------------------
@@ -1409,3 +1410,11 @@ For more information on mount propagation see:
Documentation/filesystems/sharedsubtree.txt
+
+3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
+--------------------------------------------------------
+These files provide a method to access a tasks comm value. It also allows for
+a task to set its own or one of its thread siblings comm value. The comm value
+is limited in size compared to the cmdline value, so writing anything longer
+then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
+comm value.
diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d
index effe949a7282..06534f25e643 100644
--- a/Documentation/hwmon/lis3lv02d
+++ b/Documentation/hwmon/lis3lv02d
@@ -3,7 +3,8 @@ Kernel driver lis3lv02d
Supported chips:
- * STMicroelectronics LIS3LV02DL and LIS3LV02DQ
+ * STMicroelectronics LIS3LV02DL, LIS3LV02DQ (12 bits precision)
+ * STMicroelectronics LIS302DL, LIS3L02DQ, LIS331DL (8 bits)
Authors:
Yan Burman <burman.yan@gmail.com>
@@ -13,32 +14,52 @@ Authors:
Description
-----------
-This driver provides support for the accelerometer found in various HP
-laptops sporting the feature officially called "HP Mobile Data
-Protection System 3D" or "HP 3D DriveGuard". It detects automatically
-laptops with this sensor. Known models (for now the HP 2133, nc6420,
-nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis
-automatically oriented on standard way (eg: you can directly play
-neverball). The accelerometer data is readable via
-/sys/devices/platform/lis3lv02d.
+This driver provides support for the accelerometer found in various HP laptops
+sporting the feature officially called "HP Mobile Data Protection System 3D" or
+"HP 3D DriveGuard". It detects automatically laptops with this sensor. Known
+models (full list can be found in drivers/hwmon/hp_accel.c) will have their
+axis automatically oriented on standard way (eg: you can directly play
+neverball). The accelerometer data is readable via
+/sys/devices/platform/lis3lv02d. Reported values are scaled
+to mg values (1/1000th of earth gravity).
Sysfs attributes under /sys/devices/platform/lis3lv02d/:
position - 3D position that the accelerometer reports. Format: "(x,y,z)"
-calibrate - read: values (x, y, z) that are used as the base for input
- class device operation.
- write: forces the base to be recalibrated with the current
- position.
-rate - reports the sampling rate of the accelerometer device in HZ
+rate - read reports the sampling rate of the accelerometer device in HZ.
+ write changes sampling rate of the accelerometer device.
+ Only values which are supported by HW are accepted.
+selftest - performs selftest for the chip as specified by chip manufacturer.
This driver also provides an absolute input class device, allowing
-the laptop to act as a pinball machine-esque joystick.
+the laptop to act as a pinball machine-esque joystick. Joystick device can be
+calibrated. Joystick device can be in two different modes.
+By default output values are scaled between -32768 .. 32767. In joystick raw
+mode, joystick and sysfs position entry have the same scale. There can be
+small difference due to input system fuzziness feature.
+Events are also available as input event device.
+
+Selftest is meant only for hardware diagnostic purposes. It is not meant to be
+used during normal operations. Position data is not corrupted during selftest
+but interrupt behaviour is not guaranteed to work reliably. In test mode, the
+sensing element is internally moved little bit. Selftest measures difference
+between normal mode and test mode. Chip specifications tell the acceptance
+limit for each type of the chip. Limits are provided via platform data
+to allow adjustment of the limits without a change to the actual driver.
+Seltest returns either "OK x y z" or "FAIL x y z" where x, y and z are
+measured difference between modes. Axes are not remapped in selftest mode.
+Measurement values are provided to help HW diagnostic applications to make
+final decision.
+
+On HP laptops, if the led infrastructure is activated, support for a led
+indicating disk protection will be provided as /sys/class/leds/hp::hddprotect.
Another feature of the driver is misc device called "freefall" that
acts similar to /dev/rtc and reacts on free-fall interrupts received
from the device. It supports blocking operations, poll/select and
fasync operation modes. You must read 1 bytes from the device. The
result is number of free-fall interrupts since the last successful
-read (or 255 if number of interrupts would not fit).
+read (or 255 if number of interrupts would not fit). See the hpfall.c
+file for an example on using the device.
Axes orientation
@@ -55,7 +76,7 @@ the accelerometer are converted into a "standard" organisation of the axes
* If the laptop is put upside-down, Z becomes negative
If your laptop model is not recognized (cf "dmesg"), you can send an
-email to the authors to add it to the database. When reporting a new
+email to the maintainer to add it to the database. When reporting a new
laptop, please include the output of "dmidecode" plus the value of
/sys/devices/platform/lis3lv02d/position in these four cases.
diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf
index 02b74899edaf..b7e42ec4b26b 100644
--- a/Documentation/hwmon/w83627ehf
+++ b/Documentation/hwmon/w83627ehf
@@ -81,8 +81,14 @@ pwm[1-4] - this file stores PWM duty cycle or DC value (fan speed) in range:
0 (stop) to 255 (full)
pwm[1-4]_enable - this file controls mode of fan/temperature control:
- * 1 Manual Mode, write to pwm file any value 0-255 (full speed)
- * 2 Thermal Cruise
+ * 1 Manual mode, write to pwm file any value 0-255 (full speed)
+ * 2 "Thermal Cruise" mode
+ * 3 "Fan Speed Cruise" mode
+ * 4 "Smart Fan III" mode
+
+pwm[1-4]_mode - controls if output is PWM or DC level
+ * 0 DC output (0 - 12v)
+ * 1 PWM output
Thermal Cruise mode
-------------------
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index 7860aafb483d..0a74603eb671 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -44,7 +44,7 @@ static struct i2c_driver foo_driver = {
/* if device autodetection is needed: */
.class = I2C_CLASS_SOMETHING,
.detect = foo_detect,
- .address_data = &addr_data,
+ .address_list = normal_i2c,
.shutdown = foo_shutdown, /* optional */
.suspend = foo_suspend, /* optional */
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 4edd39ec7db9..188f4768f1d5 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -233,9 +233,9 @@ All md devices contain:
resync_start
The point at which resync should start. If no resync is needed,
- this will be a very large number. At array creation it will
- default to 0, though starting the array as 'clean' will
- set it much larger.
+ this will be a very large number (or 'none' since 2.6.30-rc1). At
+ array creation it will default to 0, though starting the array as
+ 'clean' will set it much larger.
new_dev
This file can be written but not read. The value written should
@@ -296,6 +296,51 @@ All md devices contain:
active-idle
like active, but no writes have been seen for a while (safe_mode_delay).
+ bitmap/location
+ This indicates where the write-intent bitmap for the array is
+ stored.
+ It can be one of "none", "file" or "[+-]N".
+ "file" may later be extended to "file:/file/name"
+ "[+-]N" means that many sectors from the start of the metadata.
+ This is replicated on all devices. For arrays with externally
+ managed metadata, the offset is from the beginning of the
+ device.
+ bitmap/chunksize
+ The size, in bytes, of the chunk which will be represented by a
+ single bit. For RAID456, it is a portion of an individual
+ device. For RAID10, it is a portion of the array. For RAID1, it
+ is both (they come to the same thing).
+ bitmap/time_base
+ The time, in seconds, between looking for bits in the bitmap to
+ be cleared. In the current implementation, a bit will be cleared
+ between 2 and 3 times "time_base" after all the covered blocks
+ are known to be in-sync.
+ bitmap/backlog
+ When write-mostly devices are active in a RAID1, write requests
+ to those devices proceed in the background - the filesystem (or
+ other user of the device) does not have to wait for them.
+ 'backlog' sets a limit on the number of concurrent background
+ writes. If there are more than this, new writes will by
+ synchronous.
+ bitmap/metadata
+ This can be either 'internal' or 'external'.
+ 'internal' is the default and means the metadata for the bitmap
+ is stored in the first 256 bytes of the allocated space and is
+ managed by the md module.
+ 'external' means that bitmap metadata is managed externally to
+ the kernel (i.e. by some userspace program)
+ bitmap/can_clear
+ This is either 'true' or 'false'. If 'true', then bits in the
+ bitmap will be cleared when the corresponding blocks are thought
+ to be in-sync. If 'false', bits will never be cleared.
+ This is automatically set to 'false' if a write happens on a
+ degraded array, or if the array becomes degraded during a write.
+ When metadata is managed externally, it should be set to true
+ once the array becomes non-degraded, and this fact has been
+ recorded in the metadata.
+
+
+
As component devices are added to an md array, they appear in the 'md'
directory as new directories named
@@ -334,8 +379,9 @@ Each directory contains:
Writing "writemostly" sets the writemostly flag.
Writing "-writemostly" clears the writemostly flag.
Writing "blocked" sets the "blocked" flag.
- Writing "-blocked" clear the "blocked" flag and allows writes
+ Writing "-blocked" clears the "blocked" flag and allows writes
to complete.
+ Writing "in_sync" sets the in_sync flag.
This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event.
@@ -372,6 +418,24 @@ Each directory contains:
array. If a value less than the current component_size is
written, it will be rejected.
+ recovery_start
+
+ When the device is not 'in_sync', this records the number of
+ sectors from the start of the device which are known to be
+ correct. This is normally zero, but during a recovery
+ operation is will steadily increase, and if the recovery is
+ interrupted, restoring this value can cause recovery to
+ avoid repeating the earlier blocks. With v1.x metadata, this
+ value is saved and restored automatically.
+
+ This can be set whenever the device is not an active member of
+ the array, either before the array is activated, or before
+ the 'slot' is set.
+
+ Setting this to 'none' is equivalent to setting 'in_sync'.
+ Setting to any other value also clears the 'in_sync' flag.
+
+
An active md device will also contain and entry for each active device
in the array. These are named
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index bbc8a6a36921..57e7e9cc1870 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -160,12 +160,15 @@ Under each section, you can see 4 files.
NOTE:
These directories/files appear after physical memory hotplug phase.
-If CONFIG_NUMA is enabled the
-/sys/devices/system/memory/memoryXXX memory section
-directories can also be accessed via symbolic links located in
-the /sys/devices/system/node/node* directories. For example:
+If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed
+via symbolic links located in the /sys/devices/system/node/node* directories.
+
+For example:
/sys/devices/system/node/node0/memory9 -> ../../memory/memory9
+A backlink will also be created:
+/sys/devices/system/memory/memory9/node0 -> ../../node/node0
+
--------------------------------
4. Physical memory hot-add phase
--------------------------------
diff --git a/Documentation/misc-devices/ad525x_dpot.txt b/Documentation/misc-devices/ad525x_dpot.txt
new file mode 100644
index 000000000000..0c9413b1cbf3
--- /dev/null
+++ b/Documentation/misc-devices/ad525x_dpot.txt
@@ -0,0 +1,57 @@
+---------------------------------
+ AD525x Digital Potentiometers
+---------------------------------
+
+The ad525x_dpot driver exports a simple sysfs interface. This allows you to
+work with the immediate resistance settings as well as update the saved startup
+settings. Access to the factory programmed tolerance is also provided, but
+interpretation of this settings is required by the end application according to
+the specific part in use.
+
+---------
+ Files
+---------
+
+Each dpot device will have a set of eeprom, rdac, and tolerance files. How
+many depends on the actual part you have, as will the range of allowed values.
+
+The eeprom files are used to program the startup value of the device.
+
+The rdac files are used to program the immediate value of the device.
+
+The tolerance files are the read-only factory programmed tolerance settings
+and may vary greatly on a part-by-part basis. For exact interpretation of
+this field, please consult the datasheet for your part. This is presented
+as a hex file for easier parsing.
+
+-----------
+ Example
+-----------
+
+Locate the device in your sysfs tree. This is probably easiest by going into
+the common i2c directory and locating the device by the i2c slave address.
+
+ # ls /sys/bus/i2c/devices/
+ 0-0022 0-0027 0-002f
+
+So assuming the device in question is on the first i2c bus and has the slave
+address of 0x2f, we descend (unrelated sysfs entries have been trimmed).
+
+ # ls /sys/bus/i2c/devices/0-002f/
+ eeprom0 rdac0 tolerance0
+
+You can use simple reads/writes to access these files:
+
+ # cd /sys/bus/i2c/devices/0-002f/
+
+ # cat eeprom0
+ 0
+ # echo 10 > eeprom0
+ # cat eeprom0
+ 10
+
+ # cat rdac0
+ 5
+ # echo 3 > rdac0
+ # cat rdac0
+ 3
diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index b565e8279d13..8e1ddec2c78a 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -119,6 +119,32 @@ FURTHER NOTES ON NO-MMU MMAP
granule but will only discard the excess if appropriately configured as
this has an effect on fragmentation.
+ (*) The memory allocated by a request for an anonymous mapping will normally
+ be cleared by the kernel before being returned in accordance with the
+ Linux man pages (ver 2.22 or later).
+
+ In the MMU case this can be achieved with reasonable performance as
+ regions are backed by virtual pages, with the contents only being mapped
+ to cleared physical pages when a write happens on that specific page
+ (prior to which, the pages are effectively mapped to the global zero page
+ from which reads can take place). This spreads out the time it takes to
+ initialize the contents of a page - depending on the write-usage of the
+ mapping.
+
+ In the no-MMU case, however, anonymous mappings are backed by physical
+ pages, and the entire map is cleared at allocation time. This can cause
+ significant delays during a userspace malloc() as the C library does an
+ anonymous mapping and the kernel then does a memset for the entire map.
+
+ However, for memory that isn't required to be precleared - such as that
+ returned by malloc() - mmap() can take a MAP_UNINITIALIZED flag to
+ indicate to the kernel that it shouldn't bother clearing the memory before
+ returning it. Note that CONFIG_MMAP_ALLOW_UNINITIALIZED must be enabled
+ to permit this, otherwise the flag will be ignored.
+
+ uClibc uses this to speed up malloc(), and the ELF-FDPIC binfmt uses this
+ to allocate the brk and stack region.
+
(*) A list of all the private copy and anonymous mappings on the system is
visible through /proc/maps in no-MMU mode.
diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt
index 619699dde593..178c831b907d 100644
--- a/Documentation/spinlocks.txt
+++ b/Documentation/spinlocks.txt
@@ -1,73 +1,8 @@
-SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED defeat lockdep state tracking and
-are hence deprecated.
+Lesson 1: Spin locks
-Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or
-__SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate for static
-initialization.
-
-Most of the time, you can simply turn:
-
- static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
-
-into:
-
- static DEFINE_SPINLOCK(xxx_lock);
-
-Static structure member variables go from:
-
- struct foo bar {
- .lock = SPIN_LOCK_UNLOCKED;
- };
-
-to:
-
- struct foo bar {
- .lock = __SPIN_LOCK_UNLOCKED(bar.lock);
- };
-
-Declaration of static rw_locks undergo a similar transformation.
-
-Dynamic initialization, when necessary, may be performed as
-demonstrated below.
-
- spinlock_t xxx_lock;
- rwlock_t xxx_rw_lock;
-
- static int __init xxx_init(void)
- {
- spin_lock_init(&xxx_lock);
- rwlock_init(&xxx_rw_lock);
- ...
- }
-
- module_init(xxx_init);
-
-The following discussion is still valid, however, with the dynamic
-initialization of spinlocks or with DEFINE_SPINLOCK, etc., used
-instead of SPIN_LOCK_UNLOCKED.
-
------------------------
-
-On Fri, 2 Jan 1998, Doug Ledford wrote:
->
-> I'm working on making the aic7xxx driver more SMP friendly (as well as
-> importing the latest FreeBSD sequencer code to have 7895 support) and wanted
-> to get some info from you. The goal here is to make the various routines
-> SMP safe as well as UP safe during interrupts and other manipulating
-> routines. So far, I've added a spin_lock variable to things like my queue
-> structs. Now, from what I recall, there are some spin lock functions I can
-> use to lock these spin locks from other use as opposed to a (nasty)
-> save_flags(); cli(); stuff; restore_flags(); construct. Where do I find
-> these routines and go about making use of them? Do they only lock on a
-> per-processor basis or can they also lock say an interrupt routine from
-> mucking with a queue if the queue routine was manipulating it when the
-> interrupt occurred, or should I still use a cli(); based construct on that
-> one?
-
-See <asm/spinlock.h>. The basic version is:
-
- spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
+The most basic primitive for locking is spinlock.
+static DEFINE_SPINLOCK(xxx_lock);
unsigned long flags;
@@ -75,13 +10,11 @@ See <asm/spinlock.h>. The basic version is:
... critical section here ..
spin_unlock_irqrestore(&xxx_lock, flags);
-and the above is always safe. It will disable interrupts _locally_, but the
+The above is always safe. It will disable interrupts _locally_, but the
spinlock itself will guarantee the global lock, so it will guarantee that
there is only one thread-of-control within the region(s) protected by that
-lock.
-
-Note that it works well even under UP - the above sequence under UP
-essentially is just the same as doing a
+lock. This works well even under UP. The above sequence under UP
+essentially is just the same as doing
unsigned long flags;
@@ -91,15 +24,13 @@ essentially is just the same as doing a
so the code does _not_ need to worry about UP vs SMP issues: the spinlocks
work correctly under both (and spinlocks are actually more efficient on
-architectures that allow doing the "save_flags + cli" in one go because I
-don't export that interface normally).
+architectures that allow doing the "save_flags + cli" in one operation).
+
+ NOTE! Implications of spin_locks for memory are further described in:
-NOTE NOTE NOTE! The reason the spinlock is so much faster than a global
-interrupt lock under SMP is exactly because it disables interrupts only on
-the local CPU. The spin-lock is safe only when you _also_ use the lock
-itself to do locking across CPU's, which implies that EVERYTHING that
-touches a shared variable has to agree about the spinlock they want to
-use.
+ Documentation/memory-barriers.txt
+ (5) LOCK operations.
+ (6) UNLOCK operations.
The above is usually pretty simple (you usually need and want only one
spinlock for most things - using more than one spinlock can make things a
@@ -120,20 +51,24 @@ and another sequence that does
then they are NOT mutually exclusive, and the critical regions can happen
at the same time on two different CPU's. That's fine per se, but the
critical regions had better be critical for different things (ie they
-can't stomp on each other).
+can't stomp on each other).
The above is a problem mainly if you end up mixing code - for example the
routines in ll_rw_block() tend to use cli/sti to protect the atomicity of
their actions, and if a driver uses spinlocks instead then you should
-think about issues like the above..
+think about issues like the above.
This is really the only really hard part about spinlocks: once you start
using spinlocks they tend to expand to areas you might not have noticed
before, because you have to make sure the spinlocks correctly protect the
shared data structures _everywhere_ they are used. The spinlocks are most
-easily added to places that are completely independent of other code (ie
-internal driver data structures that nobody else ever touches, for
-example).
+easily added to places that are completely independent of other code (for
+example, internal driver data structures that nobody else ever touches).
+
+ NOTE! The spin-lock is safe only when you _also_ use the lock itself
+ to do locking across CPU's, which implies that EVERYTHING that
+ touches a shared variable has to agree about the spinlock they want
+ to use.
----
@@ -141,13 +76,17 @@ Lesson 2: reader-writer spinlocks.
If your data accesses have a very natural pattern where you usually tend
to mostly read from the shared variables, the reader-writer locks
-(rw_lock) versions of the spinlocks are often nicer. They allow multiple
+(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
readers to be in the same critical region at once, but if somebody wants
-to change the variables it has to get an exclusive write lock. The
-routines look the same as above:
+to change the variables it has to get an exclusive write lock.
- rwlock_t xxx_lock = RW_LOCK_UNLOCKED;
+ NOTE! reader-writer locks require more atomic memory operations than
+ simple spinlocks. Unless the reader critical section is long, you
+ are better off just using spinlocks.
+The routines look the same as above:
+
+ rwlock_t xxx_lock = RW_LOCK_UNLOCKED;
unsigned long flags;
@@ -159,18 +98,21 @@ routines look the same as above:
.. read and write exclusive access to the info ...
write_unlock_irqrestore(&xxx_lock, flags);
-The above kind of lock is useful for complex data structures like linked
-lists etc, especially when you know that most of the work is to just
-traverse the list searching for entries without changing the list itself,
-for example. Then you can use the read lock for that kind of list
-traversal, which allows many concurrent readers. Anything that _changes_
-the list will have to get the write lock.
+The above kind of lock may be useful for complex data structures like
+linked lists, especially searching for entries without changing the list
+itself. The read lock allows many concurrent readers. Anything that
+_changes_ the list will have to get the write lock.
+
+ NOTE! RCU is better for list traversal, but requires careful
+ attention to design detail (see Documentation/RCU/listRCU.txt).
-Note: you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
+Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
time need to do any changes (even if you don't do it every time), you have
-to get the write-lock at the very beginning. I could fairly easily add a
-primitive to create a "upgradeable" read-lock, but it hasn't been an issue
-yet. Tell me if you'd want one.
+to get the write-lock at the very beginning.
+
+ NOTE! We are working hard to remove reader-writer spinlocks in most
+ cases, so please don't add a new one without consensus. (Instead, see
+ Documentation/RCU/rcu.txt for complete information.)
----
@@ -233,4 +175,46 @@ indeed), while write-locks need to protect themselves against interrupts.
Linus
+----
+
+Reference information:
+
+For dynamic initialization, use spin_lock_init() or rwlock_init() as
+appropriate:
+
+ spinlock_t xxx_lock;
+ rwlock_t xxx_rw_lock;
+
+ static int __init xxx_init(void)
+ {
+ spin_lock_init(&xxx_lock);
+ rwlock_init(&xxx_rw_lock);
+ ...
+ }
+
+ module_init(xxx_init);
+
+For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
+__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
+
+SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated. These interfere
+with lockdep state tracking.
+
+Most of the time, you can simply turn:
+ static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
+into:
+ static DEFINE_SPINLOCK(xxx_lock);
+
+Static structure member variables go from:
+
+ struct foo bar {
+ .lock = SPIN_LOCK_UNLOCKED;
+ };
+
+to:
+ struct foo bar {
+ .lock = __SPIN_LOCK_UNLOCKED(bar.lock);
+ };
+
+Declaration of static rw_locks undergo a similar transformation.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 8f7a0e73ef44..3894eaa23486 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -19,6 +19,8 @@ Currently, these files might (depending on your configuration)
show up in /proc/sys/kernel:
- acpi_video_flags
- acct
+- bootloader_type [ X86 only ]
+- bootloader_version [ X86 only ]
- callhome [ S390 only ]
- auto_msgmni
- core_pattern
@@ -93,6 +95,35 @@ valid for 30 seconds.
==============================================================
+bootloader_type:
+
+x86 bootloader identification
+
+This gives the bootloader type number as indicated by the bootloader,
+shifted left by 4, and OR'd with the low four bits of the bootloader
+version. The reason for this encoding is that this used to match the
+type_of_loader field in the kernel header; the encoding is kept for
+backwards compatibility. That is, if the full bootloader type number
+is 0x15 and the full version number is 0x234, this file will contain
+the value 340 = 0x154.
+
+See the type_of_loader and ext_loader_type fields in
+Documentation/x86/boot.txt for additional information.
+
+==============================================================
+
+bootloader_version:
+
+x86 bootloader version
+
+The complete bootloader version number. In the example above, this
+file will contain the value 564 = 0x234.
+
+See the type_of_loader and ext_loader_ver fields in
+Documentation/x86/boot.txt for additional information.
+
+==============================================================
+
callhome:
Controls the kernel's callhome behavior in case of a kernel panic.
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index 82a7bd1800b2..bc31636973e3 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -11,23 +11,21 @@ This optimization is more critical now as bigger and bigger physical memories
(several GBs) are more readily available.
Users can use the huge page support in Linux kernel by either using the mmap
-system call or standard SYSv shared memory system calls (shmget, shmat).
+system call or standard SYSV shared memory system calls (shmget, shmat).
First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
(present under "File systems") and CONFIG_HUGETLB_PAGE (selected
automatically when CONFIG_HUGETLBFS is selected) configuration
options.
-The kernel built with huge page support should show the number of configured
-huge pages in the system by running the "cat /proc/meminfo" command.
+The /proc/meminfo file provides information about the total number of
+persistent hugetlb pages in the kernel's huge page pool. It also displays
+information about the number of free, reserved and surplus huge pages and the
+default huge page size. The huge page size is needed for generating the
+proper alignment and size of the arguments to system calls that map huge page
+regions.
-/proc/meminfo also provides information about the total number of hugetlb
-pages configured in the kernel. It also displays information about the
-number of free hugetlb pages at any time. It also displays information about
-the configured huge page size - this is needed for generating the proper
-alignment and size of the arguments to the above system calls.
-
-The output of "cat /proc/meminfo" will have lines like:
+The output of "cat /proc/meminfo" will include lines like:
.....
HugePages_Total: vvv
@@ -53,59 +51,63 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in
/proc/filesystems should also show a filesystem of type "hugetlbfs" configured
in the kernel.
-/proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb
-pages in the kernel. Super user can dynamically request more (or free some
-pre-configured) huge pages.
-The allocation (or deallocation) of hugetlb pages is possible only if there are
-enough physically contiguous free pages in system (freeing of huge pages is
-possible only if there are enough hugetlb pages free that can be transferred
-back to regular memory pool).
+/proc/sys/vm/nr_hugepages indicates the current number of "persistent" huge
+pages in the kernel's huge page pool. "Persistent" huge pages will be
+returned to the huge page pool when freed by a task. A user with root
+privileges can dynamically allocate more or free some persistent huge pages
+by increasing or decreasing the value of 'nr_hugepages'.
-Pages that are used as hugetlb pages are reserved inside the kernel and cannot
-be used for other purposes.
+Pages that are used as huge pages are reserved inside the kernel and cannot
+be used for other purposes. Huge pages cannot be swapped out under
+memory pressure.
-Once the kernel with Hugetlb page support is built and running, a user can
-use either the mmap system call or shared memory system calls to start using
-the huge pages. It is required that the system administrator preallocate
-enough memory for huge page purposes.
+Once a number of huge pages have been pre-allocated to the kernel huge page
+pool, a user with appropriate privilege can use either the mmap system call
+or shared memory system calls to use the huge pages. See the discussion of
+Using Huge Pages, below.
-The administrator can preallocate huge pages on the kernel boot command line by
-specifying the "hugepages=N" parameter, where 'N' = the number of huge pages
-requested. This is the most reliable method for preallocating huge pages as
-memory has not yet become fragmented.
+The administrator can allocate persistent huge pages on the kernel boot
+command line by specifying the "hugepages=N" parameter, where 'N' = the
+number of huge pages requested. This is the most reliable method of
+allocating huge pages as memory has not yet become fragmented.
-Some platforms support multiple huge page sizes. To preallocate huge pages
+Some platforms support multiple huge page sizes. To allocate huge pages
of a specific size, one must preceed the huge pages boot command parameters
with a huge page size selection parameter "hugepagesz=<size>". <size> must
be specified in bytes with optional scale suffix [kKmMgG]. The default huge
page size may be selected with the "default_hugepagesz=<size>" boot parameter.
-/proc/sys/vm/nr_hugepages indicates the current number of configured [default
-size] hugetlb pages in the kernel. Super user can dynamically request more
-(or free some pre-configured) huge pages.
-
-Use the following command to dynamically allocate/deallocate default sized
-huge pages:
+When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages
+indicates the current number of pre-allocated huge pages of the default size.
+Thus, one can use the following command to dynamically allocate/deallocate
+default sized persistent huge pages:
echo 20 > /proc/sys/vm/nr_hugepages
-This command will try to configure 20 default sized huge pages in the system.
+This command will try to adjust the number of default sized huge pages in the
+huge page pool to 20, allocating or freeing huge pages, as required.
+
On a NUMA platform, the kernel will attempt to distribute the huge page pool
-over the all on-line nodes. These huge pages, allocated when nr_hugepages
-is increased, are called "persistent huge pages".
+over all the set of allowed nodes specified by the NUMA memory policy of the
+task that modifies nr_hugepages. The default for the allowed nodes--when the
+task has default memory policy--is all on-line nodes with memory. Allowed
+nodes with insufficient available, contiguous memory for a huge page will be
+silently skipped when allocating persistent huge pages. See the discussion
+below of the interaction of task memory policy, cpusets and per node attributes
+with the allocation and freeing of persistent huge pages.
The success or failure of huge page allocation depends on the amount of
-physically contiguous memory that is preset in system at the time of the
+physically contiguous memory that is present in system at the time of the
allocation attempt. If the kernel is unable to allocate huge pages from
some nodes in a NUMA system, it will attempt to make up the difference by
allocating extra pages on other nodes with sufficient available contiguous
memory, if any.
-System administrators may want to put this command in one of the local rc init
-files. This will enable the kernel to request huge pages early in the boot
-process when the possibility of getting physical contiguous pages is still
-very high. Administrators can verify the number of huge pages actually
-allocated by checking the sysctl or meminfo. To check the per node
+System administrators may want to put this command in one of the local rc
+init files. This will enable the kernel to allocate huge pages early in
+the boot process when the possibility of getting physical contiguous pages
+is still very high. Administrators can verify the number of huge pages
+actually allocated by checking the sysctl or meminfo. To check the per node
distribution of huge pages in a NUMA system, use:
cat /sys/devices/system/node/node*/meminfo | fgrep Huge
@@ -113,45 +115,47 @@ distribution of huge pages in a NUMA system, use:
/proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of
huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are
requested by applications. Writing any non-zero value into this file
-indicates that the hugetlb subsystem is allowed to try to obtain "surplus"
-huge pages from the buddy allocator, when the normal pool is exhausted. As
-these surplus huge pages go out of use, they are freed back to the buddy
-allocator.
+indicates that the hugetlb subsystem is allowed to try to obtain that
+number of "surplus" huge pages from the kernel's normal page pool, when the
+persistent huge page pool is exhausted. As these surplus huge pages become
+unused, they are freed back to the kernel's normal page pool.
-When increasing the huge page pool size via nr_hugepages, any surplus
+When increasing the huge page pool size via nr_hugepages, any existing surplus
pages will first be promoted to persistent huge pages. Then, additional
huge pages will be allocated, if necessary and if possible, to fulfill
-the new huge page pool size.
+the new persistent huge page pool size.
-The administrator may shrink the pool of preallocated huge pages for
+The administrator may shrink the pool of persistent huge pages for
the default huge page size by setting the nr_hugepages sysctl to a
smaller value. The kernel will attempt to balance the freeing of huge pages
-across all on-line nodes. Any free huge pages on the selected nodes will
-be freed back to the buddy allocator.
-
-Caveat: Shrinking the pool via nr_hugepages such that it becomes less
-than the number of huge pages in use will convert the balance to surplus
-huge pages even if it would exceed the overcommit value. As long as
-this condition holds, however, no more surplus huge pages will be
-allowed on the system until one of the two sysctls are increased
-sufficiently, or the surplus huge pages go out of use and are freed.
+across all nodes in the memory policy of the task modifying nr_hugepages.
+Any free huge pages on the selected nodes will be freed back to the kernel's
+normal page pool.
+
+Caveat: Shrinking the persistent huge page pool via nr_hugepages such that
+it becomes less than the number of huge pages in use will convert the balance
+of the in-use huge pages to surplus huge pages. This will occur even if
+the number of surplus pages it would exceed the overcommit value. As long as
+this condition holds--that is, until nr_hugepages+nr_overcommit_hugepages is
+increased sufficiently, or the surplus huge pages go out of use and are freed--
+no more surplus huge pages will be allowed to be allocated.
With support for multiple huge page pools at run-time available, much of
-the huge page userspace interface has been duplicated in sysfs. The above
-information applies to the default huge page size which will be
-controlled by the /proc interfaces for backwards compatibility. The root
-huge page control directory in sysfs is:
+the huge page userspace interface in /proc/sys/vm has been duplicated in sysfs.
+The /proc interfaces discussed above have been retained for backwards
+compatibility. The root huge page control directory in sysfs is:
/sys/kernel/mm/hugepages
For each huge page size supported by the running kernel, a subdirectory
-will exist, of the form
+will exist, of the form:
hugepages-${size}kB
Inside each of these directories, the same set of files will exist:
nr_hugepages
+ nr_hugepages_mempolicy
nr_overcommit_hugepages
free_hugepages
resv_hugepages
@@ -159,6 +163,102 @@ Inside each of these directories, the same set of files will exist:
which function as described above for the default huge page-sized case.
+
+Interaction of Task Memory Policy with Huge Page Allocation/Freeing
+
+Whether huge pages are allocated and freed via the /proc interface or
+the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA
+nodes from which huge pages are allocated or freed are controlled by the
+NUMA memory policy of the task that modifies the nr_hugepages_mempolicy
+sysctl or attribute. When the nr_hugepages attribute is used, mempolicy
+is ignored.
+
+The recommended method to allocate or free huge pages to/from the kernel
+huge page pool, using the nr_hugepages example above, is:
+
+ numactl --interleave <node-list> echo 20 \
+ >/proc/sys/vm/nr_hugepages_mempolicy
+
+or, more succinctly:
+
+ numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy
+
+This will allocate or free abs(20 - nr_hugepages) to or from the nodes
+specified in <node-list>, depending on whether number of persistent huge pages
+is initially less than or greater than 20, respectively. No huge pages will be
+allocated nor freed on any node not included in the specified <node-list>.
+
+When adjusting the persistent hugepage count via nr_hugepages_mempolicy, any
+memory policy mode--bind, preferred, local or interleave--may be used. The
+resulting effect on persistent huge page allocation is as follows:
+
+1) Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt],
+ persistent huge pages will be distributed across the node or nodes
+ specified in the mempolicy as if "interleave" had been specified.
+ However, if a node in the policy does not contain sufficient contiguous
+ memory for a huge page, the allocation will not "fallback" to the nearest
+ neighbor node with sufficient contiguous memory. To do this would cause
+ undesirable imbalance in the distribution of the huge page pool, or
+ possibly, allocation of persistent huge pages on nodes not allowed by
+ the task's memory policy.
+
+2) One or more nodes may be specified with the bind or interleave policy.
+ If more than one node is specified with the preferred policy, only the
+ lowest numeric id will be used. Local policy will select the node where
+ the task is running at the time the nodes_allowed mask is constructed.
+ For local policy to be deterministic, the task must be bound to a cpu or
+ cpus in a single node. Otherwise, the task could be migrated to some
+ other node at any time after launch and the resulting node will be
+ indeterminate. Thus, local policy is not very useful for this purpose.
+ Any of the other mempolicy modes may be used to specify a single node.
+
+3) The nodes allowed mask will be derived from any non-default task mempolicy,
+ whether this policy was set explicitly by the task itself or one of its
+ ancestors, such as numactl. This means that if the task is invoked from a
+ shell with non-default policy, that policy will be used. One can specify a
+ node list of "all" with numactl --interleave or --membind [-m] to achieve
+ interleaving over all nodes in the system or cpuset.
+
+4) Any task mempolicy specifed--e.g., using numactl--will be constrained by
+ the resource limits of any cpuset in which the task runs. Thus, there will
+ be no way for a task with non-default policy running in a cpuset with a
+ subset of the system nodes to allocate huge pages outside the cpuset
+ without first moving to a cpuset that contains all of the desired nodes.
+
+5) Boot-time huge page allocation attempts to distribute the requested number
+ of huge pages over all on-lines nodes with memory.
+
+Per Node Hugepages Attributes
+
+A subset of the contents of the root huge page control directory in sysfs,
+described above, will be replicated under each the system device of each
+NUMA node with memory in:
+
+ /sys/devices/system/node/node[0-9]*/hugepages/
+
+Under this directory, the subdirectory for each supported huge page size
+contains the following attribute files:
+
+ nr_hugepages
+ free_hugepages
+ surplus_hugepages
+
+The free_' and surplus_' attribute files are read-only. They return the number
+of free and surplus [overcommitted] huge pages, respectively, on the parent
+node.
+
+The nr_hugepages attribute returns the total number of huge pages on the
+specified node. When this attribute is written, the number of persistent huge
+pages on the parent node will be adjusted to the specified value, if sufficient
+resources exist, regardless of the task's mempolicy or cpuset constraints.
+
+Note that the number of overcommit and reserve pages remain global quantities,
+as we don't know until fault time, when the faulting task's mempolicy is
+applied, from which node the huge page allocation will be attempted.
+
+
+Using Huge Pages
+
If the user applications are going to request huge pages using mmap system
call, then it is required that system administrator mount a file system of
type hugetlbfs:
@@ -206,9 +306,11 @@ map_hugetlb.c.
* requesting huge pages.
*
* For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages. That means the addresses starting with 0x800000... will need
- * to be specified. Specifying a fixed address is not required on ppc64,
- * i386 or x86_64.
+ * huge pages. That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required. If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
*
* Note: The default shared memory limit is quite low on many kernels,
* you may need to increase it via:
@@ -237,14 +339,8 @@ map_hugetlb.c.
#define dprintf(x) printf(x)
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define SHMAT_FLAGS (SHM_RND)
-#else
-#define ADDR (void *)(0x0UL)
+#define ADDR (void *)(0x0UL) /* let kernel choose address */
#define SHMAT_FLAGS (0)
-#endif
int main(void)
{
@@ -302,10 +398,12 @@ int main(void)
* example, the app is requesting memory of size 256MB that is backed by
* huge pages.
*
- * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages.
- * That means the addresses starting with 0x800000... will need to be
- * specified. Specifying a fixed address is not required on ppc64, i386
- * or x86_64.
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages. That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required. If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
*/
#include <stdlib.h>
#include <stdio.h>
@@ -317,14 +415,8 @@ int main(void)
#define LENGTH (256UL*1024*1024)
#define PROTECTION (PROT_READ | PROT_WRITE)
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_SHARED | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
+#define ADDR (void *)(0x0UL) /* let kernel choose address */
#define FLAGS (MAP_SHARED)
-#endif
void check_bytes(char *addr)
{
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
index 262d8e6793a3..b392e496f816 100644
--- a/Documentation/vm/ksm.txt
+++ b/Documentation/vm/ksm.txt
@@ -16,9 +16,9 @@ by sharing the data common between them. But it can be useful to any
application which generates many instances of the same data.
KSM only merges anonymous (private) pages, never pagecache (file) pages.
-KSM's merged pages are at present locked into kernel memory for as long
-as they are shared: so cannot be swapped out like the user pages they
-replace (but swapping KSM pages should follow soon in a later release).
+KSM's merged pages were originally locked into kernel memory, but can now
+be swapped out just like other user pages (but sharing is broken when they
+are swapped back in: ksmd must rediscover their identity and merge again).
KSM only operates on those areas of address space which an application
has advised to be likely candidates for merging, by using the madvise(2)
@@ -44,20 +44,12 @@ includes unmapped gaps (though working on the intervening mapped areas),
and might fail with EAGAIN if not enough memory for internal structures.
Applications should be considerate in their use of MADV_MERGEABLE,
-restricting its use to areas likely to benefit. KSM's scans may use
-a lot of processing power, and its kernel-resident pages are a limited
-resource. Some installations will disable KSM for these reasons.
+restricting its use to areas likely to benefit. KSM's scans may use a lot
+of processing power: some installations will disable KSM for that reason.
The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/,
readable by all but writable only by root:
-max_kernel_pages - set to maximum number of kernel pages that KSM may use
- e.g. "echo 100000 > /sys/kernel/mm/ksm/max_kernel_pages"
- Value 0 imposes no limit on the kernel pages KSM may use;
- but note that any process using MADV_MERGEABLE can cause
- KSM to allocate these pages, unswappable until it exits.
- Default: quarter of memory (chosen to not pin too much)
-
pages_to_scan - how many present pages to scan before ksmd goes to sleep
e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan"
Default: 100 (chosen for demonstration purposes)
@@ -75,7 +67,7 @@ run - set 0 to stop ksmd from running but keep merged pages,
The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/:
-pages_shared - how many shared unswappable kernel pages KSM is using
+pages_shared - how many shared pages are being used
pages_sharing - how many more sites are sharing them i.e. how much saved
pages_unshared - how many pages unique but repeatedly checked for merging
pages_volatile - how many pages changing too fast to be placed in a tree
@@ -87,4 +79,4 @@ pages_volatile embraces several different kinds of activity, but a high
proportion there would also indicate poor use of madvise MADV_MERGEABLE.
Izik Eidus,
-Hugh Dickins, 24 Sept 2009
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
index ea44ea502da1..7a7d9bab32ef 100644
--- a/Documentation/vm/page-types.c
+++ b/Documentation/vm/page-types.c
@@ -100,7 +100,7 @@
#define BIT(name) (1ULL << KPF_##name)
#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
-static char *page_flag_names[] = {
+static const char *page_flag_names[] = {
[KPF_LOCKED] = "L:locked",
[KPF_ERROR] = "E:error",
[KPF_REFERENCED] = "R:referenced",
@@ -173,7 +173,7 @@ static int kpageflags_fd;
static int opt_hwpoison;
static int opt_unpoison;
-static char *hwpoison_debug_fs = "/debug/hwpoison";
+static const char hwpoison_debug_fs[] = "/debug/hwpoison";
static int hwpoison_inject_fd;
static int hwpoison_forget_fd;
@@ -560,7 +560,7 @@ static void walk_pfn(unsigned long voffset,
{
uint64_t buf[KPAGEFLAGS_BATCH];
unsigned long batch;
- unsigned long pages;
+ long pages;
unsigned long i;
while (count) {
@@ -673,30 +673,35 @@ static void usage(void)
printf(
"page-types [options]\n"
-" -r|--raw Raw mode, for kernel developers\n"
-" -a|--addr addr-spec Walk a range of pages\n"
-" -b|--bits bits-spec Walk pages with specified bits\n"
-" -p|--pid pid Walk process address space\n"
+" -r|--raw Raw mode, for kernel developers\n"
+" -d|--describe flags Describe flags\n"
+" -a|--addr addr-spec Walk a range of pages\n"
+" -b|--bits bits-spec Walk pages with specified bits\n"
+" -p|--pid pid Walk process address space\n"
#if 0 /* planned features */
-" -f|--file filename Walk file address space\n"
+" -f|--file filename Walk file address space\n"
#endif
-" -l|--list Show page details in ranges\n"
-" -L|--list-each Show page details one by one\n"
-" -N|--no-summary Don't show summay info\n"
-" -X|--hwpoison hwpoison pages\n"
-" -x|--unpoison unpoison pages\n"
-" -h|--help Show this usage message\n"
+" -l|--list Show page details in ranges\n"
+" -L|--list-each Show page details one by one\n"
+" -N|--no-summary Don't show summay info\n"
+" -X|--hwpoison hwpoison pages\n"
+" -x|--unpoison unpoison pages\n"
+" -h|--help Show this usage message\n"
+"flags:\n"
+" 0x10 bitfield format, e.g.\n"
+" anon bit-name, e.g.\n"
+" 0x10,anon comma-separated list, e.g.\n"
"addr-spec:\n"
-" N one page at offset N (unit: pages)\n"
-" N+M pages range from N to N+M-1\n"
-" N,M pages range from N to M-1\n"
-" N, pages range from N to end\n"
-" ,M pages range from 0 to M-1\n"
+" N one page at offset N (unit: pages)\n"
+" N+M pages range from N to N+M-1\n"
+" N,M pages range from N to M-1\n"
+" N, pages range from N to end\n"
+" ,M pages range from 0 to M-1\n"
"bits-spec:\n"
-" bit1,bit2 (flags & (bit1|bit2)) != 0\n"
-" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n"
-" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n"
-" =bit1,bit2 flags == (bit1|bit2)\n"
+" bit1,bit2 (flags & (bit1|bit2)) != 0\n"
+" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n"
+" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n"
+" =bit1,bit2 flags == (bit1|bit2)\n"
"bit-names:\n"
);
@@ -884,13 +889,23 @@ static void parse_bits_mask(const char *optarg)
add_bits_filter(mask, bits);
}
+static void describe_flags(const char *optarg)
+{
+ uint64_t flags = parse_flag_names(optarg, 0);
-static struct option opts[] = {
+ printf("0x%016llx\t%s\t%s\n",
+ (unsigned long long)flags,
+ page_flag_name(flags),
+ page_flag_longname(flags));
+}
+
+static const struct option opts[] = {
{ "raw" , 0, NULL, 'r' },
{ "pid" , 1, NULL, 'p' },
{ "file" , 1, NULL, 'f' },
{ "addr" , 1, NULL, 'a' },
{ "bits" , 1, NULL, 'b' },
+ { "describe" , 1, NULL, 'd' },
{ "list" , 0, NULL, 'l' },
{ "list-each" , 0, NULL, 'L' },
{ "no-summary", 0, NULL, 'N' },
@@ -907,7 +922,7 @@ int main(int argc, char *argv[])
page_size = getpagesize();
while ((c = getopt_long(argc, argv,
- "rp:f:a:b:lLNXxh", opts, NULL)) != -1) {
+ "rp:f:a:b:d:lLNXxh", opts, NULL)) != -1) {
switch (c) {
case 'r':
opt_raw = 1;
@@ -924,6 +939,9 @@ int main(int argc, char *argv[])
case 'b':
parse_bits_mask(optarg);
break;
+ case 'd':
+ describe_flags(optarg);
+ exit(0);
case 'l':
opt_list = 1;
break;
OpenPOWER on IntegriCloud