summaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl2
-rw-r--r--Documentation/RCU/00-INDEX4
-rw-r--r--Documentation/RCU/Design/Data-Structures/Data-Structures.html233
-rw-r--r--Documentation/RCU/Design/Data-Structures/nxtlist.svg34
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html47
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.html229
-rw-r--r--Documentation/RCU/checklist.txt8
-rw-r--r--Documentation/RCU/rcu_dereference.txt9
-rw-r--r--Documentation/RCU/rculist_nulls.txt6
-rw-r--r--Documentation/RCU/stallwarn.txt190
-rw-r--r--Documentation/RCU/trace.txt535
-rw-r--r--Documentation/RCU/whatisRCU.txt32
-rw-r--r--Documentation/acpi/acpi-lid.txt16
-rw-r--r--Documentation/admin-guide/README.rst2
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt82
-rw-r--r--Documentation/admin-guide/pm/cpufreq.rst19
-rw-r--r--Documentation/admin-guide/pm/index.rst1
-rw-r--r--Documentation/admin-guide/pm/intel_pstate.rst755
-rw-r--r--Documentation/arm64/tagged-pointers.txt62
-rw-r--r--Documentation/block/bfq-iosched.txt17
-rw-r--r--Documentation/block/biodoc.txt2
-rw-r--r--Documentation/cgroup-v2.txt12
-rw-r--r--Documentation/core-api/atomic_ops.rst5
-rw-r--r--Documentation/cpu-freq/intel-pstate.txt281
-rw-r--r--Documentation/dev-tools/sparse.rst6
-rw-r--r--Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt31
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt3
-rw-r--r--Documentation/devicetree/bindings/clock/idt,versaclock5.txt16
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt (renamed from Documentation/devicetree/bindings/clock/rockchip,rk1108-cru.txt)12
-rw-r--r--Documentation/devicetree/bindings/clock/sunxi-ccu.txt19
-rw-r--r--Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt2
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-mvebu.txt6
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt2
-rw-r--r--Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt6
-rw-r--r--Documentation/devicetree/bindings/mfd/stm32-timers.txt2
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt2
-rw-r--r--Documentation/devicetree/bindings/mtd/atmel-nand.txt107
-rw-r--r--Documentation/devicetree/bindings/mtd/denali-nand.txt7
-rw-r--r--Documentation/devicetree/bindings/mtd/gpio-control-nand.txt4
-rw-r--r--Documentation/devicetree/bindings/mtd/stm32-quadspi.txt43
-rw-r--r--Documentation/devicetree/bindings/net/dsa/b53.txt2
-rw-r--r--Documentation/devicetree/bindings/net/dsa/marvell.txt4
-rw-r--r--Documentation/devicetree/bindings/net/fsl-fec.txt4
-rw-r--r--Documentation/devicetree/bindings/net/smsc911x.txt1
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt2
-rw-r--r--Documentation/devicetree/bindings/power/power_domain.txt2
-rw-r--r--Documentation/devicetree/bindings/power/supply/axp20x_battery.txt20
-rw-r--r--Documentation/devicetree/bindings/powerpc/ibm,powerpc-cpu-features.txt248
-rw-r--r--Documentation/devicetree/bindings/rtc/cpcap-rtc.txt18
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-sh.txt28
-rw-r--r--Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt21
-rw-r--r--Documentation/devicetree/bindings/staging/ion/hi6220-ion.txt31
-rw-r--r--Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt32
-rw-r--r--Documentation/devicetree/bindings/thermal/brcm,ns-thermal37
-rw-r--r--Documentation/devicetree/bindings/thermal/da9062-thermal.txt36
-rw-r--r--Documentation/devicetree/bindings/trivial-devices.txt1
-rw-r--r--Documentation/devicetree/bindings/usb/dwc2.txt1
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.txt2
-rw-r--r--Documentation/filesystems/autofs4.txt12
-rw-r--r--Documentation/filesystems/bfs.txt2
-rw-r--r--Documentation/filesystems/nfs/pnfs.txt37
-rw-r--r--Documentation/filesystems/overlayfs.txt9
-rw-r--r--Documentation/index.rst1
-rw-r--r--Documentation/input/devices/edt-ft5x06.rst2
-rw-r--r--Documentation/input/ff.rst4
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/kbuild/makefiles.txt74
-rw-r--r--Documentation/kernel-per-CPU-kthreads.txt31
-rw-r--r--Documentation/memory-barriers.txt8
-rw-r--r--Documentation/networking/dpaa.txt194
-rw-r--r--Documentation/networking/scaling.txt2
-rw-r--r--Documentation/networking/tcp.txt31
-rw-r--r--Documentation/scheduler/sched-deadline.txt168
-rw-r--r--Documentation/sound/hd-audio/models.rst114
-rwxr-xr-xDocumentation/target/target-export-device80
-rw-r--r--Documentation/tee.txt118
-rw-r--r--Documentation/thermal/sysfs-api.txt21
-rw-r--r--Documentation/timers/NO_HZ.txt29
-rw-r--r--Documentation/trace/ftrace.txt2
-rw-r--r--Documentation/translations/ko_KR/memory-barriers.txt4
-rw-r--r--Documentation/usb/typec.rst6
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic-its.txt121
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic-v3.txt6
-rw-r--r--Documentation/watchdog/watchdog-parameters.txt2
-rw-r--r--Documentation/x86/intel_rdt_ui.txt2
92 files changed, 3008 insertions, 1422 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 793acf999e9e..ed3e5e949fce 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -412,6 +412,8 @@ sysctl/
- directory with info on the /proc/sys/* files.
target/
- directory with info on generating TCM v4 fabric .ko modules
+tee.txt
+ - info on the TEE subsystem and drivers
this_cpu_ops.txt
- List rationale behind and the way to use this_cpu operations.
thermal/
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index da5c087462b1..c3c705591532 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -819,7 +819,7 @@ printk(KERN_INFO "my ip: %pI4\n", &ipaddress);
certain condition is true. They must be used carefully to ensure
there is no race condition. You declare a
<type>wait_queue_head_t</type>, and then processes which want to
- wait for that condition declare a <type>wait_queue_t</type>
+ wait for that condition declare a <type>wait_queue_entry_t</type>
referring to themselves, and place that in the queue.
</para>
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index f773a264ae02..f46980c060aa 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -17,7 +17,7 @@ rcu_dereference.txt
rcubarrier.txt
- RCU and Unloadable Modules
rculist_nulls.txt
- - RCU list primitives for use with SLAB_DESTROY_BY_RCU
+ - RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
rcuref.txt
- Reference-count design for elements of lists/arrays protected by RCU
rcu.txt
@@ -28,8 +28,6 @@ stallwarn.txt
- RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress)
torture.txt
- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
-trace.txt
- - CONFIG_RCU_TRACE debugfs files and formats
UP.txt
- RCU on Uniprocessor Systems
whatisRCU.txt
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
index d583c653a703..38d6d800761f 100644
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
@@ -19,6 +19,8 @@ to each other.
The <tt>rcu_state</tt> Structure</a>
<li> <a href="#The rcu_node Structure">
The <tt>rcu_node</tt> Structure</a>
+<li> <a href="#The rcu_segcblist Structure">
+ The <tt>rcu_segcblist</tt> Structure</a>
<li> <a href="#The rcu_data Structure">
The <tt>rcu_data</tt> Structure</a>
<li> <a href="#The rcu_dynticks Structure">
@@ -841,6 +843,134 @@ for lockdep lock-class names.
Finally, lines&nbsp;64-66 produce an error if the maximum number of
CPUs is too large for the specified fanout.
+<h3><a name="The rcu_segcblist Structure">
+The <tt>rcu_segcblist</tt> Structure</a></h3>
+
+The <tt>rcu_segcblist</tt> structure maintains a segmented list of
+callbacks as follows:
+
+<pre>
+ 1 #define RCU_DONE_TAIL 0
+ 2 #define RCU_WAIT_TAIL 1
+ 3 #define RCU_NEXT_READY_TAIL 2
+ 4 #define RCU_NEXT_TAIL 3
+ 5 #define RCU_CBLIST_NSEGS 4
+ 6
+ 7 struct rcu_segcblist {
+ 8 struct rcu_head *head;
+ 9 struct rcu_head **tails[RCU_CBLIST_NSEGS];
+10 unsigned long gp_seq[RCU_CBLIST_NSEGS];
+11 long len;
+12 long len_lazy;
+13 };
+</pre>
+
+<p>
+The segments are as follows:
+
+<ol>
+<li> <tt>RCU_DONE_TAIL</tt>: Callbacks whose grace periods have elapsed.
+ These callbacks are ready to be invoked.
+<li> <tt>RCU_WAIT_TAIL</tt>: Callbacks that are waiting for the
+ current grace period.
+ Note that different CPUs can have different ideas about which
+ grace period is current, hence the <tt>-&gt;gp_seq</tt> field.
+<li> <tt>RCU_NEXT_READY_TAIL</tt>: Callbacks waiting for the next
+ grace period to start.
+<li> <tt>RCU_NEXT_TAIL</tt>: Callbacks that have not yet been
+ associated with a grace period.
+</ol>
+
+<p>
+The <tt>-&gt;head</tt> pointer references the first callback or
+is <tt>NULL</tt> if the list contains no callbacks (which is
+<i>not</i> the same as being empty).
+Each element of the <tt>-&gt;tails[]</tt> array references the
+<tt>-&gt;next</tt> pointer of the last callback in the corresponding
+segment of the list, or the list's <tt>-&gt;head</tt> pointer if
+that segment and all previous segments are empty.
+If the corresponding segment is empty but some previous segment is
+not empty, then the array element is identical to its predecessor.
+Older callbacks are closer to the head of the list, and new callbacks
+are added at the tail.
+This relationship between the <tt>-&gt;head</tt> pointer, the
+<tt>-&gt;tails[]</tt> array, and the callbacks is shown in this
+diagram:
+
+</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
+
+</p><p>In this figure, the <tt>-&gt;head</tt> pointer references the
+first
+RCU callback in the list.
+The <tt>-&gt;tails[RCU_DONE_TAIL]</tt> array element references
+the <tt>-&gt;head</tt> pointer itself, indicating that none
+of the callbacks is ready to invoke.
+The <tt>-&gt;tails[RCU_WAIT_TAIL]</tt> array element references callback
+CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
+CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period,
+give or take possible disagreements about exactly which grace period
+is the current one.
+The <tt>-&gt;tails[RCU_NEXT_READY_TAIL]</tt> array element
+references the same RCU callback that <tt>-&gt;tails[RCU_WAIT_TAIL]</tt>
+does, which indicates that there are no callbacks waiting on the next
+RCU grace period.
+The <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element references
+CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
+remaining RCU callbacks have not yet been assigned to an RCU grace
+period.
+Note that the <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element
+always references the last RCU callback's <tt>-&gt;next</tt> pointer
+unless the callback list is empty, in which case it references
+the <tt>-&gt;head</tt> pointer.
+
+<p>
+There is one additional important special case for the
+<tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element: It can be <tt>NULL</tt>
+when this list is <i>disabled</i>.
+Lists are disabled when the corresponding CPU is offline or when
+the corresponding CPU's callbacks are offloaded to a kthread,
+both of which are described elsewhere.
+
+</p><p>CPUs advance their callbacks from the
+<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
+<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
+as grace periods advance.
+
+</p><p>The <tt>-&gt;gp_seq[]</tt> array records grace-period
+numbers corresponding to the list segments.
+This is what allows different CPUs to have different ideas as to
+which is the current grace period while still avoiding premature
+invocation of their callbacks.
+In particular, this allows CPUs that go idle for extended periods
+to determine which of their callbacks are ready to be invoked after
+reawakening.
+
+</p><p>The <tt>-&gt;len</tt> counter contains the number of
+callbacks in <tt>-&gt;head</tt>, and the
+<tt>-&gt;len_lazy</tt> contains the number of those callbacks that
+are known to only free memory, and whose invocation can therefore
+be safely deferred.
+
+<p><b>Important note</b>: It is the <tt>-&gt;len</tt> field that
+determines whether or not there are callbacks associated with
+this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>-&gt;head</tt>
+pointer.
+The reason for this is that all the ready-to-invoke callbacks
+(that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
+all at once at callback-invocation time.
+If callback invocation must be postponed, for example, because a
+high-priority process just woke up on this CPU, then the remaining
+callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
+Either way, the <tt>-&gt;len</tt> and <tt>-&gt;len_lazy</tt> counts
+are adjusted after the corresponding callbacks have been invoked, and so
+again it is the <tt>-&gt;len</tt> count that accurately reflects whether
+or not there are callbacks associated with this <tt>rcu_segcblist</tt>
+structure.
+Of course, off-CPU sampling of the <tt>-&gt;len</tt> count requires
+the use of appropriate synchronization, for example, memory barriers.
+This synchronization can be a bit subtle, particularly in the case
+of <tt>rcu_barrier()</tt>.
+
<h3><a name="The rcu_data Structure">
The <tt>rcu_data</tt> Structure</a></h3>
@@ -983,62 +1113,18 @@ choice.
as follows:
<pre>
- 1 struct rcu_head *nxtlist;
- 2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
- 3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
- 4 long qlen_lazy;
- 5 long qlen;
- 6 long qlen_last_fqs_check;
+ 1 struct rcu_segcblist cblist;
+ 2 long qlen_last_fqs_check;
+ 3 unsigned long n_cbs_invoked;
+ 4 unsigned long n_nocbs_invoked;
+ 5 unsigned long n_cbs_orphaned;
+ 6 unsigned long n_cbs_adopted;
7 unsigned long n_force_qs_snap;
- 8 unsigned long n_cbs_invoked;
- 9 unsigned long n_cbs_orphaned;
-10 unsigned long n_cbs_adopted;
-11 long blimit;
+ 8 long blimit;
</pre>
-<p>The <tt>-&gt;nxtlist</tt> pointer and the
-<tt>-&gt;nxttail[]</tt> array form a four-segment list with
-older callbacks near the head and newer ones near the tail.
-Each segment contains callbacks with the corresponding relationship
-to the current grace period.
-The pointer out of the end of each of the four segments is referenced
-by the element of the <tt>-&gt;nxttail[]</tt> array indexed by
-<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period),
-<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period),
-<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next
-grace period), and
-<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated
-with a specific grace period)
-respectively, as shown in the following figure.
-
-</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
-
-</p><p>In this figure, the <tt>-&gt;nxtlist</tt> pointer references the
-first
-RCU callback in the list.
-The <tt>-&gt;nxttail[RCU_DONE_TAIL]</tt> array element references
-the <tt>-&gt;nxtlist</tt> pointer itself, indicating that none
-of the callbacks is ready to invoke.
-The <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt> array element references callback
-CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
-CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period.
-The <tt>-&gt;nxttail[RCU_NEXT_READY_TAIL]</tt> array element
-references the same RCU callback that <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt>
-does, which indicates that there are no callbacks waiting on the next
-RCU grace period.
-The <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element references
-CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
-remaining RCU callbacks have not yet been assigned to an RCU grace
-period.
-Note that the <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element
-always references the last RCU callback's <tt>-&gt;next</tt> pointer
-unless the callback list is empty, in which case it references
-the <tt>-&gt;nxtlist</tt> pointer.
-
-</p><p>CPUs advance their callbacks from the
-<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
-<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
-as grace periods advance.
+<p>The <tt>-&gt;cblist</tt> structure is the segmented callback list
+described earlier.
The CPU advances the callbacks in its <tt>rcu_data</tt> structure
whenever it notices that another RCU grace period has completed.
The CPU detects the completion of an RCU grace period by noticing
@@ -1049,16 +1135,7 @@ Recall that each <tt>rcu_node</tt> structure's
<tt>-&gt;completed</tt> field is updated at the end of each
grace period.
-</p><p>The <tt>-&gt;nxtcompleted[]</tt> array records grace-period
-numbers corresponding to the list segments.
-This allows CPUs that go idle for extended periods to determine
-which of their callbacks are ready to be invoked after reawakening.
-
-</p><p>The <tt>-&gt;qlen</tt> counter contains the number of
-callbacks in <tt>-&gt;nxtlist</tt>, and the
-<tt>-&gt;qlen_lazy</tt> contains the number of those callbacks that
-are known to only free memory, and whose invocation can therefore
-be safely deferred.
+<p>
The <tt>-&gt;qlen_last_fqs_check</tt> and
<tt>-&gt;n_force_qs_snap</tt> coordinate the forcing of quiescent
states from <tt>call_rcu()</tt> and friends when callback
@@ -1069,6 +1146,10 @@ lists grow excessively long.
fields count the number of callbacks invoked,
sent to other CPUs when this CPU goes offline,
and received from other CPUs when those other CPUs go offline.
+The <tt>-&gt;n_nocbs_invoked</tt> is used when the CPU's callbacks
+are offloaded to a kthread.
+
+<p>
Finally, the <tt>-&gt;blimit</tt> counter is the maximum number of
RCU callbacks that may be invoked at a given time.
@@ -1104,6 +1185,9 @@ Its fields are as follows:
1 int dynticks_nesting;
2 int dynticks_nmi_nesting;
3 atomic_t dynticks;
+ 4 bool rcu_need_heavy_qs;
+ 5 unsigned long rcu_qs_ctr;
+ 6 bool rcu_urgent_qs;
</pre>
<p>The <tt>-&gt;dynticks_nesting</tt> field counts the
@@ -1117,11 +1201,32 @@ NMIs are counted by the <tt>-&gt;dynticks_nmi_nesting</tt>
field, except that NMIs that interrupt non-dyntick-idle execution
are not counted.
-</p><p>Finally, the <tt>-&gt;dynticks</tt> field counts the corresponding
+</p><p>The <tt>-&gt;dynticks</tt> field counts the corresponding
CPU's transitions to and from dyntick-idle mode, so that this counter
has an even value when the CPU is in dyntick-idle mode and an odd
value otherwise.
+</p><p>The <tt>-&gt;rcu_need_heavy_qs</tt> field is used
+to record the fact that the RCU core code would really like to
+see a quiescent state from the corresponding CPU, so much so that
+it is willing to call for heavy-weight dyntick-counter operations.
+This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
+code, which provide a momentary idle sojourn in response.
+
+</p><p>The <tt>-&gt;rcu_qs_ctr</tt> field is used to record
+quiescent states from <tt>cond_resched()</tt>.
+Because <tt>cond_resched()</tt> can execute quite frequently, this
+must be quite lightweight, as in a non-atomic increment of this
+per-CPU field.
+
+</p><p>Finally, the <tt>-&gt;rcu_urgent_qs</tt> field is used to record
+the fact that the RCU core code would really like to see a quiescent
+state from the corresponding CPU, with the various other fields indicating
+just how badly RCU wants this quiescent state.
+This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
+code, which, if nothing else, non-atomically increment <tt>-&gt;rcu_qs_ctr</tt>
+in response.
+
<table>
<tr><th>&nbsp;</th></tr>
<tr><th align="left">Quick Quiz:</th></tr>
diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
index abc4cc73a097..0223e79c38e0 100644
--- a/Documentation/RCU/Design/Data-Structures/nxtlist.svg
+++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
@@ -19,7 +19,7 @@
id="svg2"
version="1.1"
inkscape:version="0.48.4 r9939"
- sodipodi:docname="nxtlist.fig">
+ sodipodi:docname="segcblist.svg">
<metadata
id="metadata94">
<rdf:RDF>
@@ -28,7 +28,7 @@
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
- <dc:title></dc:title>
+ <dc:title />
</cc:Work>
</rdf:RDF>
</metadata>
@@ -241,61 +241,51 @@
xml:space="preserve"
x="225"
y="675"
- fill="#000000"
- font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="324"
- text-anchor="start"
- id="text64">nxtlist</text>
+ id="text64"
+ style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;head</text>
<!-- Text -->
<text
xml:space="preserve"
x="225"
y="1800"
- fill="#000000"
- font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="324"
- text-anchor="start"
- id="text66">nxttail[RCU_DONE_TAIL]</text>
+ id="text66"
+ style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_DONE_TAIL]</text>
<!-- Text -->
<text
xml:space="preserve"
x="225"
y="2925"
- fill="#000000"
- font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="324"
- text-anchor="start"
- id="text68">nxttail[RCU_WAIT_TAIL]</text>
+ id="text68"
+ style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_WAIT_TAIL]</text>
<!-- Text -->
<text
xml:space="preserve"
x="225"
y="4050"
- fill="#000000"
- font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="324"
- text-anchor="start"
- id="text70">nxttail[RCU_NEXT_READY_TAIL]</text>
+ id="text70"
+ style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_NEXT_READY_TAIL]</text>
<!-- Text -->
<text
xml:space="preserve"
x="225"
y="5175"
- fill="#000000"
- font-family="Courier"
font-style="normal"
font-weight="bold"
font-size="324"
- text-anchor="start"
- id="text72">nxttail[RCU_NEXT_TAIL]</text>
+ id="text72"
+ style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_NEXT_TAIL]</text>
<!-- Text -->
<text
xml:space="preserve"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
index 7a3194c5559a..e5d0bbd0230b 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
@@ -284,6 +284,7 @@ Expedited Grace Period Refinements</a></h2>
Funnel locking and wait/wakeup</a>.
<li> <a href="#Use of Workqueues">Use of Workqueues</a>.
<li> <a href="#Stall Warnings">Stall warnings</a>.
+<li> <a href="#Mid-Boot Operation">Mid-boot operation</a>.
</ol>
<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3>
@@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups.
In earlier implementations, the task requesting the expedited
grace period also drove it to completion.
This straightforward approach had the disadvantage of needing to
-account for signals sent to user tasks,
+account for POSIX signals sent to user tasks,
so more recent implemementations use the Linux kernel's
<a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>.
@@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock
processing, but the task reaching the top of the funnel lock
does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt>
so that a workqueue kthread does the actual grace-period processing.
-Because workqueue kthreads do not accept signals, grace-period-wait
-processing need not allow for signals.
+Because workqueue kthreads do not accept POSIX signals, grace-period-wait
+processing need not allow for POSIX signals.
In addition, this approach allows wakeups for the previous expedited
grace period to be overlapped with processing for the next expedited
@@ -586,6 +587,46 @@ blocking the current grace period are printed.
Each stall warning results in another pass through the loop, but the
second and subsequent passes use longer stall times.
+<h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3>
+
+<p>
+The use of workqueues has the advantage that the expedited
+grace-period code need not worry about POSIX signals.
+Unfortunately, it has the
+corresponding disadvantage that workqueues cannot be used until
+they are initialized, which does not happen until some time after
+the scheduler spawns the first task.
+Given that there are parts of the kernel that really do want to
+execute grace periods during this mid-boot &ldquo;dead zone&rdquo;,
+expedited grace periods must do something else during thie time.
+
+<p>
+What they do is to fall back to the old practice of requiring that the
+requesting task drive the expedited grace period, as was the case
+before the use of workqueues.
+However, the requesting task is only required to drive the grace period
+during the mid-boot dead zone.
+Before mid-boot, a synchronous grace period is a no-op.
+Some time after mid-boot, workqueues are used.
+
+<p>
+Non-expedited non-SRCU synchronous grace periods must also operate
+normally during mid-boot.
+This is handled by causing non-expedited grace periods to take the
+expedited code path during mid-boot.
+
+<p>
+The current code assumes that there are no POSIX signals during
+the mid-boot dead zone.
+However, if an overwhelming need for POSIX signals somehow arises,
+appropriate adjustments can be made to the expedited stall-warning code.
+One such adjustment would reinstate the pre-workqueue stall-warning
+checks, but only during the mid-boot dead zone.
+
+<p>
+With this refinement, synchronous grace periods can now be used from
+task context pretty much any time during the life of the kernel.
+
<h3><a name="Summary">
Summary</a></h3>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 21593496aca6..95b30fa25d56 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -559,9 +559,7 @@ The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
For <tt>remove_gp_synchronous()</tt>, as long as all modifications
to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
the above optimizations are harmless.
- However,
- with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
- <tt>sparse</tt> will complain if you
+ However, <tt>sparse</tt> will complain if you
define <tt>gp</tt> with <tt>__rcu</tt> and then
access it without using
either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
@@ -659,8 +657,9 @@ systems with more than one CPU:
In other words, a given instance of <tt>synchronize_rcu()</tt>
can avoid waiting on a given RCU read-side critical section only
if it can prove that <tt>synchronize_rcu()</tt> started first.
+ </font>
- <p>
+ <p><font color="ffffff">
A related question is &ldquo;When <tt>rcu_read_lock()</tt>
doesn't generate any code, why does it matter how it relates
to a grace period?&rdquo;
@@ -675,8 +674,9 @@ systems with more than one CPU:
within the critical section, in which case none of the accesses
within the critical section may observe the effects of any
access following the grace period.
+ </font>
- <p>
+ <p><font color="ffffff">
As of late 2016, mathematical models of RCU take this
viewpoint, for example, see slides&nbsp;62 and&nbsp;63
of the
@@ -1616,8 +1616,8 @@ CPUs should at least make reasonable forward progress.
In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
is permitted to impose modest degradation of real-time latency
on non-idle online CPUs.
-That said, it will likely be necessary to take further steps to reduce this
-degradation, hopefully to roughly that of a scheduling-clock interrupt.
+Here, &ldquo;modest&rdquo; means roughly the same latency
+degradation as a scheduling-clock interrupt.
<p>
There are a number of situations where even
@@ -1847,7 +1847,8 @@ mass storage, or user patience, whichever comes first.
If the nesting is not visible to the compiler, as is the case with
mutually recursive functions each in its own translation unit,
stack overflow will result.
-If the nesting takes the form of loops, either the control variable
+If the nesting takes the form of loops, perhaps in the guise of tail
+recursion, either the control variable
will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
Nevertheless, this class of RCU implementations is one
of the most composable constructs in existence.
@@ -1913,12 +1914,9 @@ This requirement is another factor driving batching of grace periods,
but it is also the driving force behind the checks for large numbers
of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
Finally, high update rates should not delay RCU read-side critical
-sections, although some read-side delays can occur when using
+sections, although some small read-side delays can occur when using
<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
-of <tt>try_stop_cpus()</tt>.
-(In the future, <tt>synchronize_rcu_expedited()</tt> will be
-converted to use lighter-weight inter-processor interrupts (IPIs),
-but this will still disturb readers, though to a much smaller degree.)
+of <tt>smp_call_function_single()</tt>.
<p>
Although all three of these corner cases were understood in the early
@@ -1978,9 +1976,8 @@ guard against mishaps and misuse:
and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
substituting a simple assignment.
To catch this sort of error, a given RCU-protected pointer may be
- tagged with <tt>__rcu</tt>, after which running sparse
- with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
- about simple-assignment accesses to that pointer.
+ tagged with <tt>__rcu</tt>, after which sparse
+ will complain about simple-assignment accesses to that pointer.
Arnd Bergmann made me aware of this requirement, and also
supplied the needed
<a href="https://lwn.net/Articles/376011/">patch series</a>.
@@ -2037,7 +2034,7 @@ guard against mishaps and misuse:
some other synchronization mechanism, for example, reference
counting.
<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
- information is provided via both debugfs and event tracing.
+ information is provided via event tracing.
<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
<tt>rcu_dereference()</tt> to create typical linked
data structures can be surprisingly error-prone.
@@ -2154,7 +2151,8 @@ as will <tt>rcu_assign_pointer()</tt>.
<p>
Although <tt>call_rcu()</tt> may be invoked at any
time during boot, callbacks are not guaranteed to be invoked until after
-the scheduler is fully up and running.
+all of RCU's kthreads have been spawned, which occurs at
+<tt>early_initcall()</tt> time.
This delay in callback invocation is due to the fact that RCU does not
invoke callbacks until it is fully initialized, and this full initialization
cannot occur until after the scheduler has initialized itself to the
@@ -2167,8 +2165,10 @@ on what operations those callbacks could invoke.
Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
(<a href="#Bottom-Half Flavor">discussed below</a>),
-and
-<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
+<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>,
+<tt>synchronize_rcu_expedited()</tt>,
+<tt>synchronize_rcu_bh_expedited()</tt>, and
+<tt>synchronize_sched_expedited()</tt>
will all operate normally
during very early boot, the reason being that there is only one CPU
and preemption is disabled.
@@ -2178,45 +2178,59 @@ state and thus a grace period, so the early-boot implementation can
be a no-op.
<p>
-Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
-continue to operate normally through the remainder of boot, courtesy
-of the fact that preemption is disabled across their RCU read-side
-critical sections and also courtesy of the fact that there is still
-only one CPU.
-However, once the scheduler starts initializing, preemption is enabled.
-There is still only a single CPU, but the fact that preemption is enabled
-means that the no-op implementation of <tt>synchronize_rcu()</tt> no
-longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
-Therefore, as soon as the scheduler starts initializing, the early-boot
-fastpath is disabled.
-This means that <tt>synchronize_rcu()</tt> switches to its runtime
-mode of operation where it posts callbacks, which in turn means that
-any call to <tt>synchronize_rcu()</tt> will block until the corresponding
-callback is invoked.
-Unfortunately, the callback cannot be invoked until RCU's runtime
-grace-period machinery is up and running, which cannot happen until
-the scheduler has initialized itself sufficiently to allow RCU's
-kthreads to be spawned.
-Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
-initialization can result in deadlock.
+However, once the scheduler has spawned its first kthread, this early
+boot trick fails for <tt>synchronize_rcu()</tt> (as well as for
+<tt>synchronize_rcu_expedited()</tt>) in <tt>CONFIG_PREEMPT=y</tt>
+kernels.
+The reason is that an RCU read-side critical section might be preempted,
+which means that a subsequent <tt>synchronize_rcu()</tt> really does have
+to wait for something, as opposed to simply returning immediately.
+Unfortunately, <tt>synchronize_rcu()</tt> can't do this until all of
+its kthreads are spawned, which doesn't happen until some time during
+<tt>early_initcalls()</tt> time.
+But this is no excuse: RCU is nevertheless required to correctly handle
+synchronous grace periods during this time period.
+Once all of its kthreads are up and running, RCU starts running
+normally.
<table>
<tr><th>&nbsp;</th></tr>
<tr><th align="left">Quick Quiz:</th></tr>
<tr><td>
- So what happens with <tt>synchronize_rcu()</tt> during
- scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
- kernels?
+ How can RCU possibly handle grace periods before all of its
+ kthreads have been spawned???
</td></tr>
<tr><th align="left">Answer:</th></tr>
<tr><td bgcolor="#ffffff"><font color="ffffff">
- In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
- maps directly to <tt>synchronize_sched()</tt>.
- Therefore, <tt>synchronize_rcu()</tt> works normally throughout
- boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
- However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
- so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
- during scheduler initialization.
+ Very carefully!
+ </font>
+
+ <p><font color="ffffff">
+ During the &ldquo;dead zone&rdquo; between the time that the
+ scheduler spawns the first task and the time that all of RCU's
+ kthreads have been spawned, all synchronous grace periods are
+ handled by the expedited grace-period mechanism.
+ At runtime, this expedited mechanism relies on workqueues, but
+ during the dead zone the requesting task itself drives the
+ desired expedited grace period.
+ Because dead-zone execution takes place within task context,
+ everything works.
+ Once the dead zone ends, expedited grace periods go back to
+ using workqueues, as is required to avoid problems that would
+ otherwise occur when a user task received a POSIX signal while
+ driving an expedited grace period.
+ </font>
+
+ <p><font color="ffffff">
+ And yes, this does mean that it is unhelpful to send POSIX
+ signals to random tasks between the time that the scheduler
+ spawns its first kthread and the time that RCU's kthreads
+ have all been spawned.
+ If there ever turns out to be a good reason for sending POSIX
+ signals during that time, appropriate adjustments will be made.
+ (If it turns out that POSIX signals are sent during this time for
+ no good reason, other adjustments will be made, appropriate
+ or otherwise.)
</font></td></tr>
<tr><td>&nbsp;</td></tr>
</table>
@@ -2295,12 +2309,61 @@ situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
The need for <tt>rcu_barrier()</tt> for module unloading became
apparent later.
+<p>
+<b>Important note</b>: The <tt>rcu_barrier()</tt> function is not,
+repeat, <i>not</i>, obligated to wait for a grace period.
+It is instead only required to wait for RCU callbacks that have
+already been posted.
+Therefore, if there are no RCU callbacks posted anywhere in the system,
+<tt>rcu_barrier()</tt> is within its rights to return immediately.
+Even if there are callbacks posted, <tt>rcu_barrier()</tt> does not
+necessarily need to wait for a grace period.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Wait a minute!
+ Each RCU callbacks must wait for a grace period to complete,
+ and <tt>rcu_barrier()</tt> must wait for each pre-existing
+ callback to be invoked.
+ Doesn't <tt>rcu_barrier()</tt> therefore need to wait for
+ a full grace period if there is even one callback posted anywhere
+ in the system?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Absolutely not!!!
+ </font>
+
+ <p><font color="ffffff">
+ Yes, each RCU callbacks must wait for a grace period to complete,
+ but it might well be partly (or even completely) finished waiting
+ by the time <tt>rcu_barrier()</tt> is invoked.
+ In that case, <tt>rcu_barrier()</tt> need only wait for the
+ remaining portion of the grace period to elapse.
+ So even if there are quite a few callbacks posted,
+ <tt>rcu_barrier()</tt> might well return quite quickly.
+ </font>
+
+ <p><font color="ffffff">
+ So if you need to wait for a grace period as well as for all
+ pre-existing callbacks, you will need to invoke both
+ <tt>synchronize_rcu()</tt> and <tt>rcu_barrier()</tt>.
+ If latency is a concern, you can always use workqueues
+ to invoke them concurrently.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
<p>
The Linux kernel supports CPU hotplug, which means that CPUs
can come and go.
-It is of course illegal to use any RCU API member from an offline CPU.
+It is of course illegal to use any RCU API member from an offline CPU,
+with the exception of <a href="#Sleepable RCU">SRCU</a> read-side
+critical sections.
This requirement was present from day one in DYNIX/ptx, but
on the other hand, the Linux kernel's CPU-hotplug implementation
is &ldquo;interesting.&rdquo;
@@ -2310,19 +2373,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that
are used to allow the various kernel subsystems (including RCU)
to respond appropriately to a given CPU-hotplug operation.
Most RCU operations may be invoked from CPU-hotplug notifiers,
-including even normal synchronous grace-period operations
-such as <tt>synchronize_rcu()</tt>.
-However, expedited grace-period operations such as
-<tt>synchronize_rcu_expedited()</tt> are not supported,
-due to the fact that current implementations block CPU-hotplug
-operations, which could result in deadlock.
+including even synchronous grace-period operations such as
+<tt>synchronize_rcu()</tt> and <tt>synchronize_rcu_expedited()</tt>.
<p>
-In addition, all-callback-wait operations such as
+However, all-callback-wait operations such as
<tt>rcu_barrier()</tt> are also not supported, due to the
fact that there are phases of CPU-hotplug operations where
the outgoing CPU's callbacks will not be invoked until after
the CPU-hotplug operation ends, which could also result in deadlock.
+Furthermore, <tt>rcu_barrier()</tt> blocks CPU-hotplug operations
+during its execution, which results in another type of deadlock
+when invoked from a CPU-hotplug notifier.
<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
@@ -2455,11 +2517,7 @@ It is similarly socially unacceptable to interrupt an
<tt>nohz_full</tt> CPU running in userspace.
RCU must therefore track <tt>nohz_full</tt> userspace
execution.
-And in
-<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
-kernels, RCU must separately track idle CPUs on the one hand and
-CPUs that are either idle or executing in userspace on the other.
-In both cases, RCU must be able to sample state at two points in
+RCU must therefore be able to sample state at two points in
time, and be able to determine whether or not some other CPU spent
any time idle and/or executing in userspace.
@@ -2864,6 +2922,41 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>,
guarantees a full memory barrier.
<p>
+Also unlike other RCU flavors, SRCU's callbacks-wait function
+<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
+though this is not necessarily a good idea.
+The reason that this is possible is that SRCU is insensitive
+to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
+need not exclude CPU-hotplug operations.
+
+<p>
+SRCU also differs from other RCU flavors in that SRCU's expedited and
+non-expedited grace periods are implemented by the same mechanism.
+This means that in the current SRCU implementation, expediting a
+future grace period has the side effect of expediting all prior
+grace periods that have not yet completed.
+(But please note that this is a property of the current implementation,
+not necessarily of future implementations.)
+In addition, if SRCU has been idle for longer than the interval
+specified by the <tt>srcutree.exp_holdoff</tt> kernel boot parameter
+(25&nbsp;microseconds by default),
+and if a <tt>synchronize_srcu()</tt> invocation ends this idle period,
+that invocation will be automatically expedited.
+
+<p>
+As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
+a locking bottleneck present in prior kernel versions.
+Although this will allow users to put much heavier stress on
+<tt>call_srcu()</tt>, it is important to note that SRCU does not
+yet take any special steps to deal with callback flooding.
+So if you are posting (say) 10,000 SRCU callbacks per second per CPU,
+you are probably totally OK, but if you intend to post (say) 1,000,000
+SRCU callbacks per second per CPU, please run some tests first.
+SRCU just might need a few adjustment to deal with that sort of load.
+Of course, your mileage may vary based on the speed of your CPUs and
+the size of your memory.
+
+<p>
The
<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
includes
@@ -3021,8 +3114,8 @@ to do some redesign to avoid this scalability problem.
<p>
RCU disables CPU hotplug in a few places, perhaps most notably in the
-expedited grace-period and <tt>rcu_barrier()</tt> operations.
-If there is a strong reason to use expedited grace periods in CPU-hotplug
+<tt>rcu_barrier()</tt> operations.
+If there is a strong reason to use <tt>rcu_barrier()</tt> in CPU-hotplug
notifiers, it will be necessary to avoid disabling CPU hotplug.
This would introduce some complexity, so there had better be a <i>very</i>
good reason.
@@ -3096,9 +3189,5 @@ Andy Lutomirski for their help in rendering
this article human readable, and to Michelle Rankin for her support
of this effort.
Other contributions are acknowledged in the Linux kernel's git archive.
-The cartoon is copyright (c) 2013 by Melissa Broussard,
-and is provided
-under the terms of the Creative Commons Attribution-Share Alike 3.0
-United States license.
</body></html>
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 877947130ebe..6beda556faf3 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -413,11 +413,11 @@ over a rather long period of time, but improvements are always welcome!
read-side critical sections. It is the responsibility of the
RCU update-side primitives to deal with this.
-17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
- __rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to
- validate your RCU code. These can help find problems as follows:
+17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
+ __rcu sparse checks to validate your RCU code. These can help
+ find problems as follows:
- CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+ CONFIG_PROVE_LOCKING: check that accesses to RCU-protected data
structures are carried out under the proper RCU
read-side critical section, while holding the right
combination of locks, or whatever other conditions
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt
index c0bf2441a2ba..b2a613f16d74 100644
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -138,6 +138,15 @@ o Be very careful about comparing pointers obtained from
This sort of comparison occurs frequently when scanning
RCU-protected circular linked lists.
+ Note that if checks for being within an RCU read-side
+ critical section are not required and the pointer is never
+ dereferenced, rcu_access_pointer() should be used in place
+ of rcu_dereference(). The rcu_access_pointer() primitive
+ does not require an enclosing read-side critical section,
+ and also omits the smp_read_barrier_depends() included in
+ rcu_dereference(), which in turn should provide a small
+ performance gain in some CPUs (e.g., the DEC Alpha).
+
o The comparison is against a pointer that references memory
that was initialized "a long time ago." The reason
this is safe is that even if misordering occurs, the
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 18f9651ff23d..8151f0195f76 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -1,5 +1,5 @@
Using hlist_nulls to protect read-mostly linked lists and
-objects using SLAB_DESTROY_BY_RCU allocations.
+objects using SLAB_TYPESAFE_BY_RCU allocations.
Please read the basics in Documentation/RCU/listRCU.txt
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way
to solve following problem :
A typical RCU linked list managing objects which are
-allocated with SLAB_DESTROY_BY_RCU kmem_cache can
+allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
use following algos :
1) Lookup algo
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock()
3) Remove algo
--------------
Nothing special here, we can use a standard RCU hlist deletion.
-But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused
+But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
very very fast (before the end of RCU grace period)
if (put_last_reference_on(obj) {
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index e93d04133fe7..96a3d81837e1 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -1,9 +1,102 @@
Using RCU's CPU Stall Detector
-The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall
-detector, which detects conditions that unduly delay RCU grace periods.
-This module parameter enables CPU stall detection by default, but
-may be overridden via boot-time parameter or at runtime via sysfs.
+This document first discusses what sorts of issues RCU's CPU stall
+detector can locate, and then discusses kernel parameters and Kconfig
+options that can be used to fine-tune the detector's operation. Finally,
+this document explains the stall detector's "splat" format.
+
+
+What Causes RCU CPU Stall Warnings?
+
+So your kernel printed an RCU CPU stall warning. The next question is
+"What caused it?" The following problems can result in RCU CPU stall
+warnings:
+
+o A CPU looping in an RCU read-side critical section.
+
+o A CPU looping with interrupts disabled.
+
+o A CPU looping with preemption disabled. This condition can
+ result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
+ stalls.
+
+o A CPU looping with bottom halves disabled. This condition can
+ result in RCU-sched and RCU-bh stalls.
+
+o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
+ kernel without invoking schedule(). Note that cond_resched()
+ does not necessarily prevent RCU CPU stall warnings. Therefore,
+ if the looping in the kernel is really expected and desirable
+ behavior, you might need to replace some of the cond_resched()
+ calls with calls to cond_resched_rcu_qs().
+
+o Booting Linux using a console connection that is too slow to
+ keep up with the boot-time console-message rate. For example,
+ a 115Kbaud serial console can be -way- too slow to keep up
+ with boot-time message rates, and will frequently result in
+ RCU CPU stall warning messages. Especially if you have added
+ debug printk()s.
+
+o Anything that prevents RCU's grace-period kthreads from running.
+ This can result in the "All QSes seen" console-log message.
+ This message will include information on when the kthread last
+ ran and how often it should be expected to run.
+
+o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+ happen to preempt a low-priority task in the middle of an RCU
+ read-side critical section. This is especially damaging if
+ that low-priority task is not permitted to run on any other CPU,
+ in which case the next RCU grace period can never complete, which
+ will eventually cause the system to run out of memory and hang.
+ While the system is in the process of running itself out of
+ memory, you might see stall-warning messages.
+
+o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+ is running at a higher priority than the RCU softirq threads.
+ This will prevent RCU callbacks from ever being invoked,
+ and in a CONFIG_PREEMPT_RCU kernel will further prevent
+ RCU grace periods from ever completing. Either way, the
+ system will eventually run out of memory and hang. In the
+ CONFIG_PREEMPT_RCU case, you might see stall-warning
+ messages.
+
+o A hardware or software issue shuts off the scheduler-clock
+ interrupt on a CPU that is not in dyntick-idle mode. This
+ problem really has happened, and seems to be most likely to
+ result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
+
+o A bug in the RCU implementation.
+
+o A hardware failure. This is quite unlikely, but has occurred
+ at least once in real life. A CPU failed in a running system,
+ becoming unresponsive, but not causing an immediate crash.
+ This resulted in a series of RCU CPU stall warnings, eventually
+ leading the realization that the CPU had failed.
+
+The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
+warning. Note that SRCU does -not- have CPU stall warnings. Please note
+that RCU only detects CPU stalls when there is a grace period in progress.
+No grace period, no CPU stall warnings.
+
+To diagnose the cause of the stall, inspect the stack traces.
+The offending function will usually be near the top of the stack.
+If you have a series of stall warnings from a single extended stall,
+comparing the stack traces can often help determine where the stall
+is occurring, which will usually be in the function nearest the top of
+that portion of the stack which remains the same from trace to trace.
+If you can reliably trigger the stall, ftrace can be quite helpful.
+
+RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
+and with RCU's event tracing. For information on RCU's event tracing,
+see include/trace/events/rcu.h.
+
+
+Fine-Tuning the RCU CPU Stall Detector
+
+The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
+CPU stall detector, which detects conditions that unduly delay RCU grace
+periods. This module parameter enables CPU stall detection by default,
+but may be overridden via boot-time parameter or at runtime via sysfs.
The stall detector's idea of what constitutes "unduly delayed" is
controlled by a set of kernel configuration variables and cpp macros:
@@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout
And continues with the output of sched_show_task() for each
task stalling the current RCU-tasks grace period.
+
+Interpreting RCU's CPU Stall-Detector "Splats"
+
For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
it will print a message similar to the following:
@@ -178,89 +274,3 @@ grace period is in flight.
It is entirely possible to see stall warnings from normal and from
expedited grace periods at about the same time from the same run.
-
-
-What Causes RCU CPU Stall Warnings?
-
-So your kernel printed an RCU CPU stall warning. The next question is
-"What caused it?" The following problems can result in RCU CPU stall
-warnings:
-
-o A CPU looping in an RCU read-side critical section.
-
-o A CPU looping with interrupts disabled. This condition can
- result in RCU-sched and RCU-bh stalls.
-
-o A CPU looping with preemption disabled. This condition can
- result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
- stalls.
-
-o A CPU looping with bottom halves disabled. This condition can
- result in RCU-sched and RCU-bh stalls.
-
-o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
- kernel without invoking schedule(). Note that cond_resched()
- does not necessarily prevent RCU CPU stall warnings. Therefore,
- if the looping in the kernel is really expected and desirable
- behavior, you might need to replace some of the cond_resched()
- calls with calls to cond_resched_rcu_qs().
-
-o Booting Linux using a console connection that is too slow to
- keep up with the boot-time console-message rate. For example,
- a 115Kbaud serial console can be -way- too slow to keep up
- with boot-time message rates, and will frequently result in
- RCU CPU stall warning messages. Especially if you have added
- debug printk()s.
-
-o Anything that prevents RCU's grace-period kthreads from running.
- This can result in the "All QSes seen" console-log message.
- This message will include information on when the kthread last
- ran and how often it should be expected to run.
-
-o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
- happen to preempt a low-priority task in the middle of an RCU
- read-side critical section. This is especially damaging if
- that low-priority task is not permitted to run on any other CPU,
- in which case the next RCU grace period can never complete, which
- will eventually cause the system to run out of memory and hang.
- While the system is in the process of running itself out of
- memory, you might see stall-warning messages.
-
-o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
- is running at a higher priority than the RCU softirq threads.
- This will prevent RCU callbacks from ever being invoked,
- and in a CONFIG_PREEMPT_RCU kernel will further prevent
- RCU grace periods from ever completing. Either way, the
- system will eventually run out of memory and hang. In the
- CONFIG_PREEMPT_RCU case, you might see stall-warning
- messages.
-
-o A hardware or software issue shuts off the scheduler-clock
- interrupt on a CPU that is not in dyntick-idle mode. This
- problem really has happened, and seems to be most likely to
- result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
-
-o A bug in the RCU implementation.
-
-o A hardware failure. This is quite unlikely, but has occurred
- at least once in real life. A CPU failed in a running system,
- becoming unresponsive, but not causing an immediate crash.
- This resulted in a series of RCU CPU stall warnings, eventually
- leading the realization that the CPU had failed.
-
-The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
-warning. Note that SRCU does -not- have CPU stall warnings. Please note
-that RCU only detects CPU stalls when there is a grace period in progress.
-No grace period, no CPU stall warnings.
-
-To diagnose the cause of the stall, inspect the stack traces.
-The offending function will usually be near the top of the stack.
-If you have a series of stall warnings from a single extended stall,
-comparing the stack traces can often help determine where the stall
-is occurring, which will usually be in the function nearest the top of
-that portion of the stack which remains the same from trace to trace.
-If you can reliably trigger the stall, ftrace can be quite helpful.
-
-RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
-and with RCU's event tracing. For information on RCU's event tracing,
-see include/trace/events/rcu.h.
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
deleted file mode 100644
index 6549012033f9..000000000000
--- a/Documentation/RCU/trace.txt
+++ /dev/null
@@ -1,535 +0,0 @@
-CONFIG_RCU_TRACE debugfs Files and Formats
-
-
-The rcutree and rcutiny implementations of RCU provide debugfs trace
-output that summarizes counters and state. This information is useful for
-debugging RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats, first
-for rcutree and next for rcutiny.
-
-
-CONFIG_TREE_RCU and CONFIG_PREEMPT_RCU debugfs Files and Formats
-
-These implementations of RCU provide several debugfs directories under the
-top-level directory "rcu":
-
-rcu/rcu_bh
-rcu/rcu_preempt
-rcu/rcu_sched
-
-Each directory contains files for the corresponding flavor of RCU.
-Note that rcu/rcu_preempt is only present for CONFIG_PREEMPT_RCU.
-For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
-so that activity for both appears in rcu/rcu_sched.
-
-In addition, the following file appears in the top-level directory:
-rcu/rcutorture. This file displays rcutorture test progress. The output
-of "cat rcu/rcutorture" looks as follows:
-
-rcutorture test sequence: 0 (test in progress)
-rcutorture update version number: 615
-
-The first line shows the number of rcutorture tests that have completed
-since boot. If a test is currently running, the "(test in progress)"
-string will appear as shown above. The second line shows the number of
-update cycles that the current test has started, or zero if there is
-no test in progress.
-
-
-Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
-also rcu/rcu_preempt) the following files will be present:
-
-rcudata:
- Displays fields in struct rcu_data.
-rcuexp:
- Displays statistics for expedited grace periods.
-rcugp:
- Displays grace-period counters.
-rcuhier:
- Displays the struct rcu_node hierarchy.
-rcu_pending:
- Displays counts of the reasons rcu_pending() decided that RCU had
- work to do.
-rcuboost:
- Displays RCU boosting statistics. Only present if
- CONFIG_RCU_BOOST=y.
-
-The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
-
- 0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
- 1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
- 2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
- 3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
- 4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
- 5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
- 6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
- 7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
-
-This file has one line per CPU, or eight for this 8-CPU system.
-The fields are as follows:
-
-o The number at the beginning of each line is the CPU number.
- CPUs numbers followed by an exclamation mark are offline,
- but have been online at least once since boot. There will be
- no output for CPUs that have never been online, which can be
- a good thing in the surprisingly common case where NR_CPUS is
- substantially larger than the number of actual CPUs.
-
-o "c" is the count of grace periods that this CPU believes have
- completed. Offlined CPUs and CPUs in dynticks idle mode may lag
- quite a ways behind, for example, CPU 4 under "rcu_sched" above,
- which has been offline through 16 RCU grace periods. It is not
- unusual to see offline CPUs lagging by thousands of grace periods.
- Note that although the grace-period number is an unsigned long,
- it is printed out as a signed long to allow more human-friendly
- representation near boot time.
-
-o "g" is the count of grace periods that this CPU believes have
- started. Again, offlined CPUs and CPUs in dynticks idle mode
- may lag behind. If the "c" and "g" values are equal, this CPU
- has already reported a quiescent state for the last RCU grace
- period that it is aware of, otherwise, the CPU believes that it
- owes RCU a quiescent state.
-
-o "pq" indicates that this CPU has passed through a quiescent state
- for the current grace period. It is possible for "pq" to be
- "1" and "c" different than "g", which indicates that although
- the CPU has passed through a quiescent state, either (1) this
- CPU has not yet reported that fact, (2) some other CPU has not
- yet reported for this grace period, or (3) both.
-
-o "qp" indicates that RCU still expects a quiescent state from
- this CPU. Offlined CPUs and CPUs in dyntick idle mode might
- well have qp=1, which is OK: RCU is still ignoring them.
-
-o "dt" is the current value of the dyntick counter that is incremented
- when entering or leaving idle, either due to a context switch or
- due to an interrupt. This number is even if the CPU is in idle
- from RCU's viewpoint and odd otherwise. The number after the
- first "/" is the interrupt nesting depth when in idle state,
- or a large number added to the interrupt-nesting depth when
- running a non-idle task. Some architectures do not accurately
- count interrupt nesting when running in non-idle kernel context,
- which can result in interesting anomalies such as negative
- interrupt-nesting levels. The number after the second "/"
- is the NMI nesting depth.
-
-o "df" is the number of times that some other CPU has forced a
- quiescent state on behalf of this CPU due to this CPU being in
- idle state.
-
-o "of" is the number of times that some other CPU has forced a
- quiescent state on behalf of this CPU due to this CPU being
- offline. In a perfect world, this might never happen, but it
- turns out that offlining and onlining a CPU can take several grace
- periods, and so there is likely to be an extended period of time
- when RCU believes that the CPU is online when it really is not.
- Please note that erring in the other direction (RCU believing a
- CPU is offline when it is really alive and kicking) is a fatal
- error, so it makes sense to err conservatively.
-
-o "ql" is the number of RCU callbacks currently residing on
- this CPU. The first number is the number of "lazy" callbacks
- that are known to RCU to only be freeing memory, and the number
- after the "/" is the total number of callbacks, lazy or not.
- These counters count callbacks regardless of what phase of
- grace-period processing that they are in (new, waiting for
- grace period to start, waiting for grace period to end, ready
- to invoke).
-
-o "qs" gives an indication of the state of the callback queue
- with four characters:
-
- "N" Indicates that there are callbacks queued that are not
- ready to be handled by the next grace period, and thus
- will be handled by the grace period following the next
- one.
-
- "R" Indicates that there are callbacks queued that are
- ready to be handled by the next grace period.
-
- "W" Indicates that there are callbacks queued that are
- waiting on the current grace period.
-
- "D" Indicates that there are callbacks queued that have
- already been handled by a prior grace period, and are
- thus waiting to be invoked. Note that callbacks in
- the process of being invoked are not counted here.
- Callbacks in the process of being invoked are those
- that have been removed from the rcu_data structures
- queues by rcu_do_batch(), but which have not yet been
- invoked.
-
- If there are no callbacks in a given one of the above states,
- the corresponding character is replaced by ".".
-
-o "b" is the batch limit for this CPU. If more than this number
- of RCU callbacks is ready to invoke, then the remainder will
- be deferred.
-
-o "ci" is the number of RCU callbacks that have been invoked for
- this CPU. Note that ci+nci+ql is the number of callbacks that have
- been registered in absence of CPU-hotplug activity.
-
-o "nci" is the number of RCU callbacks that have been offloaded from
- this CPU. This will always be zero unless the kernel was built
- with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
- parameter was specified.
-
-o "co" is the number of RCU callbacks that have been orphaned due to
- this CPU going offline. These orphaned callbacks have been moved
- to an arbitrarily chosen online CPU.
-
-o "ca" is the number of RCU callbacks that have been adopted by this
- CPU due to other CPUs going offline. Note that ci+co-ca+ql is
- the number of RCU callbacks registered on this CPU.
-
-
-Kernels compiled with CONFIG_RCU_BOOST=y display the following from
-/debug/rcu/rcu_preempt/rcudata:
-
- 0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
- 1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
- 2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
- 3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
- 4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
- 5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
- 6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
- 7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
-
-This is similar to the output discussed above, but contains the following
-additional fields:
-
-o "kt" is the per-CPU kernel-thread state. The digit preceding
- the first slash is zero if there is no work pending and 1
- otherwise. The character between the first pair of slashes is
- as follows:
-
- "S" The kernel thread is stopped, in other words, all
- CPUs corresponding to this rcu_node structure are
- offline.
-
- "R" The kernel thread is running.
-
- "W" The kernel thread is waiting because there is no work
- for it to do.
-
- "O" The kernel thread is waiting because it has been
- forced off of its designated CPU or because its
- ->cpus_allowed mask permits it to run on other than
- its designated CPU.
-
- "Y" The kernel thread is yielding to avoid hogging CPU.
-
- "?" Unknown value, indicates a bug.
-
- The number after the final slash is the CPU that the kthread
- is actually running on.
-
- This field is displayed only for CONFIG_RCU_BOOST kernels.
-
-o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
- the number of times that this CPU's per-CPU kthread has gone
- through its loop servicing invoke_rcu_cpu_kthread() requests.
-
- This field is displayed only for CONFIG_RCU_BOOST kernels.
-
-
-The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
-
-s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872
-
-These fields are as follows:
-
-o "s" is the sequence number, with an odd number indicating that
- an expedited grace period is in progress.
-
-o "wd1", "wd2", and "wd3" are the number of times that an attempt
- to start an expedited grace period found that someone else had
- completed an expedited grace period that satisfies the attempted
- request. "Our work is done."
-
-o "enq" is the number of quiescent states still outstanding.
-
-o "sc" is the number of times that the attempt to start a
- new expedited grace period succeeded.
-
-
-The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
-
-completed=31249 gpnum=31250 age=1 max=18
-
-These fields are taken from the rcu_state structure, and are as follows:
-
-o "completed" is the number of grace periods that have completed.
- It is comparable to the "c" field from rcu/rcudata in that a
- CPU whose "c" field matches the value of "completed" is aware
- that the corresponding RCU grace period has completed.
-
-o "gpnum" is the number of grace periods that have started. It is
- similarly comparable to the "g" field from rcu/rcudata in that
- a CPU whose "g" field matches the value of "gpnum" is aware that
- the corresponding RCU grace period has started.
-
- If these two fields are equal, then there is no grace period
- in progress, in other words, RCU is idle. On the other hand,
- if the two fields differ (as they are above), then an RCU grace
- period is in progress.
-
-o "age" is the number of jiffies that the current grace period
- has extended for, or zero if there is no grace period currently
- in effect.
-
-o "max" is the age in jiffies of the longest-duration grace period
- thus far.
-
-The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
-
-c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
-3/3 ..>. 0:7 ^0
-e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1
-
-The fields are as follows:
-
-o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
-
-o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
-
-o "s" is the current state of the force_quiescent_state()
- state machine.
-
-o "jfq" is the number of jiffies remaining for this grace period
- before force_quiescent_state() is invoked to help push things
- along. Note that CPUs in idle mode throughout the grace period
- will not report on their own, but rather must be check by some
- other CPU via force_quiescent_state().
-
-o "j" is the low-order four hex digits of the jiffies counter.
- Yes, Paul did run into a number of problems that turned out to
- be due to the jiffies counter no longer counting. Why do you ask?
-
-o "nfqs" is the number of calls to force_quiescent_state() since
- boot.
-
-o "nfqsng" is the number of useless calls to force_quiescent_state(),
- where there wasn't actually a grace period active. This can
- no longer happen due to grace-period processing being pushed
- into a kthread. The number in parentheses is the difference
- between "nfqs" and "nfqsng", or the number of times that
- force_quiescent_state() actually did some real work.
-
-o "fqlh" is the number of calls to force_quiescent_state() that
- exited immediately (without even being counted in nfqs above)
- due to contention on ->fqslock.
-
-o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
- structure. Each line represents one level of the hierarchy,
- from root to leaves. It is best to think of the rcu_data
- structures as forming yet another level after the leaves.
- Note that there might be either one, two, three, or even four
- levels of rcu_node structures, depending on the relationship
- between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
- adjusted using the rcu_fanout_leaf kernel boot parameter), and
- CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
- possible CPUs for the booting hardware).
-
- o The numbers separated by the "/" are the qsmask followed
- by the qsmaskinit. The qsmask will have one bit
- set for each entity in the next lower level that has
- not yet checked in for the current grace period ("e"
- indicating CPUs 5, 6, and 7 in the example above).
- The qsmaskinit will have one bit for each entity that is
- currently expected to check in during each grace period.
- The value of qsmaskinit is assigned to that of qsmask
- at the beginning of each grace period.
-
- o The characters separated by the ">" indicate the state
- of the blocked-tasks lists. A "G" preceding the ">"
- indicates that at least one task blocked in an RCU
- read-side critical section blocks the current grace
- period, while a "E" preceding the ">" indicates that
- at least one task blocked in an RCU read-side critical
- section blocks the current expedited grace period.
- A "T" character following the ">" indicates that at
- least one task is blocked within an RCU read-side
- critical section, regardless of whether any current
- grace period (expedited or normal) is inconvenienced.
- A "." character appears if the corresponding condition
- does not hold, so that "..>." indicates that no tasks
- are blocked. In contrast, "GE>T" indicates maximal
- inconvenience from blocked tasks. CONFIG_TREE_RCU
- builds of the kernel will always show "..>.".
-
- o The numbers separated by the ":" are the range of CPUs
- served by this struct rcu_node. This can be helpful
- in working out how the hierarchy is wired together.
-
- For example, the example rcu_node structure shown above
- has "0:7", indicating that it covers CPUs 0 through 7.
-
- o The number after the "^" indicates the bit in the
- next higher level rcu_node structure that this rcu_node
- structure corresponds to. For example, the "d/d ..>. 4:7
- ^1" has a "1" in this position, indicating that it
- corresponds to the "1" bit in the "3" shown in the
- "3/3 ..>. 0:7 ^0" entry on the next level up.
-
-
-The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
-
- 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0
- 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0
- 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0
- 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0
- 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0
- 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0
- 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0
- 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0
-
-The fields are as follows:
-
-o The leading number is the CPU number, with "!" indicating
- an offline CPU.
-
-o "np" is the number of times that __rcu_pending() has been invoked
- for the corresponding flavor of RCU.
-
-o "qsp" is the number of times that the RCU was waiting for a
- quiescent state from this CPU.
-
-o "rpq" is the number of times that the CPU had passed through
- a quiescent state, but not yet reported it to RCU.
-
-o "cbr" is the number of times that this CPU had RCU callbacks
- that had passed through a grace period, and were thus ready
- to be invoked.
-
-o "cng" is the number of times that this CPU needed another
- grace period while RCU was idle.
-
-o "gpc" is the number of times that an old grace period had
- completed, but this CPU was not yet aware of it.
-
-o "gps" is the number of times that a new grace period had started,
- but this CPU was not yet aware of it.
-
-o "ndw" is the number of times that a wakeup of an rcuo
- callback-offload kthread had to be deferred in order to avoid
- deadlock.
-
-o "nn" is the number of times that this CPU needed nothing.
-
-
-The output of "cat rcu/rcuboost" looks as follows:
-
-0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
- balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
-4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
- balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
-
-This information is output only for rcu_preempt. Each two-line entry
-corresponds to a leaf rcu_node structure. The fields are as follows:
-
-o "n:m" is the CPU-number range for the corresponding two-line
- entry. In the sample output above, the first entry covers
- CPUs zero through three and the second entry covers CPUs four
- through seven.
-
-o "tasks=TNEB" gives the state of the various segments of the
- rnp->blocked_tasks list:
-
- "T" This indicates that there are some tasks that blocked
- while running on one of the corresponding CPUs while
- in an RCU read-side critical section.
-
- "N" This indicates that some of the blocked tasks are preventing
- the current normal (non-expedited) grace period from
- completing.
-
- "E" This indicates that some of the blocked tasks are preventing
- the current expedited grace period from completing.
-
- "B" This indicates that some of the blocked tasks are in
- need of RCU priority boosting.
-
- Each character is replaced with "." if the corresponding
- condition does not hold.
-
-o "kt" is the state of the RCU priority-boosting kernel
- thread associated with the corresponding rcu_node structure.
- The state can be one of the following:
-
- "S" The kernel thread is stopped, in other words, all
- CPUs corresponding to this rcu_node structure are
- offline.
-
- "R" The kernel thread is running.
-
- "W" The kernel thread is waiting because there is no work
- for it to do.
-
- "Y" The kernel thread is yielding to avoid hogging CPU.
-
- "?" Unknown value, indicates a bug.
-
-o "ntb" is the number of tasks boosted.
-
-o "neb" is the number of tasks boosted in order to complete an
- expedited grace period.
-
-o "nnb" is the number of tasks boosted in order to complete a
- normal (non-expedited) grace period. When boosting a task
- that was blocking both an expedited and a normal grace period,
- it is counted against the expedited total above.
-
-o "j" is the low-order 16 bits of the jiffies counter in
- hexadecimal.
-
-o "bt" is the low-order 16 bits of the value that the jiffies
- counter will have when we next start boosting, assuming that
- the current grace period does not end beforehand. This is
- also in hexadecimal.
-
-o "balk: nt" counts the number of times we didn't boost (in
- other words, we balked) even though it was time to boost because
- there were no blocked tasks to boost. This situation occurs
- when there is one blocked task on one rcu_node structure and
- none on some other rcu_node structure.
-
-o "egt" counts the number of times we balked because although
- there were blocked tasks, none of them were blocking the
- current grace period, whether expedited or otherwise.
-
-o "bt" counts the number of times we balked because boosting
- had already been initiated for the current grace period.
-
-o "nb" counts the number of times we balked because there
- was at least one task blocking the current non-expedited grace
- period that never had blocked. If it is already running, it
- just won't help to boost its priority!
-
-o "ny" counts the number of times we balked because it was
- not yet time to start boosting.
-
-o "nos" counts the number of times we balked for other
- reasons, e.g., the grace period ended first.
-
-
-CONFIG_TINY_RCU debugfs Files and Formats
-
-These implementations of RCU provides a single debugfs file under the
-top-level directory RCU, namely rcu/rcudata, which displays fields in
-rcu_bh_ctrlblk and rcu_sched_ctrlblk.
-
-The output of "cat rcu/rcudata" is as follows:
-
-rcu_sched: qlen: 0
-rcu_bh: qlen: 0
-
-This is split into rcu_sched and rcu_bh sections. The field is as
-follows:
-
-o "qlen" is the number of RCU callbacks currently waiting either
- for an RCU grace period or waiting to be invoked. This is the
- only field present for rcu_sched and rcu_bh, due to the
- short-circuiting of grace period in those two cases.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 5cbd8b2395b8..8ed6c9f6133c 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on
familiar locking primitives. Its overhead makes it a non-starter for
real-life use, as does its lack of scalability. It is also unsuitable
for realtime use, since it allows scheduling latency to "bleed" from
-one read-side critical section to another.
+one read-side critical section to another. It also assumes recursive
+reader-writer locks: If you try this with non-recursive locks, and
+you allow nested rcu_read_lock() calls, you can deadlock.
However, it is probably the easiest implementation to relate to, so is
a good starting point.
@@ -587,20 +589,21 @@ It is extremely simple:
write_unlock(&rcu_gp_mutex);
}
-[You can ignore rcu_assign_pointer() and rcu_dereference() without
-missing much. But here they are anyway. And whatever you do, don't
-forget about them when submitting patches making use of RCU!]
+[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
+much. But here are simplified versions anyway. And whatever you do,
+don't forget about them when submitting patches making use of RCU!]
- #define rcu_assign_pointer(p, v) ({ \
- smp_wmb(); \
- (p) = (v); \
- })
+ #define rcu_assign_pointer(p, v) \
+ ({ \
+ smp_store_release(&(p), (v)); \
+ })
- #define rcu_dereference(p) ({ \
- typeof(p) _________p1 = p; \
- smp_read_barrier_depends(); \
- (_________p1); \
- })
+ #define rcu_dereference(p) \
+ ({ \
+ typeof(p) _________p1 = p; \
+ smp_read_barrier_depends(); \
+ (_________p1); \
+ })
The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
@@ -925,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face
e. Is your workload too update-intensive for normal use of
RCU, but inappropriate for other synchronization mechanisms?
- If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
+ If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
+ named SLAB_DESTROY_BY_RCU). But please be careful!
f. Do you need read-side critical sections that are respected
even though they are in the middle of the idle loop, during
diff --git a/Documentation/acpi/acpi-lid.txt b/Documentation/acpi/acpi-lid.txt
index 22cb3091f297..effe7af3a5af 100644
--- a/Documentation/acpi/acpi-lid.txt
+++ b/Documentation/acpi/acpi-lid.txt
@@ -59,20 +59,28 @@ button driver uses the following 3 modes in order not to trigger issues.
If the userspace hasn't been prepared to ignore the unreliable "opened"
events and the unreliable initial state notification, Linux users can use
the following kernel parameters to handle the possible issues:
-A. button.lid_init_state=open:
+A. button.lid_init_state=method:
+ When this option is specified, the ACPI button driver reports the
+ initial lid state using the returning value of the _LID control method
+ and whether the "opened"/"closed" events are paired fully relies on the
+ firmware implementation.
+ This option can be used to fix some platforms where the returning value
+ of the _LID control method is reliable but the initial lid state
+ notification is missing.
+ This option is the default behavior during the period the userspace
+ isn't ready to handle the buggy AML tables.
+B. button.lid_init_state=open:
When this option is specified, the ACPI button driver always reports the
initial lid state as "opened" and whether the "opened"/"closed" events
are paired fully relies on the firmware implementation.
This may fix some platforms where the returning value of the _LID
control method is not reliable and the initial lid state notification is
missing.
- This option is the default behavior during the period the userspace
- isn't ready to handle the buggy AML tables.
If the userspace has been prepared to ignore the unreliable "opened" events
and the unreliable initial state notification, Linux users should always
use the following kernel parameter:
-B. button.lid_init_state=ignore:
+C. button.lid_init_state=ignore:
When this option is specified, the ACPI button driver never reports the
initial lid state and there is a compensation mechanism implemented to
ensure that the reliable "closed" notifications can always be delievered
diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst
index 02f639aab06e..b96e80f79e85 100644
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -362,7 +362,7 @@ If something goes wrong
as is, otherwise you will have to use the ``ksymoops`` program to make
sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred).
This utility can be downloaded from
- ftp://ftp.<country>.kernel.org/pub/linux/utils/kernel/ksymoops/ .
+ https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ .
Alternatively, you can do the dump lookup by hand:
- In debugging dumps like the above, it helps enormously if you can
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 130e7ecaf9a6..9b0b3dea6326 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -866,6 +866,15 @@
dscc4.setup= [NET]
+ dt_cpu_ftrs= [PPC]
+ Format: {"off" | "known"}
+ Control how the dt_cpu_ftrs device-tree binding is
+ used for CPU feature discovery and setup (if it
+ exists).
+ off: Do not use it, fall back to legacy cpu table.
+ known: Do not pass through unknown features to guests
+ or userspace, only those that the kernel is aware of.
+
dump_apple_properties [X86]
Dump name and content of EFI device properties on
x86 Macs. Useful for driver authors to determine
@@ -972,7 +981,7 @@
A valid base address must be provided, and the serial
port must already be setup and configured.
- armada3700_uart,<addr>
+ ar3700_uart,<addr>
Start an early, polled-mode console on the
Armada 3700 serial port at the specified
address. The serial port must already be setup
@@ -2127,6 +2136,12 @@
memmap=nn[KMG]@ss[KMG]
[KNL] Force usage of a specific region of memory.
Region of memory to be used is from ss to ss+nn.
+ If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG],
+ which limits max address to nn[KMG].
+ Multiple different regions can be specified,
+ comma delimited.
+ Example:
+ memmap=100M@2G,100M#3G,1G!1024G
memmap=nn[KMG]#ss[KMG]
[KNL,ACPI] Mark specific memory as ACPI data.
@@ -2139,6 +2154,9 @@
memmap=64K$0x18690000
or
memmap=0x10000$0x18690000
+ Some bootloaders may need an escape character before '$',
+ like Grub2, otherwise '$' and the following number
+ will be eaten.
memmap=nn[KMG]!ss[KMG]
[KNL,X86] Mark specific memory as protected.
@@ -2434,12 +2452,6 @@
and gids from such clients. This is intended to ease
migration from NFSv2/v3.
- objlayoutdriver.osd_login_prog=
- [NFS] [OBJLAYOUT] sets the pathname to the program which
- is used to automatically discover and login into new
- osd-targets. Please see:
- Documentation/filesystems/pnfs.txt for more explanations
-
nmi_debug= [KNL,SH] Specify one or more actions to take
when a NMI is triggered.
Format: [state][,regs][,debounce][,die]
@@ -3235,21 +3247,17 @@
rcutree.gp_cleanup_delay= [KNL]
Set the number of jiffies to delay each step of
- RCU grace-period cleanup. This only has effect
- when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set.
+ RCU grace-period cleanup.
rcutree.gp_init_delay= [KNL]
Set the number of jiffies to delay each step of
- RCU grace-period initialization. This only has
- effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT
- is set.
+ RCU grace-period initialization.
rcutree.gp_preinit_delay= [KNL]
Set the number of jiffies to delay each step of
RCU grace-period pre-initialization, that is,
the propagation of recent CPU-hotplug changes up
- the rcu_node combining tree. This only has effect
- when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set.
+ the rcu_node combining tree.
rcutree.rcu_fanout_exact= [KNL]
Disable autobalancing of the rcu_node combining
@@ -3325,6 +3333,17 @@
This wake_up() will be accompanied by a
WARN_ONCE() splat and an ftrace_dump().
+ rcuperf.gp_async= [KNL]
+ Measure performance of asynchronous
+ grace-period primitives such as call_rcu().
+
+ rcuperf.gp_async_max= [KNL]
+ Specify the maximum number of outstanding
+ callbacks per writer thread. When a writer
+ thread exceeds this limit, it invokes the
+ corresponding flavor of rcu_barrier() to allow
+ previously posted callbacks to drain.
+
rcuperf.gp_exp= [KNL]
Measure performance of expedited synchronous
grace-period primitives.
@@ -3352,17 +3371,22 @@
rcuperf.perf_runnable= [BOOT]
Start rcuperf running at boot time.
+ rcuperf.perf_type= [KNL]
+ Specify the RCU implementation to test.
+
rcuperf.shutdown= [KNL]
Shut the system down after performance tests
complete. This is useful for hands-off automated
testing.
- rcuperf.perf_type= [KNL]
- Specify the RCU implementation to test.
-
rcuperf.verbose= [KNL]
Enable additional printk() statements.
+ rcuperf.writer_holdoff= [KNL]
+ Write-side holdoff between grace periods,
+ in microseconds. The default of zero says
+ no holdoff.
+
rcutorture.cbflood_inter_holdoff= [KNL]
Set holdoff time (jiffies) between successive
callback-flood tests.
@@ -3800,6 +3824,30 @@
spia_pedr=
spia_peddr=
+ srcutree.counter_wrap_check [KNL]
+ Specifies how frequently to check for
+ grace-period sequence counter wrap for the
+ srcu_data structure's ->srcu_gp_seq_needed field.
+ The greater the number of bits set in this kernel
+ parameter, the less frequently counter wrap will
+ be checked for. Note that the bottom two bits
+ are ignored.
+
+ srcutree.exp_holdoff [KNL]
+ Specifies how many nanoseconds must elapse
+ since the end of the last SRCU grace period for
+ a given srcu_struct until the next normal SRCU
+ grace period will be considered for automatic
+ expediting. Set to zero to disable automatic
+ expediting.
+
+ stack_guard_gap= [MM]
+ override the default stack gap protection. The value
+ is in page units and it defines how many pages prior
+ to (for stacks growing down) resp. after (for stacks
+ growing up) the main stack are reserved for no other
+ mapping. Default value is 256 pages.
+
stacktrace [FTRACE]
Enabled the stack tracer on boot up.
diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst
index 289c80f7760e..09aa2e949787 100644
--- a/Documentation/admin-guide/pm/cpufreq.rst
+++ b/Documentation/admin-guide/pm/cpufreq.rst
@@ -1,4 +1,5 @@
.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
+.. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
=======================
CPU Performance Scaling
@@ -75,7 +76,7 @@ feedback registers, as that information is typically specific to the hardware
interface it comes from and may not be easily represented in an abstract,
platform-independent way. For this reason, ``CPUFreq`` allows scaling drivers
to bypass the governor layer and implement their own performance scaling
-algorithms. That is done by the ``intel_pstate`` scaling driver.
+algorithms. That is done by the |intel_pstate| scaling driver.
``CPUFreq`` Policy Objects
@@ -174,13 +175,13 @@ necessary to restart the scaling governor so that it can take the new online CPU
into account. That is achieved by invoking the governor's ``->stop`` and
``->start()`` callbacks, in this order, for the entire policy.
-As mentioned before, the ``intel_pstate`` scaling driver bypasses the scaling
+As mentioned before, the |intel_pstate| scaling driver bypasses the scaling
governor layer of ``CPUFreq`` and provides its own P-state selection algorithms.
-Consequently, if ``intel_pstate`` is used, scaling governors are not attached to
+Consequently, if |intel_pstate| is used, scaling governors are not attached to
new policy objects. Instead, the driver's ``->setpolicy()`` callback is invoked
to register per-CPU utilization update callbacks for each policy. These
callbacks are invoked by the CPU scheduler in the same way as for scaling
-governors, but in the ``intel_pstate`` case they both determine the P-state to
+governors, but in the |intel_pstate| case they both determine the P-state to
use and change the hardware configuration accordingly in one go from scheduler
context.
@@ -257,7 +258,7 @@ are the following:
``scaling_available_governors``
List of ``CPUFreq`` scaling governors present in the kernel that can
- be attached to this policy or (if the ``intel_pstate`` scaling driver is
+ be attached to this policy or (if the |intel_pstate| scaling driver is
in use) list of scaling algorithms provided by the driver that can be
applied to this policy.
@@ -274,7 +275,7 @@ are the following:
the CPU is actually running at (due to hardware design and other
limitations).
- Some scaling drivers (e.g. ``intel_pstate``) attempt to provide
+ Some scaling drivers (e.g. |intel_pstate|) attempt to provide
information more precisely reflecting the current CPU frequency through
this attribute, but that still may not be the exact current CPU
frequency as seen by the hardware at the moment.
@@ -284,13 +285,13 @@ are the following:
``scaling_governor``
The scaling governor currently attached to this policy or (if the
- ``intel_pstate`` scaling driver is in use) the scaling algorithm
+ |intel_pstate| scaling driver is in use) the scaling algorithm
provided by the driver that is currently applied to this policy.
This attribute is read-write and writing to it will cause a new scaling
governor to be attached to this policy or a new scaling algorithm
provided by the scaling driver to be applied to it (in the
- ``intel_pstate`` case), as indicated by the string written to this
+ |intel_pstate| case), as indicated by the string written to this
attribute (which must be one of the names listed by the
``scaling_available_governors`` attribute described above).
@@ -619,7 +620,7 @@ This file is located under :file:`/sys/devices/system/cpu/cpufreq/` and controls
the "boost" setting for the whole system. It is not present if the underlying
scaling driver does not support the frequency boost mechanism (or supports it,
but provides a driver-specific interface for controlling it, like
-``intel_pstate``).
+|intel_pstate|).
If the value in this file is 1, the frequency boost mechanism is enabled. This
means that either the hardware can be put into states in which it is able to
diff --git a/Documentation/admin-guide/pm/index.rst b/Documentation/admin-guide/pm/index.rst
index c80f087321fc..7f148f76f432 100644
--- a/Documentation/admin-guide/pm/index.rst
+++ b/Documentation/admin-guide/pm/index.rst
@@ -6,6 +6,7 @@ Power Management
:maxdepth: 2
cpufreq
+ intel_pstate
.. only:: subproject and html
diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst
new file mode 100644
index 000000000000..33d703989ea8
--- /dev/null
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -0,0 +1,755 @@
+===============================================
+``intel_pstate`` CPU Performance Scaling Driver
+===============================================
+
+::
+
+ Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+General Information
+===================
+
+``intel_pstate`` is a part of the
+:doc:`CPU performance scaling subsystem <cpufreq>` in the Linux kernel
+(``CPUFreq``). It is a scaling driver for the Sandy Bridge and later
+generations of Intel processors. Note, however, that some of those processors
+may not be supported. [To understand ``intel_pstate`` it is necessary to know
+how ``CPUFreq`` works in general, so this is the time to read :doc:`cpufreq` if
+you have not done that yet.]
+
+For the processors supported by ``intel_pstate``, the P-state concept is broader
+than just an operating frequency or an operating performance point (see the
+`LinuxCon Europe 2015 presentation by Kristen Accardi <LCEU2015_>`_ for more
+information about that). For this reason, the representation of P-states used
+by ``intel_pstate`` internally follows the hardware specification (for details
+refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual
+Volume 3: System Programming Guide <SDM_>`_). However, the ``CPUFreq`` core
+uses frequencies for identifying operating performance points of CPUs and
+frequencies are involved in the user space interface exposed by it, so
+``intel_pstate`` maps its internal representation of P-states to frequencies too
+(fortunately, that mapping is unambiguous). At the same time, it would not be
+practical for ``intel_pstate`` to supply the ``CPUFreq`` core with a table of
+available frequencies due to the possible size of it, so the driver does not do
+that. Some functionality of the core is limited by that.
+
+Since the hardware P-state selection interface used by ``intel_pstate`` is
+available at the logical CPU level, the driver always works with individual
+CPUs. Consequently, if ``intel_pstate`` is in use, every ``CPUFreq`` policy
+object corresponds to one logical CPU and ``CPUFreq`` policies are effectively
+equivalent to CPUs. In particular, this means that they become "inactive" every
+time the corresponding CPU is taken offline and need to be re-initialized when
+it goes back online.
+
+``intel_pstate`` is not modular, so it cannot be unloaded, which means that the
+only way to pass early-configuration-time parameters to it is via the kernel
+command line. However, its configuration can be adjusted via ``sysfs`` to a
+great extent. In some configurations it even is possible to unregister it via
+``sysfs`` which allows another ``CPUFreq`` scaling driver to be loaded and
+registered (see `below <status_attr_>`_).
+
+
+Operation Modes
+===============
+
+``intel_pstate`` can operate in three different modes: in the active mode with
+or without hardware-managed P-states support and in the passive mode. Which of
+them will be in effect depends on what kernel command line options are used and
+on the capabilities of the processor.
+
+Active Mode
+-----------
+
+This is the default operation mode of ``intel_pstate``. If it works in this
+mode, the ``scaling_driver`` policy attribute in ``sysfs`` for all ``CPUFreq``
+policies contains the string "intel_pstate".
+
+In this mode the driver bypasses the scaling governors layer of ``CPUFreq`` and
+provides its own scaling algorithms for P-state selection. Those algorithms
+can be applied to ``CPUFreq`` policies in the same way as generic scaling
+governors (that is, through the ``scaling_governor`` policy attribute in
+``sysfs``). [Note that different P-state selection algorithms may be chosen for
+different policies, but that is not recommended.]
+
+They are not generic scaling governors, but their names are the same as the
+names of some of those governors. Moreover, confusingly enough, they generally
+do not work in the same way as the generic governors they share the names with.
+For example, the ``powersave`` P-state selection algorithm provided by
+``intel_pstate`` is not a counterpart of the generic ``powersave`` governor
+(roughly, it corresponds to the ``schedutil`` and ``ondemand`` governors).
+
+There are two P-state selection algorithms provided by ``intel_pstate`` in the
+active mode: ``powersave`` and ``performance``. The way they both operate
+depends on whether or not the hardware-managed P-states (HWP) feature has been
+enabled in the processor and possibly on the processor model.
+
+Which of the P-state selection algorithms is used by default depends on the
+:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option.
+Namely, if that option is set, the ``performance`` algorithm will be used by
+default, and the other one will be used by default if it is not set.
+
+Active Mode With HWP
+~~~~~~~~~~~~~~~~~~~~
+
+If the processor supports the HWP feature, it will be enabled during the
+processor initialization and cannot be disabled after that. It is possible
+to avoid enabling it by passing the ``intel_pstate=no_hwp`` argument to the
+kernel in the command line.
+
+If the HWP feature has been enabled, ``intel_pstate`` relies on the processor to
+select P-states by itself, but still it can give hints to the processor's
+internal P-state selection logic. What those hints are depends on which P-state
+selection algorithm has been applied to the given policy (or to the CPU it
+corresponds to).
+
+Even though the P-state selection is carried out by the processor automatically,
+``intel_pstate`` registers utilization update callbacks with the CPU scheduler
+in this mode. However, they are not used for running a P-state selection
+algorithm, but for periodic updates of the current CPU frequency information to
+be made available from the ``scaling_cur_freq`` policy attribute in ``sysfs``.
+
+HWP + ``performance``
+.....................
+
+In this configuration ``intel_pstate`` will write 0 to the processor's
+Energy-Performance Preference (EPP) knob (if supported) or its
+Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's
+internal P-state selection logic is expected to focus entirely on performance.
+
+This will override the EPP/EPB setting coming from the ``sysfs`` interface
+(see `Energy vs Performance Hints`_ below).
+
+Also, in this configuration the range of P-states available to the processor's
+internal P-state selection logic is always restricted to the upper boundary
+(that is, the maximum P-state that the driver is allowed to use).
+
+HWP + ``powersave``
+...................
+
+In this configuration ``intel_pstate`` will set the processor's
+Energy-Performance Preference (EPP) knob (if supported) or its
+Energy-Performance Bias (EPB) knob (otherwise) to whatever value it was
+previously set to via ``sysfs`` (or whatever default value it was
+set to by the platform firmware). This usually causes the processor's
+internal P-state selection logic to be less performance-focused.
+
+Active Mode Without HWP
+~~~~~~~~~~~~~~~~~~~~~~~
+
+This is the default operation mode for processors that do not support the HWP
+feature. It also is used by default with the ``intel_pstate=no_hwp`` argument
+in the kernel command line. However, in this mode ``intel_pstate`` may refuse
+to work with the given processor if it does not recognize it. [Note that
+``intel_pstate`` will never refuse to work with any processor with the HWP
+feature enabled.]
+
+In this mode ``intel_pstate`` registers utilization update callbacks with the
+CPU scheduler in order to run a P-state selection algorithm, either
+``powersave`` or ``performance``, depending on the ``scaling_cur_freq`` policy
+setting in ``sysfs``. The current CPU frequency information to be made
+available from the ``scaling_cur_freq`` policy attribute in ``sysfs`` is
+periodically updated by those utilization update callbacks too.
+
+``performance``
+...............
+
+Without HWP, this P-state selection algorithm is always the same regardless of
+the processor model and platform configuration.
+
+It selects the maximum P-state it is allowed to use, subject to limits set via
+``sysfs``, every time the P-state selection computations are carried out by the
+driver's utilization update callback for the given CPU (that does not happen
+more often than every 10 ms), but the hardware configuration will not be changed
+if the new P-state is the same as the current one.
+
+This is the default P-state selection algorithm if the
+:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
+is set.
+
+``powersave``
+.............
+
+Without HWP, this P-state selection algorithm generally depends on the
+processor model and/or the system profile setting in the ACPI tables and there
+are two variants of it.
+
+One of them is used with processors from the Atom line and (regardless of the
+processor model) on platforms with the system profile in the ACPI tables set to
+"mobile" (laptops mostly), "tablet", "appliance PC", "desktop", or
+"workstation". It is also used with processors supporting the HWP feature if
+that feature has not been enabled (that is, with the ``intel_pstate=no_hwp``
+argument in the kernel command line). It is similar to the algorithm
+implemented by the generic ``schedutil`` scaling governor except that the
+utilization metric used by it is based on numbers coming from feedback
+registers of the CPU. It generally selects P-states proportional to the
+current CPU utilization, so it is referred to as the "proportional" algorithm.
+
+The second variant of the ``powersave`` P-state selection algorithm, used in all
+of the other cases (generally, on processors from the Core line, so it is
+referred to as the "Core" algorithm), is based on the values read from the APERF
+and MPERF feedback registers and the previously requested target P-state.
+It does not really take CPU utilization into account explicitly, but as a rule
+it causes the CPU P-state to ramp up very quickly in response to increased
+utilization which is generally desirable in server environments.
+
+Regardless of the variant, this algorithm is run by the driver's utilization
+update callback for the given CPU when it is invoked by the CPU scheduler, but
+not more often than every 10 ms (that can be tweaked via ``debugfs`` in `this
+particular case <Tuning Interface in debugfs_>`_). Like in the ``performance``
+case, the hardware configuration is not touched if the new P-state turns out to
+be the same as the current one.
+
+This is the default P-state selection algorithm if the
+:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
+is not set.
+
+Passive Mode
+------------
+
+This mode is used if the ``intel_pstate=passive`` argument is passed to the
+kernel in the command line (it implies the ``intel_pstate=no_hwp`` setting too).
+Like in the active mode without HWP support, in this mode ``intel_pstate`` may
+refuse to work with the given processor if it does not recognize it.
+
+If the driver works in this mode, the ``scaling_driver`` policy attribute in
+``sysfs`` for all ``CPUFreq`` policies contains the string "intel_cpufreq".
+Then, the driver behaves like a regular ``CPUFreq`` scaling driver. That is,
+it is invoked by generic scaling governors when necessary to talk to the
+hardware in order to change the P-state of a CPU (in particular, the
+``schedutil`` governor can invoke it directly from scheduler context).
+
+While in this mode, ``intel_pstate`` can be used with all of the (generic)
+scaling governors listed by the ``scaling_available_governors`` policy attribute
+in ``sysfs`` (and the P-state selection algorithms described above are not
+used). Then, it is responsible for the configuration of policy objects
+corresponding to CPUs and provides the ``CPUFreq`` core (and the scaling
+governors attached to the policy objects) with accurate information on the
+maximum and minimum operating frequencies supported by the hardware (including
+the so-called "turbo" frequency ranges). In other words, in the passive mode
+the entire range of available P-states is exposed by ``intel_pstate`` to the
+``CPUFreq`` core. However, in this mode the driver does not register
+utilization update callbacks with the CPU scheduler and the ``scaling_cur_freq``
+information comes from the ``CPUFreq`` core (and is the last frequency selected
+by the current scaling governor for the given policy).
+
+
+.. _turbo:
+
+Turbo P-states Support
+======================
+
+In the majority of cases, the entire range of P-states available to
+``intel_pstate`` can be divided into two sub-ranges that correspond to
+different types of processor behavior, above and below a boundary that
+will be referred to as the "turbo threshold" in what follows.
+
+The P-states above the turbo threshold are referred to as "turbo P-states" and
+the whole sub-range of P-states they belong to is referred to as the "turbo
+range". These names are related to the Turbo Boost technology allowing a
+multicore processor to opportunistically increase the P-state of one or more
+cores if there is enough power to do that and if that is not going to cause the
+thermal envelope of the processor package to be exceeded.
+
+Specifically, if software sets the P-state of a CPU core within the turbo range
+(that is, above the turbo threshold), the processor is permitted to take over
+performance scaling control for that core and put it into turbo P-states of its
+choice going forward. However, that permission is interpreted differently by
+different processor generations. Namely, the Sandy Bridge generation of
+processors will never use any P-states above the last one set by software for
+the given core, even if it is within the turbo range, whereas all of the later
+processor generations will take it as a license to use any P-states from the
+turbo range, even above the one set by software. In other words, on those
+processors setting any P-state from the turbo range will enable the processor
+to put the given core into all turbo P-states up to and including the maximum
+supported one as it sees fit.
+
+One important property of turbo P-states is that they are not sustainable. More
+precisely, there is no guarantee that any CPUs will be able to stay in any of
+those states indefinitely, because the power distribution within the processor
+package may change over time or the thermal envelope it was designed for might
+be exceeded if a turbo P-state was used for too long.
+
+In turn, the P-states below the turbo threshold generally are sustainable. In
+fact, if one of them is set by software, the processor is not expected to change
+it to a lower one unless in a thermal stress or a power limit violation
+situation (a higher P-state may still be used if it is set for another CPU in
+the same package at the same time, for example).
+
+Some processors allow multiple cores to be in turbo P-states at the same time,
+but the maximum P-state that can be set for them generally depends on the number
+of cores running concurrently. The maximum turbo P-state that can be set for 3
+cores at the same time usually is lower than the analogous maximum P-state for
+2 cores, which in turn usually is lower than the maximum turbo P-state that can
+be set for 1 core. The one-core maximum turbo P-state is thus the maximum
+supported one overall.
+
+The maximum supported turbo P-state, the turbo threshold (the maximum supported
+non-turbo P-state) and the minimum supported P-state are specific to the
+processor model and can be determined by reading the processor's model-specific
+registers (MSRs). Moreover, some processors support the Configurable TDP
+(Thermal Design Power) feature and, when that feature is enabled, the turbo
+threshold effectively becomes a configurable value that can be set by the
+platform firmware.
+
+Unlike ``_PSS`` objects in the ACPI tables, ``intel_pstate`` always exposes
+the entire range of available P-states, including the whole turbo range, to the
+``CPUFreq`` core and (in the passive mode) to generic scaling governors. This
+generally causes turbo P-states to be set more often when ``intel_pstate`` is
+used relative to ACPI-based CPU performance scaling (see `below <acpi-cpufreq_>`_
+for more information).
+
+Moreover, since ``intel_pstate`` always knows what the real turbo threshold is
+(even if the Configurable TDP feature is enabled in the processor), its
+``no_turbo`` attribute in ``sysfs`` (described `below <no_turbo_attr_>`_) should
+work as expected in all cases (that is, if set to disable turbo P-states, it
+always should prevent ``intel_pstate`` from using them).
+
+
+Processor Support
+=================
+
+To handle a given processor ``intel_pstate`` requires a number of different
+pieces of information on it to be known, including:
+
+ * The minimum supported P-state.
+
+ * The maximum supported `non-turbo P-state <turbo_>`_.
+
+ * Whether or not turbo P-states are supported at all.
+
+ * The maximum supported `one-core turbo P-state <turbo_>`_ (if turbo P-states
+ are supported).
+
+ * The scaling formula to translate the driver's internal representation
+ of P-states into frequencies and the other way around.
+
+Generally, ways to obtain that information are specific to the processor model
+or family. Although it often is possible to obtain all of it from the processor
+itself (using model-specific registers), there are cases in which hardware
+manuals need to be consulted to get to it too.
+
+For this reason, there is a list of supported processors in ``intel_pstate`` and
+the driver initialization will fail if the detected processor is not in that
+list, unless it supports the `HWP feature <Active Mode_>`_. [The interface to
+obtain all of the information listed above is the same for all of the processors
+supporting the HWP feature, which is why they all are supported by
+``intel_pstate``.]
+
+
+User Space Interface in ``sysfs``
+=================================
+
+Global Attributes
+-----------------
+
+``intel_pstate`` exposes several global attributes (files) in ``sysfs`` to
+control its functionality at the system level. They are located in the
+``/sys/devices/system/cpu/cpufreq/intel_pstate/`` directory and affect all
+CPUs.
+
+Some of them are not present if the ``intel_pstate=per_cpu_perf_limits``
+argument is passed to the kernel in the command line.
+
+``max_perf_pct``
+ Maximum P-state the driver is allowed to set in percent of the
+ maximum supported performance level (the highest supported `turbo
+ P-state <turbo_>`_).
+
+ This attribute will not be exposed if the
+ ``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
+ command line.
+
+``min_perf_pct``
+ Minimum P-state the driver is allowed to set in percent of the
+ maximum supported performance level (the highest supported `turbo
+ P-state <turbo_>`_).
+
+ This attribute will not be exposed if the
+ ``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
+ command line.
+
+``num_pstates``
+ Number of P-states supported by the processor (between 0 and 255
+ inclusive) including both turbo and non-turbo P-states (see
+ `Turbo P-states Support`_).
+
+ The value of this attribute is not affected by the ``no_turbo``
+ setting described `below <no_turbo_attr_>`_.
+
+ This attribute is read-only.
+
+``turbo_pct``
+ Ratio of the `turbo range <turbo_>`_ size to the size of the entire
+ range of supported P-states, in percent.
+
+ This attribute is read-only.
+
+.. _no_turbo_attr:
+
+``no_turbo``
+ If set (equal to 1), the driver is not allowed to set any turbo P-states
+ (see `Turbo P-states Support`_). If unset (equalt to 0, which is the
+ default), turbo P-states can be set by the driver.
+ [Note that ``intel_pstate`` does not support the general ``boost``
+ attribute (supported by some other scaling drivers) which is replaced
+ by this one.]
+
+ This attrubute does not affect the maximum supported frequency value
+ supplied to the ``CPUFreq`` core and exposed via the policy interface,
+ but it affects the maximum possible value of per-policy P-state limits
+ (see `Interpretation of Policy Attributes`_ below for details).
+
+.. _status_attr:
+
+``status``
+ Operation mode of the driver: "active", "passive" or "off".
+
+ "active"
+ The driver is functional and in the `active mode
+ <Active Mode_>`_.
+
+ "passive"
+ The driver is functional and in the `passive mode
+ <Passive Mode_>`_.
+
+ "off"
+ The driver is not functional (it is not registered as a scaling
+ driver with the ``CPUFreq`` core).
+
+ This attribute can be written to in order to change the driver's
+ operation mode or to unregister it. The string written to it must be
+ one of the possible values of it and, if successful, the write will
+ cause the driver to switch over to the operation mode represented by
+ that string - or to be unregistered in the "off" case. [Actually,
+ switching over from the active mode to the passive mode or the other
+ way around causes the driver to be unregistered and registered again
+ with a different set of callbacks, so all of its settings (the global
+ as well as the per-policy ones) are then reset to their default
+ values, possibly depending on the target operation mode.]
+
+ That only is supported in some configurations, though (for example, if
+ the `HWP feature is enabled in the processor <Active Mode With HWP_>`_,
+ the operation mode of the driver cannot be changed), and if it is not
+ supported in the current configuration, writes to this attribute with
+ fail with an appropriate error.
+
+Interpretation of Policy Attributes
+-----------------------------------
+
+The interpretation of some ``CPUFreq`` policy attributes described in
+:doc:`cpufreq` is special with ``intel_pstate`` as the current scaling driver
+and it generally depends on the driver's `operation mode <Operation Modes_>`_.
+
+First of all, the values of the ``cpuinfo_max_freq``, ``cpuinfo_min_freq`` and
+``scaling_cur_freq`` attributes are produced by applying a processor-specific
+multiplier to the internal P-state representation used by ``intel_pstate``.
+Also, the values of the ``scaling_max_freq`` and ``scaling_min_freq``
+attributes are capped by the frequency corresponding to the maximum P-state that
+the driver is allowed to set.
+
+If the ``no_turbo`` `global attribute <no_turbo_attr_>`_ is set, the driver is
+not allowed to use turbo P-states, so the maximum value of ``scaling_max_freq``
+and ``scaling_min_freq`` is limited to the maximum non-turbo P-state frequency.
+Accordingly, setting ``no_turbo`` causes ``scaling_max_freq`` and
+``scaling_min_freq`` to go down to that value if they were above it before.
+However, the old values of ``scaling_max_freq`` and ``scaling_min_freq`` will be
+restored after unsetting ``no_turbo``, unless these attributes have been written
+to after ``no_turbo`` was set.
+
+If ``no_turbo`` is not set, the maximum possible value of ``scaling_max_freq``
+and ``scaling_min_freq`` corresponds to the maximum supported turbo P-state,
+which also is the value of ``cpuinfo_max_freq`` in either case.
+
+Next, the following policy attributes have special meaning if
+``intel_pstate`` works in the `active mode <Active Mode_>`_:
+
+``scaling_available_governors``
+ List of P-state selection algorithms provided by ``intel_pstate``.
+
+``scaling_governor``
+ P-state selection algorithm provided by ``intel_pstate`` currently in
+ use with the given policy.
+
+``scaling_cur_freq``
+ Frequency of the average P-state of the CPU represented by the given
+ policy for the time interval between the last two invocations of the
+ driver's utilization update callback by the CPU scheduler for that CPU.
+
+The meaning of these attributes in the `passive mode <Passive Mode_>`_ is the
+same as for other scaling drivers.
+
+Additionally, the value of the ``scaling_driver`` attribute for ``intel_pstate``
+depends on the operation mode of the driver. Namely, it is either
+"intel_pstate" (in the `active mode <Active Mode_>`_) or "intel_cpufreq" (in the
+`passive mode <Passive Mode_>`_).
+
+Coordination of P-State Limits
+------------------------------
+
+``intel_pstate`` allows P-state limits to be set in two ways: with the help of
+the ``max_perf_pct`` and ``min_perf_pct`` `global attributes
+<Global Attributes_>`_ or via the ``scaling_max_freq`` and ``scaling_min_freq``
+``CPUFreq`` policy attributes. The coordination between those limits is based
+on the following rules, regardless of the current operation mode of the driver:
+
+ 1. All CPUs are affected by the global limits (that is, none of them can be
+ requested to run faster than the global maximum and none of them can be
+ requested to run slower than the global minimum).
+
+ 2. Each individual CPU is affected by its own per-policy limits (that is, it
+ cannot be requested to run faster than its own per-policy maximum and it
+ cannot be requested to run slower than its own per-policy minimum).
+
+ 3. The global and per-policy limits can be set independently.
+
+If the `HWP feature is enabled in the processor <Active Mode With HWP_>`_, the
+resulting effective values are written into its registers whenever the limits
+change in order to request its internal P-state selection logic to always set
+P-states within these limits. Otherwise, the limits are taken into account by
+scaling governors (in the `passive mode <Passive Mode_>`_) and by the driver
+every time before setting a new P-state for a CPU.
+
+Additionally, if the ``intel_pstate=per_cpu_perf_limits`` command line argument
+is passed to the kernel, ``max_perf_pct`` and ``min_perf_pct`` are not exposed
+at all and the only way to set the limits is by using the policy attributes.
+
+
+Energy vs Performance Hints
+---------------------------
+
+If ``intel_pstate`` works in the `active mode with the HWP feature enabled
+<Active Mode With HWP_>`_ in the processor, additional attributes are present
+in every ``CPUFreq`` policy directory in ``sysfs``. They are intended to allow
+user space to help ``intel_pstate`` to adjust the processor's internal P-state
+selection logic by focusing it on performance or on energy-efficiency, or
+somewhere between the two extremes:
+
+``energy_performance_preference``
+ Current value of the energy vs performance hint for the given policy
+ (or the CPU represented by it).
+
+ The hint can be changed by writing to this attribute.
+
+``energy_performance_available_preferences``
+ List of strings that can be written to the
+ ``energy_performance_preference`` attribute.
+
+ They represent different energy vs performance hints and should be
+ self-explanatory, except that ``default`` represents whatever hint
+ value was set by the platform firmware.
+
+Strings written to the ``energy_performance_preference`` attribute are
+internally translated to integer values written to the processor's
+Energy-Performance Preference (EPP) knob (if supported) or its
+Energy-Performance Bias (EPB) knob.
+
+[Note that tasks may by migrated from one CPU to another by the scheduler's
+load-balancing algorithm and if different energy vs performance hints are
+set for those CPUs, that may lead to undesirable outcomes. To avoid such
+issues it is better to set the same energy vs performance hint for all CPUs
+or to pin every task potentially sensitive to them to a specific CPU.]
+
+.. _acpi-cpufreq:
+
+``intel_pstate`` vs ``acpi-cpufreq``
+====================================
+
+On the majority of systems supported by ``intel_pstate``, the ACPI tables
+provided by the platform firmware contain ``_PSS`` objects returning information
+that can be used for CPU performance scaling (refer to the `ACPI specification`_
+for details on the ``_PSS`` objects and the format of the information returned
+by them).
+
+The information returned by the ACPI ``_PSS`` objects is used by the
+``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate``
+the ``acpi-cpufreq`` driver uses the same hardware CPU performance scaling
+interface, but the set of P-states it can use is limited by the ``_PSS``
+output.
+
+On those systems each ``_PSS`` object returns a list of P-states supported by
+the corresponding CPU which basically is a subset of the P-states range that can
+be used by ``intel_pstate`` on the same system, with one exception: the whole
+`turbo range <turbo_>`_ is represented by one item in it (the topmost one). By
+convention, the frequency returned by ``_PSS`` for that item is greater by 1 MHz
+than the frequency of the highest non-turbo P-state listed by it, but the
+corresponding P-state representation (following the hardware specification)
+returned for it matches the maximum supported turbo P-state (or is the
+special value 255 meaning essentially "go as high as you can get").
+
+The list of P-states returned by ``_PSS`` is reflected by the table of
+available frequencies supplied by ``acpi-cpufreq`` to the ``CPUFreq`` core and
+scaling governors and the minimum and maximum supported frequencies reported by
+it come from that list as well. In particular, given the special representation
+of the turbo range described above, this means that the maximum supported
+frequency reported by ``acpi-cpufreq`` is higher by 1 MHz than the frequency
+of the highest supported non-turbo P-state listed by ``_PSS`` which, of course,
+affects decisions made by the scaling governors, except for ``powersave`` and
+``performance``.
+
+For example, if a given governor attempts to select a frequency proportional to
+estimated CPU load and maps the load of 100% to the maximum supported frequency
+(possibly multiplied by a constant), then it will tend to choose P-states below
+the turbo threshold if ``acpi-cpufreq`` is used as the scaling driver, because
+in that case the turbo range corresponds to a small fraction of the frequency
+band it can use (1 MHz vs 1 GHz or more). In consequence, it will only go to
+the turbo range for the highest loads and the other loads above 50% that might
+benefit from running at turbo frequencies will be given non-turbo P-states
+instead.
+
+One more issue related to that may appear on systems supporting the
+`Configurable TDP feature <turbo_>`_ allowing the platform firmware to set the
+turbo threshold. Namely, if that is not coordinated with the lists of P-states
+returned by ``_PSS`` properly, there may be more than one item corresponding to
+a turbo P-state in those lists and there may be a problem with avoiding the
+turbo range (if desirable or necessary). Usually, to avoid using turbo
+P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state listed
+by ``_PSS``, but that is not sufficient when there are other turbo P-states in
+the list returned by it.
+
+Apart from the above, ``acpi-cpufreq`` works like ``intel_pstate`` in the
+`passive mode <Passive Mode_>`_, except that the number of P-states it can set
+is limited to the ones listed by the ACPI ``_PSS`` objects.
+
+
+Kernel Command Line Options for ``intel_pstate``
+================================================
+
+Several kernel command line options can be used to pass early-configuration-time
+parameters to ``intel_pstate`` in order to enforce specific behavior of it. All
+of them have to be prepended with the ``intel_pstate=`` prefix.
+
+``disable``
+ Do not register ``intel_pstate`` as the scaling driver even if the
+ processor is supported by it.
+
+``passive``
+ Register ``intel_pstate`` in the `passive mode <Passive Mode_>`_ to
+ start with.
+
+ This option implies the ``no_hwp`` one described below.
+
+``force``
+ Register ``intel_pstate`` as the scaling driver instead of
+ ``acpi-cpufreq`` even if the latter is preferred on the given system.
+
+ This may prevent some platform features (such as thermal controls and
+ power capping) that rely on the availability of ACPI P-states
+ information from functioning as expected, so it should be used with
+ caution.
+
+ This option does not work with processors that are not supported by
+ ``intel_pstate`` and on platforms where the ``pcc-cpufreq`` scaling
+ driver is used instead of ``acpi-cpufreq``.
+
+``no_hwp``
+ Do not enable the `hardware-managed P-states (HWP) feature
+ <Active Mode With HWP_>`_ even if it is supported by the processor.
+
+``hwp_only``
+ Register ``intel_pstate`` as the scaling driver only if the
+ `hardware-managed P-states (HWP) feature <Active Mode With HWP_>`_ is
+ supported by the processor.
+
+``support_acpi_ppc``
+ Take ACPI ``_PPC`` performance limits into account.
+
+ If the preferred power management profile in the FADT (Fixed ACPI
+ Description Table) is set to "Enterprise Server" or "Performance
+ Server", the ACPI ``_PPC`` limits are taken into account by default
+ and this option has no effect.
+
+``per_cpu_perf_limits``
+ Use per-logical-CPU P-State limits (see `Coordination of P-state
+ Limits`_ for details).
+
+
+Diagnostics and Tuning
+======================
+
+Trace Events
+------------
+
+There are two static trace events that can be used for ``intel_pstate``
+diagnostics. One of them is the ``cpu_frequency`` trace event generally used
+by ``CPUFreq``, and the other one is the ``pstate_sample`` trace event specific
+to ``intel_pstate``. Both of them are triggered by ``intel_pstate`` only if
+it works in the `active mode <Active Mode_>`_.
+
+The following sequence of shell commands can be used to enable them and see
+their output (if the kernel is generally configured to support event tracing)::
+
+ # cd /sys/kernel/debug/tracing/
+ # echo 1 > events/power/pstate_sample/enable
+ # echo 1 > events/power/cpu_frequency/enable
+ # cat trace
+ gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107 scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 freq=2474476
+ cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2
+
+If ``intel_pstate`` works in the `passive mode <Passive Mode_>`_, the
+``cpu_frequency`` trace event will be triggered either by the ``schedutil``
+scaling governor (for the policies it is attached to), or by the ``CPUFreq``
+core (for the policies with other scaling governors).
+
+``ftrace``
+----------
+
+The ``ftrace`` interface can be used for low-level diagnostics of
+``intel_pstate``. For example, to check how often the function to set a
+P-state is called, the ``ftrace`` filter can be set to to
+:c:func:`intel_pstate_set_pstate`::
+
+ # cd /sys/kernel/debug/tracing/
+ # cat available_filter_functions | grep -i pstate
+ intel_pstate_set_pstate
+ intel_pstate_cpu_init
+ ...
+ # echo intel_pstate_set_pstate > set_ftrace_filter
+ # echo function > current_tracer
+ # cat trace | head -15
+ # tracer: function
+ #
+ # entries-in-buffer/entries-written: 80/80 #P:4
+ #
+ # _-----=> irqs-off
+ # / _----=> need-resched
+ # | / _---=> hardirq/softirq
+ # || / _--=> preempt-depth
+ # ||| / delay
+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
+ # | | | |||| | |
+ Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
+ gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
+ gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
+ <idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
+
+Tuning Interface in ``debugfs``
+-------------------------------
+
+The ``powersave`` algorithm provided by ``intel_pstate`` for `the Core line of
+processors in the active mode <powersave_>`_ is based on a `PID controller`_
+whose parameters were chosen to address a number of different use cases at the
+same time. However, it still is possible to fine-tune it to a specific workload
+and the ``debugfs`` interface under ``/sys/kernel/debug/pstate_snb/`` is
+provided for this purpose. [Note that the ``pstate_snb`` directory will be
+present only if the specific P-state selection algorithm matching the interface
+in it actually is in use.]
+
+The following files present in that directory can be used to modify the PID
+controller parameters at run time:
+
+| ``deadband``
+| ``d_gain_pct``
+| ``i_gain_pct``
+| ``p_gain_pct``
+| ``sample_rate_ms``
+| ``setpoint``
+
+Note, however, that achieving desirable results this way generally requires
+expert-level understanding of the power vs performance tradeoff, so extra care
+is recommended when attempting to do that.
+
+
+.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
+.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
+.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
+.. _PID controller: https://en.wikipedia.org/wiki/PID_controller
diff --git a/Documentation/arm64/tagged-pointers.txt b/Documentation/arm64/tagged-pointers.txt
index d9995f1f51b3..a25a99e82bb1 100644
--- a/Documentation/arm64/tagged-pointers.txt
+++ b/Documentation/arm64/tagged-pointers.txt
@@ -11,24 +11,56 @@ in AArch64 Linux.
The kernel configures the translation tables so that translations made
via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of
the virtual address ignored by the translation hardware. This frees up
-this byte for application use, with the following caveats:
+this byte for application use.
- (1) The kernel requires that all user addresses passed to EL1
- are tagged with tag 0x00. This means that any syscall
- parameters containing user virtual addresses *must* have
- their top byte cleared before trapping to the kernel.
- (2) Non-zero tags are not preserved when delivering signals.
- This means that signal handlers in applications making use
- of tags cannot rely on the tag information for user virtual
- addresses being maintained for fields inside siginfo_t.
- One exception to this rule is for signals raised in response
- to watchpoint debug exceptions, where the tag information
- will be preserved.
+Passing tagged addresses to the kernel
+--------------------------------------
- (3) Special care should be taken when using tagged pointers,
- since it is likely that C compilers will not hazard two
- virtual addresses differing only in the upper byte.
+All interpretation of userspace memory addresses by the kernel assumes
+an address tag of 0x00.
+
+This includes, but is not limited to, addresses found in:
+
+ - pointer arguments to system calls, including pointers in structures
+ passed to system calls,
+
+ - the stack pointer (sp), e.g. when interpreting it to deliver a
+ signal,
+
+ - the frame pointer (x29) and frame records, e.g. when interpreting
+ them to generate a backtrace or call graph.
+
+Using non-zero address tags in any of these locations may result in an
+error code being returned, a (fatal) signal being raised, or other modes
+of failure.
+
+For these reasons, passing non-zero address tags to the kernel via
+system calls is forbidden, and using a non-zero address tag for sp is
+strongly discouraged.
+
+Programs maintaining a frame pointer and frame records that use non-zero
+address tags may suffer impaired or inaccurate debug and profiling
+visibility.
+
+
+Preserving tags
+---------------
+
+Non-zero tags are not preserved when delivering signals. This means that
+signal handlers in applications making use of tags cannot rely on the
+tag information for user virtual addresses being maintained for fields
+inside siginfo_t. One exception to this rule is for signals raised in
+response to watchpoint debug exceptions, where the tag information will
+be preserved.
The architecture prevents the use of a tagged PC, so the upper byte will
be set to a sign-extension of bit 55 on exception return.
+
+
+Other considerations
+--------------------
+
+Special care should be taken when using tagged pointers, since it is
+likely that C compilers will not hazard two virtual addresses differing
+only in the upper byte.
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 1b87df6cd476..05e2822a80b3 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -11,6 +11,13 @@ controllers), BFQ's main features are:
groups (switching back to time distribution when needed to keep
throughput high).
+In its default configuration, BFQ privileges latency over
+throughput. So, when needed for achieving a lower latency, BFQ builds
+schedules that may lead to a lower throughput. If your main or only
+goal, for a given device, is to achieve the maximum-possible
+throughput at all times, then do switch off all low-latency heuristics
+for that device, by setting low_latency to 0. Full details in Section 3.
+
On average CPUs, the current version of BFQ can handle devices
performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
reference, 30-50 KIOPS correspond to very high bandwidths with
@@ -375,11 +382,19 @@ default, low latency mode is enabled. If enabled, interactive and soft
real-time applications are privileged and experience a lower latency,
as explained in more detail in the description of how BFQ works.
-DO NOT enable this mode if you need full control on bandwidth
+DISABLE this mode if you need full control on bandwidth
distribution. In fact, if it is enabled, then BFQ automatically
increases the bandwidth share of privileged applications, as the main
means to guarantee a lower latency to them.
+In addition, as already highlighted at the beginning of this document,
+DISABLE this mode if your only goal is to achieve a high throughput.
+In fact, privileging the I/O of some application over the rest may
+entail a lower throughput. To achieve the highest-possible throughput
+on a non-rotational device, setting slice_idle to 0 may be needed too
+(at the cost of giving up any strong guarantee on fairness and low
+latency).
+
timeout_sync
------------
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 01ddeaf64b0f..9490f2845f06 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -632,7 +632,7 @@ to i/o submission, if the bio fields are likely to be accessed after the
i/o is issued (since the bio may otherwise get freed in case i/o completion
happens in the meantime).
-The bio_clone() routine may be used to duplicate a bio, where the clone
+The bio_clone_fast() routine may be used to duplicate a bio, where the clone
shares the bio_vec_list with the original bio (i.e. both point to the
same bio_vec_list). This would typically be used for splitting i/o requests
in lvm or md.
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index e50b95c25868..dc5e2dcdbef4 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -918,6 +918,18 @@ PAGE_SIZE multiple when read back.
Number of major page faults incurred
+ workingset_refault
+
+ Number of refaults of previously evicted pages
+
+ workingset_activate
+
+ Number of refaulted pages that were immediately activated
+
+ workingset_nodereclaim
+
+ Number of times a shadow node has been reclaimed
+
memory.swap.current
A read-only single value file which exists on non-root
diff --git a/Documentation/core-api/atomic_ops.rst b/Documentation/core-api/atomic_ops.rst
index 55e43f1c80de..fce929144ccd 100644
--- a/Documentation/core-api/atomic_ops.rst
+++ b/Documentation/core-api/atomic_ops.rst
@@ -303,6 +303,11 @@ defined which accomplish this::
void smp_mb__before_atomic(void);
void smp_mb__after_atomic(void);
+Preceding a non-value-returning read-modify-write atomic operation with
+smp_mb__before_atomic() and following it with smp_mb__after_atomic()
+provides the same full ordering that is provided by value-returning
+read-modify-write atomic operations.
+
For example, smp_mb__before_atomic() can be used like so::
obj->dead = 1;
diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
deleted file mode 100644
index 3fdcdfd968ba..000000000000
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ /dev/null
@@ -1,281 +0,0 @@
-Intel P-State driver
---------------------
-
-This driver provides an interface to control the P-State selection for the
-SandyBridge+ Intel processors.
-
-The following document explains P-States:
-http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
-As stated in the document, P-State doesn’t exactly mean a frequency. However, for
-the sake of the relationship with cpufreq, P-State and frequency are used
-interchangeably.
-
-Understanding the cpufreq core governors and policies are important before
-discussing more details about the Intel P-State driver. Based on what callbacks
-a cpufreq driver provides to the cpufreq core, it can support two types of
-drivers:
-- with target_index() callback: In this mode, the drivers using cpufreq core
-simply provide the minimum and maximum frequency limits and an additional
-interface target_index() to set the current frequency. The cpufreq subsystem
-has a number of scaling governors ("performance", "powersave", "ondemand",
-etc.). Depending on which governor is in use, cpufreq core will call for
-transitions to a specific frequency using target_index() callback.
-- setpolicy() callback: In this mode, drivers do not provide target_index()
-callback, so cpufreq core can't request a transition to a specific frequency.
-The driver provides minimum and maximum frequency limits and callbacks to set a
-policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
-The cpufreq core can request the driver to operate in any of the two policies:
-"performance" and "powersave". The driver decides which frequency to use based
-on the above policy selection considering minimum and maximum frequency limits.
-
-The Intel P-State driver falls under the latter category, which implements the
-setpolicy() callback. This driver decides what P-State to use based on the
-requested policy from the cpufreq core. If the processor is capable of
-selecting its next P-State internally, then the driver will offload this
-responsibility to the processor (aka HWP: Hardware P-States). If not, the
-driver implements algorithms to select the next P-State.
-
-Since these policies are implemented in the driver, they are not same as the
-cpufreq scaling governors implementation, even if they have the same name in
-the cpufreq sysfs (scaling_governors). For example the "performance" policy is
-similar to cpufreq’s "performance" governor, but "powersave" is completely
-different than the cpufreq "powersave" governor. The strategy here is similar
-to cpufreq "ondemand", where the requested P-State is related to the system load.
-
-Sysfs Interface
-
-In addition to the frequency-controlling interfaces provided by the cpufreq
-core, the driver provides its own sysfs files to control the P-State selection.
-These files have been added to /sys/devices/system/cpu/intel_pstate/.
-Any changes made to these files are applicable to all CPUs (even in a
-multi-package system, Refer to later section on placing "Per-CPU limits").
-
- max_perf_pct: Limits the maximum P-State that will be requested by
- the driver. It states it as a percentage of the available performance. The
- available (P-State) performance may be reduced by the no_turbo
- setting described below.
-
- min_perf_pct: Limits the minimum P-State that will be requested by
- the driver. It states it as a percentage of the max (non-turbo)
- performance level.
-
- no_turbo: Limits the driver to selecting P-State below the turbo
- frequency range.
-
- turbo_pct: Displays the percentage of the total performance that
- is supported by hardware that is in the turbo range. This number
- is independent of whether turbo has been disabled or not.
-
- num_pstates: Displays the number of P-States that are supported
- by hardware. This number is independent of whether turbo has
- been disabled or not.
-
-For example, if a system has these parameters:
- Max 1 core turbo ratio: 0x21 (Max 1 core ratio is the maximum P-State)
- Max non turbo ratio: 0x17
- Minimum ratio : 0x08 (Here the ratio is called max efficiency ratio)
-
-Sysfs will show :
- max_perf_pct:100, which corresponds to 1 core ratio
- min_perf_pct:24, max_efficiency_ratio / max 1 Core ratio
- no_turbo:0, turbo is not disabled
- num_pstates:26 = (max 1 Core ratio - Max Efficiency Ratio + 1)
- turbo_pct:39 = (max 1 core ratio - max non turbo ratio) / num_pstates
-
-Refer to "Intel® 64 and IA-32 Architectures Software Developer’s Manual
-Volume 3: System Programming Guide" to understand ratios.
-
-There is one more sysfs attribute in /sys/devices/system/cpu/intel_pstate/
-that can be used for controlling the operation mode of the driver:
-
- status: Three settings are possible:
- "off" - The driver is not in use at this time.
- "active" - The driver works as a P-state governor (default).
- "passive" - The driver works as a regular cpufreq one and collaborates
- with the generic cpufreq governors (it sets P-states as
- requested by those governors).
- The current setting is returned by reads from this attribute. Writing one
- of the above strings to it changes the operation mode as indicated by that
- string, if possible. If HW-managed P-states (HWP) are enabled, it is not
- possible to change the driver's operation mode and attempts to write to
- this attribute will fail.
-
-cpufreq sysfs for Intel P-State
-
-Since this driver registers with cpufreq, cpufreq sysfs is also presented.
-There are some important differences, which need to be considered.
-
-scaling_cur_freq: This displays the real frequency which was used during
-the last sample period instead of what is requested. Some other cpufreq driver,
-like acpi-cpufreq, displays what is requested (Some changes are on the
-way to fix this for acpi-cpufreq driver). The same is true for frequencies
-displayed at /proc/cpuinfo.
-
-scaling_governor: This displays current active policy. Since each CPU has a
-cpufreq sysfs, it is possible to set a scaling governor to each CPU. But this
-is not possible with Intel P-States, as there is one common policy for all
-CPUs. Here, the last requested policy will be applicable to all CPUs. It is
-suggested that one use the cpupower utility to change policy to all CPUs at the
-same time.
-
-scaling_setspeed: This attribute can never be used with Intel P-State.
-
-scaling_max_freq/scaling_min_freq: This interface can be used similarly to
-the max_perf_pct/min_perf_pct of Intel P-State sysfs. However since frequencies
-are converted to nearest possible P-State, this is prone to rounding errors.
-This method is not preferred to limit performance.
-
-affected_cpus: Not used
-related_cpus: Not used
-
-For contemporary Intel processors, the frequency is controlled by the
-processor itself and the P-State exposed to software is related to
-performance levels. The idea that frequency can be set to a single
-frequency is fictional for Intel Core processors. Even if the scaling
-driver selects a single P-State, the actual frequency the processor
-will run at is selected by the processor itself.
-
-Per-CPU limits
-
-The kernel command line option "intel_pstate=per_cpu_perf_limits" forces
-the intel_pstate driver to use per-CPU performance limits. When it is set,
-the sysfs control interface described above is subject to limitations.
-- The following controls are not available for both read and write
- /sys/devices/system/cpu/intel_pstate/max_perf_pct
- /sys/devices/system/cpu/intel_pstate/min_perf_pct
-- The following controls can be used to set performance limits, as far as the
-architecture of the processor permits:
- /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
- /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
- /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
-- User can still observe turbo percent and number of P-States from
- /sys/devices/system/cpu/intel_pstate/turbo_pct
- /sys/devices/system/cpu/intel_pstate/num_pstates
-- User can read write system wide turbo status
- /sys/devices/system/cpu/no_turbo
-
-Support of energy performance hints
-It is possible to provide hints to the HWP algorithms in the processor
-to be more performance centric to more energy centric. When the driver
-is using HWP, two additional cpufreq sysfs attributes are presented for
-each logical CPU.
-These attributes are:
- - energy_performance_available_preferences
- - energy_performance_preference
-
-To get list of supported hints:
-$ cat energy_performance_available_preferences
- default performance balance_performance balance_power power
-
-The current preference can be read or changed via cpufreq sysfs
-attribute "energy_performance_preference". Reading from this attribute
-will display current effective setting. User can write any of the valid
-preference string to this attribute. User can always restore to power-on
-default by writing "default".
-
-Since threads can migrate to different CPUs, this is possible that the
-new CPU may have different energy performance preference than the previous
-one. To avoid such issues, either threads can be pinned to specific CPUs
-or set the same energy performance preference value to all CPUs.
-
-Tuning Intel P-State driver
-
-When the performance can be tuned using PID (Proportional Integral
-Derivative) controller, debugfs files are provided for adjusting performance.
-They are presented under:
-/sys/kernel/debug/pstate_snb/
-
-The PID tunable parameters are:
- deadband
- d_gain_pct
- i_gain_pct
- p_gain_pct
- sample_rate_ms
- setpoint
-
-To adjust these parameters, some understanding of driver implementation is
-necessary. There are some tweeks described here, but be very careful. Adjusting
-them requires expert level understanding of power and performance relationship.
-These limits are only useful when the "powersave" policy is active.
-
--To make the system more responsive to load changes, sample_rate_ms can
-be adjusted (current default is 10ms).
--To make the system use higher performance, even if the load is lower, setpoint
-can be adjusted to a lower number. This will also lead to faster ramp up time
-to reach the maximum P-State.
-If there are no derivative and integral coefficients, The next P-State will be
-equal to:
- current P-State - ((setpoint - current cpu load) * p_gain_pct)
-
-For example, if the current PID parameters are (Which are defaults for the core
-processors like SandyBridge):
- deadband = 0
- d_gain_pct = 0
- i_gain_pct = 0
- p_gain_pct = 20
- sample_rate_ms = 10
- setpoint = 97
-
-If the current P-State = 0x08 and current load = 100, this will result in the
-next P-State = 0x08 - ((97 - 100) * 0.2) = 8.6 (rounded to 9). Here the P-State
-goes up by only 1. If during next sample interval the current load doesn't
-change and still 100, then P-State goes up by one again. This process will
-continue as long as the load is more than the setpoint until the maximum P-State
-is reached.
-
-For the same load at setpoint = 60, this will result in the next P-State
-= 0x08 - ((60 - 100) * 0.2) = 16
-So by changing the setpoint from 97 to 60, there is an increase of the
-next P-State from 9 to 16. So this will make processor execute at higher
-P-State for the same CPU load. If the load continues to be more than the
-setpoint during next sample intervals, then P-State will go up again till the
-maximum P-State is reached. But the ramp up time to reach the maximum P-State
-will be much faster when the setpoint is 60 compared to 97.
-
-Debugging Intel P-State driver
-
-Event tracing
-To debug P-State transition, the Linux event tracing interface can be used.
-There are two specific events, which can be enabled (Provided the kernel
-configs related to event tracing are enabled).
-
-# cd /sys/kernel/debug/tracing/
-# echo 1 > events/power/pstate_sample/enable
-# echo 1 > events/power/cpu_frequency/enable
-# cat trace
-gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107
- scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618
- freq=2474476
-cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2
-
-
-Using ftrace
-
-If function level tracing is required, the Linux ftrace interface can be used.
-For example if we want to check how often a function to set a P-State is
-called, we can set ftrace filter to intel_pstate_set_pstate.
-
-# cd /sys/kernel/debug/tracing/
-# cat available_filter_functions | grep -i pstate
-intel_pstate_set_pstate
-intel_pstate_cpu_init
-...
-
-# echo intel_pstate_set_pstate > set_ftrace_filter
-# echo function > current_tracer
-# cat trace | head -15
-# tracer: function
-#
-# entries-in-buffer/entries-written: 80/80 #P:4
-#
-# _-----=> irqs-off
-# / _----=> need-resched
-# | / _---=> hardirq/softirq
-# || / _--=> preempt-depth
-# ||| / delay
-# TASK-PID CPU# |||| TIMESTAMP FUNCTION
-# | | | |||| | |
- Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
- gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
- gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
- <idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
diff --git a/Documentation/dev-tools/sparse.rst b/Documentation/dev-tools/sparse.rst
index ffdcc97f6f5a..78aa00a604a0 100644
--- a/Documentation/dev-tools/sparse.rst
+++ b/Documentation/dev-tools/sparse.rst
@@ -103,9 +103,3 @@ have already built it.
The optional make variable CF can be used to pass arguments to sparse. The
build system passes -Wbitwise to sparse automatically.
-
-Checking RCU annotations
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-RCU annotations are not checked by default. To enable RCU annotation
-checks, include -DCONFIG_SPARSE_RCU_POINTER in your CF flags.
diff --git a/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt
new file mode 100644
index 000000000000..d38834c67dff
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt
@@ -0,0 +1,31 @@
+OP-TEE Device Tree Bindings
+
+OP-TEE is a piece of software using hardware features to provide a Trusted
+Execution Environment. The security can be provided with ARM TrustZone, but
+also by virtualization or a separate chip.
+
+We're using "linaro" as the first part of the compatible property for
+the reference implementation maintained by Linaro.
+
+* OP-TEE based on ARM TrustZone required properties:
+
+- compatible : should contain "linaro,optee-tz"
+
+- method : The method of calling the OP-TEE Trusted OS. Permitted
+ values are:
+
+ "smc" : SMC #0, with the register assignments specified
+ in drivers/tee/optee/optee_smc.h
+
+ "hvc" : HVC #0, with the register assignments specified
+ in drivers/tee/optee/optee_smc.h
+
+
+
+Example:
+ firmware {
+ optee {
+ compatible = "linaro,optee-tz";
+ method = "smc";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt
index cb0054ac7121..cd977db7630c 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt
@@ -7,6 +7,7 @@ Required Properties:
- compatible: Should be one of:
- "mediatek,mt2701-apmixedsys"
+ - "mediatek,mt6797-apmixedsys"
- "mediatek,mt8135-apmixedsys"
- "mediatek,mt8173-apmixedsys"
- #clock-cells: Must be 1
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt
index f6a916686f4c..047b11ae5f45 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt
@@ -7,6 +7,7 @@ Required Properties:
- compatible: Should be one of:
- "mediatek,mt2701-imgsys", "syscon"
+ - "mediatek,mt6797-imgsys", "syscon"
- "mediatek,mt8173-imgsys", "syscon"
- #clock-cells: Must be 1
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt
index 1620ec2a5a3f..58d58e2006b8 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt
@@ -8,6 +8,7 @@ Required Properties:
- compatible: Should be one of:
- "mediatek,mt2701-infracfg", "syscon"
+ - "mediatek,mt6797-infracfg", "syscon"
- "mediatek,mt8135-infracfg", "syscon"
- "mediatek,mt8173-infracfg", "syscon"
- #clock-cells: Must be 1
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt
index 67dd2e473d25..70529e0b58e9 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt
@@ -7,6 +7,7 @@ Required Properties:
- compatible: Should be one of:
- "mediatek,mt2701-mmsys", "syscon"
+ - "mediatek,mt6797-mmsys", "syscon"
- "mediatek,mt8173-mmsys", "syscon"
- #clock-cells: Must be 1
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt
index 9f2fe7860114..ec93ecbb9f3c 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt
@@ -7,6 +7,7 @@ Required Properties:
- compatible: Should be one of:
- "mediatek,mt2701-topckgen"
+ - "mediatek,mt6797-topckgen"
- "mediatek,mt8135-topckgen"
- "mediatek,mt8173-topckgen"
- #clock-cells: Must be 1
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt
index 2440f73450c3..d150104f928a 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt
@@ -7,6 +7,7 @@ Required Properties:
- compatible: Should be one of:
- "mediatek,mt2701-vdecsys", "syscon"
+ - "mediatek,mt6797-vdecsys", "syscon"
- "mediatek,mt8173-vdecsys", "syscon"
- #clock-cells: Must be 1
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt
index 5bb2866a2b50..8a93be643647 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt
@@ -5,7 +5,8 @@ The Mediatek vencsys controller provides various clocks to the system.
Required Properties:
-- compatible: Should be:
+- compatible: Should be one of:
+ - "mediatek,mt6797-vencsys", "syscon"
- "mediatek,mt8173-vencsys", "syscon"
- #clock-cells: Must be 1
diff --git a/Documentation/devicetree/bindings/clock/idt,versaclock5.txt b/Documentation/devicetree/bindings/clock/idt,versaclock5.txt
index 87e9c47a89a3..53d7e50ed875 100644
--- a/Documentation/devicetree/bindings/clock/idt,versaclock5.txt
+++ b/Documentation/devicetree/bindings/clock/idt,versaclock5.txt
@@ -6,18 +6,21 @@ from 3 to 12 output clocks.
==I2C device node==
Required properties:
-- compatible: shall be one of "idt,5p49v5923" , "idt,5p49v5933".
+- compatible: shall be one of "idt,5p49v5923" , "idt,5p49v5933" ,
+ "idt,5p49v5935".
- reg: i2c device address, shall be 0x68 or 0x6a.
- #clock-cells: from common clock binding; shall be set to 1.
- clocks: from common clock binding; list of parent clock handles,
- 5p49v5923: (required) either or both of XTAL or CLKIN
reference clock.
- - 5p49v5933: (optional) property not present (internal
+ - 5p49v5933 and
+ - 5p49v5935: (optional) property not present (internal
Xtal used) or CLKIN reference
clock.
- clock-names: from common clock binding; clock input names, can be
- 5p49v5923: (required) either or both of "xin", "clkin".
- - 5p49v5933: (optional) property not present or "clkin".
+ - 5p49v5933 and
+ - 5p49v5935: (optional) property not present or "clkin".
==Mapping between clock specifier and physical pins==
@@ -34,6 +37,13 @@ clock specifier, the following mapping applies:
1 -- OUT1
2 -- OUT4
+5P49V5935:
+ 0 -- OUT0_SEL_I2CB
+ 1 -- OUT1
+ 2 -- OUT2
+ 3 -- OUT3
+ 4 -- OUT4
+
==Example==
/* 25MHz reference crystal */
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk1108-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt
index 4da126116cf0..161326a4f9c1 100644
--- a/Documentation/devicetree/bindings/clock/rockchip,rk1108-cru.txt
+++ b/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt
@@ -1,12 +1,12 @@
-* Rockchip RK1108 Clock and Reset Unit
+* Rockchip RV1108 Clock and Reset Unit
-The RK1108 clock controller generates and supplies clock to various
+The RV1108 clock controller generates and supplies clock to various
controllers within the SoC and also implements a reset controller for SoC
peripherals.
Required Properties:
-- compatible: should be "rockchip,rk1108-cru"
+- compatible: should be "rockchip,rv1108-cru"
- reg: physical base address of the controller and length of memory mapped
region.
- #clock-cells: should be 1.
@@ -19,7 +19,7 @@ Optional Properties:
Each clock is assigned an identifier and client nodes can use this identifier
to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rk1108-cru.h headers and can be
+preprocessor macros in the dt-bindings/clock/rv1108-cru.h headers and can be
used in device tree sources. Similar macros exist for the reset sources in
these files.
@@ -38,7 +38,7 @@ clock-output-names:
Example: Clock controller node:
cru: cru@20200000 {
- compatible = "rockchip,rk1108-cru";
+ compatible = "rockchip,rv1108-cru";
reg = <0x20200000 0x1000>;
rockchip,grf = <&grf>;
@@ -50,7 +50,7 @@ Example: UART controller node that consumes the clock generated by the clock
controller:
uart0: serial@10230000 {
- compatible = "rockchip,rk1108-uart", "snps,dw-apb-uart";
+ compatible = "rockchip,rv1108-uart", "snps,dw-apb-uart";
reg = <0x10230000 0x100>;
interrupts = <GIC_SPI 44 IRQ_TYPE_LEVEL_HIGH>;
reg-shift = <2>;
diff --git a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
index bae5668cf427..f465647a4dd2 100644
--- a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
+++ b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
@@ -7,9 +7,12 @@ Required properties :
- "allwinner,sun8i-a23-ccu"
- "allwinner,sun8i-a33-ccu"
- "allwinner,sun8i-h3-ccu"
+ - "allwinner,sun8i-h3-r-ccu"
- "allwinner,sun8i-v3s-ccu"
- "allwinner,sun9i-a80-ccu"
- "allwinner,sun50i-a64-ccu"
+ - "allwinner,sun50i-a64-r-ccu"
+ - "allwinner,sun50i-h5-ccu"
- reg: Must contain the registers base address and length
- clocks: phandle to the oscillators feeding the CCU. Two are needed:
@@ -19,7 +22,11 @@ Required properties :
- #clock-cells : must contain 1
- #reset-cells : must contain 1
-Example:
+For the PRCM CCUs on H3/A64, two more clocks are needed:
+- "pll-periph": the SoC's peripheral PLL from the main CCU
+- "iosc": the SoC's internal frequency oscillator
+
+Example for generic CCU:
ccu: clock@01c20000 {
compatible = "allwinner,sun8i-h3-ccu";
reg = <0x01c20000 0x400>;
@@ -28,3 +35,13 @@ ccu: clock@01c20000 {
#clock-cells = <1>;
#reset-cells = <1>;
};
+
+Example for PRCM CCU:
+r_ccu: clock@01f01400 {
+ compatible = "allwinner,sun50i-a64-r-ccu";
+ reg = <0x01f01400 0x100>;
+ clocks = <&osc24M>, <&osc32k>, <&iosc>, <&ccu CLK_PLL_PERIPH0>;
+ clock-names = "hosc", "losc", "iosc", "pll-periph";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+};
diff --git a/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt b/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt
index 7a5c0e204c8e..e5a8b363d829 100644
--- a/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt
+++ b/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt
@@ -13,6 +13,8 @@ Required nodes:
Additional, the display node has to define properties:
- bits-per-pixel: Bits per pixel
- fsl,pcr: LCDC PCR value
+ A display node may optionally define
+ - fsl,aus-mode: boolean to enable AUS mode (only for imx21)
Optional properties:
- lcd-supply: Regulator for LCD supply voltage.
diff --git a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
index 42c3bb2d53e8..01e331a5f3e7 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
@@ -41,9 +41,9 @@ Required properties:
Optional properties:
In order to use the GPIO lines in PWM mode, some additional optional
-properties are required. Only Armada 370 and XP support these properties.
+properties are required.
-- compatible: Must contain "marvell,armada-370-xp-gpio"
+- compatible: Must contain "marvell,armada-370-gpio"
- reg: an additional register set is needed, for the GPIO Blink
Counter on/off registers.
@@ -71,7 +71,7 @@ Example:
};
gpio1: gpio@18140 {
- compatible = "marvell,armada-370-xp-gpio";
+ compatible = "marvell,armada-370-gpio";
reg = <0x18140 0x40>, <0x181c8 0x08>;
reg-names = "gpio", "pwm";
ngpios = <17>;
diff --git a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt
index 6db22103e2dd..025cf8c9324a 100644
--- a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt
+++ b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt
@@ -36,7 +36,7 @@ Optional properties:
control gpios
- threshold: allows setting the "click"-threshold in the range
- from 20 to 80.
+ from 0 to 80.
- gain: allows setting the sensitivity in the range from 0 to
31. Note that lower values indicate higher
diff --git a/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt b/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt
index 05485699d70e..9630ac0e4b56 100644
--- a/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt
+++ b/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt
@@ -16,6 +16,11 @@ Required properties:
- reg: Base address of PMIC on Hi6220 SoC.
- interrupt-controller: Hi655x has internal IRQs (has own IRQ domain).
- pmic-gpios: The GPIO used by PMIC IRQ.
+- #clock-cells: From common clock binding; shall be set to 0
+
+Optional properties:
+- clock-output-names: From common clock binding to override the
+ default output clock name
Example:
pmic: pmic@f8000000 {
@@ -24,4 +29,5 @@ Example:
interrupt-controller;
#interrupt-cells = <2>;
pmic-gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>;
+ #clock-cells = <0>;
}
diff --git a/Documentation/devicetree/bindings/mfd/stm32-timers.txt b/Documentation/devicetree/bindings/mfd/stm32-timers.txt
index bbd083f5600a..1db6e0057a63 100644
--- a/Documentation/devicetree/bindings/mfd/stm32-timers.txt
+++ b/Documentation/devicetree/bindings/mfd/stm32-timers.txt
@@ -31,7 +31,7 @@ Example:
compatible = "st,stm32-timers";
reg = <0x40010000 0x400>;
clocks = <&rcc 0 160>;
- clock-names = "clk_int";
+ clock-names = "int";
pwm {
compatible = "st,stm32-pwm";
diff --git a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt
index e25436861867..9029b45b8a22 100644
--- a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt
+++ b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.txt
@@ -18,6 +18,8 @@ Optional properties:
"ext_clock" (External clock provided to the card).
- post-power-on-delay-ms : Delay in ms after powering the card and
de-asserting the reset-gpios (if any)
+- power-off-delay-us : Delay in us after asserting the reset-gpios (if any)
+ during power off of the card.
Example:
diff --git a/Documentation/devicetree/bindings/mtd/atmel-nand.txt b/Documentation/devicetree/bindings/mtd/atmel-nand.txt
index 3e7ee99d3949..f6bee57e453a 100644
--- a/Documentation/devicetree/bindings/mtd/atmel-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/atmel-nand.txt
@@ -1,4 +1,109 @@
-Atmel NAND flash
+Atmel NAND flash controller bindings
+
+The NAND flash controller node should be defined under the EBI bus (see
+Documentation/devicetree/bindings/memory-controllers/atmel,ebi.txt).
+One or several NAND devices can be defined under this NAND controller.
+The NAND controller might be connected to an ECC engine.
+
+* NAND controller bindings:
+
+Required properties:
+- compatible: should be one of the following
+ "atmel,at91rm9200-nand-controller"
+ "atmel,at91sam9260-nand-controller"
+ "atmel,at91sam9261-nand-controller"
+ "atmel,at91sam9g45-nand-controller"
+ "atmel,sama5d3-nand-controller"
+- ranges: empty ranges property to forward EBI ranges definitions.
+- #address-cells: should be set to 2.
+- #size-cells: should be set to 1.
+- atmel,nfc-io: phandle to the NFC IO block. Only required for sama5d3
+ controllers.
+- atmel,nfc-sram: phandle to the NFC SRAM block. Only required for sama5d3
+ controllers.
+
+Optional properties:
+- ecc-engine: phandle to the PMECC block. Only meaningful if the SoC embeds
+ a PMECC engine.
+
+* NAND device/chip bindings:
+
+Required properties:
+- reg: describes the CS lines assigned to the NAND device. If the NAND device
+ exposes multiple CS lines (multi-dies chips), your reg property will
+ contain X tuples of 3 entries.
+ 1st entry: the CS line this NAND chip is connected to
+ 2nd entry: the base offset of the memory region assigned to this
+ device (always 0)
+ 3rd entry: the memory region size (always 0x800000)
+
+Optional properties:
+- rb-gpios: the GPIO(s) used to check the Ready/Busy status of the NAND.
+- cs-gpios: the GPIO(s) used to control the CS line.
+- det-gpios: the GPIO used to detect if a Smartmedia Card is present.
+- atmel,rb: an integer identifying the native Ready/Busy pin. Only meaningful
+ on sama5 SoCs.
+
+All generic properties described in
+Documentation/devicetree/bindings/mtd/{common,nand}.txt also apply to the NAND
+device node, and NAND partitions should be defined under the NAND node as
+described in Documentation/devicetree/bindings/mtd/partition.txt.
+
+* ECC engine (PMECC) bindings:
+
+Required properties:
+- compatible: should be one of the following
+ "atmel,at91sam9g45-pmecc"
+ "atmel,sama5d4-pmecc"
+ "atmel,sama5d2-pmecc"
+- reg: should contain 2 register ranges. The first one is pointing to the PMECC
+ block, and the second one to the PMECC_ERRLOC block.
+
+Example:
+
+ pmecc: ecc-engine@ffffc070 {
+ compatible = "atmel,at91sam9g45-pmecc";
+ reg = <0xffffc070 0x490>,
+ <0xffffc500 0x100>;
+ };
+
+ ebi: ebi@10000000 {
+ compatible = "atmel,sama5d3-ebi";
+ #address-cells = <2>;
+ #size-cells = <1>;
+ atmel,smc = <&hsmc>;
+ reg = <0x10000000 0x10000000
+ 0x40000000 0x30000000>;
+ ranges = <0x0 0x0 0x10000000 0x10000000
+ 0x1 0x0 0x40000000 0x10000000
+ 0x2 0x0 0x50000000 0x10000000
+ 0x3 0x0 0x60000000 0x10000000>;
+ clocks = <&mck>;
+
+ nand_controller: nand-controller {
+ compatible = "atmel,sama5d3-nand-controller";
+ atmel,nfc-sram = <&nfc_sram>;
+ atmel,nfc-io = <&nfc_io>;
+ ecc-engine = <&pmecc>;
+ #address-cells = <2>;
+ #size-cells = <1>;
+ ranges;
+
+ nand@3 {
+ reg = <0x3 0x0 0x800000>;
+ atmel,rb = <0>;
+
+ /*
+ * Put generic NAND/MTD properties and
+ * subnodes here.
+ */
+ };
+ };
+ };
+
+-----------------------------------------------------------------------
+
+Deprecated bindings (should not be used in new device trees):
Required properties:
- compatible: The possible values are:
diff --git a/Documentation/devicetree/bindings/mtd/denali-nand.txt b/Documentation/devicetree/bindings/mtd/denali-nand.txt
index b04d03a1d499..e593bbeb2115 100644
--- a/Documentation/devicetree/bindings/mtd/denali-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/denali-nand.txt
@@ -1,11 +1,11 @@
* Denali NAND controller
Required properties:
- - compatible : should be "denali,denali-nand-dt"
+ - compatible : should be one of the following:
+ "altr,socfpga-denali-nand" - for Altera SOCFPGA
- reg : should contain registers location and length for data and reg.
- reg-names: Should contain the reg names "nand_data" and "denali_reg"
- interrupts : The interrupt number.
- - dm-mask : DMA bit mask
The device tree may optionally contain sub-nodes describing partitions of the
address space. See partition.txt for more detail.
@@ -15,9 +15,8 @@ Examples:
nand: nand@ff900000 {
#address-cells = <1>;
#size-cells = <1>;
- compatible = "denali,denali-nand-dt";
+ compatible = "altr,socfpga-denali-nand";
reg = <0xff900000 0x100000>, <0xffb80000 0x10000>;
reg-names = "nand_data", "denali_reg";
interrupts = <0 144 4>;
- dma-mask = <0xffffffff>;
};
diff --git a/Documentation/devicetree/bindings/mtd/gpio-control-nand.txt b/Documentation/devicetree/bindings/mtd/gpio-control-nand.txt
index af8915b41ccf..486a17d533d7 100644
--- a/Documentation/devicetree/bindings/mtd/gpio-control-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/gpio-control-nand.txt
@@ -12,7 +12,7 @@ Required properties:
- #address-cells, #size-cells : Must be present if the device has sub-nodes
representing partitions.
- gpios : Specifies the GPIO pins to control the NAND device. The order of
- GPIO references is: RDY, nCE, ALE, CLE, and an optional nWP.
+ GPIO references is: RDY, nCE, ALE, CLE, and nWP. nCE and nWP are optional.
Optional properties:
- bank-width : Width (in bytes) of the device. If not present, the width
@@ -36,7 +36,7 @@ gpio-nand@1,0 {
#address-cells = <1>;
#size-cells = <1>;
gpios = <&banka 1 0>, /* RDY */
- <&banka 2 0>, /* nCE */
+ <0>, /* nCE */
<&banka 3 0>, /* ALE */
<&banka 4 0>, /* CLE */
<0>; /* nWP */
diff --git a/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt b/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt
new file mode 100644
index 000000000000..ddd18c135148
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt
@@ -0,0 +1,43 @@
+* STMicroelectronics Quad Serial Peripheral Interface(QuadSPI)
+
+Required properties:
+- compatible: should be "st,stm32f469-qspi"
+- reg: the first contains the register location and length.
+ the second contains the memory mapping address and length
+- reg-names: should contain the reg names "qspi" "qspi_mm"
+- interrupts: should contain the interrupt for the device
+- clocks: the phandle of the clock needed by the QSPI controller
+- A pinctrl must be defined to set pins in mode of operation for QSPI transfer
+
+Optional properties:
+- resets: must contain the phandle to the reset controller.
+
+A spi flash must be a child of the nor_flash node and could have some
+properties. Also see jedec,spi-nor.txt.
+
+Required properties:
+- reg: chip-Select number (QSPI controller may connect 2 nor flashes)
+- spi-max-frequency: max frequency of spi bus
+
+Optional property:
+- spi-rx-bus-width: see ../spi/spi-bus.txt for the description
+
+Example:
+
+qspi: spi@a0001000 {
+ compatible = "st,stm32f469-qspi";
+ reg = <0xa0001000 0x1000>, <0x90000000 0x10000000>;
+ reg-names = "qspi", "qspi_mm";
+ interrupts = <91>;
+ resets = <&rcc STM32F4_AHB3_RESET(QSPI)>;
+ clocks = <&rcc 0 STM32F4_AHB3_CLOCK(QSPI)>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_qspi0>;
+
+ flash@0 {
+ reg = <0>;
+ spi-rx-bus-width = <4>;
+ spi-max-frequency = <108000000>;
+ ...
+ };
+};
diff --git a/Documentation/devicetree/bindings/net/dsa/b53.txt b/Documentation/devicetree/bindings/net/dsa/b53.txt
index d6c6e41648d4..8ec2ca21adeb 100644
--- a/Documentation/devicetree/bindings/net/dsa/b53.txt
+++ b/Documentation/devicetree/bindings/net/dsa/b53.txt
@@ -34,7 +34,7 @@ Required properties:
"brcm,bcm6328-switch"
"brcm,bcm6368-switch" and the mandatory "brcm,bcm63xx-switch"
-See Documentation/devicetree/bindings/dsa/dsa.txt for a list of additional
+See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of additional
required and optional properties.
Examples:
diff --git a/Documentation/devicetree/bindings/net/dsa/marvell.txt b/Documentation/devicetree/bindings/net/dsa/marvell.txt
index 7ef9dbb08957..1d4d0f49c9d0 100644
--- a/Documentation/devicetree/bindings/net/dsa/marvell.txt
+++ b/Documentation/devicetree/bindings/net/dsa/marvell.txt
@@ -26,6 +26,10 @@ Optional properties:
- interrupt-controller : Indicates the switch is itself an interrupt
controller. This is used for the PHY interrupts.
#interrupt-cells = <2> : Controller uses two cells, number and flag
+- eeprom-length : Set to the length of an EEPROM connected to the
+ switch. Must be set if the switch can not detect
+ the presence and/or size of a connected EEPROM,
+ otherwise optional.
- mdio : Container of PHY and devices on the switches MDIO
bus.
- mdio? : Container of PHYs and devices on the external MDIO
diff --git a/Documentation/devicetree/bindings/net/fsl-fec.txt b/Documentation/devicetree/bindings/net/fsl-fec.txt
index a1e3693cca16..6f55bdd52f8a 100644
--- a/Documentation/devicetree/bindings/net/fsl-fec.txt
+++ b/Documentation/devicetree/bindings/net/fsl-fec.txt
@@ -15,6 +15,10 @@ Optional properties:
- phy-reset-active-high : If present then the reset sequence using the GPIO
specified in the "phy-reset-gpios" property is reversed (H=reset state,
L=operation state).
+- phy-reset-post-delay : Post reset delay in milliseconds. If present then
+ a delay of phy-reset-post-delay milliseconds will be observed after the
+ phy-reset-gpios has been toggled. Can be omitted thus no delay is
+ observed. Delay is in range of 1ms to 1000ms. Other delays are invalid.
- phy-supply : regulator that powers the Ethernet PHY.
- phy-handle : phandle to the PHY device connected to this device.
- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory.
diff --git a/Documentation/devicetree/bindings/net/smsc911x.txt b/Documentation/devicetree/bindings/net/smsc911x.txt
index 16c3a9501f5d..acfafc8e143c 100644
--- a/Documentation/devicetree/bindings/net/smsc911x.txt
+++ b/Documentation/devicetree/bindings/net/smsc911x.txt
@@ -27,6 +27,7 @@ Optional properties:
of the device. On many systems this is wired high so the device goes
out of reset at power-on, but if it is under program control, this
optional GPIO can wake up in response to it.
+- vdd33a-supply, vddvario-supply : 3.3V analog and IO logic power supplies
Examples:
diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
index 71a3c134af1b..f01d154090da 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
@@ -247,7 +247,6 @@ bias-bus-hold - latch weakly
bias-pull-up - pull up the pin
bias-pull-down - pull down the pin
bias-pull-pin-default - use pin-default pull state
-bi-directional - pin supports simultaneous input/output operations
drive-push-pull - drive actively high and low
drive-open-drain - drive with open drain
drive-open-source - drive with open source
@@ -260,7 +259,6 @@ input-debounce - debounce mode with debound time X
power-source - select between different power supplies
low-power-enable - enable low power mode
low-power-disable - disable low power mode
-output-enable - enable output on pin regardless of output value
output-low - set the pin to output mode with low level
output-high - set the pin to output mode with high level
slew-rate - set the slew rate
diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index 940707d095cc..14bd9e945ff6 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -81,7 +81,7 @@ Example 3:
child: power-controller@12341000 {
compatible = "foo,power-controller";
reg = <0x12341000 0x1000>;
- power-domains = <&parent 0>;
+ power-domains = <&parent>;
#power-domain-cells = <0>;
domain-idle-states = <&DOMAIN_PWR_DN>;
};
diff --git a/Documentation/devicetree/bindings/power/supply/axp20x_battery.txt b/Documentation/devicetree/bindings/power/supply/axp20x_battery.txt
new file mode 100644
index 000000000000..c24886676a60
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/supply/axp20x_battery.txt
@@ -0,0 +1,20 @@
+AXP20x and AXP22x battery power supply
+
+Required Properties:
+ - compatible, one of:
+ "x-powers,axp209-battery-power-supply"
+ "x-powers,axp221-battery-power-supply"
+
+This node is a subnode of the axp20x/axp22x PMIC.
+
+The AXP20X and AXP22X can read the battery voltage, charge and discharge
+currents of the battery by reading ADC channels from the AXP20X/AXP22X
+ADC.
+
+Example:
+
+&axp209 {
+ battery_power_supply: battery-power-supply {
+ compatible = "x-powers,axp209-battery-power-supply";
+ }
+};
diff --git a/Documentation/devicetree/bindings/powerpc/ibm,powerpc-cpu-features.txt b/Documentation/devicetree/bindings/powerpc/ibm,powerpc-cpu-features.txt
new file mode 100644
index 000000000000..5af426e13334
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/ibm,powerpc-cpu-features.txt
@@ -0,0 +1,248 @@
+*** NOTE ***
+This document is copied from OPAL firmware
+(skiboot/doc/device-tree/ibm,powerpc-cpu-features/binding.txt)
+
+There is more complete overview and documentation of features in that
+source tree. All patches and modifications should go there.
+************
+
+ibm,powerpc-cpu-features binding
+================================
+
+This device tree binding describes CPU features available to software, with
+enablement, privilege, and compatibility metadata.
+
+More general description of design and implementation of this binding is
+found in design.txt, which also points to documentation of specific features.
+
+
+/cpus/ibm,powerpc-cpu-features node binding
+-------------------------------------------
+
+Node: ibm,powerpc-cpu-features
+
+Description: Container of CPU feature nodes.
+
+The node name must be "ibm,powerpc-cpu-features".
+
+It is implemented as a child of the node "/cpus", but this must not be
+assumed by parsers.
+
+The node is optional but should be provided by new OPAL firmware.
+
+Properties:
+
+- compatible
+ Usage: required
+ Value type: string
+ Definition: "ibm,powerpc-cpu-features"
+
+ This compatibility refers to backwards compatibility of the overall
+ design with parsers that behave according to these guidelines. This can
+ be extended in a backward compatible manner which would not warrant a
+ revision of the compatible property.
+
+- isa
+ Usage: required
+ Value type: <u32>
+ Definition:
+
+ isa that the CPU is currently running in. This provides instruction set
+ compatibility, less the individual feature nodes. For example, an ISA v3.0
+ implementation that lacks the "transactional-memory" cpufeature node
+ should not use transactional memory facilities.
+
+ Value corresponds to the "Power ISA Version" multiplied by 1000.
+ For example, <3000> corresponds to Version 3.0, <2070> to Version 2.07.
+ The minor digit is available for revisions.
+
+- display-name
+ Usage: optional
+ Value type: string
+ Definition:
+
+ A human readable name for the CPU.
+
+/cpus/ibm,powerpc-cpu-features/example-feature node bindings
+----------------------------------------------------------------
+
+Each child node of cpu-features represents a CPU feature / capability.
+
+Node: A string describing an architected CPU feature, e.g., "floating-point".
+
+Description: A feature or capability supported by the CPUs.
+
+The name of the node is a human readable string that forms the interface
+used to describe features to software. Features are currently documented
+in the code where they are implemented in skiboot/core/cpufeatures.c
+
+Presence of the node indicates the feature is available.
+
+Properties:
+
+- isa
+ Usage: required
+ Value type: <u32>
+ Definition:
+
+ First level of the Power ISA that the feature appears in.
+ Software should filter out features when constraining the
+ environment to a particular ISA version.
+
+ Value is defined similarly to /cpus/features/isa
+
+- usable-privilege
+ Usage: required
+ Value type: <u32> bit mask
+ Definition:
+ Bit numbers are LSB0
+ bit 0 - PR (problem state / user mode)
+ bit 1 - OS (privileged state)
+ bit 2 - HV (hypervisor state)
+ All other bits reserved and should be zero.
+
+ This property describes the privilege levels and/or software components
+ that can use the feature.
+
+ If bit 0 is set, then the hwcap-bit-nr property will exist.
+
+
+- hv-support
+ Usage: optional
+ Value type: <u32> bit mask
+ Definition:
+ Bit numbers are LSB0
+ bit 0 - HFSCR
+ All other bits reserved and should be zero.
+
+ This property describes the HV privilege support required to enable the
+ feature to lesser privilege levels. If the property does not exist then no
+ support is required.
+
+ If no bits are set, the hypervisor must have explicit/custom support for
+ this feature.
+
+ If the HFSCR bit is set, then the hfscr-bit-nr property will exist and
+ the feature may be enabled by setting this bit in the HFSCR register.
+
+
+- os-support
+ Usage: optional
+ Value type: <u32> bit mask
+ Definition:
+ Bit numbers are LSB0
+ bit 0 - FSCR
+ All other bits reserved and should be zero.
+
+ This property describes the OS privilege support required to enable the
+ feature to lesser privilege levels. If the property does not exist then no
+ support is required.
+
+ If no bits are set, the operating system must have explicit/custom support
+ for this feature.
+
+ If the FSCR bit is set, then the fscr-bit-nr property will exist and
+ the feature may be enabled by setting this bit in the FSCR register.
+
+
+- hfscr-bit-nr
+ Usage: optional
+ Value type: <u32>
+ Definition: HFSCR bit position (LSB0)
+
+ This property exists when the hv-support property HFSCR bit is set. This
+ property describes the bit number in the HFSCR register that the
+ hypervisor must set in order to enable this feature.
+
+ This property also exists if an HFSCR bit corresponds with this feature.
+ This makes CPU feature parsing slightly simpler.
+
+
+- fscr-bit-nr
+ Usage: optional
+ Value type: <u32>
+ Definition: FSCR bit position (LSB0)
+
+ This property exists when the os-support property FSCR bit is set. This
+ property describes the bit number in the FSCR register that the
+ operating system must set in order to enable this feature.
+
+ This property also exists if an FSCR bit corresponds with this feature.
+ This makes CPU feature parsing slightly simpler.
+
+
+- hwcap-bit-nr
+ Usage: optional
+ Value type: <u32>
+ Definition: Linux ELF AUX vector bit position (LSB0)
+
+ This property may exist when the usable-privilege property value has PR bit set.
+ This property describes the bit number that should be set in the ELF AUX
+ hardware capability vectors in order to advertise this feature to userspace.
+ Bits 0-31 correspond to bits 0-31 in AT_HWCAP vector. Bits 32-63 correspond
+ to 0-31 in AT_HWCAP2 vector, and so on. Missing AT_HWCAPx vectors implies
+ that the feature is not enabled or can not be advertised. Operating systems
+ may provide a number of unassigned hardware capability bits to allow for new
+ features to be advertised.
+
+ Some properties representing features created before this binding are
+ advertised to userspace without a one-to-one hwcap bit number may not specify
+ this bit. Operating system will handle those bits specifically. All new
+ features usable by userspace will have a hwcap-bit-nr property.
+
+
+- dependencies
+ Usage: optional
+ Value type: <prop-encoded-array>
+ Definition:
+
+ If this property exists then it is a list of phandles to cpu feature
+ nodes that must be enabled for this feature to be enabled.
+
+
+Example
+-------
+
+ /cpus/ibm,powerpc-cpu-features {
+ compatible = "ibm,powerpc-cpu-features";
+
+ isa = <3020>;
+
+ darn {
+ isa = <3000>;
+ usable-privilege = <1 | 2 | 4>;
+ hwcap-bit-nr = <xx>;
+ };
+
+ scv {
+ isa = <3000>;
+ usable-privilege = <1 | 2>;
+ os-support = <0>;
+ hwcap-bit-nr = <xx>;
+ };
+
+ stop {
+ isa = <3000>;
+ usable-privilege = <2 | 4>;
+ hv-support = <0>;
+ os-support = <0>;
+ };
+
+ vsx2 (hypothetical) {
+ isa = <3010>;
+ usable-privilege = <1 | 2 | 4>;
+ hv-support = <0>;
+ os-support = <0>;
+ hwcap-bit-nr = <xx>;
+ };
+
+ vsx2-newinsns {
+ isa = <3020>;
+ usable-privilege = <1 | 2 | 4>;
+ os-support = <1>;
+ fscr-bit-nr = <xx>;
+ hwcap-bit-nr = <xx>;
+ dependencies = <&vsx2>;
+ };
+
+ };
diff --git a/Documentation/devicetree/bindings/rtc/cpcap-rtc.txt b/Documentation/devicetree/bindings/rtc/cpcap-rtc.txt
new file mode 100644
index 000000000000..45750ff3112d
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/cpcap-rtc.txt
@@ -0,0 +1,18 @@
+Motorola CPCAP PMIC RTC
+-----------------------
+
+This module is part of the CPCAP. For more details about the whole
+chip see Documentation/devicetree/bindings/mfd/motorola-cpcap.txt.
+
+Requires node properties:
+- compatible: should contain "motorola,cpcap-rtc"
+- interrupts: An interrupt specifier for alarm and 1 Hz irq
+
+Example:
+
+&cpcap {
+ cpcap_rtc: rtc {
+ compatible = "motorola,cpcap-rtc";
+ interrupts = <39 IRQ_TYPE_NONE>, <26 IRQ_TYPE_NONE>;
+ };
+};
diff --git a/Documentation/devicetree/bindings/rtc/rtc-sh.txt b/Documentation/devicetree/bindings/rtc/rtc-sh.txt
new file mode 100644
index 000000000000..7676c7d28874
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/rtc-sh.txt
@@ -0,0 +1,28 @@
+* Real Time Clock for Renesas SH and ARM SoCs
+
+Required properties:
+- compatible: Should be "renesas,r7s72100-rtc" and "renesas,sh-rtc" as a
+ fallback.
+- reg: physical base address and length of memory mapped region.
+- interrupts: 3 interrupts for alarm, period, and carry.
+- interrupt-names: The interrupts should be labeled as "alarm", "period", and
+ "carry".
+- clocks: The functional clock source for the RTC controller must be listed
+ first (if exists). Additionally, potential clock counting sources are to be
+ listed.
+- clock-names: The functional clock must be labeled as "fck". Other clocks
+ may be named in accordance to the SoC hardware manuals.
+
+
+Example:
+rtc: rtc@fcff1000 {
+ compatible = "renesas,r7s72100-rtc", "renesas,sh-rtc";
+ reg = <0xfcff1000 0x2e>;
+ interrupts = <GIC_SPI 276 IRQ_TYPE_EDGE_RISING
+ GIC_SPI 277 IRQ_TYPE_EDGE_RISING
+ GIC_SPI 278 IRQ_TYPE_EDGE_RISING>;
+ interrupt-names = "alarm", "period", "carry";
+ clocks = <&mstp6_clks R7S72100_CLK_RTC>, <&rtc_x1_clk>,
+ <&rtc_x3_clk>, <&extal_clk>;
+ clock-names = "fck", "rtc_x1", "rtc_x3", "extal";
+};
diff --git a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt
index 349f79fd7076..626e1afa64a6 100644
--- a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt
+++ b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt
@@ -13,8 +13,17 @@ Required properties:
- #gpio-cells : Should be two. The first cell is the pin number and the
second cell is used to specify optional parameters (currently unused).
- gpio-controller : Marks the port as GPIO controller.
+Optional properties:
+- fsl,cpm1-gpio-irq-mask : For banks having interrupt capability (like port C
+ on CPM1), this item tells which ports have an associated interrupt (ports are
+ listed in the same order as in PCINT register)
+- interrupts : This property provides the list of interrupt for each GPIO having
+ one as described by the fsl,cpm1-gpio-irq-mask property. There should be as
+ many interrupts as number of ones in the mask property. The first interrupt in
+ the list corresponds to the most significant bit of the mask.
+- interrupt-parent : Parent for the above interrupt property.
-Example of three SOC GPIO banks defined as gpio-controller nodes:
+Example of four SOC GPIO banks defined as gpio-controller nodes:
CPM1_PIO_A: gpio-controller@950 {
#gpio-cells = <2>;
@@ -30,6 +39,16 @@ Example of three SOC GPIO banks defined as gpio-controller nodes:
gpio-controller;
};
+ CPM1_PIO_C: gpio-controller@960 {
+ #gpio-cells = <2>;
+ compatible = "fsl,cpm1-pario-bank-c";
+ reg = <0x960 0x10>;
+ fsl,cpm1-gpio-irq-mask = <0x0fff>;
+ interrupts = <1 2 6 9 10 11 14 15 23 24 26 31>;
+ interrupt-parent = <&CPM_PIC>;
+ gpio-controller;
+ };
+
CPM1_PIO_E: gpio-controller@ac8 {
#gpio-cells = <2>;
compatible = "fsl,cpm1-pario-bank-e";
diff --git a/Documentation/devicetree/bindings/staging/ion/hi6220-ion.txt b/Documentation/devicetree/bindings/staging/ion/hi6220-ion.txt
deleted file mode 100644
index c59e27c632c1..000000000000
--- a/Documentation/devicetree/bindings/staging/ion/hi6220-ion.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-Hi6220 SoC ION
-===================================================================
-Required properties:
-- compatible : "hisilicon,hi6220-ion"
-- list of the ION heaps
- - heap name : maybe heap_sys_user@0
- - heap id : id should be unique in the system.
- - heap base : base ddr address of the heap,0 means that
- it is dynamic.
- - heap size : memory size and 0 means it is dynamic.
- - heap type : the heap type of the heap, please also
- see the define in ion.h(drivers/staging/android/uapi/ion.h)
--------------------------------------------------------------------
-Example:
- hi6220-ion {
- compatible = "hisilicon,hi6220-ion";
- heap_sys_user@0 {
- heap-name = "sys_user";
- heap-id = <0x0>;
- heap-base = <0x0>;
- heap-size = <0x0>;
- heap-type = "ion_system";
- };
- heap_sys_contig@0 {
- heap-name = "sys_contig";
- heap-id = <0x1>;
- heap-base = <0x0>;
- heap-size = <0x0>;
- heap-type = "ion_system_contig";
- };
- };
diff --git a/Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt b/Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt
index 474531d2b2c5..da8c5b73ad10 100644
--- a/Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt
@@ -3,15 +3,39 @@ Binding for Thermal Sensor driver for BCM2835 SoCs.
Required parameters:
-------------------
-compatible: should be one of: "brcm,bcm2835-thermal",
- "brcm,bcm2836-thermal" or "brcm,bcm2837-thermal"
-reg: Address range of the thermal registers.
-clocks: Phandle of the clock used by the thermal sensor.
+compatible: should be one of: "brcm,bcm2835-thermal",
+ "brcm,bcm2836-thermal" or "brcm,bcm2837-thermal"
+reg: Address range of the thermal registers.
+clocks: Phandle of the clock used by the thermal sensor.
+#thermal-sensor-cells: should be 0 (see thermal.txt)
Example:
+thermal-zones {
+ cpu_thermal: cpu-thermal {
+ polling-delay-passive = <0>;
+ polling-delay = <1000>;
+
+ thermal-sensors = <&thermal>;
+
+ trips {
+ cpu-crit {
+ temperature = <80000>;
+ hysteresis = <0>;
+ type = "critical";
+ };
+ };
+
+ coefficients = <(-538) 407000>;
+
+ cooling-maps {
+ };
+ };
+};
+
thermal: thermal@7e212000 {
compatible = "brcm,bcm2835-thermal";
reg = <0x7e212000 0x8>;
clocks = <&clocks BCM2835_CLOCK_TSENS>;
+ #thermal-sensor-cells = <0>;
};
diff --git a/Documentation/devicetree/bindings/thermal/brcm,ns-thermal b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal
new file mode 100644
index 000000000000..68e047170039
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal
@@ -0,0 +1,37 @@
+* Broadcom Northstar Thermal
+
+This binding describes thermal sensor that is part of Northstar's DMU (Device
+Management Unit).
+
+Required properties:
+- compatible : Must be "brcm,ns-thermal"
+- reg : iomem address range of PVTMON registers
+- #thermal-sensor-cells : Should be <0>
+
+Example:
+
+thermal: thermal@1800c2c0 {
+ compatible = "brcm,ns-thermal";
+ reg = <0x1800c2c0 0x10>;
+ #thermal-sensor-cells = <0>;
+};
+
+thermal-zones {
+ cpu_thermal: cpu-thermal {
+ polling-delay-passive = <0>;
+ polling-delay = <1000>;
+ coefficients = <(-556) 418000>;
+ thermal-sensors = <&thermal>;
+
+ trips {
+ cpu-crit {
+ temperature = <125000>;
+ hysteresis = <0>;
+ type = "critical";
+ };
+ };
+
+ cooling-maps {
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/thermal/da9062-thermal.txt b/Documentation/devicetree/bindings/thermal/da9062-thermal.txt
new file mode 100644
index 000000000000..e241bb5a5584
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/da9062-thermal.txt
@@ -0,0 +1,36 @@
+* Dialog DA9062/61 TJUNC Thermal Module
+
+This module is part of the DA9061/DA9062. For more details about entire
+DA9062 and DA9061 chips see Documentation/devicetree/bindings/mfd/da9062.txt
+
+Junction temperature thermal module uses an interrupt signal to identify
+high THERMAL_TRIP_HOT temperatures for the PMIC device.
+
+Required properties:
+
+- compatible: should be one of the following valid compatible string lines:
+ "dlg,da9061-thermal", "dlg,da9062-thermal"
+ "dlg,da9062-thermal"
+
+Optional properties:
+
+- polling-delay-passive : Specify the polling period, measured in
+ milliseconds, between thermal zone device update checks.
+
+Example: DA9062
+
+ pmic0: da9062@58 {
+ thermal {
+ compatible = "dlg,da9062-thermal";
+ polling-delay-passive = <3000>;
+ };
+ };
+
+Example: DA9061 using a fall-back compatible for the DA9062 onkey driver
+
+ pmic0: da9061@58 {
+ thermal {
+ compatible = "dlg,da9061-thermal", "dlg,da9062-thermal";
+ polling-delay-passive = <3000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/trivial-devices.txt b/Documentation/devicetree/bindings/trivial-devices.txt
index ad10fbe61562..3e0a34c88e07 100644
--- a/Documentation/devicetree/bindings/trivial-devices.txt
+++ b/Documentation/devicetree/bindings/trivial-devices.txt
@@ -160,6 +160,7 @@ sii,s35390a 2-wire CMOS real-time clock
silabs,si7020 Relative Humidity and Temperature Sensors
skyworks,sky81452 Skyworks SKY81452: Six-Channel White LED Driver with Touch Panel Bias Supply
st,24c256 i2c serial eeprom (24cxx)
+st,m41t0 Serial real-time clock (RTC)
st,m41t00 Serial real-time clock (RTC)
st,m41t62 Serial real-time clock (RTC) with alarm
st,m41t80 M41T80 - SERIAL ACCESS RTC WITH ALARMS
diff --git a/Documentation/devicetree/bindings/usb/dwc2.txt b/Documentation/devicetree/bindings/usb/dwc2.txt
index 00bea038639e..fcf199b64d3d 100644
--- a/Documentation/devicetree/bindings/usb/dwc2.txt
+++ b/Documentation/devicetree/bindings/usb/dwc2.txt
@@ -10,6 +10,7 @@ Required properties:
- "rockchip,rk3288-usb", "rockchip,rk3066-usb", "snps,dwc2": for rk3288 Soc;
- "lantiq,arx100-usb": The DWC2 USB controller instance in Lantiq ARX SoCs;
- "lantiq,xrx200-usb": The DWC2 USB controller instance in Lantiq XRX SoCs;
+ - "amlogic,meson8-usb": The DWC2 USB controller instance in Amlogic Meson8 SoCs;
- "amlogic,meson8b-usb": The DWC2 USB controller instance in Amlogic Meson8b SoCs;
- "amlogic,meson-gxbb-usb": The DWC2 USB controller instance in Amlogic S905 SoCs;
- "amcc,dwc-otg": The DWC2 USB controller instance in AMCC Canyonlands 460EX SoCs;
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index f9fe94535b46..c03d20140366 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -173,6 +173,7 @@ lego LEGO Systems A/S
lenovo Lenovo Group Ltd.
lg LG Corporation
licheepi Lichee Pi
+linaro Linaro Limited
linux Linux-specific binding
lltc Linear Technology Corporation
lsi LSI Corp. (LSI Logic)
@@ -196,6 +197,7 @@ minix MINIX Technology Ltd.
miramems MiraMEMS Sensing Technology Co., Ltd.
mitsubishi Mitsubishi Electric Corporation
mosaixtech Mosaix Technologies, Inc.
+motorola Motorola, Inc.
moxa Moxa
mpl MPL AG
mqmaker mqmaker Inc.
diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt
index f10dd590f69f..8444dc3d57e8 100644
--- a/Documentation/filesystems/autofs4.txt
+++ b/Documentation/filesystems/autofs4.txt
@@ -316,7 +316,7 @@ For version 5, the format of the message is:
struct autofs_v5_packet {
int proto_version; /* Protocol version */
int type; /* Type of packet */
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_entry_token;
__u32 dev;
__u64 ino;
__u32 uid;
@@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing
`O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
most one packet, and any unread portion of a packet will be discarded.
-The `wait_queue_token` is a unique number which can identify a
+The `wait_queue_entry_token` is a unique number which can identify a
particular request to be acknowledged. When a message is sent over
the pipe the affected dentry is marked as either "active" or
"expiring" and other accesses to it block until the message is
acknowledged using one of the ioctls below and the relevant
-`wait_queue_token`.
+`wait_queue_entry_token`.
Communicating with autofs: root directory ioctls
------------------------------------------------
@@ -358,7 +358,7 @@ capability, or must be the automount daemon.
The available ioctl commands are:
- **AUTOFS_IOC_READY**: a notification has been handled. The argument
- to the ioctl command is the "wait_queue_token" number
+ to the ioctl command is the "wait_queue_entry_token" number
corresponding to the notification being acknowledged.
- **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
the error code `ENOENT`.
@@ -382,14 +382,14 @@ The available ioctl commands are:
struct autofs_packet_expire_multi {
int proto_version; /* Protocol version */
int type; /* Type of packet */
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_entry_token;
int len;
char name[NAME_MAX+1];
};
is required. This is filled in with the name of something
that can be unmounted or removed. If nothing can be expired,
- `errno` is set to `EAGAIN`. Even though a `wait_queue_token`
+ `errno` is set to `EAGAIN`. Even though a `wait_queue_entry_token`
is present in the structure, no "wait queue" is established
and no acknowledgment is needed.
- **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to
diff --git a/Documentation/filesystems/bfs.txt b/Documentation/filesystems/bfs.txt
index 78043d5a8fc3..843ce91a2e40 100644
--- a/Documentation/filesystems/bfs.txt
+++ b/Documentation/filesystems/bfs.txt
@@ -54,4 +54,4 @@ The first 4 bytes should be 0x1badface.
If you have any patches, questions or suggestions regarding this BFS
implementation please contact the author:
-Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+Tigran Aivazian <aivazian.tigran@gmail.com>
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
index 8de578a98222..80dc0bdc302a 100644
--- a/Documentation/filesystems/nfs/pnfs.txt
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -64,46 +64,9 @@ table which are called by the nfs-client pnfs-core to implement the
different layout types.
Files-layout-driver code is in: fs/nfs/filelayout/.. directory
-Objects-layout-driver code is in: fs/nfs/objlayout/.. directory
Blocks-layout-driver code is in: fs/nfs/blocklayout/.. directory
Flexfiles-layout-driver code is in: fs/nfs/flexfilelayout/.. directory
-objects-layout setup
---------------------
-
-As part of the full STD implementation the objlayoutdriver.ko needs, at times,
-to automatically login to yet undiscovered iscsi/osd devices. For this the
-driver makes up-calles to a user-mode script called *osd_login*
-
-The path_name of the script to use is by default:
- /sbin/osd_login.
-This name can be overridden by the Kernel module parameter:
- objlayoutdriver.osd_login_prog
-
-If Kernel does not find the osd_login_prog path it will zero it out
-and will not attempt farther logins. An admin can then write new value
-to the objlayoutdriver.osd_login_prog Kernel parameter to re-enable it.
-
-The /sbin/osd_login is part of the nfs-utils package, and should usually
-be installed on distributions that support this Kernel version.
-
-The API to the login script is as follows:
- Usage: $0 -u <URI> -o <OSDNAME> -s <SYSTEMID>
- Options:
- -u target uri e.g. iscsi://<ip>:<port>
- (always exists)
- (More protocols can be defined in the future.
- The client does not interpret this string it is
- passed unchanged as received from the Server)
- -o osdname of the requested target OSD
- (Might be empty)
- (A string which denotes the OSD name, there is a
- limit of 64 chars on this string)
- -s systemid of the requested target OSD
- (Might be empty)
- (This string, if not empty is always an hex
- representation of the 20 bytes osd_system_id)
-
blocks-layout setup
-------------------
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index 634d03e20c2d..c9e884b52698 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -21,12 +21,19 @@ from accessing the corresponding object from the original filesystem.
This is most obvious from the 'st_dev' field returned by stat(2).
While directories will report an st_dev from the overlay-filesystem,
-all non-directory objects will report an st_dev from the lower or
+non-directory objects may report an st_dev from the lower filesystem or
upper filesystem that is providing the object. Similarly st_ino will
only be unique when combined with st_dev, and both of these can change
over the lifetime of a non-directory object. Many applications and
tools ignore these values and will not be affected.
+In the special case of all overlay layers on the same underlying
+filesystem, all objects will report an st_dev from the overlay
+filesystem and st_ino from the underlying filesystem. This will
+make the overlay mount more compliant with filesystem scanners and
+overlay objects will be distinguishable from the corresponding
+objects in the original filesystem.
+
Upper and Lower
---------------
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 61306a22888d..bc67dbf76eb0 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -67,6 +67,7 @@ needed).
driver-api/index
core-api/index
media/index
+ input/index
gpu/index
security/index
sound/index
diff --git a/Documentation/input/devices/edt-ft5x06.rst b/Documentation/input/devices/edt-ft5x06.rst
index 2032f0b7a8fa..1ccc94b192b7 100644
--- a/Documentation/input/devices/edt-ft5x06.rst
+++ b/Documentation/input/devices/edt-ft5x06.rst
@@ -15,7 +15,7 @@ It has been tested with the following devices:
The driver allows configuration of the touch screen via a set of sysfs files:
/sys/class/input/eventX/device/device/threshold:
- allows setting the "click"-threshold in the range from 20 to 80.
+ allows setting the "click"-threshold in the range from 0 to 80.
/sys/class/input/eventX/device/device/gain:
allows setting the sensitivity in the range from 0 to 31. Note that
diff --git a/Documentation/input/ff.rst b/Documentation/input/ff.rst
index 6a265a6934e6..26d461998e08 100644
--- a/Documentation/input/ff.rst
+++ b/Documentation/input/ff.rst
@@ -130,11 +130,11 @@ See <uapi/linux/input.h> for a description of the ff_effect struct. You
should also find help in a few sketches, contained in files shape.svg
and interactive.svg:
-.. figure:: shape.svg
+.. kernel-figure:: shape.svg
Shape
-.. figure:: interactive.svg
+.. kernel-figure:: interactive.svg
Interactive
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index eccb675a2852..1e9fcb4d0ec8 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -309,6 +309,7 @@ Code Seq#(hex) Include File Comments
0xA3 80-8F Port ACL in development:
<mailto:tlewis@mindspring.com>
0xA3 90-9F linux/dtlk.h
+0xA4 00-1F uapi/linux/tee.h Generic TEE subsystem
0xAA 00-3F linux/uapi/linux/userfaultfd.h
0xAB 00-1F linux/nbd.h
0xAC 00-1F linux/raw.h
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index 9b9c4797fc55..e18daca65ccd 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -44,11 +44,11 @@ This document describes the Linux kernel Makefiles.
--- 6.11 Post-link pass
=== 7 Kbuild syntax for exported headers
- --- 7.1 header-y
+ --- 7.1 no-export-headers
--- 7.2 genhdr-y
- --- 7.3 destination-y
- --- 7.4 generic-y
- --- 7.5 generated-y
+ --- 7.3 generic-y
+ --- 7.4 generated-y
+ --- 7.5 mandatory-y
=== 8 Kbuild Variables
=== 9 Makefile language
@@ -1236,7 +1236,7 @@ When kbuild executes, the following steps are followed (roughly):
that may be shared between individual architectures.
The recommended approach how to use a generic header file is
to list the file in the Kbuild file.
- See "7.4 generic-y" for further info on syntax etc.
+ See "7.3 generic-y" for further info on syntax etc.
--- 6.11 Post-link pass
@@ -1263,53 +1263,32 @@ The pre-processing does:
- drop include of compiler.h
- drop all sections that are kernel internal (guarded by ifdef __KERNEL__)
-Each relevant directory contains a file name "Kbuild" which specifies the
-headers to be exported.
-See subsequent chapter for the syntax of the Kbuild file.
-
- --- 7.1 header-y
-
- header-y specifies header files to be exported.
-
- Example:
- #include/linux/Kbuild
- header-y += usb/
- header-y += aio_abi.h
+All headers under include/uapi/, include/generated/uapi/,
+arch/<arch>/include/uapi/ and arch/<arch>/include/generated/uapi/
+are exported.
- The convention is to list one file per line and
- preferably in alphabetic order.
+A Kbuild file may be defined under arch/<arch>/include/uapi/asm/ and
+arch/<arch>/include/asm/ to list asm files coming from asm-generic.
+See subsequent chapter for the syntax of the Kbuild file.
- header-y also specifies which subdirectories to visit.
- A subdirectory is identified by a trailing '/' which
- can be seen in the example above for the usb subdirectory.
+ --- 7.1 no-export-headers
- Subdirectories are visited before their parent directories.
+ no-export-headers is essentially used by include/uapi/linux/Kbuild to
+ avoid exporting specific headers (e.g. kvm.h) on architectures that do
+ not support it. It should be avoided as much as possible.
--- 7.2 genhdr-y
- genhdr-y specifies generated files to be exported.
- Generated files are special as they need to be looked
- up in another directory when doing 'make O=...' builds.
+ genhdr-y specifies asm files to be generated.
Example:
- #include/linux/Kbuild
- genhdr-y += version.h
+ #arch/x86/include/uapi/asm/Kbuild
+ genhdr-y += unistd_32.h
+ genhdr-y += unistd_64.h
+ genhdr-y += unistd_x32.h
- --- 7.3 destination-y
- When an architecture has a set of exported headers that needs to be
- exported to a different directory destination-y is used.
- destination-y specifies the destination directory for all exported
- headers in the file where it is present.
-
- Example:
- #arch/xtensa/platforms/s6105/include/platform/Kbuild
- destination-y := include/linux
-
- In the example above all exported headers in the Kbuild file
- will be located in the directory "include/linux" when exported.
-
- --- 7.4 generic-y
+ --- 7.3 generic-y
If an architecture uses a verbatim copy of a header from
include/asm-generic then this is listed in the file
@@ -1336,7 +1315,7 @@ See subsequent chapter for the syntax of the Kbuild file.
Example: termios.h
#include <asm-generic/termios.h>
- --- 7.5 generated-y
+ --- 7.4 generated-y
If an architecture generates other header files alongside generic-y
wrappers, and not included in genhdr-y, then generated-y specifies
@@ -1349,6 +1328,15 @@ See subsequent chapter for the syntax of the Kbuild file.
#arch/x86/include/asm/Kbuild
generated-y += syscalls_32.h
+ --- 7.5 mandatory-y
+
+ mandatory-y is essentially used by include/uapi/asm-generic/Kbuild.asm
+ to define the minimun set of headers that must be exported in
+ include/asm.
+
+ The convention is to list one subdir per line and
+ preferably in alphabetic order.
+
=== 8 Kbuild Variables
The top Makefile exports the following variables:
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index df31e30b6a02..2cb7dc5c0e0d 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -109,13 +109,12 @@ SCHED_SOFTIRQ: Do all of the following:
on that CPU. If a thread that expects to run on the de-jittered
CPU awakens, the scheduler will send an IPI that can result in
a subsequent SCHED_SOFTIRQ.
-2. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
- CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU
- to be de-jittered is marked as an adaptive-ticks CPU using the
- "nohz_full=" boot parameter. This reduces the number of
- scheduler-clock interrupts that the de-jittered CPU receives,
- minimizing its chances of being selected to do the load balancing
- work that runs in SCHED_SOFTIRQ context.
+2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
+ is marked as an adaptive-ticks CPU using the "nohz_full="
+ boot parameter. This reduces the number of scheduler-clock
+ interrupts that the de-jittered CPU receives, minimizing its
+ chances of being selected to do the load balancing work that
+ runs in SCHED_SOFTIRQ context.
3. To the extent possible, keep the CPU out of the kernel when it
is non-idle, for example, by avoiding system calls and by
forcing both kernel threads and interrupts to execute elsewhere.
@@ -135,11 +134,10 @@ HRTIMER_SOFTIRQ: Do all of the following:
RCU_SOFTIRQ: Do at least one of the following:
1. Offload callbacks and keep the CPU in either dyntick-idle or
adaptive-ticks state by doing all of the following:
- a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
- CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU
- to be de-jittered is marked as an adaptive-ticks CPU using
- the "nohz_full=" boot parameter. Bind the rcuo kthreads
- to housekeeping CPUs, which can tolerate OS jitter.
+ a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
+ de-jittered is marked as an adaptive-ticks CPU using the
+ "nohz_full=" boot parameter. Bind the rcuo kthreads to
+ housekeeping CPUs, which can tolerate OS jitter.
b. To the extent possible, keep the CPU out of the kernel
when it is non-idle, for example, by avoiding system
calls and by forcing both kernel threads and interrupts
@@ -236,11 +234,10 @@ To reduce its OS jitter, do at least one of the following:
is feasible only if your workload never requires RCU priority
boosting, for example, if you ensure frequent idle time on all
CPUs that might execute within the kernel.
-3. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y,
- which offloads all RCU callbacks to kthreads that can be moved
- off of CPUs susceptible to OS jitter. This approach prevents the
- rcuc/%u kthreads from having any work to do, so that they are
- never awakened.
+3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
+ boot parameter offloading RCU callbacks from all CPUs susceptible
+ to OS jitter. This approach prevents the rcuc/%u kthreads from
+ having any work to do, so that they are never awakened.
4. Ensure that the CPU never enters the kernel, and, in particular,
avoid initiating any CPU hotplug operations on this CPU. This is
another way of preventing any callbacks from being queued on the
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index d2b0a8d81258..9d5e0f853f08 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -27,7 +27,7 @@ The purpose of this document is twofold:
(2) to provide a guide as to how to use the barriers that are available.
Note that an architecture can provide more than the minimum requirement
-for any particular barrier, but if the architecure provides less than
+for any particular barrier, but if the architecture provides less than
that, that architecture is incorrect.
Note also that it is possible that a barrier may be a no-op for an
@@ -768,7 +768,7 @@ equal to zero, in which case the compiler is within its rights to
transform the above code into the following:
q = READ_ONCE(a);
- WRITE_ONCE(b, 1);
+ WRITE_ONCE(b, 2);
do_something_else();
Given this transformation, the CPU is not required to respect the ordering
@@ -2373,7 +2373,7 @@ is performed:
spin_unlock(Q);
-See Documentation/DocBook/deviceiobook.tmpl for more information.
+See Documentation/driver-api/device-io.rst for more information.
=================================
@@ -2614,7 +2614,7 @@ might be needed:
relaxed memory access properties, then _mandatory_ memory barriers are
required to enforce ordering.
-See Documentation/DocBook/deviceiobook.tmpl for more information.
+See Documentation/driver-api/device-io.rst for more information.
INTERRUPTS
diff --git a/Documentation/networking/dpaa.txt b/Documentation/networking/dpaa.txt
new file mode 100644
index 000000000000..76e016d4d344
--- /dev/null
+++ b/Documentation/networking/dpaa.txt
@@ -0,0 +1,194 @@
+The QorIQ DPAA Ethernet Driver
+==============================
+
+Authors:
+Madalin Bucur <madalin.bucur@nxp.com>
+Camelia Groza <camelia.groza@nxp.com>
+
+Contents
+========
+
+ - DPAA Ethernet Overview
+ - DPAA Ethernet Supported SoCs
+ - Configuring DPAA Ethernet in your kernel
+ - DPAA Ethernet Frame Processing
+ - DPAA Ethernet Features
+ - Debugging
+
+DPAA Ethernet Overview
+======================
+
+DPAA stands for Data Path Acceleration Architecture and it is a
+set of networking acceleration IPs that are available on several
+generations of SoCs, both on PowerPC and ARM64.
+
+The Freescale DPAA architecture consists of a series of hardware blocks
+that support Ethernet connectivity. The Ethernet driver depends upon the
+following drivers in the Linux kernel:
+
+ - Peripheral Access Memory Unit (PAMU) (* needed only for PPC platforms)
+ drivers/iommu/fsl_*
+ - Frame Manager (FMan)
+ drivers/net/ethernet/freescale/fman
+ - Queue Manager (QMan), Buffer Manager (BMan)
+ drivers/soc/fsl/qbman
+
+A simplified view of the dpaa_eth interfaces mapped to FMan MACs:
+
+ dpaa_eth /eth0\ ... /ethN\
+ driver | | | |
+ ------------- ---- ----------- ---- -------------
+ -Ports / Tx Rx \ ... / Tx Rx \
+ FMan | | | |
+ -MACs | MAC0 | | MACN |
+ / dtsec0 \ ... / dtsecN \ (or tgec)
+ / \ / \(or memac)
+ --------- -------------- --- -------------- ---------
+ FMan, FMan Port, FMan SP, FMan MURAM drivers
+ ---------------------------------------------------------
+ FMan HW blocks: MURAM, MACs, Ports, SP
+ ---------------------------------------------------------
+
+The dpaa_eth relation to the QMan, BMan and FMan:
+ ________________________________
+ dpaa_eth / eth0 \
+ driver / \
+ --------- -^- -^- -^- --- ---------
+ QMan driver / \ / \ / \ \ / | BMan |
+ |Rx | |Rx | |Tx | |Tx | | driver |
+ --------- |Dfl| |Err| |Cnf| |FQs| | |
+ QMan HW |FQ | |FQ | |FQs| | | | |
+ / \ / \ / \ \ / | |
+ --------- --- --- --- -v- ---------
+ | FMan QMI | |
+ | FMan HW FMan BMI | BMan HW |
+ ----------------------- --------
+
+where the acronyms used above (and in the code) are:
+DPAA = Data Path Acceleration Architecture
+FMan = DPAA Frame Manager
+QMan = DPAA Queue Manager
+BMan = DPAA Buffers Manager
+QMI = QMan interface in FMan
+BMI = BMan interface in FMan
+FMan SP = FMan Storage Profiles
+MURAM = Multi-user RAM in FMan
+FQ = QMan Frame Queue
+Rx Dfl FQ = default reception FQ
+Rx Err FQ = Rx error frames FQ
+Tx Cnf FQ = Tx confirmation FQs
+Tx FQs = transmission frame queues
+dtsec = datapath three speed Ethernet controller (10/100/1000 Mbps)
+tgec = ten gigabit Ethernet controller (10 Gbps)
+memac = multirate Ethernet MAC (10/100/1000/10000)
+
+DPAA Ethernet Supported SoCs
+============================
+
+The DPAA drivers enable the Ethernet controllers present on the following SoCs:
+
+# PPC
+P1023
+P2041
+P3041
+P4080
+P5020
+P5040
+T1023
+T1024
+T1040
+T1042
+T2080
+T4240
+B4860
+
+# ARM
+LS1043A
+LS1046A
+
+Configuring DPAA Ethernet in your kernel
+========================================
+
+To enable the DPAA Ethernet driver, the following Kconfig options are required:
+
+# common for arch/arm64 and arch/powerpc platforms
+CONFIG_FSL_DPAA=y
+CONFIG_FSL_FMAN=y
+CONFIG_FSL_DPAA_ETH=y
+CONFIG_FSL_XGMAC_MDIO=y
+
+# for arch/powerpc only
+CONFIG_FSL_PAMU=y
+
+# common options needed for the PHYs used on the RDBs
+CONFIG_VITESSE_PHY=y
+CONFIG_REALTEK_PHY=y
+CONFIG_AQUANTIA_PHY=y
+
+DPAA Ethernet Frame Processing
+==============================
+
+On Rx, buffers for the incoming frames are retrieved from one of the three
+existing buffers pools. The driver initializes and seeds these, each with
+buffers of different sizes: 1KB, 2KB and 4KB.
+
+On Tx, all transmitted frames are returned to the driver through Tx
+confirmation frame queues. The driver is then responsible for freeing the
+buffers. In order to do this properly, a backpointer is added to the buffer
+before transmission that points to the skb. When the buffer returns to the
+driver on a confirmation FQ, the skb can be correctly consumed.
+
+DPAA Ethernet Features
+======================
+
+Currently the DPAA Ethernet driver enables the basic features required for
+a Linux Ethernet driver. The support for advanced features will be added
+gradually.
+
+The driver has Rx and Tx checksum offloading for UDP and TCP. Currently the Rx
+checksum offload feature is enabled by default and cannot be controlled through
+ethtool.
+
+The driver has support for multiple prioritized Tx traffic classes. Priorities
+range from 0 (lowest) to 3 (highest). These are mapped to HW workqueues with
+strict priority levels. Each traffic class contains NR_CPU TX queues. By
+default, only one traffic class is enabled and the lowest priority Tx queues
+are used. Higher priority traffic classes can be enabled with the mqprio
+qdisc. For example, all four traffic classes are enabled on an interface with
+the following command. Furthermore, skb priority levels are mapped to traffic
+classes as follows:
+
+ * priorities 0 to 3 - traffic class 0 (low priority)
+ * priorities 4 to 7 - traffic class 1 (medium-low priority)
+ * priorities 8 to 11 - traffic class 2 (medium-high priority)
+ * priorities 12 to 15 - traffic class 3 (high priority)
+
+tc qdisc add dev <int> root handle 1: \
+ mqprio num_tc 4 map 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 hw 1
+
+Debugging
+=========
+
+The following statistics are exported for each interface through ethtool:
+
+ - interrupt count per CPU
+ - Rx packets count per CPU
+ - Tx packets count per CPU
+ - Tx confirmed packets count per CPU
+ - Tx S/G frames count per CPU
+ - Tx error count per CPU
+ - Rx error count per CPU
+ - Rx error count per type
+ - congestion related statistics:
+ - congestion status
+ - time spent in congestion
+ - number of time the device entered congestion
+ - dropped packets count per cause
+
+The driver also exports the following information in sysfs:
+
+ - the FQ IDs for each FQ type
+ /sys/devices/platform/dpaa-ethernet.0/net/<int>/fqids
+
+ - the IDs of the buffer pools in use
+ /sys/devices/platform/dpaa-ethernet.0/net/<int>/bpids
diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt
index 59f4db2a0c85..f55639d71d35 100644
--- a/Documentation/networking/scaling.txt
+++ b/Documentation/networking/scaling.txt
@@ -122,7 +122,7 @@ associated flow of the packet. The hash is either provided by hardware
or will be computed in the stack. Capable hardware can pass the hash in
the receive descriptor for the packet; this would usually be the same
hash used for RSS (e.g. computed Toeplitz hash). The hash is saved in
-skb->rx_hash and can be used elsewhere in the stack as a hash of the
+skb->hash and can be used elsewhere in the stack as a hash of the
packet’s flow.
Each receive hardware queue has an associated list of CPUs to which
diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt
index bdc4c0db51e1..9c7139d57e57 100644
--- a/Documentation/networking/tcp.txt
+++ b/Documentation/networking/tcp.txt
@@ -1,7 +1,7 @@
TCP protocol
============
-Last updated: 9 February 2008
+Last updated: 3 June 2017
Contents
========
@@ -29,18 +29,19 @@ As of 2.6.13, Linux supports pluggable congestion control algorithms.
A congestion control mechanism can be registered through functions in
tcp_cong.c. The functions used by the congestion control mechanism are
registered via passing a tcp_congestion_ops struct to
-tcp_register_congestion_control. As a minimum name, ssthresh,
-cong_avoid must be valid.
+tcp_register_congestion_control. As a minimum, the congestion control
+mechanism must provide a valid name and must implement either ssthresh,
+cong_avoid and undo_cwnd hooks or the "omnipotent" cong_control hook.
Private data for a congestion control mechanism is stored in tp->ca_priv.
tcp_ca(tp) returns a pointer to this space. This is preallocated space - it
is important to check the size of your private data will fit this space, or
-alternatively space could be allocated elsewhere and a pointer to it could
+alternatively, space could be allocated elsewhere and a pointer to it could
be stored here.
There are three kinds of congestion control algorithms currently: The
simplest ones are derived from TCP reno (highspeed, scalable) and just
-provide an alternative the congestion window calculation. More complex
+provide an alternative congestion window calculation. More complex
ones like BIC try to look at other events to provide better
heuristics. There are also round trip time based algorithms like
Vegas and Westwood+.
@@ -49,21 +50,15 @@ Good TCP congestion control is a complex problem because the algorithm
needs to maintain fairness and performance. Please review current
research and RFC's before developing new modules.
-The method that is used to determine which congestion control mechanism is
-determined by the setting of the sysctl net.ipv4.tcp_congestion_control.
-The default congestion control will be the last one registered (LIFO);
-so if you built everything as modules, the default will be reno. If you
-build with the defaults from Kconfig, then CUBIC will be builtin (not a
-module) and it will end up the default.
+The default congestion control mechanism is chosen based on the
+DEFAULT_TCP_CONG Kconfig parameter. If you really want a particular default
+value then you can set it using sysctl net.ipv4.tcp_congestion_control. The
+module will be autoloaded if needed and you will get the expected protocol. If
+you ask for an unknown congestion method, then the sysctl attempt will fail.
-If you really want a particular default value then you will need
-to set it with the sysctl. If you use a sysctl, the module will be autoloaded
-if needed and you will get the expected protocol. If you ask for an
-unknown congestion method, then the sysctl attempt will fail.
-
-If you remove a tcp congestion control module, then you will get the next
+If you remove a TCP congestion control module, then you will get the next
available one. Since reno cannot be built as a module, and cannot be
-deleted, it will always be available.
+removed, it will always be available.
How the new TCP output machine [nyi] works.
===========================================
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index cbc1b46cbf70..e89e36ec15a5 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -7,6 +7,8 @@ CONTENTS
0. WARNING
1. Overview
2. Scheduling algorithm
+ 2.1 Main algorithm
+ 2.2 Bandwidth reclaiming
3. Scheduling Real-Time Tasks
3.1 Definitions
3.2 Schedulability Analysis for Uniprocessor Systems
@@ -44,6 +46,9 @@ CONTENTS
2. Scheduling algorithm
==================
+2.1 Main algorithm
+------------------
+
SCHED_DEADLINE uses three parameters, named "runtime", "period", and
"deadline", to schedule tasks. A SCHED_DEADLINE task should receive
"runtime" microseconds of execution time every "period" microseconds, and
@@ -113,6 +118,160 @@ CONTENTS
remaining runtime = remaining runtime + runtime
+2.2 Bandwidth reclaiming
+------------------------
+
+ Bandwidth reclaiming for deadline tasks is based on the GRUB (Greedy
+ Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled
+ when flag SCHED_FLAG_RECLAIM is set.
+
+ The following diagram illustrates the state names for tasks handled by GRUB:
+
+ ------------
+ (d) | Active |
+ ------------->| |
+ | | Contending |
+ | ------------
+ | A |
+ ---------- | |
+ | | | |
+ | Inactive | |(b) | (a)
+ | | | |
+ ---------- | |
+ A | V
+ | ------------
+ | | Active |
+ --------------| Non |
+ (c) | Contending |
+ ------------
+
+ A task can be in one of the following states:
+
+ - ActiveContending: if it is ready for execution (or executing);
+
+ - ActiveNonContending: if it just blocked and has not yet surpassed the 0-lag
+ time;
+
+ - Inactive: if it is blocked and has surpassed the 0-lag time.
+
+ State transitions:
+
+ (a) When a task blocks, it does not become immediately inactive since its
+ bandwidth cannot be immediately reclaimed without breaking the
+ real-time guarantees. It therefore enters a transitional state called
+ ActiveNonContending. The scheduler arms the "inactive timer" to fire at
+ the 0-lag time, when the task's bandwidth can be reclaimed without
+ breaking the real-time guarantees.
+
+ The 0-lag time for a task entering the ActiveNonContending state is
+ computed as
+
+ (runtime * dl_period)
+ deadline - ---------------------
+ dl_runtime
+
+ where runtime is the remaining runtime, while dl_runtime and dl_period
+ are the reservation parameters.
+
+ (b) If the task wakes up before the inactive timer fires, the task re-enters
+ the ActiveContending state and the "inactive timer" is canceled.
+ In addition, if the task wakes up on a different runqueue, then
+ the task's utilization must be removed from the previous runqueue's active
+ utilization and must be added to the new runqueue's active utilization.
+ In order to avoid races between a task waking up on a runqueue while the
+ "inactive timer" is running on a different CPU, the "dl_non_contending"
+ flag is used to indicate that a task is not on a runqueue but is active
+ (so, the flag is set when the task blocks and is cleared when the
+ "inactive timer" fires or when the task wakes up).
+
+ (c) When the "inactive timer" fires, the task enters the Inactive state and
+ its utilization is removed from the runqueue's active utilization.
+
+ (d) When an inactive task wakes up, it enters the ActiveContending state and
+ its utilization is added to the active utilization of the runqueue where
+ it has been enqueued.
+
+ For each runqueue, the algorithm GRUB keeps track of two different bandwidths:
+
+ - Active bandwidth (running_bw): this is the sum of the bandwidths of all
+ tasks in active state (i.e., ActiveContending or ActiveNonContending);
+
+ - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
+ runqueue, including the tasks in Inactive state.
+
+
+ The algorithm reclaims the bandwidth of the tasks in Inactive state.
+ It does so by decrementing the runtime of the executing task Ti at a pace equal
+ to
+
+ dq = -max{ Ui, (1 - Uinact) } dt
+
+ where Uinact is the inactive utilization, computed as (this_bq - running_bw),
+ and Ui is the bandwidth of task Ti.
+
+
+ Let's now see a trivial example of two deadline tasks with runtime equal
+ to 4 and period equal to 8 (i.e., bandwidth equal to 0.5):
+
+ A Task T1
+ |
+ | |
+ | |
+ |-------- |----
+ | | V
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ A Task T2
+ |
+ | |
+ | |
+ | ------------------------|
+ | | V
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ A running_bw
+ |
+ 1 ----------------- ------
+ | | |
+ 0.5- -----------------
+ | |
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ - Time t = 0:
+
+ Both tasks are ready for execution and therefore in ActiveContending state.
+ Suppose Task T1 is the first task to start execution.
+ Since there are no inactive tasks, its runtime is decreased as dq = -1 dt.
+
+ - Time t = 2:
+
+ Suppose that task T1 blocks
+ Task T1 therefore enters the ActiveNonContending state. Since its remaining
+ runtime is equal to 2, its 0-lag time is equal to t = 4.
+ Task T2 start execution, with runtime still decreased as dq = -1 dt since
+ there are no inactive tasks.
+
+ - Time t = 4:
+
+ This is the 0-lag time for Task T1. Since it didn't woken up in the
+ meantime, it enters the Inactive state. Its bandwidth is removed from
+ running_bw.
+ Task T2 continues its execution. However, its runtime is now decreased as
+ dq = - 0.5 dt because Uinact = 0.5.
+ Task T2 therefore reclaims the bandwidth unused by Task T1.
+
+ - Time t = 8:
+
+ Task T1 wakes up. It enters the ActiveContending state again, and the
+ running_bw is incremented.
+
+
3. Scheduling Real-Time Tasks
=============================
@@ -330,6 +489,15 @@ CONTENTS
14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
Global EDF. Proceedings of the 22nd Euromicro Conference on
Real-Time Systems, 2010.
+ 15 - G. Lipari, S. Baruah, Greedy reclamation of unused bandwidth in
+ constant-bandwidth servers, 12th IEEE Euromicro Conference on Real-Time
+ Systems, 2000.
+ 16 - L. Abeni, J. Lelli, C. Scordino, L. Palopoli, Greedy CPU reclaiming for
+ SCHED DEADLINE. In Proceedings of the Real-Time Linux Workshop (RTLWS),
+ Dusseldorf, Germany, 2014.
+ 17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
+ or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
+ Computing, 2016.
4. Bandwidth management
diff --git a/Documentation/sound/hd-audio/models.rst b/Documentation/sound/hd-audio/models.rst
index 5338673c88d9..773d2bfacc6c 100644
--- a/Documentation/sound/hd-audio/models.rst
+++ b/Documentation/sound/hd-audio/models.rst
@@ -16,6 +16,8 @@ ALC880
6-jack in back, 2-jack in front
6stack-digout
6-jack with a SPDIF out
+6stack-automute
+ 6-jack with headphone jack detection
ALC260
======
@@ -62,6 +64,8 @@ lenovo-dock
Enables docking station I/O for some Lenovos
hp-gpio-led
GPIO LED support on HP laptops
+hp-dock-gpio-mic1-led
+ HP dock with mic LED support
dell-headset-multi
Headset jack, which can also be used as mic-in
dell-headset-dock
@@ -72,6 +76,12 @@ alc283-sense-combo
Combo jack sensing on ALC283
tpt440-dock
Pin configs for Lenovo Thinkpad Dock support
+tpt440
+ Lenovo Thinkpad T440s setup
+tpt460
+ Lenovo Thinkpad T460/560 setup
+dual-codecs
+ Lenovo laptops with dual codecs
ALC66x/67x/892
==============
@@ -97,6 +107,8 @@ inv-dmic
Inverted internal mic workaround
dell-headset-multi
Headset jack, which can also be used as mic-in
+dual-codecs
+ Lenovo laptops with dual codecs
ALC680
======
@@ -114,6 +126,8 @@ inv-dmic
Inverted internal mic workaround
no-primary-hp
VAIO Z/VGC-LN51JGB workaround (for fixed speaker DAC)
+dual-codecs
+ ALC1220 dual codecs for Gaming mobos
ALC861/660
==========
@@ -206,65 +220,47 @@ auto
Conexant 5045
=============
-laptop-hpsense
- Laptop with HP sense (old model laptop)
-laptop-micsense
- Laptop with Mic sense (old model fujitsu)
-laptop-hpmicsense
- Laptop with HP and Mic senses
-benq
- Benq R55E
-laptop-hp530
- HP 530 laptop
-test
- for testing/debugging purpose, almost all controls can be
- adjusted. Appearing only when compiled with $CONFIG_SND_DEBUG=y
+cap-mix-amp
+ Fix max input level on mixer widget
+toshiba-p105
+ Toshiba P105 quirk
+hp-530
+ HP 530 quirk
Conexant 5047
=============
-laptop
- Basic Laptop config
-laptop-hp
- Laptop config for some HP models (subdevice 30A5)
-laptop-eapd
- Laptop config with EAPD support
-test
- for testing/debugging purpose, almost all controls can be
- adjusted. Appearing only when compiled with $CONFIG_SND_DEBUG=y
+cap-mix-amp
+ Fix max input level on mixer widget
Conexant 5051
=============
-laptop
- Basic Laptop config (default)
-hp
- HP Spartan laptop
-hp-dv6736
- HP dv6736
-hp-f700
- HP Compaq Presario F700
-ideapad
- Lenovo IdeaPad laptop
-toshiba
- Toshiba Satellite M300
+lenovo-x200
+ Lenovo X200 quirk
Conexant 5066
=============
-laptop
- Basic Laptop config (default)
-hp-laptop
- HP laptops, e g G60
-asus
- Asus K52JU, Lenovo G560
-dell-laptop
- Dell laptops
-dell-vostro
- Dell Vostro
-olpc-xo-1_5
- OLPC XO 1.5
-ideapad
- Lenovo IdeaPad U150
+stereo-dmic
+ Workaround for inverted stereo digital mic
+gpio1
+ Enable GPIO1 pin
+headphone-mic-pin
+ Enable headphone mic NID 0x18 without detection
+tp410
+ Thinkpad T400 & co quirks
thinkpad
- Lenovo Thinkpad
+ Thinkpad mute/mic LED quirk
+lemote-a1004
+ Lemote A1004 quirk
+lemote-a1205
+ Lemote A1205 quirk
+olpc-xo
+ OLPC XO quirk
+mute-led-eapd
+ Mute LED control via EAPD
+hp-dock
+ HP dock support
+mute-led-gpio
+ Mute LED control via GPIO
STAC9200
========
@@ -444,6 +440,8 @@ dell-eq
Dell desktops/laptops
alienware
Alienware M17x
+asus-mobo
+ Pin configs for ASUS mobo with 5.1/SPDIF out
auto
BIOS setup (default)
@@ -477,6 +475,8 @@ hp-envy-ts-bass
Pin fixup for HP Envy TS bass speaker (NID 0x10)
hp-bnb13-eq
Hardware equalizer setup for HP laptops
+hp-envy-ts-bass
+ HP Envy TS bass support
auto
BIOS setup (default)
@@ -496,10 +496,22 @@ auto
Cirrus Logic CS4206/4207
========================
+mbp53
+ MacBook Pro 5,3
mbp55
MacBook Pro 5,5
imac27
IMac 27 Inch
+imac27_122
+ iMac 12,2
+apple
+ Generic Apple quirk
+mbp101
+ MacBookPro 10,1
+mbp81
+ MacBookPro 8,1
+mba42
+ MacBookAir 4,2
auto
BIOS setup (default)
@@ -509,6 +521,10 @@ mba6
MacBook Air 6,1 and 6,2
gpio0
Enable GPIO 0 amp
+mbp11
+ MacBookPro 11,2
+macmini
+ MacMini 7,1
auto
BIOS setup (default)
diff --git a/Documentation/target/target-export-device b/Documentation/target/target-export-device
new file mode 100755
index 000000000000..b803f4f886b5
--- /dev/null
+++ b/Documentation/target/target-export-device
@@ -0,0 +1,80 @@
+#!/bin/sh
+#
+# This script illustrates the sequence of operations in configfs to
+# create a very simple LIO iSCSI target with a file or block device
+# backstore.
+#
+# (C) Copyright 2014 Christophe Vu-Brugier <cvubrugier@fastmail.fm>
+#
+
+print_usage() {
+ cat <<EOF
+Usage: $(basename $0) [-p PORTAL] DEVICE|FILE
+Export a block device or a file as an iSCSI target with a single LUN
+EOF
+}
+
+die() {
+ echo $1
+ exit 1
+}
+
+while getopts "hp:" arg; do
+ case $arg in
+ h) print_usage; exit 0;;
+ p) PORTAL=${OPTARG};;
+ esac
+done
+shift $(($OPTIND - 1))
+
+DEVICE=$1
+[ -n "$DEVICE" ] || die "Missing device or file argument"
+[ -b $DEVICE -o -f $DEVICE ] || die "Invalid device or file: ${DEVICE}"
+IQN="iqn.2003-01.org.linux-iscsi.$(hostname):$(basename $DEVICE)"
+[ -n "$PORTAL" ] || PORTAL="0.0.0.0:3260"
+
+CONFIGFS=/sys/kernel/config
+CORE_DIR=$CONFIGFS/target/core
+ISCSI_DIR=$CONFIGFS/target/iscsi
+
+# Load the target modules and mount the config file system
+lsmod | grep -q configfs || modprobe configfs
+lsmod | grep -q target_core_mod || modprobe target_core_mod
+mount | grep -q ^configfs || mount -t configfs none $CONFIGFS
+mkdir -p $ISCSI_DIR
+
+# Create a backstore
+if [ -b $DEVICE ]; then
+ BACKSTORE_DIR=$CORE_DIR/iblock_0/data
+ mkdir -p $BACKSTORE_DIR
+ echo "udev_path=${DEVICE}" > $BACKSTORE_DIR/control
+else
+ BACKSTORE_DIR=$CORE_DIR/fileio_0/data
+ mkdir -p $BACKSTORE_DIR
+ DEVICE_SIZE=$(du -b $DEVICE | cut -f1)
+ echo "fd_dev_name=${DEVICE}" > $BACKSTORE_DIR/control
+ echo "fd_dev_size=${DEVICE_SIZE}" > $BACKSTORE_DIR/control
+ echo 1 > $BACKSTORE_DIR/attrib/emulate_write_cache
+fi
+echo 1 > $BACKSTORE_DIR/enable
+
+# Create an iSCSI target and a target portal group (TPG)
+mkdir $ISCSI_DIR/$IQN
+mkdir $ISCSI_DIR/$IQN/tpgt_1/
+
+# Create a LUN
+mkdir $ISCSI_DIR/$IQN/tpgt_1/lun/lun_0
+ln -s $BACKSTORE_DIR $ISCSI_DIR/$IQN/tpgt_1/lun/lun_0/data
+echo 1 > $ISCSI_DIR/$IQN/tpgt_1/enable
+
+# Create a network portal
+mkdir $ISCSI_DIR/$IQN/tpgt_1/np/$PORTAL
+
+# Disable authentication
+echo 0 > $ISCSI_DIR/$IQN/tpgt_1/attrib/authentication
+echo 1 > $ISCSI_DIR/$IQN/tpgt_1/attrib/generate_node_acls
+
+# Allow write access for non authenticated initiators
+echo 0 > $ISCSI_DIR/$IQN/tpgt_1/attrib/demo_mode_write_protect
+
+echo "Target ${IQN}, portal ${PORTAL} has been created"
diff --git a/Documentation/tee.txt b/Documentation/tee.txt
new file mode 100644
index 000000000000..718599357596
--- /dev/null
+++ b/Documentation/tee.txt
@@ -0,0 +1,118 @@
+TEE subsystem
+This document describes the TEE subsystem in Linux.
+
+A TEE (Trusted Execution Environment) is a trusted OS running in some
+secure environment, for example, TrustZone on ARM CPUs, or a separate
+secure co-processor etc. A TEE driver handles the details needed to
+communicate with the TEE.
+
+This subsystem deals with:
+
+- Registration of TEE drivers
+
+- Managing shared memory between Linux and the TEE
+
+- Providing a generic API to the TEE
+
+The TEE interface
+=================
+
+include/uapi/linux/tee.h defines the generic interface to a TEE.
+
+User space (the client) connects to the driver by opening /dev/tee[0-9]* or
+/dev/teepriv[0-9]*.
+
+- TEE_IOC_SHM_ALLOC allocates shared memory and returns a file descriptor
+ which user space can mmap. When user space doesn't need the file
+ descriptor any more, it should be closed. When shared memory isn't needed
+ any longer it should be unmapped with munmap() to allow the reuse of
+ memory.
+
+- TEE_IOC_VERSION lets user space know which TEE this driver handles and
+ the its capabilities.
+
+- TEE_IOC_OPEN_SESSION opens a new session to a Trusted Application.
+
+- TEE_IOC_INVOKE invokes a function in a Trusted Application.
+
+- TEE_IOC_CANCEL may cancel an ongoing TEE_IOC_OPEN_SESSION or TEE_IOC_INVOKE.
+
+- TEE_IOC_CLOSE_SESSION closes a session to a Trusted Application.
+
+There are two classes of clients, normal clients and supplicants. The latter is
+a helper process for the TEE to access resources in Linux, for example file
+system access. A normal client opens /dev/tee[0-9]* and a supplicant opens
+/dev/teepriv[0-9].
+
+Much of the communication between clients and the TEE is opaque to the
+driver. The main job for the driver is to receive requests from the
+clients, forward them to the TEE and send back the results. In the case of
+supplicants the communication goes in the other direction, the TEE sends
+requests to the supplicant which then sends back the result.
+
+OP-TEE driver
+=============
+
+The OP-TEE driver handles OP-TEE [1] based TEEs. Currently it is only the ARM
+TrustZone based OP-TEE solution that is supported.
+
+Lowest level of communication with OP-TEE builds on ARM SMC Calling
+Convention (SMCCC) [2], which is the foundation for OP-TEE's SMC interface
+[3] used internally by the driver. Stacked on top of that is OP-TEE Message
+Protocol [4].
+
+OP-TEE SMC interface provides the basic functions required by SMCCC and some
+additional functions specific for OP-TEE. The most interesting functions are:
+
+- OPTEE_SMC_FUNCID_CALLS_UID (part of SMCCC) returns the version information
+ which is then returned by TEE_IOC_VERSION
+
+- OPTEE_SMC_CALL_GET_OS_UUID returns the particular OP-TEE implementation, used
+ to tell, for instance, a TrustZone OP-TEE apart from an OP-TEE running on a
+ separate secure co-processor.
+
+- OPTEE_SMC_CALL_WITH_ARG drives the OP-TEE message protocol
+
+- OPTEE_SMC_GET_SHM_CONFIG lets the driver and OP-TEE agree on which memory
+ range to used for shared memory between Linux and OP-TEE.
+
+The GlobalPlatform TEE Client API [5] is implemented on top of the generic
+TEE API.
+
+Picture of the relationship between the different components in the
+OP-TEE architecture.
+
+ User space Kernel Secure world
+ ~~~~~~~~~~ ~~~~~~ ~~~~~~~~~~~~
+ +--------+ +-------------+
+ | Client | | Trusted |
+ +--------+ | Application |
+ /\ +-------------+
+ || +----------+ /\
+ || |tee- | ||
+ || |supplicant| \/
+ || +----------+ +-------------+
+ \/ /\ | TEE Internal|
+ +-------+ || | API |
+ + TEE | || +--------+--------+ +-------------+
+ | Client| || | TEE | OP-TEE | | OP-TEE |
+ | API | \/ | subsys | driver | | Trusted OS |
+ +-------+----------------+----+-------+----+-----------+-------------+
+ | Generic TEE API | | OP-TEE MSG |
+ | IOCTL (TEE_IOC_*) | | SMCCC (OPTEE_SMC_CALL_*) |
+ +-----------------------------+ +------------------------------+
+
+RPC (Remote Procedure Call) are requests from secure world to kernel driver
+or tee-supplicant. An RPC is identified by a special range of SMCCC return
+values from OPTEE_SMC_CALL_WITH_ARG. RPC messages which are intended for the
+kernel are handled by the kernel driver. Other RPC messages will be forwarded to
+tee-supplicant without further involvement of the driver, except switching
+shared memory buffer representation.
+
+References:
+[1] https://github.com/OP-TEE/optee_os
+[2] http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html
+[3] drivers/tee/optee/optee_smc.h
+[4] drivers/tee/optee/optee_msg.h
+[5] http://www.globalplatform.org/specificationsdevice.asp look for
+ "TEE Client API Specification v1.0" and click download.
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index ef473dc7f55e..bb9a0a53e76b 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -582,3 +582,24 @@ platform data is provided, this uses the step_wise throttling policy.
This function serves as an arbitrator to set the state of a cooling
device. It sets the cooling device to the deepest cooling state if
possible.
+
+6. thermal_emergency_poweroff:
+
+On an event of critical trip temperature crossing. Thermal framework
+allows the system to shutdown gracefully by calling orderly_poweroff().
+In the event of a failure of orderly_poweroff() to shut down the system
+we are in danger of keeping the system alive at undesirably high
+temperatures. To mitigate this high risk scenario we program a work
+queue to fire after a pre-determined number of seconds to start
+an emergency shutdown of the device using the kernel_power_off()
+function. In case kernel_power_off() fails then finally
+emergency_restart() is called in the worst case.
+
+The delay should be carefully profiled so as to give adequate time for
+orderly_poweroff(). In case of failure of an orderly_poweroff() the
+emergency poweroff kicks in after the delay has elapsed and shuts down
+the system.
+
+If set to 0 emergency poweroff will not be supported. So a carefully
+profiled non-zero positive value is a must for emergerncy poweroff to be
+triggered.
diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
index 6eaf576294f3..2dcaf9adb7a7 100644
--- a/Documentation/timers/NO_HZ.txt
+++ b/Documentation/timers/NO_HZ.txt
@@ -194,32 +194,9 @@ that the RCU callbacks are processed in a timely fashion.
Another approach is to offload RCU callback processing to "rcuo" kthreads
using the CONFIG_RCU_NOCB_CPU=y Kconfig option. The specific CPUs to
-offload may be selected via several methods:
-
-1. One of three mutually exclusive Kconfig options specify a
- build-time default for the CPUs to offload:
-
- a. The CONFIG_RCU_NOCB_CPU_NONE=y Kconfig option results in
- no CPUs being offloaded.
-
- b. The CONFIG_RCU_NOCB_CPU_ZERO=y Kconfig option causes
- CPU 0 to be offloaded.
-
- c. The CONFIG_RCU_NOCB_CPU_ALL=y Kconfig option causes all
- CPUs to be offloaded. Note that the callbacks will be
- offloaded to "rcuo" kthreads, and that those kthreads
- will in fact run on some CPU. However, this approach
- gives fine-grained control on exactly which CPUs the
- callbacks run on, along with their scheduling priority
- (including the default of SCHED_OTHER), and it further
- allows this control to be varied dynamically at runtime.
-
-2. The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated
- list of CPUs and CPU ranges, for example, "1,3-5" selects CPUs 1,
- 3, 4, and 5. The specified CPUs will be offloaded in addition to
- any CPUs specified as offloaded by CONFIG_RCU_NOCB_CPU_ZERO=y or
- CONFIG_RCU_NOCB_CPU_ALL=y. This means that the "rcu_nocbs=" boot
- parameter has no effect for kernels built with RCU_NOCB_CPU_ALL=y.
+offload may be selected using The "rcu_nocbs=" kernel boot parameter,
+which takes a comma-separated list of CPUs and CPU ranges, for example,
+"1,3-5" selects CPUs 1, 3, 4, and 5.
The offloaded CPUs will never queue RCU callbacks, and therefore RCU
never prevents offloaded CPUs from entering either dyntick-idle mode
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 94a987bd2bc5..fff8ff6d4893 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1609,7 +1609,7 @@ Doing the same with chrt -r 5 and function-trace set.
<idle>-0 3dN.2 14us : sched_avg_update <-__cpu_load_update
<idle>-0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz
<idle>-0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock
- <idle>-0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit
+ <idle>-0 3dN.1 15us : calc_load_nohz_stop <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : hrtimer_cancel <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : hrtimer_try_to_cancel <-hrtimer_cancel
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index ce0b48d69eaa..d05d4c54e8f7 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -2343,7 +2343,7 @@ ACQUIRE VS I/O 액세스
spin_unlock(Q);
-더 많은 정보를 위해선 Documenataion/DocBook/deviceiobook.tmpl 을 참고하세요.
+더 많은 정보를 위해선 Documentation/driver-api/device-io.rst 를 참고하세요.
=========================
@@ -2578,7 +2578,7 @@ CPU 에서는 사용되는 어토믹 인스트럭션 자체에 메모리 배리
(2) 만약 액세스 함수들이 완화된 메모리 액세스 속성을 갖는 I/O 메모리 윈도우를
사용한다면, 순서를 강제하기 위해선 _mandatory_ 메모리 배리어가 필요합니다.
-더 많은 정보를 위해선 Documentation/DocBook/deviceiobook.tmpl 을 참고하십시오.
+더 많은 정보를 위해선 Documentation/driver-api/device-io.rst 를 참고하십시오.
인터럽트
diff --git a/Documentation/usb/typec.rst b/Documentation/usb/typec.rst
index b67a46779de9..8a7249f2ff04 100644
--- a/Documentation/usb/typec.rst
+++ b/Documentation/usb/typec.rst
@@ -114,8 +114,7 @@ the details during registration. The class offers the following API for
registering/unregistering cables and their plugs:
.. kernel-doc:: drivers/usb/typec/typec.c
- :functions: typec_register_cable typec_unregister_cable typec_register_plug
- typec_unregister_plug
+ :functions: typec_register_cable typec_unregister_cable typec_register_plug typec_unregister_plug
The class will provide a handle to struct typec_cable and struct typec_plug if
the registration is successful, or NULL if it isn't.
@@ -137,8 +136,7 @@ during connection of a partner or cable, the port driver must use the following
APIs to report it to the class:
.. kernel-doc:: drivers/usb/typec/typec.c
- :functions: typec_set_data_role typec_set_pwr_role typec_set_vconn_role
- typec_set_pwr_opmode
+ :functions: typec_set_data_role typec_set_pwr_role typec_set_vconn_role typec_set_pwr_opmode
Alternate Modes
~~~~~~~~~~~~~~~
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt
index 6081a5b7fc1e..eb06beb75960 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic-its.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic-its.txt
@@ -32,7 +32,128 @@ Groups:
KVM_DEV_ARM_VGIC_CTRL_INIT
request the initialization of the ITS, no additional parameter in
kvm_device_attr.addr.
+
+ KVM_DEV_ARM_ITS_SAVE_TABLES
+ save the ITS table data into guest RAM, at the location provisioned
+ by the guest in corresponding registers/table entries.
+
+ The layout of the tables in guest memory defines an ABI. The entries
+ are laid out in little endian format as described in the last paragraph.
+
+ KVM_DEV_ARM_ITS_RESTORE_TABLES
+ restore the ITS tables from guest RAM to ITS internal structures.
+
+ The GICV3 must be restored before the ITS and all ITS registers but
+ the GITS_CTLR must be restored before restoring the ITS tables.
+
+ The GITS_IIDR read-only register must also be restored before
+ calling KVM_DEV_ARM_ITS_RESTORE_TABLES as the IIDR revision field
+ encodes the ABI revision.
+
+ The expected ordering when restoring the GICv3/ITS is described in section
+ "ITS Restore Sequence".
+
Errors:
-ENXIO: ITS not properly configured as required prior to setting
this attribute
-ENOMEM: Memory shortage when allocating ITS internal data
+ -EINVAL: Inconsistent restored data
+ -EFAULT: Invalid guest ram access
+ -EBUSY: One or more VCPUS are running
+
+ KVM_DEV_ARM_VGIC_GRP_ITS_REGS
+ Attributes:
+ The attr field of kvm_device_attr encodes the offset of the
+ ITS register, relative to the ITS control frame base address
+ (ITS_base).
+
+ kvm_device_attr.addr points to a __u64 value whatever the width
+ of the addressed register (32/64 bits). 64 bit registers can only
+ be accessed with full length.
+
+ Writes to read-only registers are ignored by the kernel except for:
+ - GITS_CREADR. It must be restored otherwise commands in the queue
+ will be re-executed after restoring CWRITER. GITS_CREADR must be
+ restored before restoring the GITS_CTLR which is likely to enable the
+ ITS. Also it must be restored after GITS_CBASER since a write to
+ GITS_CBASER resets GITS_CREADR.
+ - GITS_IIDR. The Revision field encodes the table layout ABI revision.
+ In the future we might implement direct injection of virtual LPIs.
+ This will require an upgrade of the table layout and an evolution of
+ the ABI. GITS_IIDR must be restored before calling
+ KVM_DEV_ARM_ITS_RESTORE_TABLES.
+
+ For other registers, getting or setting a register has the same
+ effect as reading/writing the register on real hardware.
+ Errors:
+ -ENXIO: Offset does not correspond to any supported register
+ -EFAULT: Invalid user pointer for attr->addr
+ -EINVAL: Offset is not 64-bit aligned
+ -EBUSY: one or more VCPUS are running
+
+ ITS Restore Sequence:
+ -------------------------
+
+The following ordering must be followed when restoring the GIC and the ITS:
+a) restore all guest memory and create vcpus
+b) restore all redistributors
+c) provide the its base address
+ (KVM_DEV_ARM_VGIC_GRP_ADDR)
+d) restore the ITS in the following order:
+ 1. Restore GITS_CBASER
+ 2. Restore all other GITS_ registers, except GITS_CTLR!
+ 3. Load the ITS table data (KVM_DEV_ARM_ITS_RESTORE_TABLES)
+ 4. Restore GITS_CTLR
+
+Then vcpus can be started.
+
+ ITS Table ABI REV0:
+ -------------------
+
+ Revision 0 of the ABI only supports the features of a virtual GICv3, and does
+ not support a virtual GICv4 with support for direct injection of virtual
+ interrupts for nested hypervisors.
+
+ The device table and ITT are indexed by the DeviceID and EventID,
+ respectively. The collection table is not indexed by CollectionID, and the
+ entries in the collection are listed in no particular order.
+ All entries are 8 bytes.
+
+ Device Table Entry (DTE):
+
+ bits: | 63| 62 ... 49 | 48 ... 5 | 4 ... 0 |
+ values: | V | next | ITT_addr | Size |
+
+ where;
+ - V indicates whether the entry is valid. If not, other fields
+ are not meaningful.
+ - next: equals to 0 if this entry is the last one; otherwise it
+ corresponds to the DeviceID offset to the next DTE, capped by
+ 2^14 -1.
+ - ITT_addr matches bits [51:8] of the ITT address (256 Byte aligned).
+ - Size specifies the supported number of bits for the EventID,
+ minus one
+
+ Collection Table Entry (CTE):
+
+ bits: | 63| 62 .. 52 | 51 ... 16 | 15 ... 0 |
+ values: | V | RES0 | RDBase | ICID |
+
+ where:
+ - V indicates whether the entry is valid. If not, other fields are
+ not meaningful.
+ - RES0: reserved field with Should-Be-Zero-or-Preserved behavior.
+ - RDBase is the PE number (GICR_TYPER.Processor_Number semantic),
+ - ICID is the collection ID
+
+ Interrupt Translation Entry (ITE):
+
+ bits: | 63 ... 48 | 47 ... 16 | 15 ... 0 |
+ values: | next | pINTID | ICID |
+
+ where:
+ - next: equals to 0 if this entry is the last one; otherwise it corresponds
+ to the EventID offset to the next ITE capped by 2^16 -1.
+ - pINTID is the physical LPI ID; if zero, it means the entry is not valid
+ and other fields are not meaningful.
+ - ICID is the collection ID
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
index c1a24612c198..9293b45abdb9 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
@@ -167,11 +167,17 @@ Groups:
KVM_DEV_ARM_VGIC_CTRL_INIT
request the initialization of the VGIC, no additional parameter in
kvm_device_attr.addr.
+ KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
+ save all LPI pending bits into guest RAM pending tables.
+
+ The first kB of the pending table is not altered by this operation.
Errors:
-ENXIO: VGIC not properly configured as required prior to calling
this attribute
-ENODEV: no online VCPU
-ENOMEM: memory shortage when allocating vgic internal data
+ -EFAULT: Invalid guest ram access
+ -EBUSY: One or more VCPUS are running
KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO
diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt
index 4f7d86dd0a5d..914518aeb972 100644
--- a/Documentation/watchdog/watchdog-parameters.txt
+++ b/Documentation/watchdog/watchdog-parameters.txt
@@ -117,7 +117,7 @@ nowayout: Watchdog cannot be stopped once started
-------------------------------------------------
iTCO_wdt:
heartbeat: Watchdog heartbeat in seconds.
- (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=30)
+ (5<=heartbeat<=74 (TCO v1) or 1226 (TCO v2), default=30)
nowayout: Watchdog cannot be stopped once started
(default=kernel config parameter)
-------------------------------------------------
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index 0f6d8477b66c..c491a1b82de2 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -295,7 +295,7 @@ kernel and the tasks running there get 50% of the cache. They should
also get 50% of memory bandwidth assuming that the cores 4-7 are SMT
siblings and only the real time threads are scheduled on the cores 4-7.
-# echo C0 > p0/cpus
+# echo F0 > p0/cpus
4) Locking between applications
OpenPOWER on IntegriCloud