9 files changed, 741 insertions, 54 deletions
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index cdb3e40b9d14..a06b48d2f5cc 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1,49 +1,563 @@
-filter.txt: Linux Socket Filtering
-Written by: Jay Schulist <jschlst@samba.org>
+Linux Socket Filtering aka Berkeley Packet Filter (BPF)
+=======================================================
 
 Introduction
-============
-
-	Linux Socket Filtering is derived from the Berkeley
-Packet Filter. There are some distinct differences between
-the BSD and Linux Kernel Filtering.
-
-Linux Socket Filtering (LSF) allows a user-space program to
-attach a filter onto any socket and allow or disallow certain
-types of data to come through the socket. LSF follows exactly
-the same filter code structure as the BSD Berkeley Packet Filter
-(BPF), so referring to the BSD bpf.4 manpage is very helpful in
-creating filters.
-
-LSF is much simpler than BPF. One does not have to worry about
-devices or anything like that. You simply create your filter
-code, send it to the kernel via the SO_ATTACH_FILTER option and
-if your filter code passes the kernel check on it, you then
-immediately begin filtering data on that socket.
-
-You can also detach filters from your socket via the
-SO_DETACH_FILTER option. This will probably not be used much
-since when you close a socket that has a filter on it the
-filter is automagically removed. The other less common case
-may be adding a different filter on the same socket where you had another
-filter that is still running: the kernel takes care of removing
-the old one and placing your new one in its place, assuming your
-filter has passed the checks, otherwise if it fails the old filter
-will remain on that socket.
-
-SO_LOCK_FILTER option allows to lock the filter attached to a
-socket. Once set, a filter cannot be removed or changed. This allows
-one process to setup a socket, attach a filter, lock it then drop
-privileges and be assured that the filter will be kept until the
-socket is closed.
-
-Examples
-========
-
-Ioctls-
-setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &Filter, sizeof(Filter));
-setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &value, sizeof(value));
-setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER, &value, sizeof(value));
-
-See the BSD bpf.4 manpage and the BSD Packet Filter paper written by
-Steven McCanne and Van Jacobson of Lawrence Berkeley Laboratory.
+------------
+
+Linux Socket Filtering (LSF) is derived from the Berkeley Packet Filter.
+Though there are some distinct differences between the BSD and Linux
+Kernel filtering, but when we speak of BPF or LSF in Linux context, we
+mean the very same mechanism of filtering in the Linux kernel.
+
+BPF allows a user-space program to attach a filter onto any socket and
+allow or disallow certain types of data to come through the socket. LSF
+follows exactly the same filter code structure as BSD's BPF, so referring
+to the BSD bpf.4 manpage is very helpful in creating filters.
+
+On Linux, BPF is much simpler than on BSD. One does not have to worry
+about devices or anything like that. You simply create your filter code,
+send it to the kernel via the SO_ATTACH_FILTER option and if your filter
+code passes the kernel check on it, you then immediately begin filtering
+data on that socket.
+
+You can also detach filters from your socket via the SO_DETACH_FILTER
+option. This will probably not be used much since when you close a socket
+that has a filter on it the filter is automagically removed. The other
+less common case may be adding a different filter on the same socket where
+you had another filter that is still running: the kernel takes care of
+removing the old one and placing your new one in its place, assuming your
+filter has passed the checks, otherwise if it fails the old filter will
+remain on that socket.
+
+SO_LOCK_FILTER option allows to lock the filter attached to a socket. Once
+set, a filter cannot be removed or changed. This allows one process to
+setup a socket, attach a filter, lock it then drop privileges and be
+assured that the filter will be kept until the socket is closed.
+
+The biggest user of this construct might be libpcap. Issuing a high-level
+filter command like `tcpdump -i em1 port 22` passes through the libpcap
+internal compiler that generates a structure that can eventually be loaded
+via SO_ATTACH_FILTER to the kernel. `tcpdump -i em1 port 22 -ddd`
+displays what is being placed into this structure.
+
+Although we were only speaking about sockets here, BPF in Linux is used
+in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel
+qdisc layer, SECCOMP-BPF (SECure COMPuting [1]), and lots of other places
+such as team driver, PTP code, etc where BPF is being used.
+
+ [1] Documentation/prctl/seccomp_filter.txt
+
+Original BPF paper:
+
+Steven McCanne and Van Jacobson. 1993. The BSD packet filter: a new
+architecture for user-level packet capture. In Proceedings of the
+USENIX Winter 1993 Conference Proceedings on USENIX Winter 1993
+Conference Proceedings (USENIX'93). USENIX Association, Berkeley,
+CA, USA, 2-2. [http://www.tcpdump.org/papers/bpf-usenix93.pdf]
+
+Structure
+---------
+
+User space applications include <linux/filter.h> which contains the
+following relevant structures:
+
+struct sock_filter {	/* Filter block */
+	__u16	code;   /* Actual filter code */
+	__u8	jt;	/* Jump true */
+	__u8	jf;	/* Jump false */
+	__u32	k;      /* Generic multiuse field */
+};
+
+Such a structure is assembled as an array of 4-tuples, that contains
+a code, jt, jf and k value. jt and jf are jump offsets and k a generic
+value to be used for a provided code.
+
+struct sock_fprog {			/* Required for SO_ATTACH_FILTER. */
+	unsigned short		   len;	/* Number of filter blocks */
+	struct sock_filter __user *filter;
+};
+
+For socket filtering, a pointer to this structure (as shown in
+follow-up example) is being passed to the kernel through setsockopt(2).
+
+Example
+-------
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <arpa/inet.h>
+#include <linux/if_ether.h>
+/* ... */
+
+/* From the example above: tcpdump -i em1 port 22 -dd */
+struct sock_filter code[] = {
+	{ 0x28,  0,  0, 0x0000000c },
+	{ 0x15,  0,  8, 0x000086dd },
+	{ 0x30,  0,  0, 0x00000014 },
+	{ 0x15,  2,  0, 0x00000084 },
+	{ 0x15,  1,  0, 0x00000006 },
+	{ 0x15,  0, 17, 0x00000011 },
+	{ 0x28,  0,  0, 0x00000036 },
+	{ 0x15, 14,  0, 0x00000016 },
+	{ 0x28,  0,  0, 0x00000038 },
+	{ 0x15, 12, 13, 0x00000016 },
+	{ 0x15,  0, 12, 0x00000800 },
+	{ 0x30,  0,  0, 0x00000017 },
+	{ 0x15,  2,  0, 0x00000084 },
+	{ 0x15,  1,  0, 0x00000006 },
+	{ 0x15,  0,  8, 0x00000011 },
+	{ 0x28,  0,  0, 0x00000014 },
+	{ 0x45,  6,  0, 0x00001fff },
+	{ 0xb1,  0,  0, 0x0000000e },
+	{ 0x48,  0,  0, 0x0000000e },
+	{ 0x15,  2,  0, 0x00000016 },
+	{ 0x48,  0,  0, 0x00000010 },
+	{ 0x15,  0,  1, 0x00000016 },
+	{ 0x06,  0,  0, 0x0000ffff },
+	{ 0x06,  0,  0, 0x00000000 },
+};
+
+struct sock_fprog bpf = {
+	.len = ARRAY_SIZE(code),
+	.filter = code,
+};
+
+sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+if (sock < 0)
+	/* ... bail out ... */
+
+ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf));
+if (ret < 0)
+	/* ... bail out ... */
+
+/* ... */
+close(sock);
+
+The above example code attaches a socket filter for a PF_PACKET socket
+in order to let all IPv4/IPv6 packets with port 22 pass. The rest will
+be dropped for this socket.
+
+The setsockopt(2) call to SO_DETACH_FILTER doesn't need any arguments
+and SO_LOCK_FILTER for preventing the filter to be detached, takes an
+integer value with 0 or 1.
+
+Note that socket filters are not restricted to PF_PACKET sockets only,
+but can also be used on other socket families.
+
+Summary of system calls:
+
+ * setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &val, sizeof(val));
+ * setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val));
+ * setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER,   &val, sizeof(val));
+
+Normally, most use cases for socket filtering on packet sockets will be
+covered by libpcap in high-level syntax, so as an application developer
+you should stick to that. libpcap wraps its own layer around all that.
+
+Unless i) using/linking to libpcap is not an option, ii) the required BPF
+filters use Linux extensions that are not supported by libpcap's compiler,
+iii) a filter might be more complex and not cleanly implementable with
+libpcap's compiler, or iv) particular filter codes should be optimized
+differently than libpcap's internal compiler does; then in such cases
+writing such a filter "by hand" can be of an alternative. For example,
+xt_bpf and cls_bpf users might have requirements that could result in
+more complex filter code, or one that cannot be expressed with libpcap
+(e.g. different return codes for various code paths). Moreover, BPF JIT
+implementors may wish to manually write test cases and thus need low-level
+access to BPF code as well.
+
+BPF engine and instruction set
+------------------------------
+
+Under tools/net/ there's a small helper tool called bpf_asm which can
+be used to write low-level filters for example scenarios mentioned in the
+previous section. Asm-like syntax mentioned here has been implemented in
+bpf_asm and will be used for further explanations (instead of dealing with
+less readable opcodes directly, principles are the same). The syntax is
+closely modelled after Steven McCanne's and Van Jacobson's BPF paper.
+
+The BPF architecture consists of the following basic elements:
+
+  Element          Description
+
+  A                32 bit wide accumulator
+  X                32 bit wide X register
+  M[]              16 x 32 bit wide misc registers aka "scratch memory
+                   store", addressable from 0 to 15
+
+A program, that is translated by bpf_asm into "opcodes" is an array that
+consists of the following elements (as already mentioned):
+
+  op:16, jt:8, jf:8, k:32
+
+The element op is a 16 bit wide opcode that has a particular instruction
+encoded. jt and jf are two 8 bit wide jump targets, one for condition
+"jump if true", the other one "jump if false". Eventually, element k
+contains a miscellaneous argument that can be interpreted in different
+ways depending on the given instruction in op.
+
+The instruction set consists of load, store, branch, alu, miscellaneous
+and return instructions that are also represented in bpf_asm syntax. This
+table lists all bpf_asm instructions available resp. what their underlying
+opcodes as defined in linux/filter.h stand for:
+
+  Instruction      Addressing mode      Description
+
+  ld               1, 2, 3, 4, 10       Load word into A
+  ldi              4                    Load word into A
+  ldh              1, 2                 Load half-word into A
+  ldb              1, 2                 Load byte into A
+  ldx              3, 4, 5, 10          Load word into X
+  ldxi             4                    Load word into X
+  ldxb             5                    Load byte into X
+
+  st               3                    Store A into M[]
+  stx              3                    Store X into M[]
+
+  jmp              6                    Jump to label
+  ja               6                    Jump to label
+  jeq              7, 8                 Jump on k == A
+  jneq             8                    Jump on k != A
+  jne              8                    Jump on k != A
+  jlt              8                    Jump on k < A
+  jle              8                    Jump on k <= A
+  jgt              7, 8                 Jump on k > A
+  jge              7, 8                 Jump on k >= A
+  jset             7, 8                 Jump on k & A
+
+  add              0, 4                 A + <x>
+  sub              0, 4                 A - <x>
+  mul              0, 4                 A * <x>
+  div              0, 4                 A / <x>
+  mod              0, 4                 A % <x>
+  neg              0, 4                 !A
+  and              0, 4                 A & <x>
+  or               0, 4                 A | <x>
+  xor              0, 4                 A ^ <x>
+  lsh              0, 4                 A << <x>
+  rsh              0, 4                 A >> <x>
+
+  tax                                   Copy A into X
+  txa                                   Copy X into A
+
+  ret              4, 9                 Return
+
+The next table shows addressing formats from the 2nd column:
+
+  Addressing mode  Syntax               Description
+
+   0               x/%x                 Register X
+   1               [k]                  BHW at byte offset k in the packet
+   2               [x + k]              BHW at the offset X + k in the packet
+   3               M[k]                 Word at offset k in M[]
+   4               #k                   Literal value stored in k
+   5               4*([k]&0xf)          Lower nibble * 4 at byte offset k in the packet
+   6               L                    Jump label L
+   7               #k,Lt,Lf             Jump to Lt if true, otherwise jump to Lf
+   8               #k,Lt                Jump to Lt if predicate is true
+   9               a/%a                 Accumulator A
+  10               extension            BPF extension
+
+The Linux kernel also has a couple of BPF extensions that are used along
+with the class of load instructions by "overloading" the k argument with
+a negative offset + a particular extension offset. The result of such BPF
+extensions are loaded into A.
+
+Possible BPF extensions are shown in the following table:
+
+  Extension                             Description
+
+  len                                   skb->len
+  proto                                 skb->protocol
+  type                                  skb->pkt_type
+  poff                                  Payload start offset
+  ifidx                                 skb->dev->ifindex
+  nla                                   Netlink attribute of type X with offset A
+  nlan                                  Nested Netlink attribute of type X with offset A
+  mark                                  skb->mark
+  queue                                 skb->queue_mapping
+  hatype                                skb->dev->type
+  rxhash                                skb->rxhash
+  cpu                                   raw_smp_processor_id()
+  vlan_tci                              vlan_tx_tag_get(skb)
+  vlan_pr                               vlan_tx_tag_present(skb)
+
+These extensions can also be prefixed with '#'.
+Examples for low-level BPF:
+
+** ARP packets:
+
+  ldh [12]
+  jne #0x806, drop
+  ret #-1
+  drop: ret #0
+
+** IPv4 TCP packets:
+
+  ldh [12]
+  jne #0x800, drop
+  ldb [23]
+  jneq #6, drop
+  ret #-1
+  drop: ret #0
+
+** (Accelerated) VLAN w/ id 10:
+
+  ld vlan_tci
+  jneq #10, drop
+  ret #-1
+  drop: ret #0
+
+** SECCOMP filter example:
+
+  ld [4]                  /* offsetof(struct seccomp_data, arch) */
+  jne #0xc000003e, bad    /* AUDIT_ARCH_X86_64 */
+  ld [0]                  /* offsetof(struct seccomp_data, nr) */
+  jeq #15, good           /* __NR_rt_sigreturn */
+  jeq #231, good          /* __NR_exit_group */
+  jeq #60, good           /* __NR_exit */
+  jeq #0, good            /* __NR_read */
+  jeq #1, good            /* __NR_write */
+  jeq #5, good            /* __NR_fstat */
+  jeq #9, good            /* __NR_mmap */
+  jeq #14, good           /* __NR_rt_sigprocmask */
+  jeq #13, good           /* __NR_rt_sigaction */
+  jeq #35, good           /* __NR_nanosleep */
+  bad: ret #0             /* SECCOMP_RET_KILL */
+  good: ret #0x7fff0000   /* SECCOMP_RET_ALLOW */
+
+The above example code can be placed into a file (here called "foo"), and
+then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf
+and cls_bpf understands and can directly be loaded with. Example with above
+ARP code:
+
+$ ./bpf_asm foo
+4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0,
+
+In copy and paste C-like output:
+
+$ ./bpf_asm -c foo
+{ 0x28,  0,  0, 0x0000000c },
+{ 0x15,  0,  1, 0x00000806 },
+{ 0x06,  0,  0, 0xffffffff },
+{ 0x06,  0,  0, 0000000000 },
+
+In particular, as usage with xt_bpf or cls_bpf can result in more complex BPF
+filters that might not be obvious at first, it's good to test filters before
+attaching to a live system. For that purpose, there's a small tool called
+bpf_dbg under tools/net/ in the kernel source directory. This debugger allows
+for testing BPF filters against given pcap files, single stepping through the
+BPF code on the pcap's packets and to do BPF machine register dumps.
+
+Starting bpf_dbg is trivial and just requires issuing:
+
+# ./bpf_dbg
+
+In case input and output do not equal stdin/stdout, bpf_dbg takes an
+alternative stdin source as a first argument, and an alternative stdout
+sink as a second one, e.g. `./bpf_dbg test_in.txt test_out.txt`.
+
+Other than that, a particular libreadline configuration can be set via
+file "~/.bpf_dbg_init" and the command history is stored in the file
+"~/.bpf_dbg_history".
+
+Interaction in bpf_dbg happens through a shell that also has auto-completion
+support (follow-up example commands starting with '>' denote bpf_dbg shell).
+The usual workflow would be to ...
+
+> load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0
+  Loads a BPF filter from standard output of bpf_asm, or transformed via
+  e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT
+  debugging (next section), this command creates a temporary socket and
+  loads the BPF code into the kernel. Thus, this will also be useful for
+  JIT developers.
+
+> load pcap foo.pcap
+  Loads standard tcpdump pcap file.
+
+> run [<n>]
+bpf passes:1 fails:9
+  Runs through all packets from a pcap to account how many passes and fails
+  the filter will generate. A limit of packets to traverse can be given.
+
+> disassemble
+l0:	ldh [12]
+l1:	jeq #0x800, l2, l5
+l2:	ldb [23]
+l3:	jeq #0x1, l4, l5
+l4:	ret #0xffff
+l5:	ret #0
+  Prints out BPF code disassembly.
+
+> dump
+/* { op, jt, jf, k }, */
+{ 0x28,  0,  0, 0x0000000c },
+{ 0x15,  0,  3, 0x00000800 },
+{ 0x30,  0,  0, 0x00000017 },
+{ 0x15,  0,  1, 0x00000001 },
+{ 0x06,  0,  0, 0x0000ffff },
+{ 0x06,  0,  0, 0000000000 },
+  Prints out C-style BPF code dump.
+
+> breakpoint 0
+breakpoint at: l0:	ldh [12]
+> breakpoint 1
+breakpoint at: l1:	jeq #0x800, l2, l5
+  ...
+  Sets breakpoints at particular BPF instructions. Issuing a `run` command
+  will walk through the pcap file continuing from the current packet and
+  break when a breakpoint is being hit (another `run` will continue from
+  the currently active breakpoint executing next instructions):
+
+  > run
+  -- register dump --
+  pc:       [0]                       <-- program counter
+  code:     [40] jt[0] jf[0] k[12]    <-- plain BPF code of current instruction
+  curr:     l0:	ldh [12]              <-- disassembly of current instruction
+  A:        [00000000][0]             <-- content of A (hex, decimal)
+  X:        [00000000][0]             <-- content of X (hex, decimal)
+  M[0,15]:  [00000000][0]             <-- folded content of M (hex, decimal)
+  -- packet dump --                   <-- Current packet from pcap (hex)
+  len: 42
+    0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01
+   16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26
+   32: 00 00 00 00 00 00 0a 3b 01 01
+  (breakpoint)
+  >
+
+> breakpoint
+breakpoints: 0 1
+  Prints currently set breakpoints.
+
+> step [-<n>, +<n>]
+  Performs single stepping through the BPF program from the current pc
+  offset. Thus, on each step invocation, above register dump is issued.
+  This can go forwards and backwards in time, a plain `step` will break
+  on the next BPF instruction, thus +1. (No `run` needs to be issued here.)
+
+> select <n>
+  Selects a given packet from the pcap file to continue from. Thus, on
+  the next `run` or `step`, the BPF program is being evaluated against
+  the user pre-selected packet. Numbering starts just as in Wireshark
+  with index 1.
+
+> quit
+#
+  Exits bpf_dbg.
+
+JIT compiler
+------------
+
+The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, PowerPC,
+ARM and s390 and can be enabled through CONFIG_BPF_JIT. The JIT compiler is
+transparently invoked for each attached filter from user space or for internal
+kernel users if it has been previously enabled by root:
+
+  echo 1 > /proc/sys/net/core/bpf_jit_enable
+
+For JIT developers, doing audits etc, each compile run can output the generated
+opcode image into the kernel log via:
+
+  echo 2 > /proc/sys/net/core/bpf_jit_enable
+
+Example output from dmesg:
+
+[ 3389.935842] flen=6 proglen=70 pass=3 image=ffffffffa0069c8f
+[ 3389.935847] JIT code: 00000000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 68
+[ 3389.935849] JIT code: 00000010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 00 00 00
+[ 3389.935850] JIT code: 00000020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 17 00 00
+[ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00
+[ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3
+
+In the kernel source tree under tools/net/, there's bpf_jit_disasm for
+generating disassembly out of the kernel log's hexdump:
+
+# ./bpf_jit_disasm
+70 bytes emitted from JIT compiler (pass:3, flen:6)
+ffffffffa0069c8f + <x>:
+   0:	push   %rbp
+   1:	mov    %rsp,%rbp
+   4:	sub    $0x60,%rsp
+   8:	mov    %rbx,-0x8(%rbp)
+   c:	mov    0x68(%rdi),%r9d
+  10:	sub    0x6c(%rdi),%r9d
+  14:	mov    0xd8(%rdi),%r8
+  1b:	mov    $0xc,%esi
+  20:	callq  0xffffffffe0ff9442
+  25:	cmp    $0x800,%eax
+  2a:	jne    0x0000000000000042
+  2c:	mov    $0x17,%esi
+  31:	callq  0xffffffffe0ff945e
+  36:	cmp    $0x1,%eax
+  39:	jne    0x0000000000000042
+  3b:	mov    $0xffff,%eax
+  40:	jmp    0x0000000000000044
+  42:	xor    %eax,%eax
+  44:	leaveq
+  45:	retq
+
+Issuing option `-o` will "annotate" opcodes to resulting assembler
+instructions, which can be very useful for JIT developers:
+
+# ./bpf_jit_disasm -o
+70 bytes emitted from JIT compiler (pass:3, flen:6)
+ffffffffa0069c8f + <x>:
+   0:	push   %rbp
+	55
+   1:	mov    %rsp,%rbp
+	48 89 e5
+   4:	sub    $0x60,%rsp
+	48 83 ec 60
+   8:	mov    %rbx,-0x8(%rbp)
+	48 89 5d f8
+   c:	mov    0x68(%rdi),%r9d
+	44 8b 4f 68
+  10:	sub    0x6c(%rdi),%r9d
+	44 2b 4f 6c
+  14:	mov    0xd8(%rdi),%r8
+	4c 8b 87 d8 00 00 00
+  1b:	mov    $0xc,%esi
+	be 0c 00 00 00
+  20:	callq  0xffffffffe0ff9442
+	e8 1d 94 ff e0
+  25:	cmp    $0x800,%eax
+	3d 00 08 00 00
+  2a:	jne    0x0000000000000042
+	75 16
+  2c:	mov    $0x17,%esi
+	be 17 00 00 00
+  31:	callq  0xffffffffe0ff945e
+	e8 28 94 ff e0
+  36:	cmp    $0x1,%eax
+	83 f8 01
+  39:	jne    0x0000000000000042
+	75 07
+  3b:	mov    $0xffff,%eax
+	b8 ff ff 00 00
+  40:	jmp    0x0000000000000044
+	eb 02
+  42:	xor    %eax,%eax
+	31 c0
+  44:	leaveq
+	c9
+  45:	retq
+	c3
+
+For BPF JIT developers, bpf_jit_disasm, bpf_asm and bpf_dbg provides a useful
+toolchain for developing and testing the kernel's JIT compiler.
+
+Misc
+----
+
+Also trinity, the Linux syscall fuzzer, has built-in support for BPF and
+SECCOMP-BPF kernel fuzzing.
+
+Written by
+----------
+
+The document was written in the hope that it is found useful and in order
+to give potential BPF hackers or security auditors a better overview of
+the underlying architecture.
+
+Jay Schulist <jschlst@samba.org>
+Daniel Borkmann <dborkman@redhat.com>
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 8a984e994e61..f76d177895d9 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -160,6 +160,16 @@ tcp_app_win - INTEGER
 	buffer. Value 0 is special, it means that nothing is reserved.
 	Default: 31
 
+tcp_autocorking - BOOLEAN
+	Enable TCP auto corking :
+	When applications do consecutive small write()/sendmsg() system calls,
+	we try to coalesce these small writes as much as possible, to lower
+	total amount of sent packets. This is done if at least one prior
+	packet for the flow is waiting in Qdisc queues or device transmit
+	queue. Applications can still use TCP_CORK for optimal behavior
+	when they know how/when to uncork their sockets.
+	Default : 1
+
 tcp_available_congestion_control - STRING
 	Shows the available congestion control choices that are registered.
 	More congestion control algorithms may be available as modules,
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 8e48e3b14227..4288ffafba9f 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -953,6 +953,27 @@ int main(int argc, char **argp)
 }
 
 -------------------------------------------------------------------------------
++ PACKET_QDISC_BYPASS
+-------------------------------------------------------------------------------
+
+If there is a requirement to load the network with many packets in a similar
+fashion as pktgen does, you might set the following option after socket
+creation:
+
+    int one = 1;
+    setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one));
+
+This has the side-effect, that packets sent through PF_PACKET will bypass the
+kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning,
+packet are not buffered, tc disciplines are ignored, increased loss can occur
+and such packets are also not visible to other PF_PACKET sockets anymore. So,
+you have been warned; generally, this can be useful for stress testing various
+components of a system.
+
+On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled
+on PF_PACKET sockets.
+
+-------------------------------------------------------------------------------
 + PACKET_TIMESTAMP
 -------------------------------------------------------------------------------
 
diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt
index d5b1a3935245..ebf270719402 100644
--- a/Documentation/networking/phy.txt
+++ b/Documentation/networking/phy.txt
@@ -255,7 +255,8 @@ Writing a PHY driver
 
    config_init: configures PHY into a sane state after a reset.
      For instance, a Davicom PHY requires descrambling disabled.
-   probe: Does any setup needed by the driver
+   probe: Allocate phy->priv, optionally refuse to bind.
+   PHY may not have been reset or had fixups run yet.
    suspend/resume: power management
    config_aneg: Changes the speed/duplex/negotiation settings
    read_status: Reads the current speed/duplex/negotiation settings
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
index 9551622d0a7b..356f791af574 100644
--- a/Documentation/networking/regulatory.txt
+++ b/Documentation/networking/regulatory.txt
@@ -159,10 +159,10 @@ struct ieee80211_regdomain mydriver_jp_regdom = {
 		REG_RULE(2412-20, 2484+20, 40, 6, 20, 0),
 		/* IEEE 802.11a, channels 34..48 */
 		REG_RULE(5170-20, 5240+20, 40, 6, 20,
-			NL80211_RRF_PASSIVE_SCAN),
+			NL80211_RRF_NO_IR),
 		/* IEEE 802.11a, channels 52..64 */
 		REG_RULE(5260-20, 5320+20, 40, 6, 20,
-			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_NO_IR|
 			NL80211_RRF_DFS),
 	}
 };
diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 98097d8cb910..661d3c316a17 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -85,7 +85,7 @@ Filled in if SOF_TIMESTAMPING_SYS_HARDWARE is set. Requires support
 by the network device and will be empty without that support.
 
 
-SIOCSHWTSTAMP:
+SIOCSHWTSTAMP, SIOCGHWTSTAMP:
 
 Hardware time stamping must also be initialized for each device driver
 that is expected to do hardware time stamping. The parameter is defined in
@@ -115,6 +115,10 @@ Only a processes with admin rights may change the configuration. User
 space is responsible to ensure that multiple processes don't interfere
 with each other and that the settings are reset.
 
+Any process can read the actual configuration by passing this
+structure to ioctl(SIOCGHWTSTAMP) in the same way.  However, this has
+not been implemented in all drivers.
+
 /* possible values for hwtstamp_config->tx_type */
 enum {
 	/*
@@ -157,7 +161,8 @@ DEVICE IMPLEMENTATION
 
 A driver which supports hardware time stamping must support the
 SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with
-the actual values as described in the section on SIOCSHWTSTAMP.
+the actual values as described in the section on SIOCSHWTSTAMP.  It
+should also support SIOCGHWTSTAMP.
 
 Time stamps for received packets must be stored in the skb. To get a pointer
 to the shared time stamp structure of the skb call skb_hwtstamps(). Then
diff --git a/Documentation/networking/timestamping/.gitignore b/Documentation/networking/timestamping/.gitignore
index 71e81eb2e22f..a380159765ce 100644
--- a/Documentation/networking/timestamping/.gitignore
+++ b/Documentation/networking/timestamping/.gitignore
@@ -1 +1,2 @@
 timestamping
+hwtstamp_config
diff --git a/Documentation/networking/timestamping/Makefile b/Documentation/networking/timestamping/Makefile
index e79973443e9f..d934afc8306a 100644
--- a/Documentation/networking/timestamping/Makefile
+++ b/Documentation/networking/timestamping/Makefile
@@ -2,12 +2,13 @@
 obj- := dummy.o
 
 # List of programs to build
-hostprogs-y := timestamping
+hostprogs-y := timestamping hwtstamp_config
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 
 HOSTCFLAGS_timestamping.o += -I$(objtree)/usr/include
+HOSTCFLAGS_hwtstamp_config.o += -I$(objtree)/usr/include
 
 clean:
-	rm -f timestamping
+	rm -f timestamping hwtstamp_config
diff --git a/Documentation/networking/timestamping/hwtstamp_config.c b/Documentation/networking/timestamping/hwtstamp_config.c
new file mode 100644
index 000000000000..e8b685a7f15f
--- /dev/null
+++ b/Documentation/networking/timestamping/hwtstamp_config.c
@@ -0,0 +1,134 @@
+/* Test program for SIOC{G,S}HWTSTAMP
+ * Copyright 2013 Solarflare Communications
+ * Author: Ben Hutchings
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+
+#include <linux/if.h>
+#include <linux/net_tstamp.h>
+#include <linux/sockios.h>
+
+static int
+lookup_value(const char **names, int size, const char *name)
+{
+	int value;
+
+	for (value = 0; value < size; value++)
+		if (names[value] && strcasecmp(names[value], name) == 0)
+			return value;
+
+	return -1;
+}
+
+static const char *
+lookup_name(const char **names, int size, int value)
+{
+	return (value >= 0 && value < size) ? names[value] : NULL;
+}
+
+static void list_names(FILE *f, const char **names, int size)
+{
+	int value;
+
+	for (value = 0; value < size; value++)
+		if (names[value])
+			fprintf(f, "    %s\n", names[value]);
+}
+
+static const char *tx_types[] = {
+#define TX_TYPE(name) [HWTSTAMP_TX_ ## name] = #name
+	TX_TYPE(OFF),
+	TX_TYPE(ON),
+	TX_TYPE(ONESTEP_SYNC)
+#undef TX_TYPE
+};
+#define N_TX_TYPES ((int)(sizeof(tx_types) / sizeof(tx_types[0])))
+
+static const char *rx_filters[] = {
+#define RX_FILTER(name) [HWTSTAMP_FILTER_ ## name] = #name
+	RX_FILTER(NONE),
+	RX_FILTER(ALL),
+	RX_FILTER(SOME),
+	RX_FILTER(PTP_V1_L4_EVENT),
+	RX_FILTER(PTP_V1_L4_SYNC),
+	RX_FILTER(PTP_V1_L4_DELAY_REQ),
+	RX_FILTER(PTP_V2_L4_EVENT),
+	RX_FILTER(PTP_V2_L4_SYNC),
+	RX_FILTER(PTP_V2_L4_DELAY_REQ),
+	RX_FILTER(PTP_V2_L2_EVENT),
+	RX_FILTER(PTP_V2_L2_SYNC),
+	RX_FILTER(PTP_V2_L2_DELAY_REQ),
+	RX_FILTER(PTP_V2_EVENT),
+	RX_FILTER(PTP_V2_SYNC),
+	RX_FILTER(PTP_V2_DELAY_REQ),
+#undef RX_FILTER
+};
+#define N_RX_FILTERS ((int)(sizeof(rx_filters) / sizeof(rx_filters[0])))
+
+static void usage(void)
+{
+	fputs("Usage: hwtstamp_config if_name [tx_type rx_filter]\n"
+	      "tx_type is any of (case-insensitive):\n",
+	      stderr);
+	list_names(stderr, tx_types, N_TX_TYPES);
+	fputs("rx_filter is any of (case-insensitive):\n", stderr);
+	list_names(stderr, rx_filters, N_RX_FILTERS);
+}
+
+int main(int argc, char **argv)
+{
+	struct ifreq ifr;
+	struct hwtstamp_config config;
+	const char *name;
+	int sock;
+
+	if ((argc != 2 && argc != 4) || (strlen(argv[1]) >= IFNAMSIZ)) {
+		usage();
+		return 2;
+	}
+
+	if (argc == 4) {
+		config.flags = 0;
+		config.tx_type = lookup_value(tx_types, N_TX_TYPES, argv[2]);
+		config.rx_filter = lookup_value(rx_filters, N_RX_FILTERS, argv[3]);
+		if (config.tx_type < 0 || config.rx_filter < 0) {
+			usage();
+			return 2;
+		}
+	}
+
+	sock = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sock < 0) {
+		perror("socket");
+		return 1;
+	}
+
+	strcpy(ifr.ifr_name, argv[1]);
+	ifr.ifr_data = (caddr_t)&config;
+
+	if (ioctl(sock, (argc == 2) ? SIOCGHWTSTAMP : SIOCSHWTSTAMP, &ifr)) {
+		perror("ioctl");
+		return 1;
+	}
+
+	printf("flags = %#x\n", config.flags);
+	name = lookup_name(tx_types, N_TX_TYPES, config.tx_type);
+	if (name)
+		printf("tx_type = %s\n", name);
+	else
+		printf("tx_type = %d\n", config.tx_type);
+	name = lookup_name(rx_filters, N_RX_FILTERS, config.rx_filter);
+	if (name)
+		printf("rx_filter = %s\n", name);
+	else
+		printf("rx_filter = %d\n", config.rx_filter);
+
+	return 0;
+}