diff options
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/net.c | 326 | ||||
-rw-r--r-- | drivers/vhost/scsi.c | 426 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 26 |
3 files changed, 588 insertions, 190 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 4e656f89cb22..ab11b2bee273 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -116,6 +116,8 @@ struct vhost_net_virtqueue { * For RX, number of batched heads */ int done_idx; + /* Number of XDP frames batched */ + int batched_xdp; /* an array of userspace buffers info */ struct ubuf_info *ubuf_info; /* Reference counting for outstanding ubufs. @@ -123,6 +125,8 @@ struct vhost_net_virtqueue { struct vhost_net_ubuf_ref *ubufs; struct ptr_ring *rx_ring; struct vhost_net_buf rxq; + /* Batched XDP buffs */ + struct xdp_buff *xdp; }; struct vhost_net { @@ -338,6 +342,11 @@ static bool vhost_sock_zcopy(struct socket *sock) sock_flag(sock->sk, SOCK_ZEROCOPY); } +static bool vhost_sock_xdp(struct socket *sock) +{ + return sock_flag(sock->sk, SOCK_XDP); +} + /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM @@ -444,32 +453,120 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) nvq->done_idx = 0; } +static void vhost_tx_batch(struct vhost_net *net, + struct vhost_net_virtqueue *nvq, + struct socket *sock, + struct msghdr *msghdr) +{ + struct tun_msg_ctl ctl = { + .type = TUN_MSG_PTR, + .num = nvq->batched_xdp, + .ptr = nvq->xdp, + }; + int err; + + if (nvq->batched_xdp == 0) + goto signal_used; + + msghdr->msg_control = &ctl; + err = sock->ops->sendmsg(sock, msghdr, 0); + if (unlikely(err < 0)) { + vq_err(&nvq->vq, "Fail to batch sending packets\n"); + return; + } + +signal_used: + vhost_net_signal_used(nvq); + nvq->batched_xdp = 0; +} + +static int sock_has_rx_data(struct socket *sock) +{ + if (unlikely(!sock)) + return 0; + + if (sock->ops->peek_len) + return sock->ops->peek_len(sock); + + return skb_queue_empty(&sock->sk->sk_receive_queue); +} + +static void vhost_net_busy_poll_try_queue(struct vhost_net *net, + struct vhost_virtqueue *vq) +{ + if (!vhost_vq_avail_empty(&net->dev, vq)) { + vhost_poll_queue(&vq->poll); + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { + vhost_disable_notify(&net->dev, vq); + vhost_poll_queue(&vq->poll); + } +} + +static void vhost_net_busy_poll(struct vhost_net *net, + struct vhost_virtqueue *rvq, + struct vhost_virtqueue *tvq, + bool *busyloop_intr, + bool poll_rx) +{ + unsigned long busyloop_timeout; + unsigned long endtime; + struct socket *sock; + struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; + + mutex_lock_nested(&vq->mutex, poll_rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX); + vhost_disable_notify(&net->dev, vq); + sock = rvq->private_data; + + busyloop_timeout = poll_rx ? rvq->busyloop_timeout: + tvq->busyloop_timeout; + + preempt_disable(); + endtime = busy_clock() + busyloop_timeout; + + while (vhost_can_busy_poll(endtime)) { + if (vhost_has_work(&net->dev)) { + *busyloop_intr = true; + break; + } + + if ((sock_has_rx_data(sock) && + !vhost_vq_avail_empty(&net->dev, rvq)) || + !vhost_vq_avail_empty(&net->dev, tvq)) + break; + + cpu_relax(); + } + + preempt_enable(); + + if (poll_rx || sock_has_rx_data(sock)) + vhost_net_busy_poll_try_queue(net, vq); + else if (!poll_rx) /* On tx here, sock has no rx data. */ + vhost_enable_notify(&net->dev, rvq); + + mutex_unlock(&vq->mutex); +} + static int vhost_net_tx_get_vq_desc(struct vhost_net *net, - struct vhost_net_virtqueue *nvq, + struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, - bool *busyloop_intr) + struct msghdr *msghdr, bool *busyloop_intr) { - struct vhost_virtqueue *vq = &nvq->vq; - unsigned long uninitialized_var(endtime); - int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; + struct vhost_virtqueue *rvq = &rnvq->vq; + struct vhost_virtqueue *tvq = &tnvq->vq; + + int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); - if (r == vq->num && vq->busyloop_timeout) { - if (!vhost_sock_zcopy(vq->private_data)) - vhost_net_signal_used(nvq); - preempt_disable(); - endtime = busy_clock() + vq->busyloop_timeout; - while (vhost_can_busy_poll(endtime)) { - if (vhost_has_work(vq->dev)) { - *busyloop_intr = true; - break; - } - if (!vhost_vq_avail_empty(vq->dev, vq)) - break; - cpu_relax(); - } - preempt_enable(); - r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + if (r == tvq->num && tvq->busyloop_timeout) { + /* Flush batched packets first */ + if (!vhost_sock_zcopy(tvq->private_data)) + vhost_tx_batch(net, tnvq, tvq->private_data, msghdr); + + vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); + + r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); } @@ -512,7 +609,7 @@ static int get_tx_bufs(struct vhost_net *net, struct vhost_virtqueue *vq = &nvq->vq; int ret; - ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr); + ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); if (ret < 0 || ret == vq->num) return ret; @@ -540,6 +637,80 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) !vhost_vq_avail_empty(vq->dev, vq); } +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) + +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, + struct iov_iter *from) +{ + struct vhost_virtqueue *vq = &nvq->vq; + struct socket *sock = vq->private_data; + struct page_frag *alloc_frag = ¤t->task_frag; + struct virtio_net_hdr *gso; + struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; + struct tun_xdp_hdr *hdr; + size_t len = iov_iter_count(from); + int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); + int sock_hlen = nvq->sock_hlen; + void *buf; + int copied; + + if (unlikely(len < nvq->sock_hlen)) + return -EFAULT; + + if (SKB_DATA_ALIGN(len + pad) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) + return -ENOSPC; + + buflen += SKB_DATA_ALIGN(len + pad); + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) + return -ENOMEM; + + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; + copied = copy_page_from_iter(alloc_frag->page, + alloc_frag->offset + + offsetof(struct tun_xdp_hdr, gso), + sock_hlen, from); + if (copied != sock_hlen) + return -EFAULT; + + hdr = buf; + gso = &hdr->gso; + + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + vhost16_to_cpu(vq, gso->csum_start) + + vhost16_to_cpu(vq, gso->csum_offset) + 2 > + vhost16_to_cpu(vq, gso->hdr_len)) { + gso->hdr_len = cpu_to_vhost16(vq, + vhost16_to_cpu(vq, gso->csum_start) + + vhost16_to_cpu(vq, gso->csum_offset) + 2); + + if (vhost16_to_cpu(vq, gso->hdr_len) > len) + return -EINVAL; + } + + len -= sock_hlen; + copied = copy_page_from_iter(alloc_frag->page, + alloc_frag->offset + pad, + len, from); + if (copied != len) + return -EFAULT; + + xdp->data_hard_start = buf; + xdp->data = buf + pad; + xdp->data_end = xdp->data + len; + hdr->buflen = buflen; + + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + + ++nvq->batched_xdp; + + return 0; +} + static void handle_tx_copy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; @@ -556,10 +727,14 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) size_t len, total_len = 0; int err; int sent_pkts = 0; + bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); for (;;) { bool busyloop_intr = false; + if (nvq->done_idx == VHOST_NET_BATCH) + vhost_tx_batch(net, nvq, sock, &msg); + head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ @@ -577,14 +752,34 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) break; } - vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); - vq->heads[nvq->done_idx].len = 0; - total_len += len; - if (tx_can_batch(vq, total_len)) - msg.msg_flags |= MSG_MORE; - else - msg.msg_flags &= ~MSG_MORE; + + /* For simplicity, TX batching is only enabled if + * sndbuf is unlimited. + */ + if (sock_can_batch) { + err = vhost_net_build_xdp(nvq, &msg.msg_iter); + if (!err) { + goto done; + } else if (unlikely(err != -ENOSPC)) { + vhost_tx_batch(net, nvq, sock, &msg); + vhost_discard_vq_desc(vq, 1); + vhost_net_enable_vq(net, vq); + break; + } + + /* We can't build XDP buff, go for single + * packet path but let's flush batched + * packets. + */ + vhost_tx_batch(net, nvq, sock, &msg); + msg.msg_control = NULL; + } else { + if (tx_can_batch(vq, total_len)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags &= ~MSG_MORE; + } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(sock, &msg, len); @@ -596,15 +791,17 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) if (err != len) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); - if (++nvq->done_idx >= VHOST_NET_BATCH) - vhost_net_signal_used(nvq); +done: + vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); + vq->heads[nvq->done_idx].len = 0; + ++nvq->done_idx; if (vhost_exceeds_weight(++sent_pkts, total_len)) { vhost_poll_queue(&vq->poll); break; } } - vhost_net_signal_used(nvq); + vhost_tx_batch(net, nvq, sock, &msg); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) @@ -620,6 +817,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; + struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); @@ -664,8 +862,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; refcount_set(&ubuf->refcnt, 1); - msg.msg_control = ubuf; - msg.msg_controllen = sizeof(ubuf); + msg.msg_control = &ctl; + ctl.type = TUN_MSG_UBUF; + ctl.ptr = ubuf; + msg.msg_controllen = sizeof(ctl); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; @@ -716,7 +916,7 @@ static void handle_tx(struct vhost_net *net) struct vhost_virtqueue *vq = &nvq->vq; struct socket *sock; - mutex_lock(&vq->mutex); + mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); sock = vq->private_data; if (!sock) goto out; @@ -757,16 +957,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) return len; } -static int sk_has_rx_data(struct sock *sk) -{ - struct socket *sock = sk->sk_socket; - - if (sock->ops->peek_len) - return sock->ops->peek_len(sock); - - return skb_queue_empty(&sk->sk_receive_queue); -} - static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, bool *busyloop_intr) { @@ -774,41 +964,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; - unsigned long uninitialized_var(endtime); int len = peek_head_len(rnvq, sk); - if (!len && tvq->busyloop_timeout) { + if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ vhost_net_signal_used(rnvq); /* Both tx vq and rx socket were polled here */ - mutex_lock_nested(&tvq->mutex, 1); - vhost_disable_notify(&net->dev, tvq); - - preempt_disable(); - endtime = busy_clock() + tvq->busyloop_timeout; - - while (vhost_can_busy_poll(endtime)) { - if (vhost_has_work(&net->dev)) { - *busyloop_intr = true; - break; - } - if ((sk_has_rx_data(sk) && - !vhost_vq_avail_empty(&net->dev, rvq)) || - !vhost_vq_avail_empty(&net->dev, tvq)) - break; - cpu_relax(); - } - - preempt_enable(); - - if (!vhost_vq_avail_empty(&net->dev, tvq)) { - vhost_poll_queue(&tvq->poll); - } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) { - vhost_disable_notify(&net->dev, tvq); - vhost_poll_queue(&tvq->poll); - } - - mutex_unlock(&tvq->mutex); + vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); len = peek_head_len(rnvq, sk); } @@ -923,7 +1085,7 @@ static void handle_rx(struct vhost_net *net) __virtio16 num_buffers; int recv_pkts = 0; - mutex_lock_nested(&vq->mutex, 0); + mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vq->private_data; if (!sock) goto out; @@ -1078,6 +1240,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) struct vhost_dev *dev; struct vhost_virtqueue **vqs; void **queue; + struct xdp_buff *xdp; int i; n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); @@ -1098,6 +1261,15 @@ static int vhost_net_open(struct inode *inode, struct file *f) } n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; + xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); + if (!xdp) { + kfree(vqs); + kvfree(n); + kfree(queue); + return -ENOMEM; + } + n->vqs[VHOST_NET_VQ_TX].xdp = xdp; + dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; @@ -1108,6 +1280,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; + n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; n->vqs[i].rx_ring = NULL; @@ -1191,6 +1364,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); + kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); kvfree(n); return 0; diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index c24bb690680b..50dffe83714c 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -203,6 +203,19 @@ struct vhost_scsi { int vs_events_nr; /* num of pending events, protected by vq->mutex */ }; +/* + * Context for processing request and control queue operations. + */ +struct vhost_scsi_ctx { + int head; + unsigned int out, in; + size_t req_size, rsp_size; + size_t out_size, in_size; + u8 *target, *lunp; + void *req; + struct iov_iter out_iter; +}; + static struct workqueue_struct *vhost_scsi_workqueue; /* Global spinlock to protect vhost_scsi TPG list for vhost IOCTL access */ @@ -800,24 +813,120 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs, pr_err("Faulted on virtio_scsi_cmd_resp\n"); } +static int +vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + int ret = -ENXIO; + + vc->head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), &vc->out, &vc->in, + NULL, NULL); + + pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", + vc->head, vc->out, vc->in); + + /* On error, stop handling until the next kick. */ + if (unlikely(vc->head < 0)) + goto done; + + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (vc->head == vq->num) { + if (unlikely(vhost_enable_notify(&vs->dev, vq))) { + vhost_disable_notify(&vs->dev, vq); + ret = -EAGAIN; + } + goto done; + } + + /* + * Get the size of request and response buffers. + * FIXME: Not correct for BIDI operation + */ + vc->out_size = iov_length(vq->iov, vc->out); + vc->in_size = iov_length(&vq->iov[vc->out], vc->in); + + /* + * Copy over the virtio-scsi request header, which for a + * ANY_LAYOUT enabled guest may span multiple iovecs, or a + * single iovec may contain both the header + outgoing + * WRITE payloads. + * + * copy_from_iter() will advance out_iter, so that it will + * point at the start of the outgoing WRITE payload, if + * DMA_TO_DEVICE is set. + */ + iov_iter_init(&vc->out_iter, WRITE, vq->iov, vc->out, vc->out_size); + ret = 0; + +done: + return ret; +} + +static int +vhost_scsi_chk_size(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc) +{ + if (unlikely(vc->in_size < vc->rsp_size)) { + vq_err(vq, + "Response buf too small, need min %zu bytes got %zu", + vc->rsp_size, vc->in_size); + return -EINVAL; + } else if (unlikely(vc->out_size < vc->req_size)) { + vq_err(vq, + "Request buf too small, need min %zu bytes got %zu", + vc->req_size, vc->out_size); + return -EIO; + } + + return 0; +} + +static int +vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc, + struct vhost_scsi_tpg **tpgp) +{ + int ret = -EIO; + + if (unlikely(!copy_from_iter_full(vc->req, vc->req_size, + &vc->out_iter))) { + vq_err(vq, "Faulted on copy_from_iter\n"); + } else if (unlikely(*vc->lunp != 1)) { + /* virtio-scsi spec requires byte 0 of the lun to be 1 */ + vq_err(vq, "Illegal virtio-scsi lun: %u\n", *vc->lunp); + } else { + struct vhost_scsi_tpg **vs_tpg, *tpg; + + vs_tpg = vq->private_data; /* validated at handler entry */ + + tpg = READ_ONCE(vs_tpg[*vc->target]); + if (unlikely(!tpg)) { + vq_err(vq, "Target 0x%x does not exist\n", *vc->target); + } else { + if (tpgp) + *tpgp = tpg; + ret = 0; + } + } + + return ret; +} + static void vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) { struct vhost_scsi_tpg **vs_tpg, *tpg; struct virtio_scsi_cmd_req v_req; struct virtio_scsi_cmd_req_pi v_req_pi; + struct vhost_scsi_ctx vc; struct vhost_scsi_cmd *cmd; - struct iov_iter out_iter, in_iter, prot_iter, data_iter; + struct iov_iter in_iter, prot_iter, data_iter; u64 tag; u32 exp_data_len, data_direction; - unsigned int out = 0, in = 0; - int head, ret, prot_bytes; - size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp); - size_t out_size, in_size; + int ret, prot_bytes; u16 lun; - u8 *target, *lunp, task_attr; + u8 task_attr; bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI); - void *req, *cdb; + void *cdb; mutex_lock(&vq->mutex); /* @@ -828,85 +937,47 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (!vs_tpg) goto out; + memset(&vc, 0, sizeof(vc)); + vc.rsp_size = sizeof(struct virtio_scsi_cmd_resp); + vhost_disable_notify(&vs->dev, vq); for (;;) { - head = vhost_get_vq_desc(vq, vq->iov, - ARRAY_SIZE(vq->iov), &out, &in, - NULL, NULL); - pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", - head, out, in); - /* On error, stop handling until the next kick. */ - if (unlikely(head < 0)) - break; - /* Nothing new? Wait for eventfd to tell us they refilled. */ - if (head == vq->num) { - if (unlikely(vhost_enable_notify(&vs->dev, vq))) { - vhost_disable_notify(&vs->dev, vq); - continue; - } - break; - } - /* - * Check for a sane response buffer so we can report early - * errors back to the guest. - */ - if (unlikely(vq->iov[out].iov_len < rsp_size)) { - vq_err(vq, "Expecting at least virtio_scsi_cmd_resp" - " size, got %zu bytes\n", vq->iov[out].iov_len); - break; - } + ret = vhost_scsi_get_desc(vs, vq, &vc); + if (ret) + goto err; + /* * Setup pointers and values based upon different virtio-scsi * request header if T10_PI is enabled in KVM guest. */ if (t10_pi) { - req = &v_req_pi; - req_size = sizeof(v_req_pi); - lunp = &v_req_pi.lun[0]; - target = &v_req_pi.lun[1]; + vc.req = &v_req_pi; + vc.req_size = sizeof(v_req_pi); + vc.lunp = &v_req_pi.lun[0]; + vc.target = &v_req_pi.lun[1]; } else { - req = &v_req; - req_size = sizeof(v_req); - lunp = &v_req.lun[0]; - target = &v_req.lun[1]; + vc.req = &v_req; + vc.req_size = sizeof(v_req); + vc.lunp = &v_req.lun[0]; + vc.target = &v_req.lun[1]; } - /* - * FIXME: Not correct for BIDI operation - */ - out_size = iov_length(vq->iov, out); - in_size = iov_length(&vq->iov[out], in); /* - * Copy over the virtio-scsi request header, which for a - * ANY_LAYOUT enabled guest may span multiple iovecs, or a - * single iovec may contain both the header + outgoing - * WRITE payloads. - * - * copy_from_iter() will advance out_iter, so that it will - * point at the start of the outgoing WRITE payload, if - * DMA_TO_DEVICE is set. + * Validate the size of request and response buffers. + * Check for a sane response buffer so we can report + * early errors back to the guest. */ - iov_iter_init(&out_iter, WRITE, vq->iov, out, out_size); + ret = vhost_scsi_chk_size(vq, &vc); + if (ret) + goto err; - if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) { - vq_err(vq, "Faulted on copy_from_iter\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } - /* virtio-scsi spec requires byte 0 of the lun to be 1 */ - if (unlikely(*lunp != 1)) { - vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } + ret = vhost_scsi_get_req(vq, &vc, &tpg); + if (ret) + goto err; + + ret = -EIO; /* bad target on any error from here on */ - tpg = READ_ONCE(vs_tpg[*target]); - if (unlikely(!tpg)) { - /* Target does not exist, fail the request */ - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; - } /* * Determine data_direction by calculating the total outgoing * iovec sizes + incoming iovec sizes vs. virtio-scsi request + @@ -924,17 +995,17 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) */ prot_bytes = 0; - if (out_size > req_size) { + if (vc.out_size > vc.req_size) { data_direction = DMA_TO_DEVICE; - exp_data_len = out_size - req_size; - data_iter = out_iter; - } else if (in_size > rsp_size) { + exp_data_len = vc.out_size - vc.req_size; + data_iter = vc.out_iter; + } else if (vc.in_size > vc.rsp_size) { data_direction = DMA_FROM_DEVICE; - exp_data_len = in_size - rsp_size; + exp_data_len = vc.in_size - vc.rsp_size; - iov_iter_init(&in_iter, READ, &vq->iov[out], in, - rsp_size + exp_data_len); - iov_iter_advance(&in_iter, rsp_size); + iov_iter_init(&in_iter, READ, &vq->iov[vc.out], vc.in, + vc.rsp_size + exp_data_len); + iov_iter_advance(&in_iter, vc.rsp_size); data_iter = in_iter; } else { data_direction = DMA_NONE; @@ -950,21 +1021,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (data_direction != DMA_TO_DEVICE) { vq_err(vq, "Received non zero pi_bytesout," " but wrong data_direction\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout); } else if (v_req_pi.pi_bytesin) { if (data_direction != DMA_FROM_DEVICE) { vq_err(vq, "Received non zero pi_bytesin," " but wrong data_direction\n"); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin); } /* - * Set prot_iter to data_iter, and advance past any + * Set prot_iter to data_iter and truncate it to + * prot_bytes, and advance data_iter past any * preceeding prot_bytes that may be present. * * Also fix up the exp_data_len to reflect only the @@ -973,6 +1043,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (prot_bytes) { exp_data_len -= prot_bytes; prot_iter = data_iter; + iov_iter_truncate(&prot_iter, prot_bytes); iov_iter_advance(&data_iter, prot_bytes); } tag = vhost64_to_cpu(vq, v_req_pi.tag); @@ -996,8 +1067,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) vq_err(vq, "Received SCSI CDB with command_size: %d that" " exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n", scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr, exp_data_len + prot_bytes, @@ -1005,13 +1075,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) if (IS_ERR(cmd)) { vq_err(vq, "vhost_scsi_get_tag failed %ld\n", PTR_ERR(cmd)); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } cmd->tvc_vhost = vs; cmd->tvc_vq = vq; - cmd->tvc_resp_iov = vq->iov[out]; - cmd->tvc_in_iovs = in; + cmd->tvc_resp_iov = vq->iov[vc.out]; + cmd->tvc_in_iovs = vc.in; pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", cmd->tvc_cdb[0], cmd->tvc_lun); @@ -1019,14 +1088,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) " %d\n", cmd, exp_data_len, prot_bytes, data_direction); if (data_direction != DMA_NONE) { - ret = vhost_scsi_mapal(cmd, - prot_bytes, &prot_iter, - exp_data_len, &data_iter); - if (unlikely(ret)) { + if (unlikely(vhost_scsi_mapal(cmd, prot_bytes, + &prot_iter, exp_data_len, + &data_iter))) { vq_err(vq, "Failed to map iov to sgl\n"); vhost_scsi_release_cmd(&cmd->tvc_se_cmd); - vhost_scsi_send_bad_target(vs, vq, head, out); - continue; + goto err; } } /* @@ -1034,7 +1101,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) * complete the virtio-scsi request in TCM callback context via * vhost_scsi_queue_data_in() and vhost_scsi_queue_status() */ - cmd->tvc_vq_desc = head; + cmd->tvc_vq_desc = vc.head; /* * Dispatch cmd descriptor for cmwq execution in process * context provided by vhost_scsi_workqueue. This also ensures @@ -1043,6 +1110,166 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) */ INIT_WORK(&cmd->work, vhost_scsi_submission_work); queue_work(vhost_scsi_workqueue, &cmd->work); + ret = 0; +err: + /* + * ENXIO: No more requests, or read error, wait for next kick + * EINVAL: Invalid response buffer, drop the request + * EIO: Respond with bad target + * EAGAIN: Pending request + */ + if (ret == -ENXIO) + break; + else if (ret == -EIO) + vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); + } +out: + mutex_unlock(&vq->mutex); +} + +static void +vhost_scsi_send_tmf_reject(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + struct virtio_scsi_ctrl_tmf_resp __user *resp; + struct virtio_scsi_ctrl_tmf_resp rsp; + int ret; + + pr_debug("%s\n", __func__); + memset(&rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_FUNCTION_REJECTED; + resp = vq->iov[vc->out].iov_base; + ret = __copy_to_user(resp, &rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); + else + pr_err("Faulted on virtio_scsi_ctrl_tmf_resp\n"); +} + +static void +vhost_scsi_send_an_resp(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc) +{ + struct virtio_scsi_ctrl_an_resp __user *resp; + struct virtio_scsi_ctrl_an_resp rsp; + int ret; + + pr_debug("%s\n", __func__); + memset(&rsp, 0, sizeof(rsp)); /* event_actual = 0 */ + rsp.response = VIRTIO_SCSI_S_OK; + resp = vq->iov[vc->out].iov_base; + ret = __copy_to_user(resp, &rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); + else + pr_err("Faulted on virtio_scsi_ctrl_an_resp\n"); +} + +static void +vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) +{ + union { + __virtio32 type; + struct virtio_scsi_ctrl_an_req an; + struct virtio_scsi_ctrl_tmf_req tmf; + } v_req; + struct vhost_scsi_ctx vc; + size_t typ_size; + int ret; + + mutex_lock(&vq->mutex); + /* + * We can handle the vq only after the endpoint is setup by calling the + * VHOST_SCSI_SET_ENDPOINT ioctl. + */ + if (!vq->private_data) + goto out; + + memset(&vc, 0, sizeof(vc)); + + vhost_disable_notify(&vs->dev, vq); + + for (;;) { + ret = vhost_scsi_get_desc(vs, vq, &vc); + if (ret) + goto err; + + /* + * Get the request type first in order to setup + * other parameters dependent on the type. + */ + vc.req = &v_req.type; + typ_size = sizeof(v_req.type); + + if (unlikely(!copy_from_iter_full(vc.req, typ_size, + &vc.out_iter))) { + vq_err(vq, "Faulted on copy_from_iter tmf type\n"); + /* + * The size of the response buffer depends on the + * request type and must be validated against it. + * Since the request type is not known, don't send + * a response. + */ + continue; + } + + switch (v_req.type) { + case VIRTIO_SCSI_T_TMF: + vc.req = &v_req.tmf; + vc.req_size = sizeof(struct virtio_scsi_ctrl_tmf_req); + vc.rsp_size = sizeof(struct virtio_scsi_ctrl_tmf_resp); + vc.lunp = &v_req.tmf.lun[0]; + vc.target = &v_req.tmf.lun[1]; + break; + case VIRTIO_SCSI_T_AN_QUERY: + case VIRTIO_SCSI_T_AN_SUBSCRIBE: + vc.req = &v_req.an; + vc.req_size = sizeof(struct virtio_scsi_ctrl_an_req); + vc.rsp_size = sizeof(struct virtio_scsi_ctrl_an_resp); + vc.lunp = &v_req.an.lun[0]; + vc.target = NULL; + break; + default: + vq_err(vq, "Unknown control request %d", v_req.type); + continue; + } + + /* + * Validate the size of request and response buffers. + * Check for a sane response buffer so we can report + * early errors back to the guest. + */ + ret = vhost_scsi_chk_size(vq, &vc); + if (ret) + goto err; + + /* + * Get the rest of the request now that its size is known. + */ + vc.req += typ_size; + vc.req_size -= typ_size; + + ret = vhost_scsi_get_req(vq, &vc, NULL); + if (ret) + goto err; + + if (v_req.type == VIRTIO_SCSI_T_TMF) + vhost_scsi_send_tmf_reject(vs, vq, &vc); + else + vhost_scsi_send_an_resp(vs, vq, &vc); +err: + /* + * ENXIO: No more requests, or read error, wait for next kick + * EINVAL: Invalid response buffer, drop the request + * EIO: Respond with bad target + * EAGAIN: Pending request + */ + if (ret == -ENXIO) + break; + else if (ret == -EIO) + vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); } out: mutex_unlock(&vq->mutex); @@ -1050,7 +1277,12 @@ out: static void vhost_scsi_ctl_handle_kick(struct vhost_work *work) { + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_scsi *vs = container_of(vq->dev, struct vhost_scsi, dev); + pr_debug("%s: The handling func for control queue.\n", __func__); + vhost_scsi_ctl_handle_vq(vs, vq); } static void diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b13c6b4b2c66..3a5f81a66d34 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -30,6 +30,7 @@ #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/interval_tree_generic.h> +#include <linux/nospec.h> #include "vhost.h" @@ -294,8 +295,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d) { int i; - for (i = 0; i < d->nvqs; ++i) + for (i = 0; i < d->nvqs; ++i) { + mutex_lock(&d->vqs[i]->mutex); __vhost_vq_meta_reset(d->vqs[i]); + mutex_unlock(&d->vqs[i]->mutex); + } } static void vhost_vq_reset(struct vhost_dev *dev, @@ -891,20 +895,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, #define vhost_get_used(vq, x, ptr) \ vhost_get_user(vq, x, ptr, VHOST_ADDR_USED) -static void vhost_dev_lock_vqs(struct vhost_dev *d) -{ - int i = 0; - for (i = 0; i < d->nvqs; ++i) - mutex_lock_nested(&d->vqs[i]->mutex, i); -} - -static void vhost_dev_unlock_vqs(struct vhost_dev *d) -{ - int i = 0; - for (i = 0; i < d->nvqs; ++i) - mutex_unlock(&d->vqs[i]->mutex); -} - static int vhost_new_umem_range(struct vhost_umem *umem, u64 start, u64 size, u64 end, u64 userspace_addr, int perm) @@ -954,7 +944,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d, if (msg->iova <= vq_msg->iova && msg->iova + msg->size - 1 >= vq_msg->iova && vq_msg->type == VHOST_IOTLB_MISS) { + mutex_lock(&node->vq->mutex); vhost_poll_queue(&node->vq->poll); + mutex_unlock(&node->vq->mutex); + list_del(&node->node); kfree(node); } @@ -986,7 +979,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, int ret = 0; mutex_lock(&dev->mutex); - vhost_dev_lock_vqs(dev); switch (msg->type) { case VHOST_IOTLB_UPDATE: if (!dev->iotlb) { @@ -1020,7 +1012,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, break; } - vhost_dev_unlock_vqs(dev); mutex_unlock(&dev->mutex); return ret; @@ -1397,6 +1388,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg if (idx >= d->nvqs) return -ENOBUFS; + idx = array_index_nospec(idx, d->nvqs); vq = d->vqs[idx]; mutex_lock(&vq->mutex); |