From e876f208af18b074f800656e4d1b99da75b2135f Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 13:59:52 -0300 Subject: net: Add a software TSO helper API Although the implementation probably needs a lot of work, this initial API allows to implement software TSO in mvneta and mv643xx_eth drivers in a not so intrusive way. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- include/net/tso.h | 20 ++++++++++++++++ net/core/Makefile | 2 +- net/core/tso.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 include/net/tso.h create mode 100644 net/core/tso.c diff --git a/include/net/tso.h b/include/net/tso.h new file mode 100644 index 000000000000..47e5444f7d15 --- /dev/null +++ b/include/net/tso.h @@ -0,0 +1,20 @@ +#ifndef _TSO_H +#define _TSO_H + +#include + +struct tso_t { + int next_frag_idx; + void *data; + size_t size; + u16 ip_id; + u32 tcp_seq; +}; + +int tso_count_descs(struct sk_buff *skb); +void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, + int size, bool is_last); +void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size); +void tso_start(struct sk_buff *skb, struct tso_t *tso); + +#endif /* _TSO_H */ diff --git a/net/core/Makefile b/net/core/Makefile index 826b925aa453..71093d94ad2b 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o + sock_diag.o dev_ioctl.o tso.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/tso.c b/net/core/tso.c new file mode 100644 index 000000000000..097821dd3a8c --- /dev/null +++ b/net/core/tso.c @@ -0,0 +1,72 @@ +#include +#include + +/* Calculate expected number of TX descriptors */ +int tso_count_descs(struct sk_buff *skb) +{ + /* The Marvell Way */ + return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} + +void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, + int size, bool is_last) +{ + struct iphdr *iph; + struct tcphdr *tcph; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int mac_hdr_len = skb_network_offset(skb); + + memcpy(hdr, skb->data, hdr_len); + iph = (struct iphdr *)(hdr + mac_hdr_len); + iph->id = htons(tso->ip_id); + iph->tot_len = htons(size + hdr_len - mac_hdr_len); + tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); + tcph->seq = htonl(tso->tcp_seq); + tso->ip_id++; + + if (!is_last) { + /* Clear all special flags for not last packet */ + tcph->psh = 0; + tcph->fin = 0; + tcph->rst = 0; + } +} + +void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) +{ + tso->tcp_seq += size; + tso->size -= size; + tso->data += size; + + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = frag->size; + tso->data = page_address(frag->page.p) + frag->page_offset; + tso->next_frag_idx++; + } +} + +void tso_start(struct sk_buff *skb, struct tso_t *tso) +{ + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + + tso->ip_id = ntohs(ip_hdr(skb)->id); + tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); + tso->next_frag_idx = 0; + + /* Build first data */ + tso->size = skb_headlen(skb) - hdr_len; + tso->data = skb->data + hdr_len; + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = frag->size; + tso->data = page_address(frag->page.p) + frag->page_offset; + tso->next_frag_idx++; + } +} -- cgit v1.2.1 From 01ef26ca44fdfc04a611975f9abe34a04261f98e Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 13:59:53 -0300 Subject: net: mvneta: Factorize feature setting In order to ease the addition of new features, let's factorize the feature list. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index c7cbaf3b4afe..ed7f924a41e2 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2896,8 +2896,8 @@ static int mvneta_probe(struct platform_device *pdev) netif_napi_add(dev, &pp->napi, mvneta_poll, pp->weight); dev->features = NETIF_F_SG | NETIF_F_IP_CSUM; - dev->hw_features |= NETIF_F_SG | NETIF_F_IP_CSUM; - dev->vlan_features |= NETIF_F_SG | NETIF_F_IP_CSUM; + dev->hw_features |= dev->features; + dev->vlan_features |= dev->features; dev->priv_flags |= IFF_UNICAST_FLT; err = register_netdev(dev); -- cgit v1.2.1 From e19d2dda90c5305a201282317aa492ce0337aa62 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 13:59:54 -0300 Subject: net: mvneta: Clean mvneta_tx() sk_buff handling Rework mvneta_tx() so that the code that performs the final handling before a sk_buff is transmitted is done only if the numbers of fragments processed if positive. This is preparation work to add the support for software TSO. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index ed7f924a41e2..85090291e210 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -1584,7 +1584,6 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev) u16 txq_id = skb_get_queue_mapping(skb); struct mvneta_tx_queue *txq = &pp->txqs[txq_id]; struct mvneta_tx_desc *tx_desc; - struct netdev_queue *nq; int frags = 0; u32 tx_cmd; @@ -1592,7 +1591,6 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev) goto out; frags = skb_shinfo(skb)->nr_frags + 1; - nq = netdev_get_tx_queue(dev, txq_id); /* Get a descriptor for the first part of the packet */ tx_desc = mvneta_txq_next_desc_get(txq); @@ -1635,15 +1633,16 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev) } } - txq->count += frags; - mvneta_txq_pend_desc_add(pp, txq, frags); - - if (txq->size - txq->count < MAX_SKB_FRAGS + 1) - netif_tx_stop_queue(nq); - out: if (frags > 0) { struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats); + struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id); + + txq->count += frags; + mvneta_txq_pend_desc_add(pp, txq, frags); + + if (txq->size - txq->count < MAX_SKB_FRAGS + 1) + netif_tx_stop_queue(nq); u64_stats_update_begin(&stats->syncp); stats->tx_packets++; -- cgit v1.2.1 From 2adb719d74f6e174071e5c913290b9bbd8c2c0e8 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 13:59:55 -0300 Subject: net: mvneta: Implement software TSO Now that the TSO helper API has been introduced, this commit makes use of it to implement the TSO in this driver. Using iperf to test and vmstat to check the CPU usage, shows a substantial CPU usage drop when TSO is on (~15% vs. ~25%). HTTP-based tests performed by Willy Tarreau have shown performance improvements. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 153 +++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 85090291e210..18c698d9ef9b 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -244,6 +245,9 @@ #define MVNETA_TX_MTU_MAX 0x3ffff +/* TSO header size */ +#define TSO_HEADER_SIZE 128 + /* Max number of Rx descriptors */ #define MVNETA_MAX_RXD 128 @@ -413,6 +417,12 @@ struct mvneta_tx_queue { /* Index of the next TX DMA descriptor to process */ int next_desc_to_proc; + + /* DMA buffers for TSO headers */ + char *tso_hdrs; + + /* DMA address of TSO headers */ + dma_addr_t tso_hdrs_phys; }; struct mvneta_rx_queue { @@ -1519,6 +1529,126 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo, return rx_done; } +static inline void +mvneta_tso_put_hdr(struct sk_buff *skb, + struct mvneta_port *pp, struct mvneta_tx_queue *txq) +{ + struct mvneta_tx_desc *tx_desc; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + + txq->tx_skb[txq->txq_put_index] = NULL; + tx_desc = mvneta_txq_next_desc_get(txq); + tx_desc->data_size = hdr_len; + tx_desc->command = mvneta_skb_tx_csum(pp, skb); + tx_desc->command |= MVNETA_TXD_F_DESC; + tx_desc->buf_phys_addr = txq->tso_hdrs_phys + + txq->txq_put_index * TSO_HEADER_SIZE; + mvneta_txq_inc_put(txq); +} + +static inline int +mvneta_tso_put_data(struct net_device *dev, struct mvneta_tx_queue *txq, + struct sk_buff *skb, char *data, int size, + bool last_tcp, bool is_last) +{ + struct mvneta_tx_desc *tx_desc; + + tx_desc = mvneta_txq_next_desc_get(txq); + tx_desc->data_size = size; + tx_desc->buf_phys_addr = dma_map_single(dev->dev.parent, data, + size, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(dev->dev.parent, + tx_desc->buf_phys_addr))) { + mvneta_txq_desc_put(txq); + return -ENOMEM; + } + + tx_desc->command = 0; + txq->tx_skb[txq->txq_put_index] = NULL; + + if (last_tcp) { + /* last descriptor in the TCP packet */ + tx_desc->command = MVNETA_TXD_L_DESC; + + /* last descriptor in SKB */ + if (is_last) + txq->tx_skb[txq->txq_put_index] = skb; + } + mvneta_txq_inc_put(txq); + return 0; +} + +static int mvneta_tx_tso(struct sk_buff *skb, struct net_device *dev, + struct mvneta_tx_queue *txq) +{ + int total_len, data_left; + int desc_count = 0; + struct mvneta_port *pp = netdev_priv(dev); + struct tso_t tso; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int i; + + /* Count needed descriptors */ + if ((txq->count + tso_count_descs(skb)) >= txq->size) + return 0; + + if (skb_headlen(skb) < (skb_transport_offset(skb) + tcp_hdrlen(skb))) { + pr_info("*** Is this even possible???!?!?\n"); + return 0; + } + + /* Initialize the TSO handler, and prepare the first payload */ + tso_start(skb, &tso); + + total_len = skb->len - hdr_len; + while (total_len > 0) { + char *hdr; + + data_left = min_t(int, skb_shinfo(skb)->gso_size, total_len); + total_len -= data_left; + desc_count++; + + /* prepare packet headers: MAC + IP + TCP */ + hdr = txq->tso_hdrs + txq->txq_put_index * TSO_HEADER_SIZE; + tso_build_hdr(skb, hdr, &tso, data_left, total_len == 0); + + mvneta_tso_put_hdr(skb, pp, txq); + + while (data_left > 0) { + int size; + desc_count++; + + size = min_t(int, tso.size, data_left); + + if (mvneta_tso_put_data(dev, txq, skb, + tso.data, size, + size == data_left, + total_len == 0)) + goto err_release; + data_left -= size; + + tso_build_data(skb, &tso, size); + } + } + + return desc_count; + +err_release: + /* Release all used data descriptors; header descriptors must not + * be DMA-unmapped. + */ + for (i = desc_count - 1; i >= 0; i--) { + struct mvneta_tx_desc *tx_desc = txq->descs + i; + if (!(tx_desc->command & MVNETA_TXD_F_DESC)) + dma_unmap_single(pp->dev->dev.parent, + tx_desc->buf_phys_addr, + tx_desc->data_size, + DMA_TO_DEVICE); + mvneta_txq_desc_put(txq); + } + return 0; +} + /* Handle tx fragmentation processing */ static int mvneta_tx_frag_process(struct mvneta_port *pp, struct sk_buff *skb, struct mvneta_tx_queue *txq) @@ -1590,6 +1720,11 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev) if (!netif_running(dev)) goto out; + if (skb_is_gso(skb)) { + frags = mvneta_tx_tso(skb, dev, txq); + goto out; + } + frags = skb_shinfo(skb)->nr_frags + 1; /* Get a descriptor for the first part of the packet */ @@ -2108,6 +2243,18 @@ static int mvneta_txq_init(struct mvneta_port *pp, txq->descs, txq->descs_phys); return -ENOMEM; } + + /* Allocate DMA buffers for TSO MAC/IP/TCP headers */ + txq->tso_hdrs = dma_alloc_coherent(pp->dev->dev.parent, + txq->size * TSO_HEADER_SIZE, + &txq->tso_hdrs_phys, GFP_KERNEL); + if (txq->tso_hdrs == NULL) { + kfree(txq->tx_skb); + dma_free_coherent(pp->dev->dev.parent, + txq->size * MVNETA_DESC_ALIGNED_SIZE, + txq->descs, txq->descs_phys); + return -ENOMEM; + } mvneta_tx_done_pkts_coal_set(pp, txq, txq->done_pkts_coal); return 0; @@ -2119,6 +2266,10 @@ static void mvneta_txq_deinit(struct mvneta_port *pp, { kfree(txq->tx_skb); + if (txq->tso_hdrs) + dma_free_coherent(pp->dev->dev.parent, + txq->size * TSO_HEADER_SIZE, + txq->tso_hdrs, txq->tso_hdrs_phys); if (txq->descs) dma_free_coherent(pp->dev->dev.parent, txq->size * MVNETA_DESC_ALIGNED_SIZE, @@ -2894,7 +3045,7 @@ static int mvneta_probe(struct platform_device *pdev) netif_napi_add(dev, &pp->napi, mvneta_poll, pp->weight); - dev->features = NETIF_F_SG | NETIF_F_IP_CSUM; + dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; dev->hw_features |= dev->features; dev->vlan_features |= dev->features; dev->priv_flags |= IFF_UNICAST_FLT; -- cgit v1.2.1 From 0a8fa93310779ee5334e64635f98f9c72781e643 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 13:59:56 -0300 Subject: net: mv643xx_eth: Factorize initial checksum and command preparation Make the code more readable by moving the initial checksum setup and the command/status preparation to its own function. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mv643xx_eth.c | 113 +++++++++++++++++------------ 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index df1d1b97187f..14a0dbb6284e 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -661,6 +661,64 @@ static inline unsigned int has_tiny_unaligned_frags(struct sk_buff *skb) return 0; } +static inline __be16 sum16_as_be(__sum16 sum) +{ + return (__force __be16)sum; +} + +static int skb_tx_csum(struct mv643xx_eth_private *mp, struct sk_buff *skb, + u16 *l4i_chk, u32 *command, int length) +{ + int ret; + u32 cmd = 0; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdr_len; + int tag_bytes; + + BUG_ON(skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_8021Q)); + + hdr_len = (void *)ip_hdr(skb) - (void *)skb->data; + tag_bytes = hdr_len - ETH_HLEN; + + if (length - hdr_len > mp->shared->tx_csum_limit || + unlikely(tag_bytes & ~12)) { + ret = skb_checksum_help(skb); + if (!ret) + goto no_csum; + return ret; + } + + if (tag_bytes & 4) + cmd |= MAC_HDR_EXTRA_4_BYTES; + if (tag_bytes & 8) + cmd |= MAC_HDR_EXTRA_8_BYTES; + + cmd |= GEN_TCP_UDP_CHECKSUM | + GEN_IP_V4_CHECKSUM | + ip_hdr(skb)->ihl << TX_IHL_SHIFT; + + switch (ip_hdr(skb)->protocol) { + case IPPROTO_UDP: + cmd |= UDP_FRAME; + *l4i_chk = ntohs(sum16_as_be(udp_hdr(skb)->check)); + break; + case IPPROTO_TCP: + *l4i_chk = ntohs(sum16_as_be(tcp_hdr(skb)->check)); + break; + default: + WARN(1, "protocol not supported"); + } + } else { +no_csum: + /* Errata BTS #50, IHL must be 5 if no HW checksum */ + cmd |= 5 << TX_IHL_SHIFT; + } + *command = cmd; + return 0; +} + static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb) { struct mv643xx_eth_private *mp = txq_to_mp(txq); @@ -699,11 +757,6 @@ static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb) } } -static inline __be16 sum16_as_be(__sum16 sum) -{ - return (__force __be16)sum; -} - static int txq_submit_skb(struct tx_queue *txq, struct sk_buff *skb) { struct mv643xx_eth_private *mp = txq_to_mp(txq); @@ -712,53 +765,17 @@ static int txq_submit_skb(struct tx_queue *txq, struct sk_buff *skb) struct tx_desc *desc; u32 cmd_sts; u16 l4i_chk; - int length; + int length, ret; - cmd_sts = TX_FIRST_DESC | GEN_CRC | BUFFER_OWNED_BY_DMA; + cmd_sts = 0; l4i_chk = 0; - if (skb->ip_summed == CHECKSUM_PARTIAL) { - int hdr_len; - int tag_bytes; - - BUG_ON(skb->protocol != htons(ETH_P_IP) && - skb->protocol != htons(ETH_P_8021Q)); - - hdr_len = (void *)ip_hdr(skb) - (void *)skb->data; - tag_bytes = hdr_len - ETH_HLEN; - if (skb->len - hdr_len > mp->shared->tx_csum_limit || - unlikely(tag_bytes & ~12)) { - if (skb_checksum_help(skb) == 0) - goto no_csum; - dev_kfree_skb_any(skb); - return 1; - } - - if (tag_bytes & 4) - cmd_sts |= MAC_HDR_EXTRA_4_BYTES; - if (tag_bytes & 8) - cmd_sts |= MAC_HDR_EXTRA_8_BYTES; - - cmd_sts |= GEN_TCP_UDP_CHECKSUM | - GEN_IP_V4_CHECKSUM | - ip_hdr(skb)->ihl << TX_IHL_SHIFT; - - switch (ip_hdr(skb)->protocol) { - case IPPROTO_UDP: - cmd_sts |= UDP_FRAME; - l4i_chk = ntohs(sum16_as_be(udp_hdr(skb)->check)); - break; - case IPPROTO_TCP: - l4i_chk = ntohs(sum16_as_be(tcp_hdr(skb)->check)); - break; - default: - BUG(); - } - } else { -no_csum: - /* Errata BTS #50, IHL must be 5 if no HW checksum */ - cmd_sts |= 5 << TX_IHL_SHIFT; + ret = skb_tx_csum(mp, skb, &l4i_chk, &cmd_sts, skb->len); + if (ret) { + dev_kfree_skb_any(skb); + return ret; } + cmd_sts |= TX_FIRST_DESC | GEN_CRC | BUFFER_OWNED_BY_DMA; tx_index = txq->tx_curr_desc++; if (txq->tx_curr_desc == txq->tx_ring_size) -- cgit v1.2.1 From 84411f73b8849c494fc6dd3bb6b5f10ee6e58a5b Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 13:59:57 -0300 Subject: net: mv643xx_eth: Avoid setting the initial TCP checksum As specified in the datasheet, the driver can set the "L4Chk_Mode" flag (bit 10) in the Tx descriptor command/status to specify that a frame is not IP fragmented and that the controller is in charge of generating the TCP/IP checksum. This must be used together with the "GL4chk" flag (bit 17). These two flags allow to avoid setting the initial TCP checksum in the l4i_chk field of the Tx descriptor, which is needed to support software TSO. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mv643xx_eth.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 14a0dbb6284e..5ad19d2e2f2d 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -250,6 +250,7 @@ struct tx_desc { #define GEN_TCP_UDP_CHECKSUM 0x00020000 #define UDP_FRAME 0x00010000 #define MAC_HDR_EXTRA_4_BYTES 0x00008000 +#define GEN_TCP_UDP_CHK_FULL 0x00000400 #define MAC_HDR_EXTRA_8_BYTES 0x00000200 #define TX_IHL_SHIFT 11 @@ -695,17 +696,19 @@ static int skb_tx_csum(struct mv643xx_eth_private *mp, struct sk_buff *skb, if (tag_bytes & 8) cmd |= MAC_HDR_EXTRA_8_BYTES; - cmd |= GEN_TCP_UDP_CHECKSUM | + cmd |= GEN_TCP_UDP_CHECKSUM | GEN_TCP_UDP_CHK_FULL | GEN_IP_V4_CHECKSUM | ip_hdr(skb)->ihl << TX_IHL_SHIFT; + /* TODO: Revisit this. With the usage of GEN_TCP_UDP_CHK_FULL + * it seems we don't need to pass the initial checksum. */ switch (ip_hdr(skb)->protocol) { case IPPROTO_UDP: cmd |= UDP_FRAME; - *l4i_chk = ntohs(sum16_as_be(udp_hdr(skb)->check)); + *l4i_chk = 0; break; case IPPROTO_TCP: - *l4i_chk = ntohs(sum16_as_be(tcp_hdr(skb)->check)); + *l4i_chk = 0; break; default: WARN(1, "protocol not supported"); -- cgit v1.2.1 From 4d48d58907169110fe6887775e8465475ef062ff Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 13:59:58 -0300 Subject: net: mv643xx_eth: Factorize feature setting In order to ease the addition of new features, let's factorize the feature list. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mv643xx_eth.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 5ad19d2e2f2d..01fde2c3f56d 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2941,9 +2941,11 @@ static int mv643xx_eth_probe(struct platform_device *pdev) dev->watchdog_timeo = 2 * HZ; dev->base_addr = 0; - dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM; - dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM; - dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM; + dev->features = NETIF_F_SG | NETIF_F_IP_CSUM; + dev->vlan_features = dev->features; + + dev->features |= NETIF_F_RXCSUM; + dev->hw_features = dev->features; dev->priv_flags |= IFF_UNICAST_FLT; -- cgit v1.2.1 From 69ad0dd7af22b61d9e0e68e56b6290121618b0fb Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 13:59:59 -0300 Subject: net: mv643xx_eth: Use dma_map_single() to map the skb fragments Using dma_map_single() instead of skb_frag_dma_map() allows to unmap all the descriptors using dma_unmap_single(). This change allows to introduce software TSO in a less intrusive way. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mv643xx_eth.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 01fde2c3f56d..8854751182ba 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -732,8 +732,10 @@ static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb) skb_frag_t *this_frag; int tx_index; struct tx_desc *desc; + void *addr; this_frag = &skb_shinfo(skb)->frags[frag]; + addr = page_address(this_frag->page.p) + this_frag->page_offset; tx_index = txq->tx_curr_desc++; if (txq->tx_curr_desc == txq->tx_ring_size) txq->tx_curr_desc = 0; @@ -753,10 +755,8 @@ static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb) desc->l4i_chk = 0; desc->byte_cnt = skb_frag_size(this_frag); - desc->buf_ptr = skb_frag_dma_map(mp->dev->dev.parent, - this_frag, 0, - skb_frag_size(this_frag), - DMA_TO_DEVICE); + desc->buf_ptr = dma_map_single(mp->dev->dev.parent, addr, + desc->byte_cnt, DMA_TO_DEVICE); } } @@ -927,14 +927,8 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force) mp->dev->stats.tx_errors++; } - if (cmd_sts & TX_FIRST_DESC) { - dma_unmap_single(mp->dev->dev.parent, desc->buf_ptr, - desc->byte_cnt, DMA_TO_DEVICE); - } else { - dma_unmap_page(mp->dev->dev.parent, desc->buf_ptr, - desc->byte_cnt, DMA_TO_DEVICE); - } - + dma_unmap_single(mp->dev->dev.parent, desc->buf_ptr, + desc->byte_cnt, DMA_TO_DEVICE); dev_kfree_skb(skb); } -- cgit v1.2.1 From 3ae8f4e0b98b640aadf410c21185ccb6b5b02351 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Mon, 19 May 2014 14:00:00 -0300 Subject: net: mv643xx_eth: Implement software TSO Now that the TSO helper API has been introduced, this commit makes use of it to add support for software TSO in this driver. This feature allows to improve outbound throughput performance significantly. Running iperf tests shows a 30% improvement, tested on a Kirkwood Openblocks A6 board. $ ethtool -K eth0 tso off $ iperf -c 192.168.0.45 -t 3 ------------------------------------------------------------ Client connecting to 192.168.0.45, TCP port 5001 TCP window size: 43.8 KByte (default) ------------------------------------------------------------ [ 3] local 192.168.0.159 port 46389 connected with 192.168.0.45 port 5001 [ ID] Interval Transfer Bandwidth [ 3] 0.0- 3.0 sec 217 MBytes 607 Mbits/sec $ ethtool -K eth0 tso on $ iperf -c 192.168.0.45 -t 3 ------------------------------------------------------------ Client connecting to 192.168.0.45, TCP port 5001 TCP window size: 43.8 KByte (default) ------------------------------------------------------------ [ 3] local 192.168.0.159 port 46390 connected with 192.168.0.45 port 5001 [ ID] Interval Transfer Bandwidth [ 3] 0.0- 3.0 sec 336 MBytes 938 Mbits/sec This commit is just an example of the usage of the TSO API, it works fine but needs some more work. In particular, the descriptor unmapping path must avoid unmapping the TSO headers. Signed-off-by: Ezequiel Garcia Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mv643xx_eth.c | 164 ++++++++++++++++++++++++++++- 1 file changed, 160 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 8854751182ba..3b0f818a4f5c 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -179,9 +180,10 @@ static char mv643xx_eth_driver_version[] = "1.4"; * Misc definitions. */ #define DEFAULT_RX_QUEUE_SIZE 128 -#define DEFAULT_TX_QUEUE_SIZE 256 +#define DEFAULT_TX_QUEUE_SIZE 512 #define SKB_DMA_REALIGN ((PAGE_SIZE - NET_SKB_PAD) % SMP_CACHE_BYTES) +#define TSO_HEADER_SIZE 128 /* * RX/TX descriptors. @@ -346,6 +348,9 @@ struct tx_queue { int tx_curr_desc; int tx_used_desc; + char *tso_hdrs; + dma_addr_t tso_hdrs_dma; + struct tx_desc *tx_desc_area; dma_addr_t tx_desc_dma; int tx_desc_area_size; @@ -722,6 +727,138 @@ no_csum: return 0; } +static inline int +txq_put_data_tso(struct net_device *dev, struct tx_queue *txq, + struct sk_buff *skb, char *data, int length, + bool last_tcp, bool is_last) +{ + int tx_index; + u32 cmd_sts; + struct tx_desc *desc; + + tx_index = txq->tx_curr_desc++; + if (txq->tx_curr_desc == txq->tx_ring_size) + txq->tx_curr_desc = 0; + desc = &txq->tx_desc_area[tx_index]; + + desc->l4i_chk = 0; + desc->byte_cnt = length; + desc->buf_ptr = dma_map_single(dev->dev.parent, data, + length, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(dev->dev.parent, desc->buf_ptr))) { + WARN(1, "dma_map_single failed!\n"); + return -ENOMEM; + } + + cmd_sts = BUFFER_OWNED_BY_DMA; + if (last_tcp) { + /* last descriptor in the TCP packet */ + cmd_sts |= ZERO_PADDING | TX_LAST_DESC; + /* last descriptor in SKB */ + if (is_last) + cmd_sts |= TX_ENABLE_INTERRUPT; + } + desc->cmd_sts = cmd_sts; + return 0; +} + +static inline void +txq_put_hdr_tso(struct sk_buff *skb, struct tx_queue *txq, int length) +{ + struct mv643xx_eth_private *mp = txq_to_mp(txq); + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int tx_index; + struct tx_desc *desc; + int ret; + u32 cmd_csum = 0; + u16 l4i_chk = 0; + + tx_index = txq->tx_curr_desc; + desc = &txq->tx_desc_area[tx_index]; + + ret = skb_tx_csum(mp, skb, &l4i_chk, &cmd_csum, length); + if (ret) + WARN(1, "failed to prepare checksum!"); + + /* Should we set this? Can't use the value from skb_tx_csum() + * as it's not the correct initial L4 checksum to use. */ + desc->l4i_chk = 0; + + desc->byte_cnt = hdr_len; + desc->buf_ptr = txq->tso_hdrs_dma + + txq->tx_curr_desc * TSO_HEADER_SIZE; + desc->cmd_sts = cmd_csum | BUFFER_OWNED_BY_DMA | TX_FIRST_DESC | + GEN_CRC; + + txq->tx_curr_desc++; + if (txq->tx_curr_desc == txq->tx_ring_size) + txq->tx_curr_desc = 0; +} + +static int txq_submit_tso(struct tx_queue *txq, struct sk_buff *skb, + struct net_device *dev) +{ + struct mv643xx_eth_private *mp = txq_to_mp(txq); + int total_len, data_left, ret; + int desc_count = 0; + struct tso_t tso; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + + /* Count needed descriptors */ + if ((txq->tx_desc_count + tso_count_descs(skb)) >= txq->tx_ring_size) { + netdev_dbg(dev, "not enough descriptors for TSO!\n"); + return -EBUSY; + } + + /* Initialize the TSO handler, and prepare the first payload */ + tso_start(skb, &tso); + + total_len = skb->len - hdr_len; + while (total_len > 0) { + char *hdr; + + data_left = min_t(int, skb_shinfo(skb)->gso_size, total_len); + total_len -= data_left; + desc_count++; + + /* prepare packet headers: MAC + IP + TCP */ + hdr = txq->tso_hdrs + txq->tx_curr_desc * TSO_HEADER_SIZE; + tso_build_hdr(skb, hdr, &tso, data_left, total_len == 0); + txq_put_hdr_tso(skb, txq, data_left); + + while (data_left > 0) { + int size; + desc_count++; + + size = min_t(int, tso.size, data_left); + ret = txq_put_data_tso(dev, txq, skb, tso.data, size, + size == data_left, + total_len == 0); + if (ret) + goto err_release; + data_left -= size; + tso_build_data(skb, &tso, size); + } + } + + __skb_queue_tail(&txq->tx_skb, skb); + skb_tx_timestamp(skb); + + /* clear TX_END status */ + mp->work_tx_end &= ~(1 << txq->index); + + /* ensure all descriptors are written before poking hardware */ + wmb(); + txq_enable(txq); + txq->tx_desc_count += desc_count; + return 0; +err_release: + /* TODO: Release all used data descriptors; header descriptors must not + * be DMA-unmapped. + */ + return ret; +} + static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb) { struct mv643xx_eth_private *mp = txq_to_mp(txq); @@ -821,7 +958,7 @@ static int txq_submit_skb(struct tx_queue *txq, struct sk_buff *skb) static netdev_tx_t mv643xx_eth_xmit(struct sk_buff *skb, struct net_device *dev) { struct mv643xx_eth_private *mp = netdev_priv(dev); - int length, queue; + int length, queue, ret; struct tx_queue *txq; struct netdev_queue *nq; @@ -845,7 +982,11 @@ static netdev_tx_t mv643xx_eth_xmit(struct sk_buff *skb, struct net_device *dev) length = skb->len; - if (!txq_submit_skb(txq, skb)) { + if (skb_is_gso(skb)) + ret = txq_submit_tso(txq, skb, dev); + else + ret = txq_submit_skb(txq, skb); + if (!ret) { int entries_left; txq->tx_bytes += length; @@ -854,6 +995,8 @@ static netdev_tx_t mv643xx_eth_xmit(struct sk_buff *skb, struct net_device *dev) entries_left = txq->tx_ring_size - txq->tx_desc_count; if (entries_left < MAX_SKB_FRAGS + 1) netif_tx_stop_queue(nq); + } else if (ret == -EBUSY) { + return NETDEV_TX_BUSY; } return NETDEV_TX_OK; @@ -1885,6 +2028,15 @@ static int txq_init(struct mv643xx_eth_private *mp, int index) nexti * sizeof(struct tx_desc); } + /* Allocate DMA buffers for TSO MAC/IP/TCP headers */ + txq->tso_hdrs = dma_alloc_coherent(mp->dev->dev.parent, + txq->tx_ring_size * TSO_HEADER_SIZE, + &txq->tso_hdrs_dma, GFP_KERNEL); + if (txq->tso_hdrs == NULL) { + dma_free_coherent(mp->dev->dev.parent, txq->tx_desc_area_size, + txq->tx_desc_area, txq->tx_desc_dma); + return -ENOMEM; + } skb_queue_head_init(&txq->tx_skb); return 0; @@ -1905,6 +2057,10 @@ static void txq_deinit(struct tx_queue *txq) else dma_free_coherent(mp->dev->dev.parent, txq->tx_desc_area_size, txq->tx_desc_area, txq->tx_desc_dma); + if (txq->tso_hdrs) + dma_free_coherent(mp->dev->dev.parent, + txq->tx_ring_size * TSO_HEADER_SIZE, + txq->tso_hdrs, txq->tso_hdrs_dma); } @@ -2935,7 +3091,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev) dev->watchdog_timeo = 2 * HZ; dev->base_addr = 0; - dev->features = NETIF_F_SG | NETIF_F_IP_CSUM; + dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; dev->vlan_features = dev->features; dev->features |= NETIF_F_RXCSUM; -- cgit v1.2.1