summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/igmp.c19
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c11
-rw-r--r--net/ipv4/sysctl_net_ipv4.c8
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_bic.c12
-rw-r--r--net/ipv4/tcp_cong.c40
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_htcp.c13
-rw-r--r--net/ipv4/tcp_hybla.c6
-rw-r--r--net/ipv4/tcp_input.c288
-rw-r--r--net/ipv4/tcp_ipv4.c28
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c61
-rw-r--r--net/ipv4/tcp_scalable.c14
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/tcp_vegas.c42
-rw-r--r--net/ipv4/udp.c7
19 files changed, 360 insertions, 235 deletions
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 175e093ec564..e3eceecd0496 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb)
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
+ /* fall through */
case CHECKSUM_NONE:
- if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb))
goto error;
- default:;
}
if (!pskb_pull(skb, sizeof(struct icmphdr)))
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c6247fc84060..c04607b49212 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb)
return 0;
}
- if (!pskb_may_pull(skb, sizeof(struct igmphdr)) ||
- (u16)csum_fold(skb_checksum(skb, 0, len, 0))) {
- in_dev_put(in_dev);
- kfree_skb(skb);
- return 0;
+ if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
+ goto drop;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!(u16)csum_fold(skb->csum))
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb))
+ goto drop;
}
ih = skb->h.igmph;
@@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb)
default:
NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
}
+
+drop:
in_dev_put(in_dev);
kfree_skb(skb);
return 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 896ce3f8f53a..4e9c74b54b15 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb)
goto drop_nolock;
if (flags&GRE_CSUM) {
- if (skb->ip_summed == CHECKSUM_HW) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
csum = (u16)csum_fold(skb->csum);
- if (csum)
- skb->ip_summed = CHECKSUM_NONE;
- }
- if (skb->ip_summed == CHECKSUM_NONE) {
- skb->csum = skb_checksum(skb, 0, skb->len, 0);
+ if (!csum)
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ csum = __skb_checksum_complete(skb);
skb->ip_summed = CHECKSUM_HW;
- csum = (u16)csum_fold(skb->csum);
}
offset += 4;
}
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 5198f3a1e2cd..e4d6b268e8c4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -13,6 +13,7 @@
#include <linux/in.h>
#include <linux/icmp.h>
#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/checksum.h>
#include <linux/netfilter.h>
@@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_icmp: bad HW ICMP checksum ");
- return -NF_ACCEPT;
+ /* fall through */
case CHECKSUM_NONE:
- if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb)) {
if (LOG_INVALID(IPPROTO_ICMP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad ICMP checksum ");
return -NF_ACCEPT;
}
- default:
- break;
}
checksum_skipped:
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 652685623519..01444a02b48b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_tcp_congestion_control,
.strategy = &sysctl_tcp_congestion_control,
},
+ {
+ .ctl_name = NET_TCP_ABC,
+ .procname = "tcp_abc",
+ .data = &sysctl_tcp_abc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 72b7c22e1ea5..9ac7a4f46bd8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags)
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
- /* The last check adjusts for discrepance of Linux wrt. RFC
+ /* The last check adjusts for discrepancy of Linux wrt. RFC
* states
*/
tcp_send_active_reset(sk, gfp_any());
@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->packets_out = 0;
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0;
+ tp->bytes_acked = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ae35e0609047..1d0cd86621b1 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack,
bictcp_low_utilization(sk, data_acked);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
bictcp_update(ca, tp->snd_cwnd);
- /* In dangerous area, increase slowly.
+ /* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if (tp->snd_cwnd_cnt >= ca->cnt) {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index bbf2d6624e89..c7cc62c8dc12 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
- }
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+
+ /* In dangerous area, increase slowly. */
+ else if (sysctl_tcp_abc) {
+ /* RFC3465: Apppriate Byte Count
+ * increase once for each full cwnd acked
+ */
+ if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
+ tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ }
+ } else {
+ /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 6acc04bde080..82b3c189bd7d 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk)
}
static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
- u32 in_flight, int good)
+ u32 in_flight, u32 pkts_acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
/* Update AIMD parameters */
if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index e47b37984e95..3284cfb993e6 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
+
measure_rtt(sk);
/* keep track of number of round-trip times since last backoff event */
@@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
htcp_alpha_update(ca);
}
- /* In dangerous area, increase slowly.
+ /* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
*/
if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 77add63623df..40dbb3877510 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
ca->minrtt = tp->srtt;
}
+ if (!tcp_is_cwnd_limited(sk, in_flight))
+ return;
+
if (!ca->hybla_en)
return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
- if (in_flight < tp->snd_cwnd)
- return;
-
if (ca->rho == 0)
hybla_recalc_param(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e98b57578dc..40a26b7157b4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -42,7 +42,7 @@
* Andi Kleen : Moved open_request checking here
* and process RSTs for open_requests.
* Andi Kleen : Better prune_queue, and other fixes.
- * Andrey Savochkin: Fix RTT measurements in the presnce of
+ * Andrey Savochkin: Fix RTT measurements in the presence of
* timestamps.
* Andrey Savochkin: Check sequence numbers correctly when
* removing SACKs due to in sequence incoming
@@ -89,6 +89,7 @@ int sysctl_tcp_frto;
int sysctl_tcp_nometrics_save;
int sysctl_tcp_moderate_rcvbuf = 1;
+int sysctl_tcp_abc = 1;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk)
* of receiver window. Check #2.
*
* The scheme does not work when sender sends good segments opening
- * window and then starts to feed us spagetti. But it should work
+ * window and then starts to feed us spaghetti. But it should work
* in common situations. Otherwise, we have to rely on queue collapsing.
*/
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
{
/* Optimize this! */
int truesize = tcp_win_from_space(skb->truesize)/2;
- int window = tcp_full_space(sk)/2;
+ int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
/* Try to select rcvbuf so that 4 mss-sized segments
- * will fit to window and correspoding skbs will fit to our rcvbuf.
+ * will fit to window and corresponding skbs will fit to our rcvbuf.
* (was 3; 4 is minimum to allow fast retransmit to work.)
*/
while (tcp_win_from_space(rcvmem) < tp->advmss)
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
}
-/* 4. Try to fixup all. It is made iimediately after connection enters
+/* 4. Try to fixup all. It is made immediately after connection enters
* established state.
*/
static void tcp_init_buffer_space(struct sock *sk)
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk)
static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct sk_buff *skb;
- unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
- int ofo_win = 0;
icsk->icsk_ack.quick = 0;
- skb_queue_walk(&tp->out_of_order_queue, skb) {
- ofo_win += skb->len;
- }
-
- /* If overcommit is due to out of order segments,
- * do not clamp window. Try to expand rcvbuf instead.
- */
- if (ofo_win) {
- if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !tcp_memory_pressure &&
- atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- sysctl_tcp_rmem[2]);
+ if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+ !tcp_memory_pressure &&
+ atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+ sysctl_tcp_rmem[2]);
}
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
- app_win += ofo_win;
- if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
- app_win >>= 1;
- if (app_win > icsk->icsk_ack.rcv_mss)
- app_win -= icsk->icsk_ack.rcv_mss;
- app_win = max(app_win, 2U*tp->advmss);
-
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
- }
}
/* Receiver "autotuning" code.
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* are stalled on filesystem I/O.
*
* Also, since we are only going for a minimum in the
- * non-timestamp case, we do not smoothe things out
- * else with timestamps disabled convergance takes too
+ * non-timestamp case, we do not smoother things out
+ * else with timestamps disabled convergence takes too
* long.
*/
if (!win_dep) {
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
} else if (m < new_sample)
new_sample = m << 3;
} else {
- /* No previous mesaure. */
+ /* No previous measure. */
new_sample = m << 3;
}
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
if (icsk->icsk_ack.ato > icsk->icsk_rto)
icsk->icsk_ack.ato = icsk->icsk_rto;
} else if (m > icsk->icsk_rto) {
- /* Too long gap. Apparently sender falled to
+ /* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack(sk);
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
{
struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
long m = mrtt; /* RTT */
/* The following amusing code comes from Jacobson's
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
*
* Funny. This algorithm seems to be very broken.
* These formulae increase RTO, when it should be decreased, increase
- * too slowly, when it should be incresed fastly, decrease too fastly
+ * too slowly, when it should be increased fastly, decrease too fastly
* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
tp->rtt_seq = tp->snd_nxt;
}
-
- if (icsk->icsk_ca_ops->rtt_sample)
- icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk)
* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
* to do with delayed acks, because at cwnd>2 true delack timeout
* is invisible. Actually, Linux-2.4 also generates erratic
- * ACKs in some curcumstances.
+ * ACKs in some circumstances.
*/
inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
* all the algo is pure shit and should be replaced
- * with correct one. It is exaclty, which we pretend to do.
+ * with correct one. It is exactly, which we pretend to do.
*/
}
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk)
* to make it more realistic.
*
* A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal curcumstances sending small
+ * is sent until it is ACKed. In normal circumstances sending small
* packets force peer to delay ACKs and calculation is correct too.
* The algorithm is adaptive and, provided we follow specs, it
* NEVER underestimate RTT. BUT! If peer tries to make some clever
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int prior_fackets;
u32 lost_retrans = 0;
int flag = 0;
+ int dup_sack = 0;
int i;
if (!tp->sacked_out)
tp->fackets_out = 0;
prior_fackets = tp->fackets_out;
- for (i=0; i<num_sacks; i++, sp++) {
- struct sk_buff *skb;
- __u32 start_seq = ntohl(sp->start_seq);
- __u32 end_seq = ntohl(sp->end_seq);
- int fack_count = 0;
- int dup_sack = 0;
+ /* SACK fastpath:
+ * if the only SACK change is the increase of the end_seq of
+ * the first block then only apply that SACK block
+ * and use retrans queue hinting otherwise slowpath */
+ flag = 1;
+ for (i = 0; i< num_sacks; i++) {
+ __u32 start_seq = ntohl(sp[i].start_seq);
+ __u32 end_seq = ntohl(sp[i].end_seq);
+
+ if (i == 0){
+ if (tp->recv_sack_cache[i].start_seq != start_seq)
+ flag = 0;
+ } else {
+ if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
+ (tp->recv_sack_cache[i].end_seq != end_seq))
+ flag = 0;
+ }
+ tp->recv_sack_cache[i].start_seq = start_seq;
+ tp->recv_sack_cache[i].end_seq = end_seq;
/* Check for D-SACK. */
if (i == 0) {
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (before(ack, prior_snd_una - tp->max_window))
return 0;
}
+ }
+
+ if (flag)
+ num_sacks = 1;
+ else {
+ int j;
+ tp->fastpath_skb_hint = NULL;
+
+ /* order SACK blocks to allow in order walk of the retrans queue */
+ for (i = num_sacks-1; i > 0; i--) {
+ for (j = 0; j < i; j++){
+ if (after(ntohl(sp[j].start_seq),
+ ntohl(sp[j+1].start_seq))){
+ sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq);
+ sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq);
+ sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq);
+ sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq);
+ }
+
+ }
+ }
+ }
+
+ /* clear flag as used for different purpose in following code */
+ flag = 0;
+
+ for (i=0; i<num_sacks; i++, sp++) {
+ struct sk_buff *skb;
+ __u32 start_seq = ntohl(sp->start_seq);
+ __u32 end_seq = ntohl(sp->end_seq);
+ int fack_count;
+
+ /* Use SACK fastpath hint if valid */
+ if (tp->fastpath_skb_hint) {
+ skb = tp->fastpath_skb_hint;
+ fack_count = tp->fastpath_cnt_hint;
+ } else {
+ skb = sk->sk_write_queue.next;
+ fack_count = 0;
+ }
/* Event "B" in the comment above. */
if (after(end_seq, tp->high_seq))
flag |= FLAG_DATA_LOST;
- sk_stream_for_retrans_queue(skb, sk) {
+ sk_stream_for_retrans_queue_from(skb, sk) {
int in_sack, pcount;
u8 sacked;
+ tp->fastpath_skb_hint = skb;
+ tp->fastpath_cnt_hint = fack_count;
+
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
tp->lost_out -= tcp_skb_pcount(skb);
tp->retrans_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
}
} else {
/* New sack for not retransmitted frame,
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
tp->lost_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
}
}
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->retransmit_skb_hint = NULL;
}
}
}
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
+
if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
+
+ clear_all_retrans_hints(tp);
}
void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->bytes_acked = 0;
tcp_clear_retrans(tp);
/* Push undo marker, if it was plain RTO and nothing
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
+
+ clear_all_retrans_hints(tp);
}
static int tcp_check_sack_reneging(struct sock *sk)
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
int packets, u32 high_seq)
{
struct sk_buff *skb;
- int cnt = packets;
+ int cnt;
- BUG_TRAP(cnt <= tp->packets_out);
+ BUG_TRAP(packets <= tp->packets_out);
+ if (tp->lost_skb_hint) {
+ skb = tp->lost_skb_hint;
+ cnt = tp->lost_cnt_hint;
+ } else {
+ skb = sk->sk_write_queue.next;
+ cnt = 0;
+ }
- sk_stream_for_retrans_queue(skb, sk) {
- cnt -= tcp_skb_pcount(skb);
- if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ /* TODO: do this better */
+ /* this is not the most efficient way to do this... */
+ tp->lost_skb_hint = skb;
+ tp->lost_cnt_hint = cnt;
+ cnt += tcp_skb_pcount(skb);
+ if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
+
+ /* clear xmit_retransmit_queue hints
+ * if this is beyond hint */
+ if(tp->retransmit_skb_hint != NULL &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
+
+ tp->retransmit_skb_hint = NULL;
+ }
}
}
tcp_sync_left_out(tp);
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
if (tcp_head_timedout(sk, tp)) {
struct sk_buff *skb;
- sk_stream_for_retrans_queue(skb, sk) {
- if (tcp_skb_timedout(sk, skb) &&
- !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
+ : sk->sk_write_queue.next;
+
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ if (!tcp_skb_timedout(sk, skb))
+ break;
+
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
+
+ /* clear xmit_retrans hint */
+ if (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+
+ tp->retransmit_skb_hint = NULL;
}
}
+
+ tp->scoreboard_skb_hint = skb;
+
tcp_sync_left_out(tp);
}
}
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
}
tcp_moderate_cwnd(tp);
tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ /* There is something screwy going on with the retrans hints after
+ an undo */
+ clear_all_retrans_hints(tp);
}
static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
sk_stream_for_retrans_queue(skb, sk) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
+
+ clear_all_retrans_hints(tp);
+
DBGUNDO(sk, tp, "partial loss");
tp->lost_out = 0;
tp->left_out = tp->sacked_out;
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
TCP_ECN_queue_cwr(tp);
}
+ tp->bytes_acked = 0;
tp->snd_cwnd_cnt = 0;
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
}
/* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Superceeds RFC1323)
+ * with this code. (Supersedes RFC1323)
*/
-static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
- * If we do not, we get strongly overstimated rto. With timestamps
+ * If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
*/
struct tcp_sock *tp = tcp_sk(sk);
const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
}
-static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
- tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
}
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
- const s32 seq_rtt, u32 *usrtt)
+ const s32 seq_rtt)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(sk, usrtt, flag);
+ tcp_ack_saw_tstamp(sk, flag);
else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
+ tcp_ack_no_tstamp(sk, seq_rtt, flag);
}
static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
return acked;
}
+static inline u32 tcp_usrtt(const struct sk_buff *skb)
+{
+ struct timeval tv, now;
+
+ do_gettimeofday(&now);
+ skb_get_timestamp(skb, &tv);
+ return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec);
+}
/* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
int acked = 0;
__s32 seq_rtt = -1;
- struct timeval usnow;
u32 pkts_acked = 0;
-
- if (seq_usrtt)
- do_gettimeofday(&usnow);
+ void (*rtt_sample)(struct sock *sk, u32 usrtt)
+ = icsk->icsk_ca_ops->rtt_sample;
while ((skb = skb_peek(&sk->sk_write_queue)) &&
skb != sk->sk_send_head) {
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
tp->retrans_out -= tcp_skb_pcount(skb);
acked |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1;
- } else if (seq_rtt < 0)
+ } else if (seq_rtt < 0) {
seq_rtt = now - scb->when;
- if (seq_usrtt) {
- struct timeval tv;
-
- skb_get_timestamp(skb, &tv);
- *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
- + (usnow.tv_usec - tv.tv_usec);
+ if (rtt_sample)
+ (*rtt_sample)(sk, tcp_usrtt(skb));
}
-
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= tcp_skb_pcount(skb);
if (sacked & TCPCB_LOST)
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
!before(scb->end_seq, tp->snd_up))
tp->urg_mode = 0;
}
- } else if (seq_rtt < 0)
+ } else if (seq_rtt < 0) {
seq_rtt = now - scb->when;
+ if (rtt_sample)
+ (*rtt_sample)(sk, tcp_usrtt(skb));
+ }
tcp_dec_pcount_approx(&tp->fackets_out, skb);
tcp_packets_out_dec(tp, skb);
__skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
+ clear_all_retrans_hints(tp);
}
if (acked&FLAG_ACKED) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
+ tcp_ack_update_rtt(sk, acked, seq_rtt);
tcp_ack_packets_out(sk, tp);
if (icsk->icsk_ca_ops->pkts_acked)
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
}
/* F-RTO affects on two new ACKs following RTO.
- * At latest on third ACK the TCP behavor is back to normal.
+ * At latest on third ACK the TCP behavior is back to normal.
*/
tp->frto_counter = (tp->frto_counter + 1) % 3;
}
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight;
s32 seq_rtt;
- s32 seq_usrtt = 0;
int prior_packets;
/* If the ack is newer than sent or older than previous acks
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (before(ack, prior_snd_una))
goto old_ack;
+ if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
+ tp->bytes_acked += ack - prior_snd_una;
+
if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
prior_in_flight = tcp_packets_in_flight(tp);
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
- icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
+ flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
if (tp->frto_counter)
tcp_process_frto(sk, prior_snd_una);
if (tcp_ack_is_dubious(sk, flag)) {
- /* Advanve CWND, if state allows this. */
+ /* Advance CWND, if state allows this. */
if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
{
struct sk_buff *skb;
- /* First, check that queue is collapsable and find
+ /* First, check that queue is collapsible and find
* the point where collapsing can be useful. */
for (skb = head; skb != tail; ) {
/* No new bits? It is possible on ofo queue. */
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
/*
* This routine is only called when we have urgent data
- * signalled. Its the 'slow' part of tcp_urg. It could be
+ * signaled. Its the 'slow' part of tcp_urg. It could be
* moved inline now as tcp_urg is only called from one
* place. We handle URGent data wrong. We have to - as
* BSD still doesn't use the correction from RFC961.
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
* urgent. To do this requires some care. We cannot just ignore
* tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
- * or we break the sematics of SIOCATMARK (and thus sockatmark())
+ * or we break the semantics of SIOCATMARK (and thus sockatmark())
*
* NOTE. Double Dutch. Rendering to plain English: author of comment
* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
- * if header_predition is to be made
+ * if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!tp->srtt)
- tcp_ack_saw_tstamp(sk, NULL, 0);
+ tcp_ack_saw_tstamp(sk, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4372,6 +4471,7 @@ discard:
EXPORT_SYMBOL(sysctl_tcp_ecn);
EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(sysctl_tcp_abc);
EXPORT_SYMBOL(tcp_parse_options);
EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 634dabb558fd..4d5021e1929b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -39,7 +39,7 @@
* request_sock handling and moved
* most of it into the af independent code.
* Added tail drop and some other bugfixes.
- * Added new listen sematics.
+ * Added new listen semantics.
* Mike McLagan : Routing by source
* Juan Jose Ciarlante: ip_dynaddr bits
* Andi Kleen: various fixes.
@@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
static int tcp_v4_checksum_init(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
- skb->nh.iph->daddr, skb->csum))
+ skb->nh.iph->daddr, skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return 0;
-
- LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
- skb->ip_summed = CHECKSUM_NONE;
+ }
}
+
+ skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->len, IPPROTO_TCP, 0);
+
if (skb->len <= 76) {
- if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
- skb->nh.iph->daddr,
- skb_checksum(skb, 0, skb->len, 0)))
- return -1;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else {
- skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
- skb->nh.iph->saddr,
- skb->nh.iph->daddr, 0);
+ return __skb_checksum_complete(skb);
}
return 0;
}
@@ -1216,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
- * provided case of th->doff==0 is elimineted.
+ * provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
- tcp_v4_checksum_init(skb) < 0))
+ tcp_v4_checksum_init(skb)))
goto bad_packet;
th = skb->h.th;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b1a63b2c6b4a..1b66a2ac4321 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -158,7 +158,7 @@ kill_with_rst:
/* I am shamed, but failed to make it more elegant.
* Yes, it is direct reference to IP, which is impossible
* to generalize to IPv6. Taking into account that IPv6
- * do not undertsnad recycling in any case, it not
+ * do not understand recycling in any case, it not
* a big problem in practice. --ANK */
if (tw->tw_family == AF_INET &&
tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
@@ -194,7 +194,7 @@ kill_with_rst:
/* In window segment, it may be only reset or bare ack. */
if (th->rst) {
- /* This is TIME_WAIT assasination, in two flavors.
+ /* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
*/
newtp->snd_cwnd = 2;
newtp->snd_cwnd_cnt = 0;
+ newtp->bytes_acked = 0;
newtp->frto_counter = 0;
newtp->frto_highmark = 0;
@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet
- * sent (the segment carries an unaccaptable ACK) ...
+ * sent (the segment carries an unacceptable ACK) ...
* a reset is sent."
*
* Invalid ACK: reset will be sent by listening socket
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b907456a79f4..029c70dfb585 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
u16 flags;
BUG_ON(len > skb->len);
+
+ clear_all_retrans_hints(tp);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup.
- It is minumum of user_mss and mss received with SYN.
+ It is minimum of user_mss and mss received with SYN.
It also does not include TCP options.
tp->pmtu_cookie is last pmtu, seen by this function.
@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* MSS for the peer's data. Previous verions used mss_clamp
+ /* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss
@@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
BUG_ON(tcp_skb_pcount(skb) != 1 ||
tcp_skb_pcount(next_skb) != 1);
- /* Ok. We will be able to collapse the packet. */
+ /* changing transmit queue under us so clear hints */
+ clear_all_retrans_hints(tp);
+
+ /* Ok. We will be able to collapse the packet. */
__skb_unlink(next_skb, &sk->sk_write_queue);
memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk)
}
}
+ clear_all_retrans_hints(tp);
+
if (!lost)
return;
@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int err;
/* Do not sent more than we queued. 1/4 is reserved for possible
- * copying overhead: frgagmentation, tunneling, mangling etc.
+ * copying overhead: fragmentation, tunneling, mangling etc.
*/
if (atomic_read(&sk->sk_wmem_alloc) >
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int packet_cnt = tp->lost_out;
+ int packet_cnt;
+
+ if (tp->retransmit_skb_hint) {
+ skb = tp->retransmit_skb_hint;
+ packet_cnt = tp->retransmit_cnt_hint;
+ }else{
+ skb = sk->sk_write_queue.next;
+ packet_cnt = 0;
+ }
/* First pass: retransmit lost packets. */
- if (packet_cnt) {
- sk_stream_for_retrans_queue(skb, sk) {
+ if (tp->lost_out) {
+ sk_stream_for_retrans_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
+ /* we could do better than to assign each time */
+ tp->retransmit_skb_hint = skb;
+ tp->retransmit_cnt_hint = packet_cnt;
+
/* Assume this retransmit will generate
* only one packet for congestion window
* calculation purposes. This works because
@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
return;
- if (sacked&TCPCB_LOST) {
+ if (sacked & TCPCB_LOST) {
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb)) {
+ tp->retransmit_skb_hint = NULL;
return;
+ }
if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else
@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
TCP_RTO_MAX);
}
- packet_cnt -= tcp_skb_pcount(skb);
- if (packet_cnt <= 0)
+ packet_cnt += tcp_skb_pcount(skb);
+ if (packet_cnt >= tp->lost_out)
break;
}
}
@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_may_send_now(sk, tp))
return;
- packet_cnt = 0;
+ if (tp->forward_skb_hint) {
+ skb = tp->forward_skb_hint;
+ packet_cnt = tp->forward_cnt_hint;
+ } else{
+ skb = sk->sk_write_queue.next;
+ packet_cnt = 0;
+ }
+
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ tp->forward_cnt_hint = packet_cnt;
+ tp->forward_skb_hint = skb;
- sk_stream_for_retrans_queue(skb, sk) {
/* Similar to the retransmit loop above we
* can pretend that the retransmitted SKB
* we send out here will be composed of one
@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
continue;
/* Ok, retransmit it. */
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb)) {
+ tp->forward_skb_hint = NULL;
break;
+ }
if (skb == skb_peek(&sk->sk_write_queue))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect);
EXPORT_SYMBOL(tcp_make_synack);
EXPORT_SYMBOL(tcp_simple_retransmit);
EXPORT_SYMBOL(tcp_sync_mss);
+EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 327770bf5522..26d7486ee501 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (in_flight < tp->snd_cwnd)
+
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
tp->snd_cwnd_cnt++;
if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
- tp->snd_cwnd++;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0;
}
}
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
- tp->snd_cwnd_stamp = tcp_time_stamp;
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 415ee47ac1c5..e1880959614a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk)
* to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket.
*
- * Criterium is still not confirmed experimentally and may change.
+ * Criteria is still not confirmed experimentally and may change.
* We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured
* limit.
@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk)
hole detection. :-(
It is place to make it. It is not made. I do not want
- to make it. It is disguisting. It does not work in any
+ to make it. It is disgusting. It does not work in any
case. Let me to cite the same draft, which requires for
us to implement this:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 93c5f92070f9..b7d296a8ac6d 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- if (tp->snd_cwnd > tp->snd_ssthresh)
- tp->snd_cwnd++;
+ tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
} else {
u32 rtt, target_cwnd, diff;
@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
*/
diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
- if (tp->snd_cwnd < tp->snd_ssthresh) {
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
if (diff > gamma) {
/* Going too fast. Time to slow down
@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
V_PARAM_SHIFT)+1);
}
+ tcp_slow_start(tp);
} else {
/* Congestion avoidance. */
u32 next_snd_cwnd;
@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
else if (next_snd_cwnd < tp->snd_cwnd)
tp->snd_cwnd--;
}
- }
- /* Wipe the slate clean for the next RTT. */
- vegas->cntRTT = 0;
- vegas->minRTT = 0x7fffffff;
+ if (tp->snd_cwnd < 2)
+ tp->snd_cwnd = 2;
+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+ tp->snd_cwnd = tp->snd_cwnd_clamp;
+ }
}
- /* The following code is executed for every ack we receive,
- * except for conditions checked in should_advance_cwnd()
- * before the call to tcp_cong_avoid(). Mainly this means that
- * we only execute this code if the ack actually acked some
- * data.
- */
-
- /* If we are in slow start, increase our cwnd in response to this ACK.
- * (If we are not in slow start then we are in congestion avoidance,
- * and adjust our congestion window only once per RTT. See the code
- * above.)
- */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tp->snd_cwnd++;
-
- /* to keep cwnd from growing without bound */
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
-
- /* Make sure that we are never so timid as to reduce our cwnd below
- * 2 MSS.
- *
- * Going below 2 MSS would risk huge delayed ACKs from our receiver.
- */
- tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+ /* Wipe the slate clean for the next RTT. */
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
}
/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e0bd1013cb0d..2422a5f7195d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
{
- return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+ return __skb_checksum_complete(skb);
}
static __inline__ int udp_checksum_complete(struct sk_buff *skb)
@@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
if (uh->check == 0) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
} else if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
- return 0;
- LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
}
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
OpenPOWER on IntegriCloud