Videos :: Linux Kernel TCP Congestion Control Algorithms
Written by: Kiran Kankipati - Published: 20-Nov-2017
Watch Video: 303 TCP Tune-up and Performance Analysis Graphs - Congestion Control - Research - Dos and Don'ts
* Click the image above to watch this video on Youtube ↗
Watch Video: 268 TCP Congestion Control Algorithms - implemented in Linux Kernel - Introduction - Ep1
* Click the image above to watch this video on Youtube ↗ Refer:
TCP congestion control :: https://en.wikipedia.org/wiki/TCP_congestion_control
-----
Linux Kernel Source:
Linux Kernel - IPv4 Stack /net/ipv4 :: http://elixir.free-electrons.com/linux/latest/source/net/ipv4
TCP CUBIC: Binary Increase Congestion control for TCP - /net/ipv4/tcp_cubic.c :: http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cubic.c
TCP-HYBLA Congestion control algorithm - /net/ipv4/tcp_hybla.c :: http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_hybla.c
* A TCP Enhancement for Heterogeneous Networks
TCP Illinois congestion control - /net/ipv4/tcp_illinois.c :: http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_illinois.c
TCP Westwood+: end-to-end bandwidth estimation for TCP - /net/ipv4/tcp_westwood.c :: http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_westwood.c
TCP Vegas congestion control - /net/ipv4/tcp_vegas.c :: http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_vegas.c
TCP Veno congestion control - /net/ipv4/tcp_veno.c :: http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_veno.c
H-TCP congestion control: TCP for high-speed and long-distance networks - /net/ipv4/tcp_htcp.c :: http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_htcp.c
Also refer the Kernel Architecture difference between a standard Linux System vs Android OS, discussed in this video:
Watch Video: 269 Linux Kernel TCP Congestion Control - CUBIC or BIC-TCP, Pluggable TCP congestion control datastructures - Ep2
* Click the image above to watch this video on Youtube ↗ Refer:
Wiki:
CUBIC TCP - https://en.wikipedia.org/wiki/CUBIC_TCP
BIC TCP - https://en.wikipedia.org/wiki/BIC_TCP
LFN: long fat networks - https://en.wikipedia.org/wiki/Bandwidth-delay_product
Bandwidth-delay product - https://en.wikipedia.org/wiki/Bandwidth-delay_product
-----
Linux Kernel Source:
TCP CUBIC: Binary Increase Congestion control for TCP v2.3 - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cubic.c
static struct tcp_congestion_ops cubictcp - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cubic.c#L457
static int __init cubictcp_register(void) - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cubic.c#L469
static void __exit cubictcp_unregister(void) - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cubic.c#L504
tcp_register_congestion_control(), tcp_unregister_congestion_control() - http://elixir.free-electrons.com/linux/latest/source/include/net/tcp.h#L1029
Pluggable TCP congestion control support - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c
struct tcp_congestion_ops data-structure - http://elixir.free-electrons.com/linux/latest/source/include/net/tcp.h#L989
Here is the struct tcp_congestion_ops data-structure
(/include/net/tcp.h)
from the Kernel-source version 4.14 for quick reference:
struct tcp_congestion_ops {
struct list_head list;
u32 key;
u32 flags;
/* initialize private data (optional) */
void (*init)(struct sock *sk);
/* cleanup private data (optional) */
void (*release)(struct sock *sk);
/* return slow start threshold (required) */
u32 (*ssthresh)(struct sock *sk);
/* do new cwnd calculation (required) */
void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
/* call before changing ca_state (optional) */
void (*set_state)(struct sock *sk, u8 new_state);
/* call when cwnd event occurs (optional) */
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
/* call when ack arrives (optional) */
void (*in_ack_event)(struct sock *sk, u32 flags);
/* new value of cwnd after loss (required) */
u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
/* suggest number of segments for each skb to transmit (optional) */
u32 (*tso_segs_goal)(struct sock *sk);
/* returns the multiplier used in tcp_sndbuf_expand (optional) */
u32 (*sndbuf_expand)(struct sock *sk);
/* call when packets are delivered to update cwnd and pacing rate,
* after all the ca_state processing. (optional)
*/
void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
/* get info for inet_diag (optional) */
size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info);
char name[TCP_CA_NAME_MAX];
struct module *owner;
};
Here is the tcp_register_congestion_control(), tcp_unregister_congestion_control() pluggable congestion control registration APIs
(/net/ipv4/tcp_cong.c)
from the Kernel-source version 4.14 for quick reference:
/*
* Attach new congestion control algorithm to the list
* of available options.
*/
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
int ret = 0;
/* all algorithms must implement these */
if (!ca->ssthresh || !ca->undo_cwnd ||
!(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
spin_lock(&tcp_cong_list_lock);
if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
pr_notice("%s already registered or non-unique key\n",
ca->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
pr_debug("%s registered\n", ca->name);
}
spin_unlock(&tcp_cong_list_lock);
return ret;
}
EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
/*
* Remove congestion control algorithm, called from
* the module's remove function. Module ref counts are used
* to ensure that this can't be done till all sockets using
* that method are closed.
*/
void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
{
spin_lock(&tcp_cong_list_lock);
list_del_rcu(&ca->list);
spin_unlock(&tcp_cong_list_lock);
/* Wait for outstanding readers to complete before the
* module gets removed entirely.
*
* A try_module_get() should fail by now as our module is
* in "going" state since no refs are held anymore and
* module_exit() handler being called.
*/
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
Here is the struct tcp_congestion_ops cubictcp data-structure instance, cubictcp_register(), cubictcp_unregister() APIs
(/net/ipv4/tcp_cubic.c)
from the Kernel-source version 4.14 for quick reference:
static struct tcp_congestion_ops cubictcp __read_mostly = {
.init = bictcp_init,
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
.undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = bictcp_cwnd_event,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
.name = "cubic",
};
static int __init cubictcp_register(void)
{
BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
/* Precompute a bunch of the scaling factors that are used per-packet
* based on SRTT of 100ms
*/
beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
/ (BICTCP_BETA_SCALE - beta);
cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
* so K = cubic_root( (wmax-cwnd)*rtt/c )
* the unit of K is bictcp_HZ=2^10, not HZ
*
* c = bic_scale >> 10
* rtt = 100ms
*
* the following code has been designed and tested for
* cwnd < 1 million packets
* RTT < 100 seconds
* HZ < 1,000,00 (corresponding to 10 nano-second)
*/
/* 1/c * 2^2*bictcp_HZ * srtt */
cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
/* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10);
return tcp_register_congestion_control(&cubictcp);
}
static void __exit cubictcp_unregister(void)
{
tcp_unregister_congestion_control(&cubictcp);
}
module_init(cubictcp_register);
module_exit(cubictcp_unregister);
MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("CUBIC TCP");
MODULE_VERSION("2.3");
Watch Video: 271 Linux Kernel TCP Congestion Control - tcp_cong.c - tcp_ca_find(), tcp_cong_list, tcp_init_congestion_control(), tcp_get_allowed_congestion_control() - Ep3
* Click the image above to watch this video on Youtube ↗ Refer:
Linux Kernel Source:
Main TCP congestion control support implementation file:
/net/ipv4/tcp_cong.c - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c
-------
APIs:
tcp_register_congestion_control(), tcp_unregister_congestion_control() - http://elixir.free-electrons.com/linux/latest/source/include/net/tcp.h#L1029
tcp_ca_find() - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c#L23
tcp_set_congestion_control() - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c#L341
tcp_init_congestion_control(), tcp_reinit_congestion_control() - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c#L179
tcp_get_allowed_congestion_control(), tcp_set_allowed_congestion_control() - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c#L280
tcp_set_default_congestion_control() - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c#L217
tcp_get_available_congestion_control() - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c#L252
-------
Data-structures:
struct tcp_congestion_ops data-structure - http://elixir.free-electrons.com/linux/latest/source/include/net/tcp.h#L989
static LIST_HEAD(tcp_cong_list); - Linked List - http://elixir.free-electrons.com/linux/latest/source/net/ipv4/tcp_cong.c#L20
Here is the LIST_HEAD(tcp_cong_list) linked list, tcp_ca_find() API
(/net/ipv4/tcp_cong.c)
from the Kernel-source version 4.14 for quick reference:
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
/* Simple linear search, don't expect many entries! */
static struct tcp_congestion_ops *tcp_ca_find(const char *name)
{
struct tcp_congestion_ops *e;
list_for_each_entry_rcu(e, &tcp_cong_list, list) {
if (strcmp(e->name, name) == 0)
return e;
}
return NULL;
}
Here is the tcp_init_congestion_control(), tcp_reinit_congestion_control() APIs
(/net/ipv4/tcp_cong.c)
from the Kernel-source version 4.14 for quick reference:
void tcp_init_congestion_control(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
tcp_sk(sk)->prior_ssthresh = 0;
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
if (tcp_ca_needs_ecn(sk))
INET_ECN_xmit(sk);
else
INET_ECN_dontxmit(sk);
}
static void tcp_reinit_congestion_control(struct sock *sk,
const struct tcp_congestion_ops *ca)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;
icsk->icsk_ca_setsockopt = 1;
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (sk->sk_state != TCP_CLOSE)
tcp_init_congestion_control(sk);
}
Here is the tcp_set_default_congestion_control(), tcp_get_available_congestion_control(), tcp_set_allowed_congestion_control() APIs
(/net/ipv4/tcp_cong.c)
from the Kernel-source version 4.14 for quick reference:
/* Used by sysctl to change default congestion control */
int tcp_set_default_congestion_control(const char *name)
{
struct tcp_congestion_ops *ca;
int ret = -ENOENT;
spin_lock(&tcp_cong_list_lock);
ca = tcp_ca_find(name);
#ifdef CONFIG_MODULES
if (!ca && capable(CAP_NET_ADMIN)) {
spin_unlock(&tcp_cong_list_lock);
request_module("tcp_%s", name);
spin_lock(&tcp_cong_list_lock);
ca = tcp_ca_find(name);
}
#endif
if (ca) {
ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
list_move(&ca->list, &tcp_cong_list);
ret = 0;
}
spin_unlock(&tcp_cong_list_lock);
return ret;
}
/* Set default value from kernel configuration at bootup */
static int __init tcp_congestion_default(void)
{
return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
}
late_initcall(tcp_congestion_default);
/* Build string with list of available congestion control values */
void tcp_get_available_congestion_control(char *buf, size_t maxlen)
{
struct tcp_congestion_ops *ca;
size_t offs = 0;
rcu_read_lock();
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
}
rcu_read_unlock();
}
/* Get current default congestion control */
void tcp_get_default_congestion_control(char *name)
{
struct tcp_congestion_ops *ca;
/* We will always have reno... */
BUG_ON(list_empty(&tcp_cong_list));
rcu_read_lock();
ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
strncpy(name, ca->name, TCP_CA_NAME_MAX);
rcu_read_unlock();
}
/* Change list of non-restricted congestion control */
int tcp_set_allowed_congestion_control(char *val)
{
struct tcp_congestion_ops *ca;
char *saved_clone, *clone, *name;
int ret = 0;
saved_clone = clone = kstrdup(val, GFP_USER);
if (!clone)
return -ENOMEM;
spin_lock(&tcp_cong_list_lock);
/* pass 1 check for bad entries */
while ((name = strsep(&clone, " ")) && *name) {
ca = tcp_ca_find(name);
if (!ca) {
ret = -ENOENT;
goto out;
}
}
/* pass 2 clear old values */
list_for_each_entry_rcu(ca, &tcp_cong_list, list)
ca->flags &= ~TCP_CONG_NON_RESTRICTED;
/* pass 3 mark as allowed */
while ((name = strsep(&val, " ")) && *name) {
ca = tcp_ca_find(name);
WARN_ON(!ca);
if (ca)
ca->flags |= TCP_CONG_NON_RESTRICTED;
}
out:
spin_unlock(&tcp_cong_list_lock);
kfree(saved_clone);
return ret;
}
Suggested Topics: