grub-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH V2] tcp: add window scaling and RTTM support


From: Andrei Borzenkov
Subject: Re: [PATCH V2] tcp: add window scaling and RTTM support
Date: Mon, 1 Feb 2016 11:43:48 +0300

On Sat, Jan 30, 2016 at 12:48 AM, Josef Bacik <address@hidden> wrote:
> Sometimes we have to provision boxes across regions, such as California to
> Sweden.  The http server has a 10 minute timeout, so if we can't get our 250mb
> image transferred fast enough our provisioning fails, which is not ideal.  So
> add tcp window scaling on open connections and set the window size to 1mb.  
> With
> this change we're able to get higher sustained transfers between regions and 
> can
> transfer our image in well below 10 minutes.  Without this patch we'd time out
> every time halfway through the transfer.
>
> RTTM is needed in order to make window scaling work well under heavy 
> congestion
> or packet loss.  In most cases grub could recover with just window scaling
> enabled, but on some machines the congestion would be so high that it would
> never recover and would timeout.
>
> I've made the window size configureable with the grub env variable
> "tcp_window_size".  By default this is set to 1mb but can be configured to
> whatever a user wants, and we will calculate the appropriate window size and
> scale settings.  Thanks,
>

Please make it net_tcp_window_size to match other net_* variables.

I'm still unsure about increasing it by default. GRUB network
processing is not lightning fast and it looks like it enables partner
on local network to send too much at once which can overflow hardware
receiver buffers. Not sure if this is actually possible.

I hoped DHCP defines standard option for window size but apparently not.

> Signed-off-by: Josef Bacik <address@hidden>
> ---
> V1->V2:
> -Address Andrei's concerns about making the window size configurable.
> -Also make the tcp option stuff more dynamic.
> -Add RTTM support to make higher window sizes more stable.
>
>  grub-core/net/tcp.c | 141 
> ++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 132 insertions(+), 9 deletions(-)
>
> diff --git a/grub-core/net/tcp.c b/grub-core/net/tcp.c
> index 5da8b11..be5ef30 100644
> --- a/grub-core/net/tcp.c
> +++ b/grub-core/net/tcp.c
> @@ -22,6 +22,7 @@
>  #include <grub/net/netbuff.h>
>  #include <grub/time.h>
>  #include <grub/priority_queue.h>
> +#include <grub/env.h>
>
>  #define TCP_SYN_RETRANSMISSION_TIMEOUT GRUB_NET_INTERVAL
>  #define TCP_SYN_RETRANSMISSION_COUNT GRUB_NET_TRIES
> @@ -65,6 +66,7 @@ struct grub_net_tcp_socket
>    grub_uint32_t my_cur_seq;
>    grub_uint32_t their_start_seq;
>    grub_uint32_t their_cur_seq;
> +  grub_uint32_t cur_tsecr;
>    grub_uint16_t my_window;
>    struct unacked *unack_first;
>    struct unacked *unack_last;
> @@ -94,6 +96,8 @@ struct grub_net_tcp_listen
>    void *hook_data;
>  };
>
> +#define ALIGNWORD(var) ((var) + 3) & (~3)
> +

Can we use ALIGN_UP here?

>  struct tcphdr
>  {
>    grub_uint16_t src;
> @@ -106,6 +110,25 @@ struct tcphdr
>    grub_uint16_t urgent;
>  } GRUB_PACKED;
>
> +struct tcp_opt

This probably should be tcp_opt_hdr, it is not really complete option.

> +{
> +  grub_uint8_t kind;
> +  grub_uint8_t length;
> +} GRUB_PACKED;
> +
> +struct tcp_scale_opt
> +{
> +  struct tcp_opt opt;
> +  grub_uint8_t scale;
> +} GRUB_PACKED;
> +
> +struct tcp_timestamp_opt
> +{
> +  struct tcp_opt opt;
> +  grub_uint32_t tsval;
> +  grub_uint32_t tsecr;
> +} GRUB_PACKED;
> +
>  struct tcp_pseudohdr
>  {
>    grub_uint32_t src;
> @@ -299,9 +322,12 @@ ack_real (grub_net_tcp_socket_t sock, int res)
>  {
>    struct grub_net_buff *nb_ack;
>    struct tcphdr *tcph_ack;
> +  struct tcp_timestamp_opt *timestamp;
> +  grub_size_t headersize;
>    grub_err_t err;
>
> -  nb_ack = grub_netbuff_alloc (sizeof (*tcph_ack) + 128);
> +  headersize = ALIGNWORD(sizeof (*tcph_ack) + sizeof (*timestamp));
> +  nb_ack = grub_netbuff_alloc (headersize + 128);

RFC 7323 recommends to use timestamp option only if mutually
negotiated(i.e. both initial SYN and SYN+ACK contain timestamp
option). So it probably should not be done unconditionally.

>    if (!nb_ack)
>      return;
>    err = grub_netbuff_reserve (nb_ack, 128);
> @@ -313,7 +339,7 @@ ack_real (grub_net_tcp_socket_t sock, int res)
>        return;
>      }
>
> -  err = grub_netbuff_put (nb_ack, sizeof (*tcph_ack));
> +  err = grub_netbuff_put (nb_ack, headersize);
>    if (err)
>      {
>        grub_netbuff_free (nb_ack);
> @@ -322,22 +348,28 @@ ack_real (grub_net_tcp_socket_t sock, int res)
>        return;
>      }
>    tcph_ack = (void *) nb_ack->data;
> +  grub_memset (tcph_ack, 0, headersize);

We did not do grub_memset before, why is it necessary now?

>    if (res)
>      {
>        tcph_ack->ack = grub_cpu_to_be32_compile_time (0);
> -      tcph_ack->flags = grub_cpu_to_be16_compile_time ((5 << 12) | TCP_RST);
> +      tcph_ack->flags = grub_cpu_to_be16_compile_time ((headersize << 10) | 
> TCP_RST);

grub_cpu_to_XX_compile_time is intended for compile time constants
only which it is no more.

This either needs comment or be written as ((headersize >> 2) << 12)
to avoid head scratching in the future.

>        tcph_ack->window = grub_cpu_to_be16_compile_time (0);
>      }
>    else
>      {
>        tcph_ack->ack = grub_cpu_to_be32 (sock->their_cur_seq);
> -      tcph_ack->flags = grub_cpu_to_be16_compile_time ((5 << 12) | TCP_ACK);
> +      tcph_ack->flags = grub_cpu_to_be16_compile_time ((headersize << 10) | 
> TCP_ACK);

Ditto.
>        tcph_ack->window = !sock->i_stall ? grub_cpu_to_be16 (sock->my_window)
>         : 0;
>      }
>    tcph_ack->urgent = 0;
>    tcph_ack->src = grub_cpu_to_be16 (sock->in_port);
>    tcph_ack->dst = grub_cpu_to_be16 (sock->out_port);
> +  timestamp = (struct tcp_timestamp_opt *)(tcph_ack + 1);
> +  timestamp->opt.kind = 8;

No magic constants please; add enum with TCP options (GRUB_TCP_OPT_XXX
as example) and use them everywhere,.

> +  timestamp->opt.length = 10;

sizeof (struct tcp_timestamp_opt) please.

> +  timestamp->tsval = grub_cpu_to_be32 (grub_get_time_ms ());
> +  timestamp->tsecr = grub_cpu_to_be32 (sock->cur_tsecr);
>    err = tcp_send (nb_ack, sock);
>    if (err)
>      {
> @@ -503,7 +535,7 @@ grub_net_tcp_accept (grub_net_tcp_socket_t sock,
>    sock->error_hook = error_hook;
>    sock->fin_hook = fin_hook;
>    sock->hook_data = hook_data;
> -  nb_ack = grub_netbuff_alloc (sizeof (*tcph)
> +  nb_ack = grub_netbuff_alloc (sizeof (*tcph) +

Redundant change

>                                + GRUB_NET_OUR_MAX_IP_HEADER_SIZE
>                                + GRUB_NET_MAX_LINK_HEADER_SIZE);
>    if (!nb_ack)
> @@ -536,6 +568,37 @@ grub_net_tcp_accept (grub_net_tcp_socket_t sock,
>    return GRUB_ERR_NONE;
>  }
>
> +static grub_uint32_t
> +tcp_window_size (void)
> +{
> +  const char *val;
> +  grub_uint32_t ret = 1048576;

Please make default #define somewhere on top of file with clear enough name.

> +
> +  val = grub_env_get ("tcp_window_size");
> +  if (!val)
> +    return ret;
> +
> +  grub_error_push ();
> +  ret = (grub_uint32_t) grub_strtoul (val, 0, 0);
> +
> +  if (grub_errno != GRUB_ERR_NONE)
> +    {
> +      grub_env_unset ("tcp_window_size");
> +      grub_errno = GRUB_ERR_NONE;
> +      ret = 1048576;
> +    }
> +

This should go in variable ->read_hook method so even if variable is
not explicitly set its value can be queried

old_window=$net_tcp_window_size
do something with known long haul network
net_tcp_window_size=$old_window

Make value be stored in static variable and ->read_hook return it.

> +  grub_error_pop ();
> +
> +  /* A window size greater than 1gb is invalid */
> +  if (ret > 1024 * 1024 * 1024)
> +    {
> +      grub_env_unset ("tcp_window_size");
> +      ret = 1048576;
> +    }

And this in variable ->write_hook method with useful error message

> +  return ret;
> +}
> +
>  grub_net_tcp_socket_t
>  grub_net_tcp_open (char *server,
>                    grub_uint16_t out_port,
> @@ -556,9 +619,14 @@ grub_net_tcp_open (char *server,
>    static grub_uint16_t in_port = 21550;
>    struct grub_net_buff *nb;
>    struct tcphdr *tcph;
> +  struct tcp_scale_opt *scale;
> +  struct tcp_timestamp_opt *timestamp;
>    int i;
>    grub_uint8_t *nbd;
>    grub_net_link_level_address_t ll_target_addr;
> +  grub_size_t headersize;

Same comment

> +  grub_uint32_t window_size = tcp_window_size ();
> +  grub_uint8_t window_scale = 0;
>
>    err = grub_net_resolve_address (server, &addr);
>    if (err)
> @@ -583,6 +651,14 @@ grub_net_tcp_open (char *server,
>    if (socket == NULL)
>      return NULL;
>
> +  /* Have to scale down the window_size so it fits in 16bits and calculate 
> the
> +     scale at the same time. */
> +  while (window_size > 65535)
> +    {
> +      window_size >>= 1;
> +      window_scale += 1;
> +    }
> +
>    socket->out_port = out_port;
>    socket->inf = inf;
>    socket->out_nla = addr;
> @@ -593,7 +669,9 @@ grub_net_tcp_open (char *server,
>    socket->fin_hook = fin_hook;
>    socket->hook_data = hook_data;
>
> -  nb = grub_netbuff_alloc (sizeof (*tcph) + 128);
> +  headersize = ALIGNWORD (sizeof (*tcph) + sizeof (*scale) +
> +                         sizeof (*timestamp));
> +  nb = grub_netbuff_alloc (headersize + 128);
>    if (!nb)
>      return NULL;
>    err = grub_netbuff_reserve (nb, 128);
> @@ -603,7 +681,7 @@ grub_net_tcp_open (char *server,
>        return NULL;
>      }
>
> -  err = grub_netbuff_put (nb, sizeof (*tcph));
> +  err = grub_netbuff_put (nb, headersize);
>    if (err)
>      {
>        grub_netbuff_free (nb);
> @@ -617,17 +695,30 @@ grub_net_tcp_open (char *server,
>      }
>
>    tcph = (void *) nb->data;
> +  grub_memset(tcph, 0, headersize);

Same question.

>    socket->my_start_seq = grub_get_time_ms ();
>    socket->my_cur_seq = socket->my_start_seq + 1;
> -  socket->my_window = 8192;
> +  socket->my_window = window_size;
>    tcph->seqnr = grub_cpu_to_be32 (socket->my_start_seq);
>    tcph->ack = grub_cpu_to_be32_compile_time (0);
> -  tcph->flags = grub_cpu_to_be16_compile_time ((5 << 12) | TCP_SYN);
> +  tcph->flags = grub_cpu_to_be16_compile_time ((headersize << 10) | TCP_SYN);

See above.

>    tcph->window = grub_cpu_to_be16 (socket->my_window);
>    tcph->urgent = 0;
>    tcph->src = grub_cpu_to_be16 (socket->in_port);
>    tcph->dst = grub_cpu_to_be16 (socket->out_port);
>    tcph->checksum = 0;
> +
> +  scale = (struct tcp_scale_opt *)(tcph + 1);
> +  scale->opt.kind = 3;

Magic constant

> +  scale->opt.length = 3;

sizeof

> +  scale->scale = window_scale;
> +
> +  timestamp = (struct tcp_timestamp_opt *)(scale + 1);
> +  timestamp->opt.kind = 8;

Magic constant

> +  timestamp->opt.length = 10;

Sizeof

> +  timestamp->tsval = grub_cpu_to_be32 (grub_get_time_ms ());
> +  timestamp->tsecr = 0;
> +
>    tcph->checksum = grub_net_ip_transport_checksum (nb, GRUB_NET_IP_TCP,
>                                                    &socket->inf->address,
>                                                    &socket->out_nla);
> @@ -745,6 +836,7 @@ grub_net_recv_tcp_packet (struct grub_net_buff *nb,
>  {
>    struct tcphdr *tcph;
>    grub_net_tcp_socket_t sock;
> +  grub_uint32_t tsecr = 0;
>    grub_err_t err;
>
>    /* Ignore broadcast.  */
> @@ -771,6 +863,35 @@ grub_net_recv_tcp_packet (struct grub_net_buff *nb,
>        return GRUB_ERR_NONE;
>      }
>
> +  if ((grub_be_to_cpu16 (tcph->flags >> 12)) > 5)

sizeof (tcp_hdr) please. Also it could check for >= sizeof (tcp_hdr) +
sizeof (tcp_timestamp_opt) as this is the only option we are
interested in right now anyway.

> +    {
> +      struct tcp_opt *opt;
> +      grub_size_t remaining = nb->tail - nb->data;
> +
> +      opt = (struct tcp_opt *)(tcph + 1);
> +      while (remaining > 0)
> +       {
> +         grub_uint8_t len = opt->length;

out of bound access. You need to check you have at least 2 bytes left.

> +         if (opt->kind == 8 || opt->kind == 0)
> +           break;
> +         if (opt->kind == 1)
> +           len = 1;
> +         if (len > remaining)
> +           remaining = 0;
> +         remaining -= len;

Sorry? This wraps around.

> +         opt = (struct tcp_opt *)((grub_uint8_t *)opt + len);
> +       }
> +
> +      /* Ok we definitely have the timestamp option. */
> +      if (opt->kind == 8)
> +       {
> +         struct tcp_timestamp_opt *timestamp;
> +
> +         timestamp = (struct tcp_timestamp_opt *)opt;
> +         tsecr = grub_be_to_cpu32 (timestamp->tsval);
> +       }
> +    }
> +
>    FOR_TCP_SOCKETS (sock)
>    {
>      if (!(grub_be_to_cpu16 (tcph->dst) == sock->in_port
> @@ -906,6 +1027,8 @@ grub_net_recv_tcp_packet (struct grub_net_buff *nb,
>               return err;
>             }
>
> +         /* We only update the tsecr when we advance the window. */
> +         sock->cur_tsecr = tsecr;

As far as I can tell, this does not agree with algorithm proposed in
RFC 7323. It recommends that TSecr be advanced on immediate segment
arrival and only if TSval is greater than previous one. I read it that
TSecr advance should be done before queue processing, probably at the
spot where we look for TS option.

(2)  If:

            SEG.TSval >= TS.Recent and SEG.SEQ <= Last.ACK.sent

        then SEG.TSval is copied to TS.Recent; otherwise, it is ignored.

and

   It is important to note that the timestamp MUST be checked only when
   a segment first arrives at the receiver, regardless of whether it is
   in sequence or it must be queued for later delivery.


Note that it also recommends that if TS option was negotiated, packets
without TS option should be discarded. Not sure if we need to follow
it.


>           sock->their_cur_seq += (nb_top->tail - nb_top->data);
>           if (grub_be_to_cpu16 (tcph->flags) & TCP_FIN)
>             {
> --
> 1.8.1
>
>
> _______________________________________________
> Grub-devel mailing list
> address@hidden
> https://lists.gnu.org/mailman/listinfo/grub-devel



reply via email to

[Prev in Thread] Current Thread [Next in Thread]