Raw Ethernet Programming: TSO - Code Example

Version 7

    TCP Segmentation Offload (TSO) enabled adapter card accepts large chunk of data with the size greater than the MTU. The TSO engine splits the data into separate packets and inserts the user-specified L2/L3/L4 headers automatically per packet. With the usage of TSO, CPU is offloaded from dealing with much less throughput of data.

    This feature is available starting of OFED 3.4 release.

     

    References

     

    Configuration

    1. Confirm that the adapter hardware supports TSO.

    # ibv_devinfo -v

     

    ...

    tso_caps:

            max_tso:                        262144

            supported_qp:

                                            SUPPORT_RAW_PACKET

    ...

     

    max_tso is the maximum payload size (in bytes) supported for segmentation by the TSO engine.

    supported_qp is the QP types which support TSO operations.

     

    2. Create a TSO-eligible QP by calling ibv_exp_create_qp. The user application should pass as a parameter the maximum TSO header size (max_tso_header)  into the ibv_exp_qp_init_attr  (defined in /usr/include/infiniband/verbs_exp.h) in order to prepare the SQ buffer accordingly.

     

    struct ibv_exp_qp_init_attr {

    ...

      uint16_t max_tso_header; /* Maximum TSO header size */

    ...

    };

     

    TSO traffic is performed by ibv_exp_post_send using  the opcode IBV_EXP_WR_TSO. The user application should specify the following information within the associated send Work Request (wr):

     

    • A pointer to the packet header.
    • Header size.
    • The maximum segment size (mss) that hardware should generate in its TSO engine.

    Via the following struct.

    struct ibv_exp_send_wr {

    ...

      struct {

      void *hdr;   /* Pointer address of inline header */

      uint16_t hdr_sz; /* Inline header size */

      uint16_t mss;    /* Maximum segment size for each TSO fragment */

      } tso;

    ...

    };

     

    This example was based on Raw Ethernet Programming: Basic Introduction - Code Example in the Sender section. Follow the changes in red to be done in order to set TSO.

       /* 5. Initialize QP */

        struct ibv_qp *qp;

        struct ibv_exp_qp_init_attr qp_init_attr = {

            .qp_context = NULL,

            /* report send completion to cq */

            .send_cq = cq,

            .recv_cq = NULL,                        

          

            .cap = {

                /* number of allowed outstanding sends without waiting for a completion */

                .max_send_wr  = SQ_NUM_DESC,

                /* maximum number of pointers in each descriptor */

                .max_send_sge = 1,

                /* if inline maximum of payload data in the descriptors themselves */

                .max_inline_data = 512,

                .max_recv_wr = 0

            },

            .qp_type = IBV_QPT_RAW_PACKET,

            .max_tso_header = 54;  // ETH/IPv4/TCP header example

            .comp_mask = IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_MAX_TSO_HEADER;

        };

     

        /* 6. Create Queue Pair (QP) - Send Ring */

        qp = ibv_exp_create_qp(pd, &qp_init_attr);

        if (!qp)  {

            fprintf(stderr, "Couldn't create RSS QP\n");

            exit(1);

        }

     

          /* 7. Initialize the QP (receive ring) and assign a port */

        struct ibv_qp_attr qp_attr;

        int qp_flags;

        memset(&qp_attr, 0, sizeof(qp_attr));

     

        qp_flags = IBV_QP_STATE | IBV_QP_PORT;

        qp_attr.qp_state        = IBV_QPS_INIT;

        qp_attr.port_num        = 1;

        ret = ibv_modify_qp(qp, &qp_attr, qp_flags);

        if (ret < 0) {

            fprintf(stderr, "failed modify qp to init\n");

            exit(1);

        }

        memset(&qp_attr, 0, sizeof(qp_attr));

     

        /* 8. Move the ring to ready to send in two steps (a,b) */

        /*    a. Move ring state to ready to receive, this is needed to be able to move ring to ready to send even if receive queue is not enabled */

        qp_flags = IBV_QP_STATE;

        qp_attr.qp_state = IBV_QPS_RTR;

        ret = ibv_modify_qp(qp, &qp_attr, qp_flags);

        if (ret < 0) {

            fprintf(stderr, "failed modify qp to recevie\n");

            exit(1);

        }

     

        /*    b. Move the ring to ready to send */

        qp_flags = IBV_QP_STATE;

        qp_attr.qp_state = IBV_QPS_RTS;

        ret = ibv_modify_qp(qp, &qp_attr, qp_flags);

        if (ret < 0) {

            fprintf(stderr, "failed modify qp to recevie\n");

            exit(1);

        }

     

        /* 9. Allocate Memory */

        int buf_size = ENTRY_SIZE*SQ_NUM_DESC;  /* maximum size of data to be access directly by hw */

        void *buf;

        buf = malloc(buf_size);

        if (!buf) {

            fprintf(stderr, "Coudln't allocate memory\n");

            exit(1);

        }

     

        /* 10. Register the user memory so it can be accessed by the HW directly */

        struct ibv_mr *mr;

        mr = ibv_reg_mr(pd, buf, buf_size, IBV_ACCESS_LOCAL_WRITE);

        if (!mr) {

            fprintf(stderr, "Couldn't register mr\n");

            exit(1);

        }

        memcpy(buf, packet, sizeof(packet));

     

        int n;

        struct ibv_sge sg_entry;

        struct ibv_exp_send_wr wr, *bad_wr;

        int msgs_completed;

        struct ibv_wc wc;

     

     

        /* scatter/gather entry describes location and size of data to send*/

        sg_entry.addr   = (uint64_t)buf;

        sg_entry.length = sizeof(packet);

        sg_entry.lkey   = mr->lkey;

     

        memset(&wr, 0, sizeof(wr));

     

      

        /*

         * descriptor for send transaction - details:

         *      - how many pointer to data to use

         *      - if this is a single descriptor or a list (next == NULL single)

         *      - if we want inline and/or completion

         */

        wr.num_sge  = 1;

        wr.sg_list  = &sg_entry;

        wr.next     = NULL;

        wr.opcode   = IBV_EXP_WR_TSO;

     

        wr.tso.mss = 1000; // Maximum Segment Size example

        wr.tso.hdr_sz =  54; // ETH/IPv4/TCP header example

        char hdr[54]; // ETH/IPv4/TCP header example

        memcpy(hdr, header, 54); // Assuming that the header buffer was define before.

        wr.tso.hdr = hdr; // There is no need to use malloc operation in this case, local definition of hdr is ok.

     

        /* 10. Send Operation */

        while(1) {

            /*

             * inline means data will be copied to space pre-allocated in descriptor

             * as long as it is small enough. otherwise pointer reference will be used.

             * see max_inline_data = 512 above.

             */

            wr.send_flags = IBV_SEND_INLINE;

     

            /*

             * we ask for a completion every half queue. only interested in completions to monitor progress.

             */

            if ( (n % (SQ_NUM_DESC/2)) == 0) {

                wr.wr_id = n;

                wr.send_flags |=  IBV_SEND_SIGNALED;

            }

     

            /* push descriptor to hardware */

            ret = ibv_exp_post_send(qp, &wr, &bad_wr);

            if (ret < 0) {

                fprintf(stderr, "failed in post send\n");

                exit(1);

            }

            n++;

          

            /* poll for completion after half ring is posted */

            if ( (n % (SQ_NUM_DESC/2)) == 0 && n > 0) {

                msgs_completed = ibv_poll_cq(cq, 1, &wc);

                if (msgs_completed > 0) {

                    printf("completed message %ld\n", wc.wr_id);

                } else if (msgs_completed < 0) {

                    printf("Polling error\n");

                    exit(1);

                }

            }

        }

     

        printf("We are done\n");

     

        return 0;