1
votes

I've got a question about how the size of recv() and send() buffers affects the performance of TCP. Consider the following fully working C++ example that transfers 1 GB of (arbitrary) data from the client to the server via TCP.

#include <unistd.h>
#include <netdb.h>
#include <errno.h>
#include <netinet/tcp.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/ioctl.h>

#include <iostream>
#include <memory>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <algorithm>
#include <string>
#include <sstream>

typedef unsigned long long TimePoint;
typedef unsigned long long Duration;

inline TimePoint getTimePoint() {
    struct ::timeval tv;
    ::gettimeofday(&tv, nullptr);
    return tv.tv_sec * 1000000ULL + tv.tv_usec;
}

const size_t totalSize = 1024 * 1024 * 1024;
const int one = 1;

void server(const size_t blockSize, const std::string& serviceName) {
    std::unique_ptr<char[]> block(new char[blockSize]);
    const size_t atLeastReads = totalSize / blockSize;
    std::cout << "Starting server. Receiving block size is " << blockSize << ", which requires at least " << atLeastReads << " reads." << std::endl;
    addrinfo hints;
    memset(&hints, 0, sizeof(addrinfo));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;
    hints.ai_flags = AI_PASSIVE;
    hints.ai_protocol = 0;
    addrinfo* firstAddress;
    int result = getaddrinfo(nullptr, serviceName.c_str(), &hints, &firstAddress);
    if (result != 0) return;
    int listener = socket(firstAddress->ai_family, firstAddress->ai_socktype, firstAddress->ai_protocol);
    if (listener == -1) return;
    if (setsockopt(listener, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) != 0) return;
    if (bind(listener, firstAddress->ai_addr, firstAddress->ai_addrlen) != 0) return;
    freeaddrinfo(firstAddress);
    if (listen(listener, 1) != 0) return;
    while (true) {
        int server = accept(listener, nullptr, nullptr);
        if (server == -1) return;
        u_long mode = 1;
        if (::ioctl(server, FIONBIO, &mode) != 0) return;
//        if (setsockopt(server, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)) != 0) return;
//        int size = 64000;
//        if (setsockopt(server, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)) != 0) return;
//        if (setsockopt(server, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)) != 0) return;
        std::cout << "Server accepted connection." << std::endl;
        size_t leftToRead = totalSize;
        size_t numberOfReads = 0;
        size_t numberOfIncompleteReads = 0;
        const TimePoint totalStart = ::getTimePoint();
        Duration selectDuration = 0;
        Duration readDuration = 0;
        while (leftToRead > 0) {
            fd_set readSet;
            FD_ZERO(&readSet);
            FD_SET(server, &readSet);
            TimePoint selectStart = ::getTimePoint();
            if (select(server + 1, &readSet, nullptr, nullptr, nullptr) == -1) return;
            selectDuration += ::getTimePoint() - selectStart;
            if (FD_ISSET(server, &readSet) != 0) {
                const size_t toRead = std::min(leftToRead, blockSize);
                TimePoint readStart = ::getTimePoint();
                const ssize_t actuallyRead = recv(server, block.get(), toRead, 0);
                readDuration += ::getTimePoint() - readStart;
                if (actuallyRead == -1)
                    return;
                else if (actuallyRead == 0) {
                    std::cout << "Got 0 bytes, which signals that the client closed the socket." << std::endl;
                    break;
                }
                else if (toRead != actuallyRead)
                    ++numberOfIncompleteReads;
                ++numberOfReads;
                leftToRead -= actuallyRead;
            }
        }
        const Duration totalDuration = ::getTimePoint() - totalStart;
        std::cout << "Receiving took " << totalDuration << " us, transfer rate was " << totalSize / (totalDuration / 1000000.0) << " bytes/s." << std::endl;
        std::cout << "Selects took " << selectDuration << " us, while reads took " << readDuration << " us." << std::endl;
        std::cout << "There were " << numberOfReads << " reads (factor " << numberOfReads / ((double)atLeastReads) << "), of which " << numberOfIncompleteReads << " (" << (numberOfIncompleteReads / ((double)numberOfReads)) * 100.0 << "%) were incomplete." << std::endl << std::endl;
        close(server);
    }
}

bool client(const size_t blockSize, const std::string& hostName, const std::string& serviceName) {
    std::unique_ptr<char[]> block(new char[blockSize]);
    const size_t atLeastWrites = totalSize / blockSize;
    std::cout << "Starting client... " << std::endl;
    addrinfo hints;
    memset(&hints, 0, sizeof(addrinfo));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;
    hints.ai_flags = 0;
    hints.ai_protocol = 0;
    addrinfo* firstAddress;
    if (getaddrinfo(hostName.c_str(), serviceName.c_str(), &hints, &firstAddress) != 0) return false;
    int client = socket(firstAddress->ai_family, firstAddress->ai_socktype, firstAddress->ai_protocol);
    if (client == -1) return false;
    if (connect(client, firstAddress->ai_addr, firstAddress->ai_addrlen) != 0) return false;
    freeaddrinfo(firstAddress);
    u_long mode = 1;
    if (::ioctl(client, FIONBIO, &mode) != 0) return false;
//    if (setsockopt(client, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)) != 0) return false;
//    int size = 64000;
//    if (setsockopt(client, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)) != 0) return false;
//    if (setsockopt(client, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)) != 0) return false;
    std::cout << "Client connected. Sending block size is " << blockSize << ", which requires at least " << atLeastWrites << " writes." << std::endl;
    size_t leftToWrite = totalSize;
    size_t numberOfWrites = 0;
    size_t numberOfIncompleteWrites = 0;
    const TimePoint totalStart = ::getTimePoint();
    Duration selectDuration = 0;
    Duration writeDuration = 0;
    while (leftToWrite > 0) {
        fd_set writeSet;
        FD_ZERO(&writeSet);
        FD_SET(client, &writeSet);
        TimePoint selectStart = ::getTimePoint();
        if (select(client + 1, nullptr, &writeSet, nullptr, nullptr) == -1) return false;
        selectDuration += ::getTimePoint() - selectStart;
        if (FD_ISSET(client, &writeSet) != 0) {
            const size_t toWrite = std::min(leftToWrite, blockSize);
            TimePoint writeStart = ::getTimePoint();
            const ssize_t actuallyWritten = send(client, block.get(), toWrite, 0);
            writeDuration += ::getTimePoint() - writeStart;
            if (actuallyWritten == -1)
                return false;
            else if (actuallyWritten == 0) {
                std::cout << "Got 0 bytes, which shouldn't happen!" << std::endl;
                break;
            }
            else if (toWrite != actuallyWritten)
                ++numberOfIncompleteWrites;
            ++numberOfWrites;
            leftToWrite -= actuallyWritten;
        }
    }
    const Duration totalDuration = ::getTimePoint() - totalStart;
    std::cout << "Writing took " << totalDuration << " us, transfer rate was " << totalSize / (totalDuration / 1000000.0) << " bytes/s." << std::endl;
    std::cout << "Selects took " << selectDuration << " us, while writes took " << writeDuration << " us." << std::endl;
    std::cout << "There were " << numberOfWrites << " writes (factor " << numberOfWrites / ((double)atLeastWrites) << "), of which " << numberOfIncompleteWrites << " (" << (numberOfIncompleteWrites / ((double)numberOfWrites)) * 100.0 << "%) were incomplete." << std::endl << std::endl;
    if (shutdown(client, SHUT_WR) != 0) return false;
    if (close(client) != 0) return false;
    return true;
}

int main(int argc, char* argv[]) {
    if (argc < 2)
        std::cout << "Block size is missing." << std::endl;
    else {
        const size_t blockSize = static_cast<size_t>(std::atoll(argv[argc - 1]));
        if (blockSize > 1024 * 1024)
            std::cout << "Block size " << blockSize << " is suspicious." << std::endl;
        else {
            if (argc >= 3) {
                if (!client(blockSize, argv[1], "12000"))
                    std::cout << "The client encountered an error." << std::endl;
            }
            else {
                server(blockSize, "12000");
                std::cout << "The server encountered an error." << std::endl;
            }
        }
    }
    return 0;
}

I'm running the example on two Linux (kernel version 4.1.10-200.fc22.x86_64) machines connected by 1 Gbit/s LAN, on which I get the following behaviour: if the recv() and send() system calls use a buffer of 40 bytes or more, then I use all available bandwidth; however, if I use smaller buffers on either the server or the client, then the throughput drops. This behaviour seems to be unaffected by the commented out socket options (Nagle's algorithm and/or send/receive buffer sizes).

I can understand that sending data in small chunks might be inefficient: if the Nagle's algorithm is turned off and chunks are small, then the header sizes of TCP and IP could dominate the useful payload. However, I don't expect the receiving buffer size to affect the transfer rate: I would expect the cost of the recv() system call to be cheap compared to the cost of actually sending the data over LAN. Thus, if I send data in, say, 5000 byte chunks, I would expect that the transfer rate is largely independent from the size of the receiving buffer because the rate by which I call recv() should still be larger than the LAN transfer rate. Alas, this isn't the case!

I would really appreciate it if someone could explain to me what is causing the slowdown: is it simply the cost of system calls, or is something happening at the protocol level?

I encountered this problem while writing a message-based cloud application, and I would appreciate it if someone could tell me about how this issue, in their opinion, should affect the system's architecture. For a variety of reasons, I'm not using a messaging library such as ZeroMQ, but am writing the message passing interface myself. The computation in the cloud is such that the flow of messages between servers is not symmetric (i.e., depending on the workload, server A could send much more data to server B than vice versa), messages are asynchronous (i.e., the times between messages are not predictable, but many messages can be sent in bursts), and messages are of variable size and are usually small (10 to 20 bytes). Furthermore, messages could in principle be delivered out of order, but it is important that messages are not dropped and some flow/congestion control is needed as well; therefore, I'm using TCP rather than UDP. Since messages are of varying size, each message starts with an integer specifying the message size, followed by the message payload. To read messages from the socket, I first read the message size and then the payload; thus, reading a single message requires at least two recv() calls (and maybe more because recv() can return less data than requested). Now because both message size and message payloads are small, I end up having many small recv() requests, which, as my example demonstrates, doesn't let me use the available bandwidth fully. Does anyone have any advice about the "right" way to structure message passing in such a scenario?

Many thanks in advance for all your help!

2
"is it simply the cost of system calls" . Seems likely. Can't you run your 2 versions with a profiler and see? Good luck. - shellter
The number of bytes processed by each recv() call does not affect the bytes sent over the wire. The recv() call merely copies the next N bytes from the kernel's received-data buffer for that socket into user-space. But there is overhead (context-switch, etc) associated with every system call, so reducing the number of times you need to call recv() per second does make things more CPU-efficient. (I think two recv() calls per incoming message is a reasonable tradeoff between ease-of-coding and efficiency, though) - Jeremy Friesner
I'm not sure I understand how to isolate the cost of a system call from the cost of the underlying work: in a profiler, I'll see the total time spent in a system call, but I cannot know how much of that is used up in the kernel and how much is due to protocol issues (e.g., data not received yet). Is there a way to measure the cost of the kernel switch? - Boris
If you set the socket to non-blocking, then you are guaranteed that the recv() call will always return immediately. (Of course you will then need to block elsewhere, e.g. inside select(), to avoid spinning the CPU, but by doing that you will be able to separate time spent waiting for incoming data from time spent inside recv()) - Jeremy Friesner
That is a good idea -- I've updated the test code as Jeremy suggested. After running some experiments, it turned out that the time spent in select() is about ten times lager than the time spent in recv()/send(). This, in my opinion, suggests that the problem doesn't arise due to the overhead of system calls; rather, something is happening at the protocol level. - Boris

2 Answers

1
votes
  • You don't need two recv() calls to read the data you describe. Smarter code, or recvmsg(), will solve that. You just have to be able to cope with the fact that some data from the next message may already have been read.

  • The socket receive buffer should be at least as large as the bandwidth-delay product of the link. Normally that will be many kilobytes.

  • The socket send buffer should be at least as large as the socket receive buffer of the peer.

Otherwise you can't use all the available bandwidth.

EDIT Addressing your comment below:

I don't understand why the size of the recv()/send() buffers in the user space should affect the throughput.

It affects the throughput because it affects the amount of data that can be in flight, whose maximum is given by the bandwidth-delay product of the link.

As people have said above, requests to recv()/send() do not affect the protocol.

This is rubbish. Requests to send() cause data to be sent, which affects the protocol by causing the protocol to be engaged in for sending, and requests to recv() cause data to be removed from the recieve buffer, which affects the protocol by changing the receive window advertised by the next ACK.

Hence, I would expect that, as long as the kernel has enough space in its buffers, and as long as I read this data sufficiently quickly, there shouldn't be any problems. This, however, is not what I observed: (i) changing the sizes of the kernel buffer had no effect, and (ii) I used the available bandwidth already with 40 bytes buffers.

No you didn't. There was a study published in the early 1980s that showed a tripling of throughput over early and slow versions of Ethernet of the day by raising the socket buffers from 1024 to 4096. If you think you observed different, you didn't. Any socket buffer size less than the bandwidth-delay product is going to inhibit performance, by definition.

-1
votes
  • would help to align the kernel tcp socket buffers SO_RCVBUF/SO_SNDBUF using set socket option...