I've got a question about how the size of recv() and send() buffers affects the performance of TCP. Consider the following fully working C++ example that transfers 1 GB of (arbitrary) data from the client to the server via TCP.
#include <unistd.h>
#include <netdb.h>
#include <errno.h>
#include <netinet/tcp.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/ioctl.h>
#include <iostream>
#include <memory>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <algorithm>
#include <string>
#include <sstream>
typedef unsigned long long TimePoint;
typedef unsigned long long Duration;
inline TimePoint getTimePoint() {
struct ::timeval tv;
::gettimeofday(&tv, nullptr);
return tv.tv_sec * 1000000ULL + tv.tv_usec;
}
const size_t totalSize = 1024 * 1024 * 1024;
const int one = 1;
void server(const size_t blockSize, const std::string& serviceName) {
std::unique_ptr<char[]> block(new char[blockSize]);
const size_t atLeastReads = totalSize / blockSize;
std::cout << "Starting server. Receiving block size is " << blockSize << ", which requires at least " << atLeastReads << " reads." << std::endl;
addrinfo hints;
memset(&hints, 0, sizeof(addrinfo));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
hints.ai_flags = AI_PASSIVE;
hints.ai_protocol = 0;
addrinfo* firstAddress;
int result = getaddrinfo(nullptr, serviceName.c_str(), &hints, &firstAddress);
if (result != 0) return;
int listener = socket(firstAddress->ai_family, firstAddress->ai_socktype, firstAddress->ai_protocol);
if (listener == -1) return;
if (setsockopt(listener, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) != 0) return;
if (bind(listener, firstAddress->ai_addr, firstAddress->ai_addrlen) != 0) return;
freeaddrinfo(firstAddress);
if (listen(listener, 1) != 0) return;
while (true) {
int server = accept(listener, nullptr, nullptr);
if (server == -1) return;
u_long mode = 1;
if (::ioctl(server, FIONBIO, &mode) != 0) return;
// if (setsockopt(server, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)) != 0) return;
// int size = 64000;
// if (setsockopt(server, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)) != 0) return;
// if (setsockopt(server, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)) != 0) return;
std::cout << "Server accepted connection." << std::endl;
size_t leftToRead = totalSize;
size_t numberOfReads = 0;
size_t numberOfIncompleteReads = 0;
const TimePoint totalStart = ::getTimePoint();
Duration selectDuration = 0;
Duration readDuration = 0;
while (leftToRead > 0) {
fd_set readSet;
FD_ZERO(&readSet);
FD_SET(server, &readSet);
TimePoint selectStart = ::getTimePoint();
if (select(server + 1, &readSet, nullptr, nullptr, nullptr) == -1) return;
selectDuration += ::getTimePoint() - selectStart;
if (FD_ISSET(server, &readSet) != 0) {
const size_t toRead = std::min(leftToRead, blockSize);
TimePoint readStart = ::getTimePoint();
const ssize_t actuallyRead = recv(server, block.get(), toRead, 0);
readDuration += ::getTimePoint() - readStart;
if (actuallyRead == -1)
return;
else if (actuallyRead == 0) {
std::cout << "Got 0 bytes, which signals that the client closed the socket." << std::endl;
break;
}
else if (toRead != actuallyRead)
++numberOfIncompleteReads;
++numberOfReads;
leftToRead -= actuallyRead;
}
}
const Duration totalDuration = ::getTimePoint() - totalStart;
std::cout << "Receiving took " << totalDuration << " us, transfer rate was " << totalSize / (totalDuration / 1000000.0) << " bytes/s." << std::endl;
std::cout << "Selects took " << selectDuration << " us, while reads took " << readDuration << " us." << std::endl;
std::cout << "There were " << numberOfReads << " reads (factor " << numberOfReads / ((double)atLeastReads) << "), of which " << numberOfIncompleteReads << " (" << (numberOfIncompleteReads / ((double)numberOfReads)) * 100.0 << "%) were incomplete." << std::endl << std::endl;
close(server);
}
}
bool client(const size_t blockSize, const std::string& hostName, const std::string& serviceName) {
std::unique_ptr<char[]> block(new char[blockSize]);
const size_t atLeastWrites = totalSize / blockSize;
std::cout << "Starting client... " << std::endl;
addrinfo hints;
memset(&hints, 0, sizeof(addrinfo));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
hints.ai_flags = 0;
hints.ai_protocol = 0;
addrinfo* firstAddress;
if (getaddrinfo(hostName.c_str(), serviceName.c_str(), &hints, &firstAddress) != 0) return false;
int client = socket(firstAddress->ai_family, firstAddress->ai_socktype, firstAddress->ai_protocol);
if (client == -1) return false;
if (connect(client, firstAddress->ai_addr, firstAddress->ai_addrlen) != 0) return false;
freeaddrinfo(firstAddress);
u_long mode = 1;
if (::ioctl(client, FIONBIO, &mode) != 0) return false;
// if (setsockopt(client, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)) != 0) return false;
// int size = 64000;
// if (setsockopt(client, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)) != 0) return false;
// if (setsockopt(client, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)) != 0) return false;
std::cout << "Client connected. Sending block size is " << blockSize << ", which requires at least " << atLeastWrites << " writes." << std::endl;
size_t leftToWrite = totalSize;
size_t numberOfWrites = 0;
size_t numberOfIncompleteWrites = 0;
const TimePoint totalStart = ::getTimePoint();
Duration selectDuration = 0;
Duration writeDuration = 0;
while (leftToWrite > 0) {
fd_set writeSet;
FD_ZERO(&writeSet);
FD_SET(client, &writeSet);
TimePoint selectStart = ::getTimePoint();
if (select(client + 1, nullptr, &writeSet, nullptr, nullptr) == -1) return false;
selectDuration += ::getTimePoint() - selectStart;
if (FD_ISSET(client, &writeSet) != 0) {
const size_t toWrite = std::min(leftToWrite, blockSize);
TimePoint writeStart = ::getTimePoint();
const ssize_t actuallyWritten = send(client, block.get(), toWrite, 0);
writeDuration += ::getTimePoint() - writeStart;
if (actuallyWritten == -1)
return false;
else if (actuallyWritten == 0) {
std::cout << "Got 0 bytes, which shouldn't happen!" << std::endl;
break;
}
else if (toWrite != actuallyWritten)
++numberOfIncompleteWrites;
++numberOfWrites;
leftToWrite -= actuallyWritten;
}
}
const Duration totalDuration = ::getTimePoint() - totalStart;
std::cout << "Writing took " << totalDuration << " us, transfer rate was " << totalSize / (totalDuration / 1000000.0) << " bytes/s." << std::endl;
std::cout << "Selects took " << selectDuration << " us, while writes took " << writeDuration << " us." << std::endl;
std::cout << "There were " << numberOfWrites << " writes (factor " << numberOfWrites / ((double)atLeastWrites) << "), of which " << numberOfIncompleteWrites << " (" << (numberOfIncompleteWrites / ((double)numberOfWrites)) * 100.0 << "%) were incomplete." << std::endl << std::endl;
if (shutdown(client, SHUT_WR) != 0) return false;
if (close(client) != 0) return false;
return true;
}
int main(int argc, char* argv[]) {
if (argc < 2)
std::cout << "Block size is missing." << std::endl;
else {
const size_t blockSize = static_cast<size_t>(std::atoll(argv[argc - 1]));
if (blockSize > 1024 * 1024)
std::cout << "Block size " << blockSize << " is suspicious." << std::endl;
else {
if (argc >= 3) {
if (!client(blockSize, argv[1], "12000"))
std::cout << "The client encountered an error." << std::endl;
}
else {
server(blockSize, "12000");
std::cout << "The server encountered an error." << std::endl;
}
}
}
return 0;
}
I'm running the example on two Linux (kernel version 4.1.10-200.fc22.x86_64) machines connected by 1 Gbit/s LAN, on which I get the following behaviour: if the recv() and send() system calls use a buffer of 40 bytes or more, then I use all available bandwidth; however, if I use smaller buffers on either the server or the client, then the throughput drops. This behaviour seems to be unaffected by the commented out socket options (Nagle's algorithm and/or send/receive buffer sizes).
I can understand that sending data in small chunks might be inefficient: if the Nagle's algorithm is turned off and chunks are small, then the header sizes of TCP and IP could dominate the useful payload. However, I don't expect the receiving buffer size to affect the transfer rate: I would expect the cost of the recv() system call to be cheap compared to the cost of actually sending the data over LAN. Thus, if I send data in, say, 5000 byte chunks, I would expect that the transfer rate is largely independent from the size of the receiving buffer because the rate by which I call recv() should still be larger than the LAN transfer rate. Alas, this isn't the case!
I would really appreciate it if someone could explain to me what is causing the slowdown: is it simply the cost of system calls, or is something happening at the protocol level?
I encountered this problem while writing a message-based cloud application, and I would appreciate it if someone could tell me about how this issue, in their opinion, should affect the system's architecture. For a variety of reasons, I'm not using a messaging library such as ZeroMQ, but am writing the message passing interface myself. The computation in the cloud is such that the flow of messages between servers is not symmetric (i.e., depending on the workload, server A could send much more data to server B than vice versa), messages are asynchronous (i.e., the times between messages are not predictable, but many messages can be sent in bursts), and messages are of variable size and are usually small (10 to 20 bytes). Furthermore, messages could in principle be delivered out of order, but it is important that messages are not dropped and some flow/congestion control is needed as well; therefore, I'm using TCP rather than UDP. Since messages are of varying size, each message starts with an integer specifying the message size, followed by the message payload. To read messages from the socket, I first read the message size and then the payload; thus, reading a single message requires at least two recv() calls (and maybe more because recv() can return less data than requested). Now because both message size and message payloads are small, I end up having many small recv() requests, which, as my example demonstrates, doesn't let me use the available bandwidth fully. Does anyone have any advice about the "right" way to structure message passing in such a scenario?
Many thanks in advance for all your help!