tinc/src/protocol_key.c

518 lines
16 KiB
C
Raw Normal View History

2002-02-11 10:05:58 +00:00
/*
protocol_key.c -- handle the meta-protocol, key exchange
Copyright (C) 1999-2005 Ivo Timmermans,
2000-2014 Guus Sliepen <guus@tinc-vpn.org>
2002-02-11 10:05:58 +00:00
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
2002-02-11 10:05:58 +00:00
*/
#include "system.h"
2002-02-11 10:05:58 +00:00
#include "cipher.h"
#include "connection.h"
#include "crypto.h"
#include "logger.h"
2002-02-11 10:05:58 +00:00
#include "net.h"
#include "netutl.h"
#include "node.h"
2011-07-03 13:59:49 +00:00
#include "prf.h"
#include "protocol.h"
#include "sptps.h"
#include "utils.h"
#include "xalloc.h"
2002-02-11 10:05:58 +00:00
static bool mykeyused = false;
2002-02-11 10:05:58 +00:00
void send_key_changed(void) {
#ifndef DISABLE_LEGACY
send_request(everyone, "%d %x %s", KEY_CHANGED, rand(), myself->name);
2002-09-09 21:25:28 +00:00
/* Immediately send new keys to directly connected nodes to keep UDP mappings alive */
2002-09-09 21:25:28 +00:00
2012-10-07 22:35:38 +00:00
for list_each(connection_t, c, connection_list)
if(c->edge && c->node && c->node->status.reachable && !c->node->status.sptps)
2012-10-07 22:35:38 +00:00
send_ans_key(c->node);
#endif
/* Force key exchange for connections using SPTPS */
if(experimental) {
2012-10-07 22:35:38 +00:00
for splay_each(node_t, n, node_tree)
if(n->status.reachable && n->status.validkey && n->status.sptps)
sptps_force_kex(&n->sptps);
}
2002-02-11 10:05:58 +00:00
}
bool key_changed_h(connection_t *c, const char *request) {
2002-09-09 21:25:28 +00:00
char name[MAX_STRING_SIZE];
node_t *n;
if(sscanf(request, "%*d %*x " MAX_STRING, name) != 1) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got bad %s from %s (%s)", "KEY_CHANGED",
2002-09-09 21:25:28 +00:00
c->name, c->hostname);
2003-07-22 20:55:21 +00:00
return false;
2002-09-09 21:25:28 +00:00
}
if(seen_request(request))
2003-07-22 20:55:21 +00:00
return true;
2002-09-09 21:25:28 +00:00
n = lookup_node(name);
if(!n) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got %s from %s (%s) origin %s which does not exist",
2002-09-09 21:25:28 +00:00
"KEY_CHANGED", c->name, c->hostname, name);
return true;
2002-09-09 21:25:28 +00:00
}
if(!n->status.sptps) {
n->status.validkey = false;
n->last_req_key = 0;
}
2002-09-09 21:25:28 +00:00
/* Tell the others */
if(!tunnelserver)
forward_request(c, request);
2002-09-09 21:25:28 +00:00
2003-07-22 20:55:21 +00:00
return true;
2002-02-11 10:05:58 +00:00
}
static bool send_sptps_data_myself(void *handle, uint8_t type, const void *data, size_t len) {
return send_sptps_data(handle, myself, type, data, len);
}
static bool send_initial_sptps_data(void *handle, uint8_t type, const void *data, size_t len) {
node_t *to = handle;
to->sptps.send_data = send_sptps_data_myself;
char buf[len * 4 / 3 + 5];
b64encode(data, buf, len);
return send_request(to->nexthop->connection, "%d %s %s %d %s", REQ_KEY, myself->name, to->name, REQ_KEY, buf);
}
bool send_req_key(node_t *to) {
if(to->status.sptps) {
if(!node_read_ecdsa_public_key(to)) {
2014-05-18 18:47:04 +00:00
logger(DEBUG_PROTOCOL, LOG_DEBUG, "No Ed25519 key known for %s (%s)", to->name, to->hostname);
send_request(to->nexthop->connection, "%d %s %s %d", REQ_KEY, myself->name, to->name, REQ_PUBKEY);
return true;
}
2012-10-10 15:17:49 +00:00
char label[25 + strlen(myself->name) + strlen(to->name)];
snprintf(label, sizeof label, "tinc UDP key expansion %s %s", myself->name, to->name);
2012-08-02 15:23:51 +00:00
sptps_stop(&to->sptps);
to->status.validkey = false;
to->status.waitingforkey = true;
to->last_req_key = now.tv_sec;
to->incompression = myself->incompression;
2012-10-10 15:17:49 +00:00
return sptps_start(&to->sptps, to, true, true, myself->connection->ecdsa, to->ecdsa, label, sizeof label, send_initial_sptps_data, receive_sptps_record);
}
return send_request(to->nexthop->connection, "%d %s %s", REQ_KEY, myself->name, to->name);
}
/* REQ_KEY is overloaded to allow arbitrary requests to be routed between two nodes. */
static bool req_key_ext_h(connection_t *c, const char *request, node_t *from, node_t *to, int reqno) {
/* If this is a SPTPS packet, see if sending UDP info helps.
Note that we only do this if we're the destination or the static relay;
otherwise every hop would initiate its own UDP info message, resulting in elevated chatter. */
if((reqno == REQ_KEY || reqno == SPTPS_PACKET) && to->via == myself)
send_udp_info(myself, from);
if(reqno == SPTPS_PACKET) {
/* This is a SPTPS data packet. */
char buf[MAX_STRING_SIZE];
int len;
if(sscanf(request, "%*d %*s %*s %*d " MAX_STRING, buf) != 1 || !(len = b64decode(buf, buf, strlen(buf)))) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got bad %s from %s (%s) to %s (%s): %s", "SPTPS_PACKET", from->name, from->hostname, to->name, to->hostname, "invalid SPTPS data");
return true;
}
if(to != myself) {
/* We don't just forward the request, because we want to use UDP if it's available. */
send_sptps_data(to, from, 0, buf, len);
try_tx(to, true);
} else {
/* The packet is for us */
Proactively restart the SPTPS tunnel if we get receive errors. There are a number of ways a SPTPS tunnel can get into a corrupt state. For example, during key regeneration, the KEX and SIG messages from other nodes might arrive out of order, which confuses the hell out of the SPTPS code. Another possible scenario is not noticing another node crashed and restarted because there was no point in time where the node was seen completely disconnected from *all* nodes; this could result in using the wrong (old) key. There are probably other scenarios which have not even been considered yet. Distributed systems are hard. When SPTPS got confused by a packet, it used to crash the entire process; fortunately that was fixed by commit 2e7f68ad2b51648b89c4b5c61aeb4cec67c2fbbb. However, the error handling (or lack thereof) leaves a lot to be desired. Currently, when SPTPS encounters an error when receiving a packet, it just shrugs it off and continues as if nothing happened. The problem is, sometimes getting receive errors mean the tunnel is completely stuck and will not recover on its own. In that case, the node will become unreachable - possibly indefinitely. The goal of this commit is to improve SPTPS error handling by taking proactive action when an incoming packet triggers a failure, which is often an indicator that the tunnel is stuck in some way. When that happens, we simply restart SPTPS entirely, which should make the tunnel recover quickly. To prevent "storms" where two buggy nodes flood each other with invalid packets and therefore spend all their time negotiating new tunnels, we limit the frequency at which tunnel restarts happen to ten seconds. It is likely this commit will solve the "Invalid KEX record length during key regeneration" issue that has been seen in the wild. It is difficult to be sure though because we do not have a full understanding of all the possible conditions that can trigger this problem.
2015-05-17 17:50:11 +00:00
if(!sptps_receive_data(&from->sptps, buf, len)) {
/* Uh-oh. It might be that the tunnel is stuck in some corrupted state,
so let's restart SPTPS in case that helps. But don't do that too often
to prevent storms. */
if(from->last_req_key < now.tv_sec - 10) {
logger(DEBUG_PROTOCOL, LOG_ERR, "Failed to decode TCP packet from %s (%s), restarting SPTPS", from->name, from->hostname);
send_req_key(from);
}
return true;
}
send_mtu_info(myself, from, MTU);
}
return true;
}
/* Requests that are not SPTPS data packets are forwarded as-is. */
if (to != myself)
return send_request(to->nexthop->connection, "%s", request);
/* The request is for us */
switch(reqno) {
case REQ_PUBKEY: {
if(!node_read_ecdsa_public_key(from)) {
/* Request their key *before* we send our key back. Otherwise the first SPTPS packet from them will get dropped. */
logger(DEBUG_PROTOCOL, LOG_DEBUG, "Preemptively requesting Ed25519 key for %s (%s)", from->name, from->hostname);
send_request(from->nexthop->connection, "%d %s %s %d", REQ_KEY, myself->name, from->name, REQ_PUBKEY);
}
char *pubkey = ecdsa_get_base64_public_key(myself->connection->ecdsa);
send_request(from->nexthop->connection, "%d %s %s %d %s", REQ_KEY, myself->name, from->name, ANS_PUBKEY, pubkey);
free(pubkey);
return true;
}
case ANS_PUBKEY: {
if(node_read_ecdsa_public_key(from)) {
2012-09-28 15:51:48 +00:00
logger(DEBUG_PROTOCOL, LOG_WARNING, "Got ANS_PUBKEY from %s (%s) even though we already have his pubkey", from->name, from->hostname);
return true;
}
char pubkey[MAX_STRING_SIZE];
if(sscanf(request, "%*d %*s %*s %*d " MAX_STRING, pubkey) != 1 || !(from->ecdsa = ecdsa_set_base64_public_key(pubkey))) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got bad %s from %s (%s): %s", "ANS_PUBKEY", from->name, from->hostname, "invalid pubkey");
return true;
}
2014-05-18 18:47:04 +00:00
logger(DEBUG_PROTOCOL, LOG_INFO, "Learned Ed25519 public key from %s (%s)", from->name, from->hostname);
append_config_file(from->name, "Ed25519PublicKey", pubkey);
return true;
}
case REQ_KEY: {
if(!node_read_ecdsa_public_key(from)) {
2014-05-18 18:47:04 +00:00
logger(DEBUG_PROTOCOL, LOG_DEBUG, "No Ed25519 key known for %s (%s)", from->name, from->hostname);
send_request(from->nexthop->connection, "%d %s %s %d", REQ_KEY, myself->name, from->name, REQ_PUBKEY);
return true;
}
if(from->sptps.label)
logger(DEBUG_ALWAYS, LOG_DEBUG, "Got REQ_KEY from %s while we already started a SPTPS session!", from->name);
char buf[MAX_STRING_SIZE];
int len;
if(sscanf(request, "%*d %*s %*s %*d " MAX_STRING, buf) != 1 || !(len = b64decode(buf, buf, strlen(buf)))) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got bad %s from %s (%s): %s", "REQ_SPTPS_START", from->name, from->hostname, "invalid SPTPS data");
return true;
}
char label[25 + strlen(from->name) + strlen(myself->name)];
snprintf(label, sizeof label, "tinc UDP key expansion %s %s", from->name, myself->name);
2012-08-02 15:23:51 +00:00
sptps_stop(&from->sptps);
from->status.validkey = false;
from->status.waitingforkey = true;
from->last_req_key = now.tv_sec;
sptps_start(&from->sptps, from, false, true, myself->connection->ecdsa, from->ecdsa, label, sizeof label, send_sptps_data_myself, receive_sptps_record);
sptps_receive_data(&from->sptps, buf, len);
Add MTU_INFO protocol message. In this commit, nodes use MTU_INFO messages to provide MTU information. The issue this code is meant to address is the non-trivial problem of finding the proper MTU when UDP SPTPS relays are involved. Currently, tinc has no idea what the MTU looks like beyond the first relay, and will arbitrarily use the first relay's MTU as the limit. This will fail miserably if the MTU decreases after the first relay, forcing relays to fall back to TCP. More generally, one should keep in mind that relay paths can be arbitrarily complex, resulting in packets taking "epic journeys" through the graph, switching back and forth between UDP (with variable MTUs) and TCP multiple times along the path. A solution that was considered consists in sending standard MTU probes through the relays. This is inefficient (if there are 3 nodes on one side of relay and 3 nodes on the other side, we end up with 3*3=9 MTU discoveries taking place at the same time, while technically only 3+3=6 are needed) and would involve eyebrow-raising behaviors such as probes being sent over TCP. This commit implements an alternative solution, which consists in the packet receiver sending MTU_INFO messages to the packet sender. The message contains an MTU value which is set to maximum when the message is originally sent. The message gets altered as it travels through the metagraph, such that when the message arrives to the destination, the MTU value contained in the message can be used to send packets while making sure no relays will be forced to fall back to TCP to deliver them. The operating principles behind such a protocol message are similar to how the UDP_INFO message works, but there is a key difference that prevents us from simply reusing the same message: the UDP_INFO message only cares about relay-to-relay links (i.e. it is sent between static relays and the information it contains only makes sense between two adjacent static relays), while the MTU_INFO cares about the end-to-end MTU, including the entire relay path. Therefore, UDP_INFO messages stop when they encounter static relays, while MTU_INFO messages don't stop until they get to the original packet sender. Note that, technically, the MTU that is obtained through this mechanism can be slightly pessimistic, because it can be lowered by an intermediate node that is not being used as a relay. Since nodes have no way of knowing whether they'll be used as dynamic relays or not (and have no say in the matter), this is not a trivial problem. That said, this is highly unlikely to result in noticeable issues in realistic scenarios.
2015-03-08 18:54:50 +00:00
send_mtu_info(myself, from, MTU);
return true;
}
default:
logger(DEBUG_ALWAYS, LOG_ERR, "Unknown extended REQ_KEY request from %s (%s): %s", from->name, from->hostname, request);
return true;
}
2002-02-11 10:05:58 +00:00
}
bool req_key_h(connection_t *c, const char *request) {
2002-09-09 21:25:28 +00:00
char from_name[MAX_STRING_SIZE];
char to_name[MAX_STRING_SIZE];
node_t *from, *to;
int reqno = 0;
2002-09-09 21:25:28 +00:00
if(sscanf(request, "%*d " MAX_STRING " " MAX_STRING " %d", from_name, to_name, &reqno) < 2) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got bad %s from %s (%s)", "REQ_KEY", c->name,
2002-09-09 21:25:28 +00:00
c->hostname);
2003-07-22 20:55:21 +00:00
return false;
2002-09-09 21:25:28 +00:00
}
if(!check_id(from_name) || !check_id(to_name)) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got bad %s from %s (%s): %s", "REQ_KEY", c->name, c->hostname, "invalid name");
return false;
}
2002-09-09 21:25:28 +00:00
from = lookup_node(from_name);
if(!from) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got %s from %s (%s) origin %s which does not exist in our connection list",
2002-09-09 21:25:28 +00:00
"REQ_KEY", c->name, c->hostname, from_name);
return true;
2002-09-09 21:25:28 +00:00
}
to = lookup_node(to_name);
if(!to) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got %s from %s (%s) destination %s which does not exist in our connection list",
2002-09-09 21:25:28 +00:00
"REQ_KEY", c->name, c->hostname, to_name);
return true;
2002-09-09 21:25:28 +00:00
}
/* Check if this key request is for us */
2012-10-10 15:17:49 +00:00
if(to == myself) { /* Yes */
/* Is this an extended REQ_KEY message? */
if(experimental && reqno)
return req_key_ext_h(c, request, from, to, reqno);
/* No, just send our key back */
send_ans_key(from);
2002-09-09 21:25:28 +00:00
} else {
if(tunnelserver)
return true;
if(!to->status.reachable) {
2012-09-28 15:51:48 +00:00
logger(DEBUG_PROTOCOL, LOG_WARNING, "Got %s from %s (%s) destination %s which is not reachable",
"REQ_KEY", c->name, c->hostname, to_name);
return true;
}
/* Is this an extended REQ_KEY message? */
if(experimental && reqno)
return req_key_ext_h(c, request, from, to, reqno);
send_request(to->nexthop->connection, "%s", request);
2002-09-09 21:25:28 +00:00
}
2002-02-11 10:05:58 +00:00
2003-07-22 20:55:21 +00:00
return true;
2002-02-11 10:05:58 +00:00
}
bool send_ans_key(node_t *to) {
if(to->status.sptps)
abort();
#ifdef DISABLE_LEGACY
return false;
#else
size_t keylen = myself->incipher ? cipher_keylength(myself->incipher) : 1;
char key[keylen * 2 + 1];
2002-09-09 21:25:28 +00:00
randomize(key, keylen);
cipher_close(to->incipher);
digest_close(to->indigest);
2012-10-09 14:27:28 +00:00
if(myself->incipher) {
to->incipher = cipher_open_by_nid(cipher_get_nid(myself->incipher));
if(!to->incipher)
abort();
if(!cipher_set_key(to->incipher, key, false))
abort();
}
2002-09-09 21:25:28 +00:00
if(myself->indigest) {
to->indigest = digest_open_by_nid(digest_get_nid(myself->indigest), digest_length(myself->indigest));
if(!to->indigest)
abort();
if(!digest_set_key(to->indigest, key, keylen))
abort();
}
to->incompression = myself->incompression;
bin2hex(key, key, keylen);
2002-09-09 21:25:28 +00:00
// Reset sequence number and late packet window
mykeyused = true;
to->received_seqno = 0;
to->received = 0;
if(replaywin) memset(to->late, 0, replaywin);
to->status.validkey_in = true;
2012-07-21 10:51:53 +00:00
return send_request(to->nexthop->connection, "%d %s %s %s %d %d %d %d", ANS_KEY,
myself->name, to->name, key,
cipher_get_nid(to->incipher),
digest_get_nid(to->indigest),
(int)digest_length(to->indigest),
to->incompression);
#endif
2002-02-11 10:05:58 +00:00
}
bool ans_key_h(connection_t *c, const char *request) {
2002-09-09 21:25:28 +00:00
char from_name[MAX_STRING_SIZE];
char to_name[MAX_STRING_SIZE];
char key[MAX_STRING_SIZE];
2012-10-10 15:17:49 +00:00
char address[MAX_STRING_SIZE] = "";
char port[MAX_STRING_SIZE] = "";
int cipher, digest, maclength, compression, keylen;
2002-09-09 21:25:28 +00:00
node_t *from, *to;
2011-07-13 20:52:52 +00:00
if(sscanf(request, "%*d "MAX_STRING" "MAX_STRING" "MAX_STRING" %d %d %d %d "MAX_STRING" "MAX_STRING,
2002-09-09 21:25:28 +00:00
from_name, to_name, key, &cipher, &digest, &maclength,
&compression, address, port) < 7) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got bad %s from %s (%s)", "ANS_KEY", c->name,
2002-09-09 21:25:28 +00:00
c->hostname);
2003-07-22 20:55:21 +00:00
return false;
2002-02-11 10:05:58 +00:00
}
2002-09-09 21:25:28 +00:00
if(!check_id(from_name) || !check_id(to_name)) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got bad %s from %s (%s): %s", "ANS_KEY", c->name, c->hostname, "invalid name");
return false;
}
2002-09-09 21:25:28 +00:00
from = lookup_node(from_name);
if(!from) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got %s from %s (%s) origin %s which does not exist in our connection list",
2002-09-09 21:25:28 +00:00
"ANS_KEY", c->name, c->hostname, from_name);
return true;
2002-09-09 21:25:28 +00:00
}
to = lookup_node(to_name);
if(!to) {
logger(DEBUG_ALWAYS, LOG_ERR, "Got %s from %s (%s) destination %s which does not exist in our connection list",
2002-09-09 21:25:28 +00:00
"ANS_KEY", c->name, c->hostname, to_name);
return true;
2002-02-11 10:05:58 +00:00
}
2002-09-09 21:25:28 +00:00
/* Forward it if necessary */
if(to != myself) {
if(tunnelserver)
return true;
if(!to->status.reachable) {
logger(DEBUG_ALWAYS, LOG_WARNING, "Got %s from %s (%s) destination %s which is not reachable",
"ANS_KEY", c->name, c->hostname, to_name);
return true;
}
if(!*address && from->address.sa.sa_family != AF_UNSPEC) {
char *address, *port;
logger(DEBUG_PROTOCOL, LOG_DEBUG, "Appending reflexive UDP address to ANS_KEY from %s to %s", from->name, to->name);
sockaddr2str(&from->address, &address, &port);
send_request(to->nexthop->connection, "%s %s %s", request, address, port);
free(address);
free(port);
return true;
}
return send_request(to->nexthop->connection, "%s", request);
2002-02-11 10:05:58 +00:00
}
2002-09-09 21:25:28 +00:00
#ifndef DISABLE_LEGACY
/* Don't use key material until every check has passed. */
cipher_close(from->outcipher);
digest_close(from->outdigest);
#endif
Don't unset validkey when receiving SPTPS handshakes over ANS_KEY. This fixes a hairy race condition that was introduced in 1e89a63f1638e43dee79afbb18d5f733b27d830b, which changed the underlying transport of handshake packets from REQ_KEY to ANS_KEY. Unfortunately, what I missed in that commit is, on the receiving side, there is a slight difference between req_key_h() and ans_key_h(): indeed, the latter resets validkey to false. The reason why this is not a problem during typical operation is because the normal SPTPS key regeneration procedure looks like this: KEX -> <- KEX SIG -> <- SIG All these messages are sent over ANS_KEY, therefore the receiving side will unset validkey. However, that's typically not a problem in practice because upon reception of the last message (SIG), SPTPS will call sptps_receive_record(), which will set validkey to true again, and everything works out fine in the end. However, that was the *typical* scenario. Now let's assume that the SPTPS channel is in active use at the same time key regeneration happens. Specifically, let's assume a normal VPN data packet sneaks in during the key regeneration procedure: KEX -> <- KEX <- (SPTPS packet, over TCP or UDP) <- KEX (wtf?) SIG -> (refused with Invalid packet seqno: XXX != 0) At this point, both nodes are extremely confused and the SPTPS channel becomes unusable with various errors being thrown on both sides. The channel will stay down until automatic SPTPS channel restart kicks in after 10 seconds. (Note: the above is just an example - the race can occur on either side whenever a packet is sent during the period of time between KEX and SIG messages are received by the node sending the packet.) I've seen this race occur in the wild - it is very likely to occur if key regeneration occurs on a heavily loaded channel. It can be reproduced fairly easily by setting KeyExpire to a short value (a few seconds) and then running something like ping -f foobar -i 0.01. The reason why this occurs is because tinc's TX code path triggers the following: - send_packet() - try_tx() - try_tx_sptps() - validkey is false because we just received an ANS_KEY message - waitingforkey is false because it's not used for key regeneration - send_req_key() - SPTPS channel restart (sptps_stop(), sptps_start()). Obviously, it all goes downhill from there and the two nodes get very confused quickly (for example the seqno gets reset, hence the error messages). This commit fixes the issue by keeping validkey set when SPTPS data is received over ANS_KEY messages.
2015-11-22 17:14:14 +00:00
if (!from->status.sptps) from->status.validkey = false;
if(compression < 0 || compression > 11) {
logger(DEBUG_ALWAYS, LOG_ERR, "Node %s (%s) uses bogus compression level!", from->name, from->hostname);
return true;
}
from->outcompression = compression;
/* SPTPS or old-style key exchange? */
if(from->status.sptps) {
char buf[strlen(key)];
int len = b64decode(key, buf, strlen(key));
Proactively restart the SPTPS tunnel if we get receive errors. There are a number of ways a SPTPS tunnel can get into a corrupt state. For example, during key regeneration, the KEX and SIG messages from other nodes might arrive out of order, which confuses the hell out of the SPTPS code. Another possible scenario is not noticing another node crashed and restarted because there was no point in time where the node was seen completely disconnected from *all* nodes; this could result in using the wrong (old) key. There are probably other scenarios which have not even been considered yet. Distributed systems are hard. When SPTPS got confused by a packet, it used to crash the entire process; fortunately that was fixed by commit 2e7f68ad2b51648b89c4b5c61aeb4cec67c2fbbb. However, the error handling (or lack thereof) leaves a lot to be desired. Currently, when SPTPS encounters an error when receiving a packet, it just shrugs it off and continues as if nothing happened. The problem is, sometimes getting receive errors mean the tunnel is completely stuck and will not recover on its own. In that case, the node will become unreachable - possibly indefinitely. The goal of this commit is to improve SPTPS error handling by taking proactive action when an incoming packet triggers a failure, which is often an indicator that the tunnel is stuck in some way. When that happens, we simply restart SPTPS entirely, which should make the tunnel recover quickly. To prevent "storms" where two buggy nodes flood each other with invalid packets and therefore spend all their time negotiating new tunnels, we limit the frequency at which tunnel restarts happen to ten seconds. It is likely this commit will solve the "Invalid KEX record length during key regeneration" issue that has been seen in the wild. It is difficult to be sure though because we do not have a full understanding of all the possible conditions that can trigger this problem.
2015-05-17 17:50:11 +00:00
if(!len || !sptps_receive_data(&from->sptps, buf, len)) {
/* Uh-oh. It might be that the tunnel is stuck in some corrupted state,
so let's restart SPTPS in case that helps. But don't do that too often
to prevent storms.
Note that simply relying on handshake timeout is not enough, because
that doesn't apply to key regeneration. */
if(from->last_req_key < now.tv_sec - 10) {
logger(DEBUG_PROTOCOL, LOG_ERR, "Failed to decode handshake TCP packet from %s (%s), restarting SPTPS", from->name, from->hostname);
send_req_key(from);
}
return true;
}
if(from->status.validkey) {
if(*address && *port) {
logger(DEBUG_PROTOCOL, LOG_DEBUG, "Using reflexive UDP address from %s: %s port %s", from->name, address, port);
sockaddr_t sa = str2sockaddr(address, port);
update_node_udp(from, &sa);
}
}
Add MTU_INFO protocol message. In this commit, nodes use MTU_INFO messages to provide MTU information. The issue this code is meant to address is the non-trivial problem of finding the proper MTU when UDP SPTPS relays are involved. Currently, tinc has no idea what the MTU looks like beyond the first relay, and will arbitrarily use the first relay's MTU as the limit. This will fail miserably if the MTU decreases after the first relay, forcing relays to fall back to TCP. More generally, one should keep in mind that relay paths can be arbitrarily complex, resulting in packets taking "epic journeys" through the graph, switching back and forth between UDP (with variable MTUs) and TCP multiple times along the path. A solution that was considered consists in sending standard MTU probes through the relays. This is inefficient (if there are 3 nodes on one side of relay and 3 nodes on the other side, we end up with 3*3=9 MTU discoveries taking place at the same time, while technically only 3+3=6 are needed) and would involve eyebrow-raising behaviors such as probes being sent over TCP. This commit implements an alternative solution, which consists in the packet receiver sending MTU_INFO messages to the packet sender. The message contains an MTU value which is set to maximum when the message is originally sent. The message gets altered as it travels through the metagraph, such that when the message arrives to the destination, the MTU value contained in the message can be used to send packets while making sure no relays will be forced to fall back to TCP to deliver them. The operating principles behind such a protocol message are similar to how the UDP_INFO message works, but there is a key difference that prevents us from simply reusing the same message: the UDP_INFO message only cares about relay-to-relay links (i.e. it is sent between static relays and the information it contains only makes sense between two adjacent static relays), while the MTU_INFO cares about the end-to-end MTU, including the entire relay path. Therefore, UDP_INFO messages stop when they encounter static relays, while MTU_INFO messages don't stop until they get to the original packet sender. Note that, technically, the MTU that is obtained through this mechanism can be slightly pessimistic, because it can be lowered by an intermediate node that is not being used as a relay. Since nodes have no way of knowing whether they'll be used as dynamic relays or not (and have no say in the matter), this is not a trivial problem. That said, this is highly unlikely to result in noticeable issues in realistic scenarios.
2015-03-08 18:54:50 +00:00
send_mtu_info(myself, from, MTU);
return true;
}
#ifdef DISABLE_LEGACY
logger(DEBUG_ALWAYS, LOG_ERR, "Node %s (%s) uses legacy protocol!", from->name, from->hostname);
return false;
#else
2002-09-09 21:25:28 +00:00
/* Check and lookup cipher and digest algorithms */
if(cipher) {
if(!(from->outcipher = cipher_open_by_nid(cipher))) {
logger(DEBUG_ALWAYS, LOG_ERR, "Node %s (%s) uses unknown cipher!", from->name, from->hostname);
return false;
}
} else {
from->outcipher = NULL;
}
if(digest) {
if(!(from->outdigest = digest_open_by_nid(digest, maclength))) {
logger(DEBUG_ALWAYS, LOG_ERR, "Node %s (%s) uses unknown digest!", from->name, from->hostname);
return false;
}
} else {
from->outdigest = NULL;
}
if(maclength != digest_length(from->outdigest)) {
logger(DEBUG_ALWAYS, LOG_ERR, "Node %s (%s) uses bogus MAC length!", from->name, from->hostname);
return false;
2002-09-09 21:25:28 +00:00
}
/* Process key */
keylen = hex2bin(key, key, sizeof key);
if(keylen != (from->outcipher ? cipher_keylength(from->outcipher) : 1)) {
logger(DEBUG_ALWAYS, LOG_ERR, "Node %s (%s) uses wrong keylength!", from->name, from->hostname);
return true;
}
/* Update our copy of the origin's packet key */
if(from->outcipher && !cipher_set_key(from->outcipher, key, true))
return false;
if(from->outdigest && !digest_set_key(from->outdigest, key, keylen))
return false;
from->status.validkey = true;
from->sent_seqno = 0;
2003-10-11 12:16:13 +00:00
if(*address && *port) {
logger(DEBUG_PROTOCOL, LOG_DEBUG, "Using reflexive UDP address from %s: %s port %s", from->name, address, port);
sockaddr_t sa = str2sockaddr(address, port);
update_node_udp(from, &sa);
}
2003-07-22 20:55:21 +00:00
return true;
#endif
2002-02-11 10:05:58 +00:00
}