RADIUS client: Cease endless retry for message for multiple servers

In the previous RADIUS client implementation, when there are multiple
RADIUS servers, we kept trying the next server when the current message
can not be acked. It leads to endless retry when all the RADIUS servers
are down.

Fix this by keeping a counter for the accumulated retransmit attempts
for the message, and guarantee that after all the servers failover
RADIUS_CLIENT_MAX_FAILOVER times the message will be dropped.

Another issue with the previous code was that the decision regarding
whether the server should fail over was made immediately after we send
out the message. This patch guarantees we consider whether a server
needs failover after pending ack times out.

Signed-off-by: Bo Chen<bochen@meraki.com>
This commit is contained in:
Bo Chen 2018-05-10 07:48:41 +00:00 committed by Jouni Malinen
parent 08e6721e87
commit 27ebadccfb

View file

@ -26,12 +26,12 @@
#define RADIUS_CLIENT_MAX_WAIT 120
/**
* RADIUS_CLIENT_MAX_RETRIES - RADIUS client maximum retries
* RADIUS_CLIENT_MAX_FAILOVER - RADIUS client maximum retries
*
* Maximum number of retransmit attempts before the entry is removed from
* Maximum number of server failovers before the entry is removed from
* retransmit list.
*/
#define RADIUS_CLIENT_MAX_RETRIES 10
#define RADIUS_CLIENT_MAX_FAILOVER 3
/**
* RADIUS_CLIENT_MAX_ENTRIES - RADIUS client maximum pending messages
@ -110,10 +110,15 @@ struct radius_msg_list {
os_time_t next_try;
/**
* attempts - Number of transmission attempts
* attempts - Number of transmission attempts for one server
*/
int attempts;
/**
* accu_attempts - Number of accumulated attempts
*/
int accu_attempts;
/**
* next_wait - Next retransmission wait time in seconds
*/
@ -367,9 +372,11 @@ static int radius_client_retransmit(struct radius_client_data *radius,
size_t prev_num_msgs;
u8 *acct_delay_time;
size_t acct_delay_time_len;
int num_servers;
if (entry->msg_type == RADIUS_ACCT ||
entry->msg_type == RADIUS_ACCT_INTERIM) {
num_servers = conf->num_acct_servers;
if (radius->acct_sock < 0)
radius_client_init_acct(radius);
if (radius->acct_sock < 0 && conf->num_acct_servers > 1) {
@ -386,6 +393,7 @@ static int radius_client_retransmit(struct radius_client_data *radius,
conf->acct_server->retransmissions++;
}
} else {
num_servers = conf->num_auth_servers;
if (radius->auth_sock < 0)
radius_client_init_auth(radius);
if (radius->auth_sock < 0 && conf->num_auth_servers > 1) {
@ -449,7 +457,15 @@ static int radius_client_retransmit(struct radius_client_data *radius,
}
/* retransmit; remove entry if too many attempts */
if (entry->accu_attempts > RADIUS_CLIENT_MAX_FAILOVER *
RADIUS_CLIENT_NUM_FAILOVER * num_servers) {
wpa_printf(MSG_INFO,
"RADIUS: Removing un-ACKed message due to too many failed retransmit attempts");
return 1;
}
entry->attempts++;
entry->accu_attempts++;
hostapd_logger(radius->ctx, entry->addr, HOSTAPD_MODULE_RADIUS,
HOSTAPD_LEVEL_DEBUG, "Resending RADIUS message (id=%d)",
radius_msg_get_hdr(entry->msg)->identifier);
@ -466,10 +482,6 @@ static int radius_client_retransmit(struct radius_client_data *radius,
entry->next_wait *= 2;
if (entry->next_wait > RADIUS_CLIENT_MAX_WAIT)
entry->next_wait = RADIUS_CLIENT_MAX_WAIT;
if (entry->attempts >= RADIUS_CLIENT_MAX_RETRIES) {
wpa_printf(MSG_INFO, "RADIUS: Removing un-ACKed message due to too many failed retransmit attempts");
return 1;
}
return 0;
}
@ -490,6 +502,30 @@ static void radius_client_timer(void *eloop_ctx, void *timeout_ctx)
return;
os_get_reltime(&now);
while (entry) {
if (now.sec >= entry->next_try) {
s = entry->msg_type == RADIUS_AUTH ? radius->auth_sock :
radius->acct_sock;
if (entry->attempts > RADIUS_CLIENT_NUM_FAILOVER ||
(s < 0 && entry->attempts > 0)) {
if (entry->msg_type == RADIUS_ACCT ||
entry->msg_type == RADIUS_ACCT_INTERIM)
acct_failover++;
else
auth_failover++;
}
}
entry = entry->next;
}
if (auth_failover)
radius_client_auth_failover(radius);
if (acct_failover)
radius_client_acct_failover(radius);
entry = radius->msgs;
first = 0;
prev = NULL;
@ -517,17 +553,6 @@ static void radius_client_timer(void *eloop_ctx, void *timeout_ctx)
continue;
}
s = entry->msg_type == RADIUS_AUTH ? radius->auth_sock :
radius->acct_sock;
if (entry->attempts > RADIUS_CLIENT_NUM_FAILOVER ||
(s < 0 && entry->attempts > 0)) {
if (entry->msg_type == RADIUS_ACCT ||
entry->msg_type == RADIUS_ACCT_INTERIM)
acct_failover++;
else
auth_failover++;
}
if (first == 0 || entry->next_try < first)
first = entry->next_try;
@ -538,6 +563,7 @@ static void radius_client_timer(void *eloop_ctx, void *timeout_ctx)
if (radius->msgs) {
if (first < now.sec)
first = now.sec;
eloop_cancel_timeout(radius_client_timer, radius, NULL);
eloop_register_timeout(first - now.sec, 0,
radius_client_timer, radius, NULL);
hostapd_logger(radius->ctx, NULL, HOSTAPD_MODULE_RADIUS,
@ -545,12 +571,6 @@ static void radius_client_timer(void *eloop_ctx, void *timeout_ctx)
"retransmit in %ld seconds",
(long int) (first - now.sec));
}
if (auth_failover)
radius_client_auth_failover(radius);
if (acct_failover)
radius_client_acct_failover(radius);
}
@ -674,7 +694,10 @@ static void radius_client_list_add(struct radius_client_data *radius,
entry->first_try = entry->last_attempt.sec;
entry->next_try = entry->first_try + RADIUS_CLIENT_FIRST_WAIT;
entry->attempts = 1;
entry->accu_attempts = 1;
entry->next_wait = RADIUS_CLIENT_FIRST_WAIT * 2;
if (entry->next_wait > RADIUS_CLIENT_MAX_WAIT)
entry->next_wait = RADIUS_CLIENT_MAX_WAIT;
entry->next = radius->msgs;
radius->msgs = entry;
radius_client_update_timeout(radius);
@ -713,9 +736,9 @@ static void radius_client_list_add(struct radius_client_data *radius,
*
* The message is added on the retransmission queue and will be retransmitted
* automatically until a response is received or maximum number of retries
* (RADIUS_CLIENT_MAX_RETRIES) is reached. No such retries are used with
* RADIUS_ACCT_INTERIM, i.e., such a pending message is removed from the queue
* automatically on transmission failure.
* (RADIUS_CLIENT_MAX_FAILOVER * RADIUS_CLIENT_NUM_FAILOVER) is reached. No
* such retries are used with RADIUS_ACCT_INTERIM, i.e., such a pending message
* is removed from the queue automatically on transmission failure.
*
* The related device MAC address can be used to identify pending messages that
* can be removed with radius_client_flush_auth().
@ -1087,14 +1110,13 @@ radius_change_server(struct radius_client_data *radius,
}
}
/* Reset retry counters for the new server */
for (entry = radius->msgs; oserv && oserv != nserv && entry;
entry = entry->next) {
/* Reset retry counters */
for (entry = radius->msgs; oserv && entry; entry = entry->next) {
if ((auth && entry->msg_type != RADIUS_AUTH) ||
(!auth && entry->msg_type != RADIUS_ACCT))
continue;
entry->next_try = entry->first_try + RADIUS_CLIENT_FIRST_WAIT;
entry->attempts = 0;
entry->attempts = 1;
entry->next_wait = RADIUS_CLIENT_FIRST_WAIT * 2;
}