blob: 2735a1cb965231520b5ae285f770ef4feeedd896 [file] [log] [blame]
/* MIT License
*
* Copyright (c) 1998 Massachusetts Institute of Technology
* Copyright (c) 2010 Daniel Stenberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* SPDX-License-Identifier: MIT
*/
#include "ares_private.h"
#ifdef HAVE_STRINGS_H
# include <strings.h>
#endif
#ifdef HAVE_SYS_IOCTL_H
# include <sys/ioctl.h>
#endif
#ifdef NETWARE
# include <sys/filio.h>
#endif
#ifdef HAVE_STDINT_H
# include <stdint.h>
#endif
#include <assert.h>
#include <fcntl.h>
#include <limits.h>
static void timeadd(ares_timeval_t *now, size_t millisecs);
static ares_status_t process_write(ares_channel_t *channel,
ares_socket_t write_fd);
static ares_status_t process_read(ares_channel_t *channel,
ares_socket_t read_fd,
const ares_timeval_t *now);
static ares_status_t process_timeouts(ares_channel_t *channel,
const ares_timeval_t *now);
static ares_status_t process_answer(ares_channel_t *channel,
const unsigned char *abuf, size_t alen,
ares_conn_t *conn,
const ares_timeval_t *now);
static void handle_conn_error(ares_conn_t *conn, ares_bool_t critical_failure,
ares_status_t failure_status);
static ares_bool_t same_questions(const ares_query_t *query,
const ares_dns_record_t *arec);
static void end_query(ares_channel_t *channel, ares_server_t *server,
ares_query_t *query, ares_status_t status,
const ares_dns_record_t *dnsrec);
static void ares_query_remove_from_conn(ares_query_t *query)
{
/* If its not part of a connection, it can't be tracked for timeouts either */
ares_slist_node_destroy(query->node_queries_by_timeout);
ares_llist_node_destroy(query->node_queries_to_conn);
query->node_queries_by_timeout = NULL;
query->node_queries_to_conn = NULL;
query->conn = NULL;
}
/* Invoke the server state callback after a success or failure */
static void invoke_server_state_cb(const ares_server_t *server,
ares_bool_t success, int flags)
{
const ares_channel_t *channel = server->channel;
ares_buf_t *buf;
ares_status_t status;
char *server_string;
if (channel->server_state_cb == NULL) {
return;
}
buf = ares_buf_create();
if (buf == NULL) {
return; /* LCOV_EXCL_LINE: OutOfMemory */
}
status = ares_get_server_addr(server, buf);
if (status != ARES_SUCCESS) {
ares_buf_destroy(buf); /* LCOV_EXCL_LINE: OutOfMemory */
return; /* LCOV_EXCL_LINE: OutOfMemory */
}
server_string = ares_buf_finish_str(buf, NULL);
buf = NULL;
if (server_string == NULL) {
return; /* LCOV_EXCL_LINE: OutOfMemory */
}
channel->server_state_cb(server_string, success, flags,
channel->server_state_cb_data);
ares_free(server_string);
}
static void server_increment_failures(ares_server_t *server,
ares_bool_t used_tcp)
{
ares_slist_node_t *node;
const ares_channel_t *channel = server->channel;
ares_timeval_t next_retry_time;
node = ares_slist_node_find(channel->servers, server);
if (node == NULL) {
return; /* LCOV_EXCL_LINE: DefensiveCoding */
}
server->consec_failures++;
ares_slist_node_reinsert(node);
ares_tvnow(&next_retry_time);
timeadd(&next_retry_time, channel->server_retry_delay);
server->next_retry_time = next_retry_time;
invoke_server_state_cb(server, ARES_FALSE,
used_tcp == ARES_TRUE ? ARES_SERV_STATE_TCP
: ARES_SERV_STATE_UDP);
}
static void server_set_good(ares_server_t *server, ares_bool_t used_tcp)
{
ares_slist_node_t *node;
const ares_channel_t *channel = server->channel;
node = ares_slist_node_find(channel->servers, server);
if (node == NULL) {
return; /* LCOV_EXCL_LINE: DefensiveCoding */
}
if (server->consec_failures > 0) {
server->consec_failures = 0;
ares_slist_node_reinsert(node);
}
server->next_retry_time.sec = 0;
server->next_retry_time.usec = 0;
invoke_server_state_cb(server, ARES_TRUE,
used_tcp == ARES_TRUE ? ARES_SERV_STATE_TCP
: ARES_SERV_STATE_UDP);
}
/* return true if now is exactly check time or later */
ares_bool_t ares_timedout(const ares_timeval_t *now,
const ares_timeval_t *check)
{
ares_int64_t secs = (now->sec - check->sec);
if (secs > 0) {
return ARES_TRUE; /* yes, timed out */
}
if (secs < 0) {
return ARES_FALSE; /* nope, not timed out */
}
/* if the full seconds were identical, check the sub second parts */
return ((ares_int64_t)now->usec - (ares_int64_t)check->usec) >= 0
? ARES_TRUE
: ARES_FALSE;
}
/* add the specific number of milliseconds to the time in the first argument */
static void timeadd(ares_timeval_t *now, size_t millisecs)
{
now->sec += (ares_int64_t)millisecs / 1000;
now->usec += (unsigned int)((millisecs % 1000) * 1000);
if (now->usec >= 1000000) {
now->sec += now->usec / 1000000;
now->usec %= 1000000;
}
}
static ares_status_t ares_process_fds_nolock(ares_channel_t *channel,
const ares_fd_events_t *events,
size_t nevents, unsigned int flags)
{
ares_timeval_t now;
size_t i;
ares_status_t status = ARES_SUCCESS;
if (channel == NULL || (events == NULL && nevents != 0)) {
return ARES_EFORMERR; /* LCOV_EXCL_LINE: DefensiveCoding */
}
ares_tvnow(&now);
/* Process write events */
for (i = 0; i < nevents; i++) {
if (events[i].fd == ARES_SOCKET_BAD ||
!(events[i].events & ARES_FD_EVENT_WRITE)) {
continue;
}
status = process_write(channel, events[i].fd);
/* We only care about ENOMEM, anything else is handled via connection
* retries, etc */
if (status == ARES_ENOMEM) {
goto done;
}
}
/* Process read events */
for (i = 0; i < nevents; i++) {
if (events[i].fd == ARES_SOCKET_BAD ||
!(events[i].events & ARES_FD_EVENT_READ)) {
continue;
}
status = process_read(channel, events[i].fd, &now);
if (status == ARES_ENOMEM) {
goto done;
}
}
if (!(flags & ARES_PROCESS_FLAG_SKIP_NON_FD)) {
ares_check_cleanup_conns(channel);
status = process_timeouts(channel, &now);
if (status == ARES_ENOMEM) {
goto done;
}
}
done:
if (status == ARES_ENOMEM) {
return ARES_ENOMEM;
}
return ARES_SUCCESS;
}
ares_status_t ares_process_fds(ares_channel_t *channel,
const ares_fd_events_t *events, size_t nevents,
unsigned int flags)
{
ares_status_t status;
if (channel == NULL) {
return ARES_EFORMERR;
}
ares_channel_lock(channel);
status = ares_process_fds_nolock(channel, events, nevents, flags);
ares_channel_unlock(channel);
return status;
}
void ares_process_fd(ares_channel_t *channel, ares_socket_t read_fd,
ares_socket_t write_fd)
{
ares_fd_events_t events[2];
size_t nevents = 0;
memset(events, 0, sizeof(events));
if (read_fd != ARES_SOCKET_BAD) {
nevents++;
events[nevents - 1].fd = read_fd;
events[nevents - 1].events |= ARES_FD_EVENT_READ;
}
if (write_fd != ARES_SOCKET_BAD) {
if (write_fd != read_fd) {
nevents++;
}
events[nevents - 1].fd = write_fd;
events[nevents - 1].events |= ARES_FD_EVENT_WRITE;
}
ares_process_fds(channel, events, nevents, ARES_PROCESS_FLAG_NONE);
}
static ares_socket_t *channel_socket_list(const ares_channel_t *channel,
size_t *num)
{
ares_slist_node_t *snode;
ares_array_t *arr = ares_array_create(sizeof(ares_socket_t), NULL);
*num = 0;
if (arr == NULL) {
return NULL; /* LCOV_EXCL_LINE: OutOfMemory */
}
for (snode = ares_slist_node_first(channel->servers); snode != NULL;
snode = ares_slist_node_next(snode)) {
ares_server_t *server = ares_slist_node_val(snode);
ares_llist_node_t *node;
for (node = ares_llist_node_first(server->connections); node != NULL;
node = ares_llist_node_next(node)) {
const ares_conn_t *conn = ares_llist_node_val(node);
ares_socket_t *sptr;
ares_status_t status;
if (conn->fd == ARES_SOCKET_BAD) {
continue;
}
status = ares_array_insert_last((void **)&sptr, arr);
if (status != ARES_SUCCESS) {
ares_array_destroy(arr); /* LCOV_EXCL_LINE: OutOfMemory */
return NULL; /* LCOV_EXCL_LINE: OutOfMemory */
}
*sptr = conn->fd;
}
}
return ares_array_finish(arr, num);
}
/* Something interesting happened on the wire, or there was a timeout.
* See what's up and respond accordingly.
*/
void ares_process(ares_channel_t *channel, fd_set *read_fds, fd_set *write_fds)
{
size_t i;
size_t num_sockets;
ares_socket_t *socketlist;
ares_fd_events_t *events = NULL;
size_t nevents = 0;
if (channel == NULL) {
return;
}
ares_channel_lock(channel);
/* There is no good way to iterate across an fd_set, instead we must pull a
* list of all known fds, and iterate across that checking against the fd_set.
*/
socketlist = channel_socket_list(channel, &num_sockets);
/* Lets create an events array, maximum number is the number of sockets in
* the list, so we'll use that and just track entries with nevents */
if (num_sockets) {
events = ares_malloc_zero(sizeof(*events) * num_sockets);
if (events == NULL) {
goto done;
}
}
for (i = 0; i < num_sockets; i++) {
ares_bool_t had_read = ARES_FALSE;
if (read_fds && FD_ISSET(socketlist[i], read_fds)) {
nevents++;
events[nevents - 1].fd = socketlist[i];
events[nevents - 1].events |= ARES_FD_EVENT_READ;
had_read = ARES_TRUE;
}
if (write_fds && FD_ISSET(socketlist[i], write_fds)) {
if (!had_read) {
nevents++;
}
events[nevents - 1].fd = socketlist[i];
events[nevents - 1].events |= ARES_FD_EVENT_WRITE;
}
}
done:
ares_process_fds_nolock(channel, events, nevents, ARES_PROCESS_FLAG_NONE);
ares_free(events);
ares_free(socketlist);
ares_channel_unlock(channel);
}
static ares_status_t process_write(ares_channel_t *channel,
ares_socket_t write_fd)
{
ares_conn_t *conn = ares_conn_from_fd(channel, write_fd);
ares_status_t status;
if (conn == NULL) {
return ARES_SUCCESS;
}
/* Mark as connected if we got here and TFO Initial not set */
if (!(conn->flags & ARES_CONN_FLAG_TFO_INITIAL)) {
conn->state_flags |= ARES_CONN_STATE_CONNECTED;
}
status = ares_conn_flush(conn);
if (status != ARES_SUCCESS) {
handle_conn_error(conn, ARES_TRUE, status);
}
return status;
}
void ares_process_pending_write(ares_channel_t *channel)
{
ares_slist_node_t *node;
if (channel == NULL) {
return;
}
ares_channel_lock(channel);
if (!channel->notify_pending_write) {
ares_channel_unlock(channel);
return;
}
/* Set as untriggerd before calling into ares_conn_flush(), this is
* because its possible ares_conn_flush() might cause additional data to
* be enqueued if there is some form of exception so it will need to recurse.
*/
channel->notify_pending_write = ARES_FALSE;
for (node = ares_slist_node_first(channel->servers); node != NULL;
node = ares_slist_node_next(node)) {
ares_server_t *server = ares_slist_node_val(node);
ares_conn_t *conn = server->tcp_conn;
ares_status_t status;
if (conn == NULL) {
continue;
}
/* Enqueue any pending data if there is any */
status = ares_conn_flush(conn);
if (status != ARES_SUCCESS) {
handle_conn_error(conn, ARES_TRUE, status);
}
}
ares_channel_unlock(channel);
}
static ares_status_t read_conn_packets(ares_conn_t *conn)
{
ares_bool_t read_again;
ares_conn_err_t err;
const ares_channel_t *channel = conn->server->channel;
do {
size_t count;
size_t len = 65535;
unsigned char *ptr;
size_t start_len = ares_buf_len(conn->in_buf);
/* If UDP, lets write out a placeholder for the length indicator */
if (!(conn->flags & ARES_CONN_FLAG_TCP) &&
ares_buf_append_be16(conn->in_buf, 0) != ARES_SUCCESS) {
handle_conn_error(conn, ARES_FALSE /* not critical to connection */,
ARES_SUCCESS);
return ARES_ENOMEM;
}
/* Get a buffer of sufficient size */
ptr = ares_buf_append_start(conn->in_buf, &len);
if (ptr == NULL) {
handle_conn_error(conn, ARES_FALSE /* not critical to connection */,
ARES_SUCCESS);
return ARES_ENOMEM;
}
/* Read from socket */
err = ares_conn_read(conn, ptr, len, &count);
if (err != ARES_CONN_ERR_SUCCESS) {
ares_buf_append_finish(conn->in_buf, 0);
if (!(conn->flags & ARES_CONN_FLAG_TCP)) {
ares_buf_set_length(conn->in_buf, start_len);
}
break;
}
/* Record amount of data read */
ares_buf_append_finish(conn->in_buf, count);
/* Only loop if we're not overwriting socket functions, and are using UDP
* or are using TCP and read the maximum buffer size */
read_again = ARES_FALSE;
if (channel->sock_funcs == NULL) {
if (!(conn->flags & ARES_CONN_FLAG_TCP)) {
read_again = ARES_TRUE;
} else if (count == len) {
read_again = ARES_TRUE;
}
}
/* If UDP, overwrite length */
if (!(conn->flags & ARES_CONN_FLAG_TCP)) {
len = ares_buf_len(conn->in_buf);
ares_buf_set_length(conn->in_buf, start_len);
ares_buf_append_be16(conn->in_buf, (unsigned short)count);
ares_buf_set_length(conn->in_buf, len);
}
/* Try to read again only if *we* set up the socket, otherwise it may be
* a blocking socket and would cause recvfrom to hang. */
} while (read_again);
if (err != ARES_CONN_ERR_SUCCESS && err != ARES_CONN_ERR_WOULDBLOCK) {
handle_conn_error(conn, ARES_TRUE, ARES_ECONNREFUSED);
return ARES_ECONNREFUSED;
}
return ARES_SUCCESS;
}
static ares_status_t read_answers(ares_conn_t *conn, const ares_timeval_t *now)
{
ares_status_t status;
ares_channel_t *channel = conn->server->channel;
/* Process all queued answers */
while (1) {
unsigned short dns_len = 0;
const unsigned char *data = NULL;
size_t data_len = 0;
/* Tag so we can roll back */
ares_buf_tag(conn->in_buf);
/* Read length indicator */
status = ares_buf_fetch_be16(conn->in_buf, &dns_len);
if (status != ARES_SUCCESS) {
ares_buf_tag_rollback(conn->in_buf);
break;
}
/* Not enough data for a full response yet */
status = ares_buf_consume(conn->in_buf, dns_len);
if (status != ARES_SUCCESS) {
ares_buf_tag_rollback(conn->in_buf);
break;
}
/* Can't fail except for misuse */
data = ares_buf_tag_fetch(conn->in_buf, &data_len);
if (data == NULL || data_len < 2) {
ares_buf_tag_clear(conn->in_buf);
break;
}
/* Strip off 2 bytes length */
data += 2;
data_len -= 2;
/* We finished reading this answer; process it */
status = process_answer(channel, data, data_len, conn, now);
if (status != ARES_SUCCESS) {
handle_conn_error(conn, ARES_TRUE, status);
return status;
}
/* Since we processed the answer, clear the tag so space can be reclaimed */
ares_buf_tag_clear(conn->in_buf);
}
return status;
}
static ares_status_t process_read(ares_channel_t *channel,
ares_socket_t read_fd,
const ares_timeval_t *now)
{
ares_conn_t *conn = ares_conn_from_fd(channel, read_fd);
ares_status_t status;
if (conn == NULL) {
return ARES_SUCCESS;
}
/* TODO: There might be a potential issue here where there was a read that
* read some data, then looped and read again and got a disconnect.
* Right now, that would cause a resend instead of processing the data
* we have. This is fairly unlikely to occur due to only looping if
* a full buffer of 65535 bytes was read. */
status = read_conn_packets(conn);
if (status != ARES_SUCCESS) {
return status;
}
return read_answers(conn, now);
}
/* If any queries have timed out, note the timeout and move them on. */
static ares_status_t process_timeouts(ares_channel_t *channel,
const ares_timeval_t *now)
{
ares_slist_node_t *node;
ares_status_t status = ARES_SUCCESS;
/* Just keep popping off the first as this list will re-sort as things come
* and go. We don't want to try to rely on 'next' as some operation might
* cause a cleanup of that pointer and would become invalid */
while ((node = ares_slist_node_first(channel->queries_by_timeout)) != NULL) {
ares_query_t *query = ares_slist_node_val(node);
ares_conn_t *conn;
/* Since this is sorted, as soon as we hit a query that isn't timed out,
* break */
if (!ares_timedout(now, &query->timeout)) {
break;
}
query->timeouts++;
conn = query->conn;
server_increment_failures(conn->server, query->using_tcp);
status = ares_requeue_query(query, now, ARES_ETIMEOUT, ARES_TRUE, NULL);
if (status == ARES_ENOMEM) {
goto done;
}
}
done:
if (status == ARES_ENOMEM) {
return ARES_ENOMEM;
}
return ARES_SUCCESS;
}
static ares_status_t rewrite_without_edns(ares_query_t *query)
{
ares_status_t status = ARES_SUCCESS;
size_t i;
ares_bool_t found_opt_rr = ARES_FALSE;
/* Find and remove the OPT RR record */
for (i = 0; i < ares_dns_record_rr_cnt(query->query, ARES_SECTION_ADDITIONAL);
i++) {
const ares_dns_rr_t *rr;
rr = ares_dns_record_rr_get(query->query, ARES_SECTION_ADDITIONAL, i);
if (ares_dns_rr_get_type(rr) == ARES_REC_TYPE_OPT) {
ares_dns_record_rr_del(query->query, ARES_SECTION_ADDITIONAL, i);
found_opt_rr = ARES_TRUE;
break;
}
}
if (!found_opt_rr) {
status = ARES_EFORMERR;
goto done;
}
done:
return status;
}
/* Handle an answer from a server. This must NEVER cleanup the
* server connection! Return something other than ARES_SUCCESS to cause
* the connection to be terminated after this call. */
static ares_status_t process_answer(ares_channel_t *channel,
const unsigned char *abuf, size_t alen,
ares_conn_t *conn,
const ares_timeval_t *now)
{
ares_query_t *query;
/* Cache these as once ares_send_query() gets called, it may end up
* invalidating the connection all-together */
ares_server_t *server = conn->server;
ares_dns_record_t *rdnsrec = NULL;
ares_status_t status;
ares_bool_t is_cached = ARES_FALSE;
/* UDP can have 0-byte messages, drop them to the ground */
if (alen == 0) {
return ARES_SUCCESS;
}
/* Parse the response */
status = ares_dns_parse(abuf, alen, 0, &rdnsrec);
if (status != ARES_SUCCESS) {
/* Malformations are never accepted */
status = ARES_EBADRESP;
goto cleanup;
}
/* Find the query corresponding to this packet. The queries are
* hashed/bucketed by query id, so this lookup should be quick.
*/
query = ares_htable_szvp_get_direct(channel->queries_by_qid,
ares_dns_record_get_id(rdnsrec));
if (!query) {
/* We may have stopped listening for this query, that's ok */
status = ARES_SUCCESS;
goto cleanup;
}
/* Both the query id and the questions must be the same. We will drop any
* replies that aren't for the same query as this is considered invalid. */
if (!same_questions(query, rdnsrec)) {
/* Possible qid conflict due to delayed response, that's ok */
status = ARES_SUCCESS;
goto cleanup;
}
/* Validate DNS cookie in response. This function may need to requeue the
* query. */
if (ares_cookie_validate(query, rdnsrec, conn, now) != ARES_SUCCESS) {
/* Drop response and return */
status = ARES_SUCCESS;
goto cleanup;
}
/* At this point we know we've received an answer for this query, so we should
* remove it from the connection's queue so we can possibly invalidate the
* connection. Delay cleaning up the connection though as we may enqueue
* something new. */
ares_llist_node_destroy(query->node_queries_to_conn);
query->node_queries_to_conn = NULL;
/* If we use EDNS and server answers with FORMERR without an OPT RR, the
* protocol extension is not understood by the responder. We must retry the
* query without EDNS enabled. */
if (ares_dns_record_get_rcode(rdnsrec) == ARES_RCODE_FORMERR &&
ares_dns_get_opt_rr_const(query->query) != NULL &&
ares_dns_get_opt_rr_const(rdnsrec) == NULL) {
status = rewrite_without_edns(query);
if (status != ARES_SUCCESS) {
end_query(channel, server, query, status, NULL);
goto cleanup;
}
ares_send_query(query, now);
status = ARES_SUCCESS;
goto cleanup;
}
/* If we got a truncated UDP packet and are not ignoring truncation,
* don't accept the packet, and switch the query to TCP if we hadn't
* done so already.
*/
if (ares_dns_record_get_flags(rdnsrec) & ARES_FLAG_TC &&
!(conn->flags & ARES_CONN_FLAG_TCP) &&
!(channel->flags & ARES_FLAG_IGNTC)) {
query->using_tcp = ARES_TRUE;
ares_send_query(query, now);
status = ARES_SUCCESS; /* Switched to TCP is ok */
goto cleanup;
}
/* If we aren't passing through all error packets, discard packets
* with SERVFAIL, NOTIMP, or REFUSED response codes.
*/
if (!(channel->flags & ARES_FLAG_NOCHECKRESP)) {
ares_dns_rcode_t rcode = ares_dns_record_get_rcode(rdnsrec);
if (rcode == ARES_RCODE_SERVFAIL || rcode == ARES_RCODE_NOTIMP ||
rcode == ARES_RCODE_REFUSED) {
switch (rcode) {
case ARES_RCODE_SERVFAIL:
status = ARES_ESERVFAIL;
break;
case ARES_RCODE_NOTIMP:
status = ARES_ENOTIMP;
break;
case ARES_RCODE_REFUSED:
status = ARES_EREFUSED;
break;
default:
break;
}
server_increment_failures(server, query->using_tcp);
ares_requeue_query(query, now, status, ARES_TRUE, rdnsrec);
/* Should any of these cause a connection termination?
* Maybe SERVER_FAILURE? */
status = ARES_SUCCESS;
goto cleanup;
}
}
/* If cache insertion was successful, it took ownership. We ignore
* other cache insertion failures. */
if (ares_qcache_insert(channel, now, query, rdnsrec) == ARES_SUCCESS) {
is_cached = ARES_TRUE;
}
server_set_good(server, query->using_tcp);
end_query(channel, server, query, ARES_SUCCESS, rdnsrec);
status = ARES_SUCCESS;
cleanup:
/* Don't cleanup the cached pointer to the dns response */
if (!is_cached) {
ares_dns_record_destroy(rdnsrec);
}
return status;
}
static void handle_conn_error(ares_conn_t *conn, ares_bool_t critical_failure,
ares_status_t failure_status)
{
ares_server_t *server = conn->server;
/* Increment failures first before requeue so it is unlikely to requeue
* to the same server */
if (critical_failure) {
server_increment_failures(
server, (conn->flags & ARES_CONN_FLAG_TCP) ? ARES_TRUE : ARES_FALSE);
}
/* This will requeue any connections automatically */
ares_close_connection(conn, failure_status);
}
ares_status_t ares_requeue_query(ares_query_t *query, const ares_timeval_t *now,
ares_status_t status,
ares_bool_t inc_try_count,
const ares_dns_record_t *dnsrec)
{
ares_channel_t *channel = query->channel;
size_t max_tries = ares_slist_len(channel->servers) * channel->tries;
ares_query_remove_from_conn(query);
if (status != ARES_SUCCESS) {
query->error_status = status;
}
if (inc_try_count) {
query->try_count++;
}
if (query->try_count < max_tries && !query->no_retries) {
return ares_send_query(query, now);
}
/* If we are here, all attempts to perform query failed. */
if (query->error_status == ARES_SUCCESS) {
query->error_status = ARES_ETIMEOUT;
}
end_query(channel, NULL, query, query->error_status, dnsrec);
return ARES_ETIMEOUT;
}
/* Pick a random server from the list, we first get a random number in the
* range of the number of servers, then scan until we find that server in
* the list */
static ares_server_t *ares_random_server(ares_channel_t *channel)
{
unsigned char c;
size_t cnt;
size_t idx;
ares_slist_node_t *node;
size_t num_servers = ares_slist_len(channel->servers);
/* Silence coverity, not possible */
if (num_servers == 0) {
return NULL;
}
ares_rand_bytes(channel->rand_state, &c, 1);
cnt = c;
idx = cnt % num_servers;
cnt = 0;
for (node = ares_slist_node_first(channel->servers); node != NULL;
node = ares_slist_node_next(node)) {
if (cnt == idx) {
return ares_slist_node_val(node);
}
cnt++;
}
return NULL;
}
/* Pick a server from the list with failover behavior.
*
* We default to using the first server in the sorted list of servers. That is
* the server with the lowest number of consecutive failures and then the
* highest priority server (by idx) if there is a draw.
*
* However, if a server temporarily goes down and hits some failures, then that
* server will never be retried until all other servers hit the same number of
* failures. This may prevent the server from being retried for a long time.
*
* To resolve this, with some probability we select a failed server to retry
* instead.
*/
static ares_server_t *ares_failover_server(ares_channel_t *channel)
{
ares_server_t *first_server = ares_slist_first_val(channel->servers);
const ares_server_t *last_server = ares_slist_last_val(channel->servers);
unsigned short r;
/* Defensive code against no servers being available on the channel. */
if (first_server == NULL) {
return NULL; /* LCOV_EXCL_LINE: DefensiveCoding */
}
/* If no servers have failures, then prefer the first server in the list. */
if (last_server != NULL && last_server->consec_failures == 0) {
return first_server;
}
/* If we are not configured with a server retry chance then return the first
* server.
*/
if (channel->server_retry_chance == 0) {
return first_server;
}
/* Generate a random value to decide whether to retry a failed server. The
* probability to use is 1/channel->server_retry_chance, rounded up to a
* precision of 1/2^B where B is the number of bits in the random value.
* We use an unsigned short for the random value for increased precision.
*/
ares_rand_bytes(channel->rand_state, (unsigned char *)&r, sizeof(r));
if (r % channel->server_retry_chance == 0) {
/* Select a suitable failed server to retry. */
ares_timeval_t now;
ares_slist_node_t *node;
ares_tvnow(&now);
for (node = ares_slist_node_first(channel->servers); node != NULL;
node = ares_slist_node_next(node)) {
ares_server_t *node_val = ares_slist_node_val(node);
if (node_val != NULL && node_val->consec_failures > 0 &&
ares_timedout(&now, &node_val->next_retry_time)) {
return node_val;
}
}
}
/* If we have not returned yet, then return the first server. */
return first_server;
}
static size_t ares_calc_query_timeout(const ares_query_t *query,
const ares_server_t *server,
const ares_timeval_t *now)
{
const ares_channel_t *channel = query->channel;
size_t timeout = ares_metrics_server_timeout(server, now);
size_t timeplus = timeout;
size_t rounds;
size_t num_servers = ares_slist_len(channel->servers);
if (num_servers == 0) {
return 0; /* LCOV_EXCL_LINE: DefensiveCoding */
}
/* For each trip through the entire server list, we want to double the
* retry from the last retry */
rounds = (query->try_count / num_servers);
if (rounds > 0) {
timeplus <<= rounds;
}
if (channel->maxtimeout && timeplus > channel->maxtimeout) {
timeplus = channel->maxtimeout;
}
/* Add some jitter to the retry timeout.
*
* Jitter is needed in situation when resolve requests are performed
* simultaneously from multiple hosts and DNS server throttle these requests.
* Adding randomness allows to avoid synchronisation of retries.
*
* Value of timeplus adjusted randomly to the range [0.5 * timeplus,
* timeplus].
*/
if (rounds > 0) {
unsigned short r;
float delta_multiplier;
ares_rand_bytes(channel->rand_state, (unsigned char *)&r, sizeof(r));
delta_multiplier = ((float)r / USHRT_MAX) * 0.5f;
timeplus -= (size_t)((float)timeplus * delta_multiplier);
}
/* We want explicitly guarantee that timeplus is greater or equal to timeout
* specified in channel options. */
if (timeplus < timeout) {
timeplus = timeout;
}
return timeplus;
}
static ares_conn_t *ares_fetch_connection(const ares_channel_t *channel,
ares_server_t *server,
const ares_query_t *query)
{
ares_llist_node_t *node;
ares_conn_t *conn;
if (query->using_tcp) {
return server->tcp_conn;
}
/* Fetch existing UDP connection */
node = ares_llist_node_first(server->connections);
if (node == NULL) {
return NULL;
}
conn = ares_llist_node_val(node);
/* Not UDP, skip */
if (conn->flags & ARES_CONN_FLAG_TCP) {
return NULL;
}
/* Used too many times */
if (channel->udp_max_queries > 0 &&
conn->total_queries >= channel->udp_max_queries) {
return NULL;
}
return conn;
}
static ares_status_t ares_conn_query_write(ares_conn_t *conn,
ares_query_t *query,
const ares_timeval_t *now)
{
ares_server_t *server = conn->server;
ares_channel_t *channel = server->channel;
ares_status_t status;
status = ares_cookie_apply(query->query, conn, now);
if (status != ARES_SUCCESS) {
return status;
}
/* We write using the TCP format even for UDP, we just strip the length
* before putting on the wire */
status = ares_dns_write_buf_tcp(query->query, conn->out_buf);
if (status != ARES_SUCCESS) {
return status;
}
/* Not pending a TFO write and not connected, so we can't even try to
* write until we get a signal */
if (conn->flags & ARES_CONN_FLAG_TCP &&
!(conn->state_flags & ARES_CONN_STATE_CONNECTED) &&
!(conn->flags & ARES_CONN_FLAG_TFO_INITIAL)) {
return ARES_SUCCESS;
}
/* Delay actual write if possible (TCP only, and only if callback
* configured) */
if (channel->notify_pending_write_cb && !channel->notify_pending_write &&
conn->flags & ARES_CONN_FLAG_TCP) {
channel->notify_pending_write = ARES_TRUE;
channel->notify_pending_write_cb(channel->notify_pending_write_cb_data);
return ARES_SUCCESS;
}
/* Unfortunately we need to write right away and can't aggregate multiple
* queries into a single write. */
return ares_conn_flush(conn);
}
ares_status_t ares_send_query(ares_query_t *query, const ares_timeval_t *now)
{
ares_channel_t *channel = query->channel;
ares_server_t *server;
ares_conn_t *conn;
size_t timeplus;
ares_status_t status;
/* Choose the server to send the query to */
if (channel->rotate) {
/* Pull random server */
server = ares_random_server(channel);
} else {
/* Pull server with failover behavior */
server = ares_failover_server(channel);
}
if (server == NULL) {
end_query(channel, server, query, ARES_ENOSERVER /* ? */, NULL);
return ARES_ENOSERVER;
}
conn = ares_fetch_connection(channel, server, query);
if (conn == NULL) {
status = ares_open_connection(&conn, channel, server, query->using_tcp);
switch (status) {
/* Good result, continue on */
case ARES_SUCCESS:
break;
/* These conditions are retryable as they are server-specific
* error codes */
case ARES_ECONNREFUSED:
case ARES_EBADFAMILY:
server_increment_failures(server, query->using_tcp);
return ares_requeue_query(query, now, status, ARES_TRUE, NULL);
/* Anything else is not retryable, likely ENOMEM */
default:
end_query(channel, server, query, status, NULL);
return status;
}
}
/* Write the query */
status = ares_conn_query_write(conn, query, now);
switch (status) {
/* Good result, continue on */
case ARES_SUCCESS:
break;
case ARES_ENOMEM:
/* Not retryable */
end_query(channel, server, query, status, NULL);
return status;
/* These conditions are retryable as they are server-specific
* error codes */
case ARES_ECONNREFUSED:
case ARES_EBADFAMILY:
handle_conn_error(conn, ARES_TRUE, status);
status = ares_requeue_query(query, now, status, ARES_TRUE, NULL);
if (status == ARES_ETIMEOUT) {
status = ARES_ECONNREFUSED;
}
return status;
default:
server_increment_failures(server, query->using_tcp);
status = ares_requeue_query(query, now, status, ARES_TRUE, NULL);
return status;
}
timeplus = ares_calc_query_timeout(query, server, now);
/* Keep track of queries bucketed by timeout, so we can process
* timeout events quickly.
*/
ares_slist_node_destroy(query->node_queries_by_timeout);
query->ts = *now;
query->timeout = *now;
timeadd(&query->timeout, timeplus);
query->node_queries_by_timeout =
ares_slist_insert(channel->queries_by_timeout, query);
if (!query->node_queries_by_timeout) {
/* LCOV_EXCL_START: OutOfMemory */
end_query(channel, server, query, ARES_ENOMEM, NULL);
return ARES_ENOMEM;
/* LCOV_EXCL_STOP */
}
/* Keep track of queries bucketed by connection, so we can process errors
* quickly. */
ares_llist_node_destroy(query->node_queries_to_conn);
query->node_queries_to_conn =
ares_llist_insert_last(conn->queries_to_conn, query);
if (query->node_queries_to_conn == NULL) {
/* LCOV_EXCL_START: OutOfMemory */
end_query(channel, server, query, ARES_ENOMEM, NULL);
return ARES_ENOMEM;
/* LCOV_EXCL_STOP */
}
query->conn = conn;
conn->total_queries++;
return ARES_SUCCESS;
}
static ares_bool_t same_questions(const ares_query_t *query,
const ares_dns_record_t *arec)
{
size_t i;
ares_bool_t rv = ARES_FALSE;
const ares_dns_record_t *qrec = query->query;
const ares_channel_t *channel = query->channel;
if (ares_dns_record_query_cnt(qrec) != ares_dns_record_query_cnt(arec)) {
goto done;
}
for (i = 0; i < ares_dns_record_query_cnt(qrec); i++) {
const char *qname = NULL;
const char *aname = NULL;
ares_dns_rec_type_t qtype;
ares_dns_rec_type_t atype;
ares_dns_class_t qclass;
ares_dns_class_t aclass;
if (ares_dns_record_query_get(qrec, i, &qname, &qtype, &qclass) !=
ARES_SUCCESS ||
qname == NULL) {
goto done;
}
if (ares_dns_record_query_get(arec, i, &aname, &atype, &aclass) !=
ARES_SUCCESS ||
aname == NULL) {
goto done;
}
if (qtype != atype || qclass != aclass) {
goto done;
}
if (channel->flags & ARES_FLAG_DNS0x20 && !query->using_tcp) {
/* NOTE: for DNS 0x20, part of the protection is to use a case-sensitive
* comparison of the DNS query name. This expects the upstream DNS
* server to preserve the case of the name in the response packet.
* https://datatracker.ietf.org/doc/html/draft-vixie-dnsext-dns0x20-00
*/
if (!ares_streq(qname, aname)) {
goto done;
}
} else {
/* without DNS0x20 use case-insensitive matching */
if (!ares_strcaseeq(qname, aname)) {
goto done;
}
}
}
rv = ARES_TRUE;
done:
return rv;
}
static void ares_detach_query(ares_query_t *query)
{
/* Remove the query from all the lists in which it is linked */
ares_query_remove_from_conn(query);
ares_htable_szvp_remove(query->channel->queries_by_qid, query->qid);
ares_llist_node_destroy(query->node_all_queries);
query->node_all_queries = NULL;
}
static void end_query(ares_channel_t *channel, ares_server_t *server,
ares_query_t *query, ares_status_t status,
const ares_dns_record_t *dnsrec)
{
ares_metrics_record(query, server, status, dnsrec);
/* Invoke the callback. */
query->callback(query->arg, status, query->timeouts, dnsrec);
ares_free_query(query);
/* Check and notify if no other queries are enqueued on the channel. This
* must come after the callback and freeing the query for 2 reasons.
* 1) The callback itself may enqueue a new query
* 2) Technically the current query isn't detached until it is free()'d.
*/
ares_queue_notify_empty(channel);
}
void ares_free_query(ares_query_t *query)
{
ares_detach_query(query);
/* Zero out some important stuff, to help catch bugs */
query->callback = NULL;
query->arg = NULL;
/* Deallocate the memory associated with the query */
ares_dns_record_destroy(query->query);
ares_free(query);
}