blob: 29fa65176daaa82112e0826e69b61f4a6f27ecff [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001
2/* Copyright 1998 by the Massachusetts Institute of Technology.
3 * Copyright (C) 2004-2016 by Daniel Stenberg
4 *
5 * Permission to use, copy, modify, and distribute this
6 * software and its documentation for any purpose and without
7 * fee is hereby granted, provided that the above copyright
8 * notice appear in all copies and that both that copyright
9 * notice and this permission notice appear in supporting
10 * documentation, and that the name of M.I.T. not be used in
11 * advertising or publicity pertaining to distribution of the
12 * software without specific, written prior permission.
13 * M.I.T. makes no representations about the suitability of
14 * this software for any purpose. It is provided "as is"
15 * without express or implied warranty.
16 */
17
18#include "ares_setup.h"
19
20#ifdef HAVE_SYS_UIO_H
21# include <sys/uio.h>
22#endif
23#ifdef HAVE_NETINET_IN_H
24# include <netinet/in.h>
25#endif
26#ifdef HAVE_NETINET_TCP_H
27# include <netinet/tcp.h>
28#endif
29#ifdef HAVE_NETDB_H
30# include <netdb.h>
31#endif
32#ifdef HAVE_ARPA_NAMESER_H
33# include <arpa/nameser.h>
34#else
35# include "nameser.h"
36#endif
37#ifdef HAVE_ARPA_NAMESER_COMPAT_H
38# include <arpa/nameser_compat.h>
39#endif
40
41#ifdef HAVE_STRINGS_H
42# include <strings.h>
43#endif
44#ifdef HAVE_SYS_IOCTL_H
45# include <sys/ioctl.h>
46#endif
47#ifdef NETWARE
48# include <sys/filio.h>
49#endif
50
51#include <assert.h>
52#include <fcntl.h>
53
54#include "ares.h"
55#include "ares_dns.h"
56#include "ares_nowarn.h"
57#include "ares_private.h"
58
59
60static int try_again(int errnum);
61static void write_tcp_data(ares_channel channel, fd_set *write_fds,
62 ares_socket_t write_fd, struct timeval *now);
63static void read_tcp_data(ares_channel channel, fd_set *read_fds,
64 ares_socket_t read_fd, struct timeval *now);
65static void read_udp_packets(ares_channel channel, fd_set *read_fds,
66 ares_socket_t read_fd, struct timeval *now);
67static void advance_tcp_send_queue(ares_channel channel, int whichserver,
68 ssize_t num_bytes);
69static void process_timeouts(ares_channel channel, struct timeval *now);
70static void process_broken_connections(ares_channel channel,
71 struct timeval *now);
72static void process_answer(ares_channel channel, unsigned char *abuf,
73 int alen, int whichserver, int tcp,
74 struct timeval *now);
75static void handle_error(ares_channel channel, int whichserver,
76 struct timeval *now);
77static void skip_server(ares_channel channel, struct query *query,
78 int whichserver);
79static void next_server(ares_channel channel, struct query *query,
80 struct timeval *now);
81static int open_tcp_socket(ares_channel channel, struct server_state *server);
82static int open_udp_socket(ares_channel channel, struct server_state *server);
83static int same_questions(const unsigned char *qbuf, int qlen,
84 const unsigned char *abuf, int alen);
85static int same_address(struct sockaddr *sa, struct ares_addr *aa);
86static void end_query(ares_channel channel, struct query *query, int status,
87 unsigned char *abuf, int alen);
88
89/* return true if now is exactly check time or later */
90int ares__timedout(struct timeval *now,
91 struct timeval *check)
92{
93 long secs = (now->tv_sec - check->tv_sec);
94
95 if(secs > 0)
96 return 1; /* yes, timed out */
97 if(secs < 0)
98 return 0; /* nope, not timed out */
99
100 /* if the full seconds were identical, check the sub second parts */
101 return (now->tv_usec - check->tv_usec >= 0);
102}
103
104/* add the specific number of milliseconds to the time in the first argument */
105static void timeadd(struct timeval *now, int millisecs)
106{
107 now->tv_sec += millisecs/1000;
108 now->tv_usec += (millisecs%1000)*1000;
109
110 if(now->tv_usec >= 1000000) {
111 ++(now->tv_sec);
112 now->tv_usec -= 1000000;
113 }
114}
115
116/*
117 * generic process function
118 */
119static void processfds(ares_channel channel,
120 fd_set *read_fds, ares_socket_t read_fd,
121 fd_set *write_fds, ares_socket_t write_fd)
122{
123 struct timeval now = ares__tvnow();
124
125 write_tcp_data(channel, write_fds, write_fd, &now);
126 read_tcp_data(channel, read_fds, read_fd, &now);
127 read_udp_packets(channel, read_fds, read_fd, &now);
128 process_timeouts(channel, &now);
129 process_broken_connections(channel, &now);
130}
131
132/* Something interesting happened on the wire, or there was a timeout.
133 * See what's up and respond accordingly.
134 */
135void ares_process(ares_channel channel, fd_set *read_fds, fd_set *write_fds)
136{
137 processfds(channel, read_fds, ARES_SOCKET_BAD, write_fds, ARES_SOCKET_BAD);
138}
139
140/* Something interesting happened on the wire, or there was a timeout.
141 * See what's up and respond accordingly.
142 */
143void ares_process_fd(ares_channel channel,
144 ares_socket_t read_fd, /* use ARES_SOCKET_BAD or valid
145 file descriptors */
146 ares_socket_t write_fd)
147{
148 processfds(channel, NULL, read_fd, NULL, write_fd);
149}
150
151
152/* Return 1 if the specified error number describes a readiness error, or 0
153 * otherwise. This is mostly for HP-UX, which could return EAGAIN or
154 * EWOULDBLOCK. See this man page
155 *
156 * http://devrsrc1.external.hp.com/STKS/cgi-bin/man2html?
157 * manpage=/usr/share/man/man2.Z/send.2
158 */
159static int try_again(int errnum)
160{
161#if !defined EWOULDBLOCK && !defined EAGAIN
162#error "Neither EWOULDBLOCK nor EAGAIN defined"
163#endif
164 switch (errnum)
165 {
166#ifdef EWOULDBLOCK
167 case EWOULDBLOCK:
168 return 1;
169#endif
170#if defined EAGAIN && EAGAIN != EWOULDBLOCK
171 case EAGAIN:
172 return 1;
173#endif
174 }
175 return 0;
176}
177
178/* If any TCP sockets select true for writing, write out queued data
179 * we have for them.
180 */
181static void write_tcp_data(ares_channel channel,
182 fd_set *write_fds,
183 ares_socket_t write_fd,
184 struct timeval *now)
185{
186 struct server_state *server;
187 struct send_request *sendreq;
188 struct iovec *vec;
189 int i;
190 ssize_t scount;
191 ssize_t wcount;
192 size_t n;
193
194 if(!write_fds && (write_fd == ARES_SOCKET_BAD))
195 /* no possible action */
196 return;
197
198 for (i = 0; i < channel->nservers; i++)
199 {
200 /* Make sure server has data to send and is selected in write_fds or
201 write_fd. */
202 server = &channel->servers[i];
203 if (!server->qhead || server->tcp_socket == ARES_SOCKET_BAD ||
204 server->is_broken)
205 continue;
206
207 if(write_fds) {
208 if(!FD_ISSET(server->tcp_socket, write_fds))
209 continue;
210 }
211 else {
212 if(server->tcp_socket != write_fd)
213 continue;
214 }
215
216 if(write_fds)
217 /* If there's an error and we close this socket, then open
218 * another with the same fd to talk to another server, then we
219 * don't want to think that it was the new socket that was
220 * ready. This is not disastrous, but is likely to result in
221 * extra system calls and confusion. */
222 FD_CLR(server->tcp_socket, write_fds);
223
224 /* Count the number of send queue items. */
225 n = 0;
226 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
227 n++;
228
229 /* Allocate iovecs so we can send all our data at once. */
230 vec = ares_malloc(n * sizeof(struct iovec));
231 if (vec)
232 {
233 /* Fill in the iovecs and send. */
234 n = 0;
235 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
236 {
237 vec[n].iov_base = (char *) sendreq->data;
238 vec[n].iov_len = sendreq->len;
239 n++;
240 }
241 wcount = (ssize_t)writev(server->tcp_socket, vec, (int)n);
242 ares_free(vec);
243 if (wcount < 0)
244 {
245 if (!try_again(SOCKERRNO))
246 handle_error(channel, i, now);
247 continue;
248 }
249
250 /* Advance the send queue by as many bytes as we sent. */
251 advance_tcp_send_queue(channel, i, wcount);
252 }
253 else
254 {
255 /* Can't allocate iovecs; just send the first request. */
256 sendreq = server->qhead;
257
258 scount = swrite(server->tcp_socket, sendreq->data, sendreq->len);
259 if (scount < 0)
260 {
261 if (!try_again(SOCKERRNO))
262 handle_error(channel, i, now);
263 continue;
264 }
265
266 /* Advance the send queue by as many bytes as we sent. */
267 advance_tcp_send_queue(channel, i, scount);
268 }
269 }
270}
271
272/* Consume the given number of bytes from the head of the TCP send queue. */
273static void advance_tcp_send_queue(ares_channel channel, int whichserver,
274 ssize_t num_bytes)
275{
276 struct send_request *sendreq;
277 struct server_state *server = &channel->servers[whichserver];
278 while (num_bytes > 0) {
279 sendreq = server->qhead;
280 if ((size_t)num_bytes >= sendreq->len) {
281 num_bytes -= sendreq->len;
282 server->qhead = sendreq->next;
283 if (sendreq->data_storage)
284 ares_free(sendreq->data_storage);
285 ares_free(sendreq);
286 if (server->qhead == NULL) {
287 SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 0);
288 server->qtail = NULL;
289
290 /* qhead is NULL so we cannot continue this loop */
291 break;
292 }
293 }
294 else {
295 sendreq->data += num_bytes;
296 sendreq->len -= num_bytes;
297 num_bytes = 0;
298 }
299 }
300}
301
302/* If any TCP socket selects true for reading, read some data,
303 * allocate a buffer if we finish reading the length word, and process
304 * a packet if we finish reading one.
305 */
306static void read_tcp_data(ares_channel channel, fd_set *read_fds,
307 ares_socket_t read_fd, struct timeval *now)
308{
309 struct server_state *server;
310 int i;
311 ssize_t count;
312
313 if(!read_fds && (read_fd == ARES_SOCKET_BAD))
314 /* no possible action */
315 return;
316
317 for (i = 0; i < channel->nservers; i++)
318 {
319 /* Make sure the server has a socket and is selected in read_fds. */
320 server = &channel->servers[i];
321 if (server->tcp_socket == ARES_SOCKET_BAD || server->is_broken)
322 continue;
323
324 if(read_fds) {
325 if(!FD_ISSET(server->tcp_socket, read_fds))
326 continue;
327 }
328 else {
329 if(server->tcp_socket != read_fd)
330 continue;
331 }
332
333 if(read_fds)
334 /* If there's an error and we close this socket, then open another
335 * with the same fd to talk to another server, then we don't want to
336 * think that it was the new socket that was ready. This is not
337 * disastrous, but is likely to result in extra system calls and
338 * confusion. */
339 FD_CLR(server->tcp_socket, read_fds);
340
341 if (server->tcp_lenbuf_pos != 2)
342 {
343 /* We haven't yet read a length word, so read that (or
344 * what's left to read of it).
345 */
346 count = sread(server->tcp_socket,
347 server->tcp_lenbuf + server->tcp_lenbuf_pos,
348 2 - server->tcp_lenbuf_pos);
349 if (count <= 0)
350 {
351 if (!(count == -1 && try_again(SOCKERRNO)))
352 handle_error(channel, i, now);
353 continue;
354 }
355
356 server->tcp_lenbuf_pos += (int)count;
357 if (server->tcp_lenbuf_pos == 2)
358 {
359 /* We finished reading the length word. Decode the
360 * length and allocate a buffer for the data.
361 */
362 server->tcp_length = server->tcp_lenbuf[0] << 8
363 | server->tcp_lenbuf[1];
364 server->tcp_buffer = ares_malloc(server->tcp_length);
365 if (!server->tcp_buffer) {
366 handle_error(channel, i, now);
367 return; /* bail out on malloc failure. TODO: make this
368 function return error codes */
369 }
370 server->tcp_buffer_pos = 0;
371 }
372 }
373 else
374 {
375 /* Read data into the allocated buffer. */
376 count = sread(server->tcp_socket,
377 server->tcp_buffer + server->tcp_buffer_pos,
378 server->tcp_length - server->tcp_buffer_pos);
379 if (count <= 0)
380 {
381 if (!(count == -1 && try_again(SOCKERRNO)))
382 handle_error(channel, i, now);
383 continue;
384 }
385
386 server->tcp_buffer_pos += (int)count;
387 if (server->tcp_buffer_pos == server->tcp_length)
388 {
389 /* We finished reading this answer; process it and
390 * prepare to read another length word.
391 */
392 process_answer(channel, server->tcp_buffer, server->tcp_length,
393 i, 1, now);
394 ares_free(server->tcp_buffer);
395 server->tcp_buffer = NULL;
396 server->tcp_lenbuf_pos = 0;
397 server->tcp_buffer_pos = 0;
398 }
399 }
400 }
401}
402
403/* If any UDP sockets select true for reading, process them. */
404static void read_udp_packets(ares_channel channel, fd_set *read_fds,
405 ares_socket_t read_fd, struct timeval *now)
406{
407 struct server_state *server;
408 int i;
409 ssize_t count;
410 unsigned char buf[MAXENDSSZ + 1];
411#ifdef HAVE_RECVFROM
412 ares_socklen_t fromlen;
413 union {
414 struct sockaddr sa;
415 struct sockaddr_in sa4;
416 struct sockaddr_in6 sa6;
417 } from;
418#endif
419
420 if(!read_fds && (read_fd == ARES_SOCKET_BAD))
421 /* no possible action */
422 return;
423
424 for (i = 0; i < channel->nservers; i++)
425 {
426 /* Make sure the server has a socket and is selected in read_fds. */
427 server = &channel->servers[i];
428
429 if (server->udp_socket == ARES_SOCKET_BAD || server->is_broken)
430 continue;
431
432 if(read_fds) {
433 if(!FD_ISSET(server->udp_socket, read_fds))
434 continue;
435 }
436 else {
437 if(server->udp_socket != read_fd)
438 continue;
439 }
440
441 if(read_fds)
442 /* If there's an error and we close this socket, then open
443 * another with the same fd to talk to another server, then we
444 * don't want to think that it was the new socket that was
445 * ready. This is not disastrous, but is likely to result in
446 * extra system calls and confusion. */
447 FD_CLR(server->udp_socket, read_fds);
448
449 /* To reduce event loop overhead, read and process as many
450 * packets as we can. */
451 do {
452 if (server->udp_socket == ARES_SOCKET_BAD)
453 count = 0;
454
455 else {
456#ifdef HAVE_RECVFROM
457 if (server->addr.family == AF_INET)
458 fromlen = sizeof(from.sa4);
459 else
460 fromlen = sizeof(from.sa6);
461 count = (ssize_t)recvfrom(server->udp_socket, (void *)buf,
462 sizeof(buf), 0, &from.sa, &fromlen);
463#else
464 count = sread(server->udp_socket, buf, sizeof(buf));
465#endif
466 }
467
468 if (count == -1 && try_again(SOCKERRNO))
469 continue;
470 else if (count <= 0)
471 handle_error(channel, i, now);
472#ifdef HAVE_RECVFROM
473 else if (!same_address(&from.sa, &server->addr))
474 /* The address the response comes from does not match the address we
475 * sent the request to. Someone may be attempting to perform a cache
476 * poisoning attack. */
477 break;
478#endif
479 else
480 process_answer(channel, buf, (int)count, i, 0, now);
481 } while (count > 0);
482 }
483}
484
485/* If any queries have timed out, note the timeout and move them on. */
486static void process_timeouts(ares_channel channel, struct timeval *now)
487{
488 time_t t; /* the time of the timeouts we're processing */
489 struct query *query;
490 struct list_node* list_head;
491 struct list_node* list_node;
492
493 /* Process all the timeouts that have fired since the last time we processed
494 * timeouts. If things are going well, then we'll have hundreds/thousands of
495 * queries that fall into future buckets, and only a handful of requests
496 * that fall into the "now" bucket, so this should be quite quick.
497 */
498 for (t = channel->last_timeout_processed; t <= now->tv_sec; t++)
499 {
500 list_head = &(channel->queries_by_timeout[t % ARES_TIMEOUT_TABLE_SIZE]);
501 for (list_node = list_head->next; list_node != list_head; )
502 {
503 query = list_node->data;
504 list_node = list_node->next; /* in case the query gets deleted */
505 if (query->timeout.tv_sec && ares__timedout(now, &query->timeout))
506 {
507 query->error_status = ARES_ETIMEOUT;
508 ++query->timeouts;
509 next_server(channel, query, now);
510 }
511 }
512 }
513 channel->last_timeout_processed = now->tv_sec;
514}
515
516/* Handle an answer from a server. */
517static void process_answer(ares_channel channel, unsigned char *abuf,
518 int alen, int whichserver, int tcp,
519 struct timeval *now)
520{
521 int tc, rcode, packetsz;
522 unsigned short id;
523 struct query *query;
524 struct list_node* list_head;
525 struct list_node* list_node;
526
527 /* If there's no room in the answer for a header, we can't do much
528 * with it. */
529 if (alen < HFIXEDSZ)
530 return;
531
532 /* Grab the query ID, truncate bit, and response code from the packet. */
533 id = DNS_HEADER_QID(abuf);
534 tc = DNS_HEADER_TC(abuf);
535 rcode = DNS_HEADER_RCODE(abuf);
536
537 /* Find the query corresponding to this packet. The queries are
538 * hashed/bucketed by query id, so this lookup should be quick. Note that
539 * both the query id and the questions must be the same; when the query id
540 * wraps around we can have multiple outstanding queries with the same query
541 * id, so we need to check both the id and question.
542 */
543 query = NULL;
544 list_head = &(channel->queries_by_qid[id % ARES_QID_TABLE_SIZE]);
545 for (list_node = list_head->next; list_node != list_head;
546 list_node = list_node->next)
547 {
548 struct query *q = list_node->data;
549 if ((q->qid == id) && same_questions(q->qbuf, q->qlen, abuf, alen))
550 {
551 query = q;
552 break;
553 }
554 }
555 if (!query)
556 return;
557
558 packetsz = PACKETSZ;
559 /* If we use EDNS and server answers with one of these RCODES, the protocol
560 * extension is not understood by the responder. We must retry the query
561 * without EDNS enabled.
562 */
563 if (channel->flags & ARES_FLAG_EDNS)
564 {
565 packetsz = channel->ednspsz;
566 if (rcode == NOTIMP || rcode == FORMERR || rcode == SERVFAIL)
567 {
568 int qlen = (query->tcplen - 2) - EDNSFIXEDSZ;
569 channel->flags ^= ARES_FLAG_EDNS;
570 query->tcplen -= EDNSFIXEDSZ;
571 query->qlen -= EDNSFIXEDSZ;
572 query->tcpbuf[0] = (unsigned char)((qlen >> 8) & 0xff);
573 query->tcpbuf[1] = (unsigned char)(qlen & 0xff);
574 DNS_HEADER_SET_ARCOUNT(query->tcpbuf + 2, 0);
575 query->tcpbuf = ares_realloc(query->tcpbuf, query->tcplen);
576 query->qbuf = query->tcpbuf + 2;
577 ares__send_query(channel, query, now);
578 return;
579 }
580 }
581
582 /* If we got a truncated UDP packet and are not ignoring truncation,
583 * don't accept the packet, and switch the query to TCP if we hadn't
584 * done so already.
585 */
586 if ((tc || alen > packetsz) && !tcp && !(channel->flags & ARES_FLAG_IGNTC))
587 {
588 if (!query->using_tcp)
589 {
590 query->using_tcp = 1;
591 ares__send_query(channel, query, now);
592 }
593 return;
594 }
595
596 /* Limit alen to PACKETSZ if we aren't using TCP (only relevant if we
597 * are ignoring truncation.
598 */
599 if (alen > packetsz && !tcp)
600 alen = packetsz;
601
602 /* If we aren't passing through all error packets, discard packets
603 * with SERVFAIL, NOTIMP, or REFUSED response codes.
604 */
605 if (!(channel->flags & ARES_FLAG_NOCHECKRESP))
606 {
607 if (rcode == SERVFAIL || rcode == NOTIMP || rcode == REFUSED)
608 {
609 skip_server(channel, query, whichserver);
610 if (query->server == whichserver)
611 next_server(channel, query, now);
612 return;
613 }
614 }
615
616 end_query(channel, query, ARES_SUCCESS, abuf, alen);
617}
618
619/* Close all the connections that are no longer usable. */
620static void process_broken_connections(ares_channel channel,
621 struct timeval *now)
622{
623 int i;
624 for (i = 0; i < channel->nservers; i++)
625 {
626 struct server_state *server = &channel->servers[i];
627 if (server->is_broken)
628 {
629 handle_error(channel, i, now);
630 }
631 }
632}
633
634/* Swap the contents of two lists */
635static void swap_lists(struct list_node* head_a,
636 struct list_node* head_b)
637{
638 int is_a_empty = ares__is_list_empty(head_a);
639 int is_b_empty = ares__is_list_empty(head_b);
640 struct list_node old_a = *head_a;
641 struct list_node old_b = *head_b;
642
643 if (is_a_empty) {
644 ares__init_list_head(head_b);
645 } else {
646 *head_b = old_a;
647 old_a.next->prev = head_b;
648 old_a.prev->next = head_b;
649 }
650 if (is_b_empty) {
651 ares__init_list_head(head_a);
652 } else {
653 *head_a = old_b;
654 old_b.next->prev = head_a;
655 old_b.prev->next = head_a;
656 }
657}
658
659static void handle_error(ares_channel channel, int whichserver,
660 struct timeval *now)
661{
662 struct server_state *server;
663 struct query *query;
664 struct list_node list_head;
665 struct list_node* list_node;
666
667 server = &channel->servers[whichserver];
668
669 /* Reset communications with this server. */
670 ares__close_sockets(channel, server);
671
672 /* Tell all queries talking to this server to move on and not try this
673 * server again. We steal the current list of queries that were in-flight to
674 * this server, since when we call next_server this can cause the queries to
675 * be re-sent to this server, which will re-insert these queries in that
676 * same server->queries_to_server list.
677 */
678 ares__init_list_head(&list_head);
679 swap_lists(&list_head, &(server->queries_to_server));
680 for (list_node = list_head.next; list_node != &list_head; )
681 {
682 query = list_node->data;
683 list_node = list_node->next; /* in case the query gets deleted */
684 assert(query->server == whichserver);
685 skip_server(channel, query, whichserver);
686 next_server(channel, query, now);
687 }
688 /* Each query should have removed itself from our temporary list as
689 * it re-sent itself or finished up...
690 */
691 assert(ares__is_list_empty(&list_head));
692}
693
694static void skip_server(ares_channel channel, struct query *query,
695 int whichserver)
696{
697 /* The given server gave us problems with this query, so if we have the
698 * luxury of using other servers, then let's skip the potentially broken
699 * server and just use the others. If we only have one server and we need to
700 * retry then we should just go ahead and re-use that server, since it's our
701 * only hope; perhaps we just got unlucky, and retrying will work (eg, the
702 * server timed out our TCP connection just as we were sending another
703 * request).
704 */
705 if (channel->nservers > 1)
706 {
707 query->server_info[whichserver].skip_server = 1;
708 }
709}
710
711static void next_server(ares_channel channel, struct query *query,
712 struct timeval *now)
713{
714 /* We need to try each server channel->tries times. We have channel->nservers
715 * servers to try. In total, we need to do channel->nservers * channel->tries
716 * attempts. Use query->try to remember how many times we already attempted
717 * this query. Use modular arithmetic to find the next server to try. */
718 while (++(query->try_count) < (channel->nservers * channel->tries))
719 {
720 struct server_state *server;
721
722 /* Move on to the next server. */
723 query->server = (query->server + 1) % channel->nservers;
724 server = &channel->servers[query->server];
725
726 /* We don't want to use this server if (1) we decided this connection is
727 * broken, and thus about to be closed, (2) we've decided to skip this
728 * server because of earlier errors we encountered, or (3) we already
729 * sent this query over this exact connection.
730 */
731 if (!server->is_broken &&
732 !query->server_info[query->server].skip_server &&
733 !(query->using_tcp &&
734 (query->server_info[query->server].tcp_connection_generation ==
735 server->tcp_connection_generation)))
736 {
737 ares__send_query(channel, query, now);
738 return;
739 }
740
741 /* You might think that with TCP we only need one try. However, even
742 * when using TCP, servers can time-out our connection just as we're
743 * sending a request, or close our connection because they die, or never
744 * send us a reply because they get wedged or tickle a bug that drops
745 * our request.
746 */
747 }
748
749 /* If we are here, all attempts to perform query failed. */
750 end_query(channel, query, query->error_status, NULL, 0);
751}
752
753void ares__send_query(ares_channel channel, struct query *query,
754 struct timeval *now)
755{
756 struct send_request *sendreq;
757 struct server_state *server;
758 int timeplus;
759
760 server = &channel->servers[query->server];
761 if (query->using_tcp)
762 {
763 /* Make sure the TCP socket for this server is set up and queue
764 * a send request.
765 */
766 if (server->tcp_socket == ARES_SOCKET_BAD)
767 {
768 if (open_tcp_socket(channel, server) == -1)
769 {
770 skip_server(channel, query, query->server);
771 next_server(channel, query, now);
772 return;
773 }
774 }
775 sendreq = ares_malloc(sizeof(struct send_request));
776 if (!sendreq)
777 {
778 end_query(channel, query, ARES_ENOMEM, NULL, 0);
779 return;
780 }
781 memset(sendreq, 0, sizeof(struct send_request));
782 /* To make the common case fast, we avoid copies by using the query's
783 * tcpbuf for as long as the query is alive. In the rare case where the
784 * query ends while it's queued for transmission, then we give the
785 * sendreq its own copy of the request packet and put it in
786 * sendreq->data_storage.
787 */
788 sendreq->data_storage = NULL;
789 sendreq->data = query->tcpbuf;
790 sendreq->len = query->tcplen;
791 sendreq->owner_query = query;
792 sendreq->next = NULL;
793 if (server->qtail)
794 server->qtail->next = sendreq;
795 else
796 {
797 SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 1);
798 server->qhead = sendreq;
799 }
800 server->qtail = sendreq;
801 query->server_info[query->server].tcp_connection_generation =
802 server->tcp_connection_generation;
803 }
804 else
805 {
806 if (server->udp_socket == ARES_SOCKET_BAD)
807 {
808 if (open_udp_socket(channel, server) == -1)
809 {
810 skip_server(channel, query, query->server);
811 next_server(channel, query, now);
812 return;
813 }
814 }
815 if (swrite(server->udp_socket, query->qbuf, query->qlen) == -1)
816 {
817 /* FIXME: Handle EAGAIN here since it likely can happen. */
818 skip_server(channel, query, query->server);
819 next_server(channel, query, now);
820 return;
821 }
822 }
823 timeplus = channel->timeout << (query->try_count / channel->nservers);
824 timeplus = (timeplus * (9 + (rand () & 7))) / 16;
825 query->timeout = *now;
826 timeadd(&query->timeout, timeplus);
827 /* Keep track of queries bucketed by timeout, so we can process
828 * timeout events quickly.
829 */
830 ares__remove_from_list(&(query->queries_by_timeout));
831 ares__insert_in_list(
832 &(query->queries_by_timeout),
833 &(channel->queries_by_timeout[query->timeout.tv_sec %
834 ARES_TIMEOUT_TABLE_SIZE]));
835
836 /* Keep track of queries bucketed by server, so we can process server
837 * errors quickly.
838 */
839 ares__remove_from_list(&(query->queries_to_server));
840 ares__insert_in_list(&(query->queries_to_server),
841 &(server->queries_to_server));
842}
843
844/*
845 * setsocknonblock sets the given socket to either blocking or non-blocking
846 * mode based on the 'nonblock' boolean argument. This function is highly
847 * portable.
848 */
849static int setsocknonblock(ares_socket_t sockfd, /* operate on this */
850 int nonblock /* TRUE or FALSE */)
851{
852#if defined(USE_BLOCKING_SOCKETS)
853
854 return 0; /* returns success */
855
856#elif defined(HAVE_FCNTL_O_NONBLOCK)
857
858 /* most recent unix versions */
859 int flags;
860 flags = fcntl(sockfd, F_GETFL, 0);
861 if (FALSE != nonblock)
862 return fcntl(sockfd, F_SETFL, flags | O_NONBLOCK);
863 else
864 return fcntl(sockfd, F_SETFL, flags & (~O_NONBLOCK)); /* LCOV_EXCL_LINE */
865
866#elif defined(HAVE_IOCTL_FIONBIO)
867
868 /* older unix versions */
869 int flags = nonblock ? 1 : 0;
870 return ioctl(sockfd, FIONBIO, &flags);
871
872#elif defined(HAVE_IOCTLSOCKET_FIONBIO)
873
874#ifdef WATT32
875 char flags = nonblock ? 1 : 0;
876#else
877 /* Windows */
878 unsigned long flags = nonblock ? 1UL : 0UL;
879#endif
880 return ioctlsocket(sockfd, FIONBIO, &flags);
881
882#elif defined(HAVE_IOCTLSOCKET_CAMEL_FIONBIO)
883
884 /* Amiga */
885 long flags = nonblock ? 1L : 0L;
886 return IoctlSocket(sockfd, FIONBIO, flags);
887
888#elif defined(HAVE_SETSOCKOPT_SO_NONBLOCK)
889
890 /* BeOS */
891 long b = nonblock ? 1L : 0L;
892 return setsockopt(sockfd, SOL_SOCKET, SO_NONBLOCK, &b, sizeof(b));
893
894#else
895# error "no non-blocking method was found/used/set"
896#endif
897}
898#define SO_BINDTODEVICE 1
899static int configure_socket(ares_socket_t s, int family, ares_channel channel)
900{
901 union {
902 struct sockaddr sa;
903 struct sockaddr_in sa4;
904 struct sockaddr_in6 sa6;
905 } local;
906
907 (void)setsocknonblock(s, TRUE);
908
909#if defined(FD_CLOEXEC) && !defined(MSDOS)
910 /* Configure the socket fd as close-on-exec. */
911 if (fcntl(s, F_SETFD, FD_CLOEXEC) == -1)
912 return -1; /* LCOV_EXCL_LINE */
913#endif
914
915 /* Set the socket's send and receive buffer sizes. */
916 if ((channel->socket_send_buffer_size > 0) &&
917 setsockopt(s, SOL_SOCKET, SO_SNDBUF,
918 (void *)&channel->socket_send_buffer_size,
919 sizeof(channel->socket_send_buffer_size)) == -1)
920 return -1;
921
922 if ((channel->socket_receive_buffer_size > 0) &&
923 setsockopt(s, SOL_SOCKET, SO_RCVBUF,
924 (void *)&channel->socket_receive_buffer_size,
925 sizeof(channel->socket_receive_buffer_size)) == -1)
926 return -1;
927fprintf(stderr, "configure_socket before\n");
928#ifdef SO_BINDTODEVICE
929 if (channel->local_dev_name[0]) {
930 if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
931 channel->local_dev_name, sizeof(channel->local_dev_name)) == -1) {
932 /* Only root can do this, and usually not fatal if it doesn't work, so */
933 /* just continue on. */
934 fprintf(stderr, "configure_socket failed\n");
935 }
936 }
937#endif
938fprintf(stderr, "configure_socket good\n");
939 if (family == AF_INET) {
940 if (channel->local_ip4) {
941 memset(&local.sa4, 0, sizeof(local.sa4));
942 local.sa4.sin_family = AF_INET;
943 local.sa4.sin_addr.s_addr = htonl(channel->local_ip4);
944 if (bind(s, &local.sa, sizeof(local.sa4)) < 0)
945 return -1;
946 }
947 }
948 else if (family == AF_INET6) {
949 if (memcmp(channel->local_ip6, &ares_in6addr_any,
950 sizeof(channel->local_ip6)) != 0) {
951 memset(&local.sa6, 0, sizeof(local.sa6));
952 local.sa6.sin6_family = AF_INET6;
953 memcpy(&local.sa6.sin6_addr, channel->local_ip6,
954 sizeof(channel->local_ip6));
955 if (bind(s, &local.sa, sizeof(local.sa6)) < 0)
956 return -1;
957 }
958 }
959
960 return 0;
961}
962
963static int open_tcp_socket(ares_channel channel, struct server_state *server)
964{
965 ares_socket_t s;
966 int opt;
967 ares_socklen_t salen;
968 union {
969 struct sockaddr_in sa4;
970 struct sockaddr_in6 sa6;
971 } saddr;
972 struct sockaddr *sa;
973
974 switch (server->addr.family)
975 {
976 case AF_INET:
977 sa = (void *)&saddr.sa4;
978 salen = sizeof(saddr.sa4);
979 memset(sa, 0, salen);
980 saddr.sa4.sin_family = AF_INET;
981 if (server->addr.tcp_port) {
982 saddr.sa4.sin_port = aresx_sitous(server->addr.tcp_port);
983 } else {
984 saddr.sa4.sin_port = aresx_sitous(channel->tcp_port);
985 }
986 memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4,
987 sizeof(server->addr.addrV4));
988 break;
989 case AF_INET6:
990 sa = (void *)&saddr.sa6;
991 salen = sizeof(saddr.sa6);
992 memset(sa, 0, salen);
993 saddr.sa6.sin6_family = AF_INET6;
994 if (server->addr.tcp_port) {
995 saddr.sa6.sin6_port = aresx_sitous(server->addr.tcp_port);
996 } else {
997 saddr.sa6.sin6_port = aresx_sitous(channel->tcp_port);
998 }
999 memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6,
1000 sizeof(server->addr.addrV6));
1001 break;
1002 default:
1003 return -1; /* LCOV_EXCL_LINE */
1004 }
1005
1006 /* Acquire a socket. */
1007 s = socket(server->addr.family, SOCK_STREAM, 0);
1008 if (s == ARES_SOCKET_BAD)
1009 return -1;
1010
1011 /* Configure it. */
1012 if (configure_socket(s, server->addr.family, channel) < 0)
1013 {
1014 sclose(s);
1015 return -1;
1016 }
1017
1018#ifdef TCP_NODELAY
1019 /*
1020 * Disable the Nagle algorithm (only relevant for TCP sockets, and thus not
1021 * in configure_socket). In general, in DNS lookups we're pretty much
1022 * interested in firing off a single request and then waiting for a reply,
1023 * so batching isn't very interesting.
1024 */
1025 opt = 1;
1026 if (setsockopt(s, IPPROTO_TCP, TCP_NODELAY,
1027 (void *)&opt, sizeof(opt)) == -1)
1028 {
1029 sclose(s);
1030 return -1;
1031 }
1032#endif
1033
1034 if (channel->sock_config_cb)
1035 {
1036 int err = channel->sock_config_cb(s, SOCK_STREAM,
1037 channel->sock_config_cb_data);
1038 if (err < 0)
1039 {
1040 sclose(s);
1041 return err;
1042 }
1043 }
1044
1045 /* Connect to the server. */
1046 if (connect(s, sa, salen) == -1)
1047 {
1048 int err = SOCKERRNO;
1049
1050 if (err != EINPROGRESS && err != EWOULDBLOCK)
1051 {
1052 sclose(s);
1053 return -1;
1054 }
1055 }
1056
1057 if (channel->sock_create_cb)
1058 {
1059 int err = channel->sock_create_cb(s, SOCK_STREAM,
1060 channel->sock_create_cb_data);
1061 if (err < 0)
1062 {
1063 sclose(s);
1064 return err;
1065 }
1066 }
1067
1068 SOCK_STATE_CALLBACK(channel, s, 1, 0);
1069 server->tcp_buffer_pos = 0;
1070 server->tcp_socket = s;
1071 server->tcp_connection_generation = ++channel->tcp_connection_generation;
1072 return 0;
1073}
1074
1075static int open_udp_socket(ares_channel channel, struct server_state *server)
1076{
1077 ares_socket_t s;
1078 ares_socklen_t salen;
1079 union {
1080 struct sockaddr_in sa4;
1081 struct sockaddr_in6 sa6;
1082 } saddr;
1083 struct sockaddr *sa;
1084
1085 switch (server->addr.family)
1086 {
1087 case AF_INET:
1088 sa = (void *)&saddr.sa4;
1089 salen = sizeof(saddr.sa4);
1090 memset(sa, 0, salen);
1091 saddr.sa4.sin_family = AF_INET;
1092 if (server->addr.udp_port) {
1093 saddr.sa4.sin_port = aresx_sitous(server->addr.udp_port);
1094 } else {
1095 saddr.sa4.sin_port = aresx_sitous(channel->udp_port);
1096 }
1097 memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4,
1098 sizeof(server->addr.addrV4));
1099 break;
1100 case AF_INET6:
1101 sa = (void *)&saddr.sa6;
1102 salen = sizeof(saddr.sa6);
1103 memset(sa, 0, salen);
1104 saddr.sa6.sin6_family = AF_INET6;
1105 if (server->addr.udp_port) {
1106 saddr.sa6.sin6_port = aresx_sitous(server->addr.udp_port);
1107 } else {
1108 saddr.sa6.sin6_port = aresx_sitous(channel->udp_port);
1109 }
1110 memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6,
1111 sizeof(server->addr.addrV6));
1112 break;
1113 default:
1114 return -1; /* LCOV_EXCL_LINE */
1115 }
1116
1117 /* Acquire a socket. */
1118 s = socket(server->addr.family, SOCK_DGRAM, 0);
1119 if (s == ARES_SOCKET_BAD)
1120 return -1;
1121
1122 /* Set the socket non-blocking. */
1123 if (configure_socket(s, server->addr.family, channel) < 0)
1124 {
1125 sclose(s);
1126 return -1;
1127 }
1128
1129 if (channel->sock_config_cb)
1130 {
1131 int err = channel->sock_config_cb(s, SOCK_DGRAM,
1132 channel->sock_config_cb_data);
1133 if (err < 0)
1134 {
1135 sclose(s);
1136 return err;
1137 }
1138 }
1139
1140 /* Connect to the server. */
1141 if (connect(s, sa, salen) == -1)
1142 {
1143 int err = SOCKERRNO;
1144
1145 if (err != EINPROGRESS && err != EWOULDBLOCK)
1146 {
1147 sclose(s);
1148 return -1;
1149 }
1150 }
1151
1152 if (channel->sock_create_cb)
1153 {
1154 int err = channel->sock_create_cb(s, SOCK_DGRAM,
1155 channel->sock_create_cb_data);
1156 if (err < 0)
1157 {
1158 sclose(s);
1159 return err;
1160 }
1161 }
1162
1163 SOCK_STATE_CALLBACK(channel, s, 1, 0);
1164
1165 server->udp_socket = s;
1166 return 0;
1167}
1168
1169static int same_questions(const unsigned char *qbuf, int qlen,
1170 const unsigned char *abuf, int alen)
1171{
1172 struct {
1173 const unsigned char *p;
1174 int qdcount;
1175 char *name;
1176 long namelen;
1177 int type;
1178 int dnsclass;
1179 } q, a;
1180 int i, j;
1181
1182 if (qlen < HFIXEDSZ || alen < HFIXEDSZ)
1183 return 0;
1184
1185 /* Extract qdcount from the request and reply buffers and compare them. */
1186 q.qdcount = DNS_HEADER_QDCOUNT(qbuf);
1187 a.qdcount = DNS_HEADER_QDCOUNT(abuf);
1188 if (q.qdcount != a.qdcount)
1189 return 0;
1190
1191 /* For each question in qbuf, find it in abuf. */
1192 q.p = qbuf + HFIXEDSZ;
1193 for (i = 0; i < q.qdcount; i++)
1194 {
1195 /* Decode the question in the query. */
1196 if (ares_expand_name(q.p, qbuf, qlen, &q.name, &q.namelen)
1197 != ARES_SUCCESS)
1198 return 0;
1199 q.p += q.namelen;
1200 if (q.p + QFIXEDSZ > qbuf + qlen)
1201 {
1202 ares_free(q.name);
1203 return 0;
1204 }
1205 q.type = DNS_QUESTION_TYPE(q.p);
1206 q.dnsclass = DNS_QUESTION_CLASS(q.p);
1207 q.p += QFIXEDSZ;
1208
1209 /* Search for this question in the answer. */
1210 a.p = abuf + HFIXEDSZ;
1211 for (j = 0; j < a.qdcount; j++)
1212 {
1213 /* Decode the question in the answer. */
1214 if (ares_expand_name(a.p, abuf, alen, &a.name, &a.namelen)
1215 != ARES_SUCCESS)
1216 {
1217 ares_free(q.name);
1218 return 0;
1219 }
1220 a.p += a.namelen;
1221 if (a.p + QFIXEDSZ > abuf + alen)
1222 {
1223 ares_free(q.name);
1224 ares_free(a.name);
1225 return 0;
1226 }
1227 a.type = DNS_QUESTION_TYPE(a.p);
1228 a.dnsclass = DNS_QUESTION_CLASS(a.p);
1229 a.p += QFIXEDSZ;
1230
1231 /* Compare the decoded questions. */
1232 if (strcasecmp(q.name, a.name) == 0 && q.type == a.type
1233 && q.dnsclass == a.dnsclass)
1234 {
1235 ares_free(a.name);
1236 break;
1237 }
1238 ares_free(a.name);
1239 }
1240
1241 ares_free(q.name);
1242 if (j == a.qdcount)
1243 return 0;
1244 }
1245 return 1;
1246}
1247
1248static int same_address(struct sockaddr *sa, struct ares_addr *aa)
1249{
1250 void *addr1;
1251 void *addr2;
1252
1253 if (sa->sa_family == aa->family)
1254 {
1255 switch (aa->family)
1256 {
1257 case AF_INET:
1258 addr1 = &aa->addrV4;
1259 addr2 = &((struct sockaddr_in *)sa)->sin_addr;
1260 if (memcmp(addr1, addr2, sizeof(aa->addrV4)) == 0)
1261 return 1; /* match */
1262 break;
1263 case AF_INET6:
1264 addr1 = &aa->addrV6;
1265 addr2 = &((struct sockaddr_in6 *)sa)->sin6_addr;
1266 if (memcmp(addr1, addr2, sizeof(aa->addrV6)) == 0)
1267 return 1; /* match */
1268 break;
1269 default:
1270 break; /* LCOV_EXCL_LINE */
1271 }
1272 }
1273 return 0; /* different */
1274}
1275
1276static void end_query (ares_channel channel, struct query *query, int status,
1277 unsigned char *abuf, int alen)
1278{
1279 int i;
1280
1281 /* First we check to see if this query ended while one of our send
1282 * queues still has pointers to it.
1283 */
1284 for (i = 0; i < channel->nservers; i++)
1285 {
1286 struct server_state *server = &channel->servers[i];
1287 struct send_request *sendreq;
1288 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
1289 if (sendreq->owner_query == query)
1290 {
1291 sendreq->owner_query = NULL;
1292 assert(sendreq->data_storage == NULL);
1293 if (status == ARES_SUCCESS)
1294 {
1295 /* We got a reply for this query, but this queued sendreq
1296 * points into this soon-to-be-gone query's tcpbuf. Probably
1297 * this means we timed out and queued the query for
1298 * retransmission, then received a response before actually
1299 * retransmitting. This is perfectly fine, so we want to keep
1300 * the connection running smoothly if we can. But in the worst
1301 * case we may have sent only some prefix of the query, with
1302 * some suffix of the query left to send. Also, the buffer may
1303 * be queued on multiple queues. To prevent dangling pointers
1304 * to the query's tcpbuf and handle these cases, we just give
1305 * such sendreqs their own copy of the query packet.
1306 */
1307 sendreq->data_storage = ares_malloc(sendreq->len);
1308 if (sendreq->data_storage != NULL)
1309 {
1310 memcpy(sendreq->data_storage, sendreq->data, sendreq->len);
1311 sendreq->data = sendreq->data_storage;
1312 }
1313 }
1314 if ((status != ARES_SUCCESS) || (sendreq->data_storage == NULL))
1315 {
1316 /* We encountered an error (probably a timeout, suggesting the
1317 * DNS server we're talking to is probably unreachable,
1318 * wedged, or severely overloaded) or we couldn't copy the
1319 * request, so mark the connection as broken. When we get to
1320 * process_broken_connections() we'll close the connection and
1321 * try to re-send requests to another server.
1322 */
1323 server->is_broken = 1;
1324 /* Just to be paranoid, zero out this sendreq... */
1325 sendreq->data = NULL;
1326 sendreq->len = 0;
1327 }
1328 }
1329 }
1330
1331 /* Invoke the callback */
1332 query->callback(query->arg, status, query->timeouts, abuf, alen);
1333 ares__free_query(query);
1334
1335 /* Simple cleanup policy: if no queries are remaining, close all network
1336 * sockets unless STAYOPEN is set.
1337 */
1338 if (!(channel->flags & ARES_FLAG_STAYOPEN) &&
1339 ares__is_list_empty(&(channel->all_queries)))
1340 {
1341 for (i = 0; i < channel->nservers; i++)
1342 ares__close_sockets(channel, &channel->servers[i]);
1343 }
1344}
1345
1346void ares__free_query(struct query *query)
1347{
1348 /* Remove the query from all the lists in which it is linked */
1349 ares__remove_from_list(&(query->queries_by_qid));
1350 ares__remove_from_list(&(query->queries_by_timeout));
1351 ares__remove_from_list(&(query->queries_to_server));
1352 ares__remove_from_list(&(query->all_queries));
1353 /* Zero out some important stuff, to help catch bugs */
1354 query->callback = NULL;
1355 query->arg = NULL;
1356 /* Deallocate the memory associated with the query */
1357 ares_free(query->tcpbuf);
1358 ares_free(query->server_info);
1359 ares_free(query);
1360}