Merge pull request #47593 from corhere/backport-23.0/libnet-resolver-nxdomain

[23.0 backport] libnet: Don't forward to upstream resolvers on internal nw
diff --git a/daemon/container_operations_unix.go b/daemon/container_operations_unix.go
index 561077b..d6c2564 100644
--- a/daemon/container_operations_unix.go
+++ b/daemon/container_operations_unix.go
@@ -384,6 +384,7 @@
 
 func (daemon *Daemon) setupPathsAndSandboxOptions(container *container.Container, sboxOptions *[]libnetwork.SandboxOption) error {
 	var err error
+	var originResolvConfPath string
 
 	// Set the correct paths for /etc/hosts and /etc/resolv.conf, based on the
 	// networking-mode of the container. Note that containers with "container"
@@ -397,8 +398,8 @@
 		*sboxOptions = append(
 			*sboxOptions,
 			libnetwork.OptionOriginHostsPath("/etc/hosts"),
-			libnetwork.OptionOriginResolvConfPath("/etc/resolv.conf"),
 		)
+		originResolvConfPath = "/etc/resolv.conf"
 	case container.HostConfig.NetworkMode.IsUserDefined():
 		// The container uses a user-defined network. We use the embedded DNS
 		// server for container name resolution and to act as a DNS forwarder
@@ -411,10 +412,7 @@
 		// If systemd-resolvd is used, the "upstream" DNS servers can be found in
 		// /run/systemd/resolve/resolv.conf. We do not query those DNS servers
 		// directly, as they can be dynamically reconfigured.
-		*sboxOptions = append(
-			*sboxOptions,
-			libnetwork.OptionOriginResolvConfPath("/etc/resolv.conf"),
-		)
+		originResolvConfPath = "/etc/resolv.conf"
 	default:
 		// For other situations, such as the default bridge network, container
 		// discovery / name resolution is handled through /etc/hosts, and no
@@ -427,12 +425,16 @@
 		// DNS servers on the host can be dynamically updated.
 		//
 		// Copy the host's resolv.conf for the container (/run/systemd/resolve/resolv.conf or /etc/resolv.conf)
-		*sboxOptions = append(
-			*sboxOptions,
-			libnetwork.OptionOriginResolvConfPath(daemon.configStore.GetResolvConf()),
-		)
+		originResolvConfPath = daemon.configStore.GetResolvConf()
 	}
 
+	// Allow tests to point at their own resolv.conf file.
+	if envPath := os.Getenv("DOCKER_TEST_RESOLV_CONF_PATH"); envPath != "" {
+		logrus.Infof("Using OriginResolvConfPath from env: %s", envPath)
+		originResolvConfPath = envPath
+	}
+	*sboxOptions = append(*sboxOptions, libnetwork.OptionOriginResolvConfPath(originResolvConfPath))
+
 	container.HostsPath, err = container.GetRootResourcePath("hosts")
 	if err != nil {
 		return err
diff --git a/integration/internal/container/container.go b/integration/internal/container/container.go
index 6559bd4..4ecd2ec 100644
--- a/integration/internal/container/container.go
+++ b/integration/internal/container/container.go
@@ -1,14 +1,18 @@
 package container
 
 import (
+	"bytes"
 	"context"
+	"errors"
 	"runtime"
+	"sync"
 	"testing"
 
 	"github.com/docker/docker/api/types"
 	"github.com/docker/docker/api/types/container"
 	"github.com/docker/docker/api/types/network"
 	"github.com/docker/docker/client"
+	"github.com/docker/docker/pkg/stdcopy"
 	specs "github.com/opencontainers/image-spec/specs-go/v1"
 	"gotest.tools/v3/assert"
 )
@@ -71,3 +75,75 @@
 
 	return id
 }
+
+type RunResult struct {
+	ContainerID string
+	ExitCode    int
+	Stdout      *bytes.Buffer
+	Stderr      *bytes.Buffer
+}
+
+func RunAttach(ctx context.Context, t *testing.T, client client.APIClient, ops ...func(config *TestContainerConfig)) RunResult {
+	t.Helper()
+
+	ops = append(ops, func(c *TestContainerConfig) {
+		c.Config.AttachStdout = true
+		c.Config.AttachStderr = true
+	})
+	id := Create(ctx, t, client, ops...)
+
+	aresp, err := client.ContainerAttach(ctx, id, types.ContainerAttachOptions{
+		Stream: true,
+		Stdout: true,
+		Stderr: true,
+	})
+	assert.NilError(t, err)
+
+	err = client.ContainerStart(ctx, id, types.ContainerStartOptions{})
+	assert.NilError(t, err)
+
+	s, err := demultiplexStreams(ctx, aresp)
+	if !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
+		assert.NilError(t, err)
+	}
+
+	// Inspect to get the exit code. A new context is used here to make sure that if the context passed as argument as
+	// reached timeout during the demultiplexStream call, we still return a RunResult.
+	resp, err := client.ContainerInspect(context.Background(), id)
+	assert.NilError(t, err)
+
+	return RunResult{ContainerID: id, ExitCode: resp.State.ExitCode, Stdout: &s.stdout, Stderr: &s.stderr}
+}
+
+type streams struct {
+	stdout, stderr bytes.Buffer
+}
+
+// demultiplexStreams starts a goroutine to demultiplex stdout and stderr from the types.HijackedResponse resp and
+// waits until either multiplexed stream reaches EOF or the context expires. It unconditionally closes resp and waits
+// until the demultiplexing goroutine has finished its work before returning.
+func demultiplexStreams(ctx context.Context, resp types.HijackedResponse) (streams, error) {
+	var s streams
+	outputDone := make(chan error, 1)
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		_, err := stdcopy.StdCopy(&s.stdout, &s.stderr, resp.Reader)
+		outputDone <- err
+		wg.Done()
+	}()
+
+	var err error
+	select {
+	case copyErr := <-outputDone:
+		err = copyErr
+		break
+	case <-ctx.Done():
+		err = ctx.Err()
+	}
+
+	resp.Close()
+	wg.Wait()
+	return s, err
+}
diff --git a/integration/internal/network/network.go b/integration/internal/network/network.go
index 5a682ce..3989a4f 100644
--- a/integration/internal/network/network.go
+++ b/integration/internal/network/network.go
@@ -33,3 +33,10 @@
 	assert.NilError(t, err)
 	return name
 }
+
+func RemoveNoError(ctx context.Context, t *testing.T, apiClient client.APIClient, name string) {
+	t.Helper()
+
+	err := apiClient.NetworkRemove(ctx, name)
+	assert.NilError(t, err)
+}
diff --git a/integration/networking/main_test.go b/integration/networking/main_test.go
new file mode 100644
index 0000000..df2934e
--- /dev/null
+++ b/integration/networking/main_test.go
@@ -0,0 +1,34 @@
+package networking
+
+import (
+	"context"
+	"os"
+	"testing"
+
+	"github.com/docker/docker/testutil/environment"
+)
+
+var testEnv *environment.Execution
+
+func TestMain(m *testing.M) {
+	var err error
+	testEnv, err = environment.New()
+	if err != nil {
+		panic(err)
+	}
+
+	err = environment.EnsureFrozenImagesLinux(testEnv)
+	if err != nil {
+		panic(err)
+	}
+
+	testEnv.Print()
+	code := m.Run()
+	os.Exit(code)
+}
+
+func setupTest(t *testing.T) context.Context {
+	environment.ProtectAll(t, testEnv)
+	t.Cleanup(func() { testEnv.Clean(t) })
+	return context.Background()
+}
diff --git a/integration/networking/resolvconf_test.go b/integration/networking/resolvconf_test.go
new file mode 100644
index 0000000..0dd1991
--- /dev/null
+++ b/integration/networking/resolvconf_test.go
@@ -0,0 +1,142 @@
+package networking
+
+import (
+	"net"
+	"os"
+	"testing"
+
+	"github.com/docker/docker/api/types"
+	"github.com/docker/docker/integration/internal/container"
+	"github.com/docker/docker/integration/internal/network"
+	"github.com/docker/docker/testutil/daemon"
+	"github.com/miekg/dns"
+	"gotest.tools/v3/assert"
+	is "gotest.tools/v3/assert/cmp"
+	"gotest.tools/v3/skip"
+)
+
+// writeTempResolvConf writes a resolv.conf that only contains a single
+// nameserver line, with address addr.
+// It returns the name of the temp file.
+func writeTempResolvConf(t *testing.T, addr string) string {
+	t.Helper()
+	// Not using t.TempDir() here because in rootless mode, while the temporary
+	// directory gets mode 0777, it's a subdir of an 0700 directory owned by root.
+	// So, it's not accessible by the daemon.
+	f, err := os.CreateTemp("", "resolv.conf")
+	assert.NilError(t, err)
+	t.Cleanup(func() { os.Remove(f.Name()) })
+	err = f.Chmod(0644)
+	assert.NilError(t, err)
+	f.Write([]byte("nameserver " + addr + "\n"))
+	return f.Name()
+}
+
+const dnsRespAddr = "10.11.12.13"
+
+// startDaftDNS starts and returns a really, really daft DNS server that only
+// responds to type-A requests, and always with address dnsRespAddr.
+func startDaftDNS(t *testing.T, addr string) *dns.Server {
+	serveDNS := func(w dns.ResponseWriter, query *dns.Msg) {
+		if query.Question[0].Qtype == dns.TypeA {
+			resp := &dns.Msg{}
+			resp.SetReply(query)
+			answer := &dns.A{
+				Hdr: dns.RR_Header{
+					Name:   query.Question[0].Name,
+					Rrtype: dns.TypeA,
+					Class:  dns.ClassINET,
+					Ttl:    600,
+				},
+			}
+			answer.A = net.ParseIP(dnsRespAddr)
+			resp.Answer = append(resp.Answer, answer)
+			_ = w.WriteMsg(resp)
+		}
+	}
+
+	conn, err := net.ListenUDP("udp", &net.UDPAddr{
+		IP:   net.ParseIP(addr),
+		Port: 53,
+	})
+	assert.NilError(t, err)
+
+	server := &dns.Server{Handler: dns.HandlerFunc(serveDNS), PacketConn: conn}
+	go func() {
+		_ = server.ActivateAndServe()
+	}()
+
+	return server
+}
+
+// Check that when a container is connected to an internal network, DNS
+// requests sent to daemon's internal DNS resolver are not forwarded to
+// an upstream resolver listening on a localhost address.
+// (Assumes the host does not already have a DNS server on 127.0.0.1.)
+func TestInternalNetworkDNS(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType == "windows", "No resolv.conf on Windows")
+	skip.If(t, testEnv.IsRootless, "Can't use resolver on host in rootless mode")
+	ctx := setupTest(t)
+
+	// Start a DNS server on the loopback interface.
+	server := startDaftDNS(t, "127.0.0.1")
+	defer server.Shutdown()
+
+	// Set up a temp resolv.conf pointing at that DNS server, and a daemon using it.
+	tmpFileName := writeTempResolvConf(t, "127.0.0.1")
+	d := daemon.New(t, daemon.WithEnvVars("DOCKER_TEST_RESOLV_CONF_PATH="+tmpFileName))
+	d.StartWithBusybox(t, "--experimental", "--ip6tables")
+	defer d.Stop(t)
+
+	c := d.NewClientT(t)
+	defer c.Close()
+
+	intNetName := "intnet"
+	network.CreateNoError(ctx, t, c, intNetName,
+		network.WithDriver("bridge"),
+		network.WithInternal(),
+	)
+	defer network.RemoveNoError(ctx, t, c, intNetName)
+
+	extNetName := "extnet"
+	network.CreateNoError(ctx, t, c, extNetName,
+		network.WithDriver("bridge"),
+	)
+	defer network.RemoveNoError(ctx, t, c, extNetName)
+
+	// Create a container, initially with external connectivity.
+	// Expect the external DNS server to respond to a request from the container.
+	ctrId := container.Run(ctx, t, c, container.WithNetworkMode(extNetName))
+	defer c.ContainerRemove(ctx, ctrId, types.ContainerRemoveOptions{Force: true})
+	res, err := container.Exec(ctx, c, ctrId, []string{"nslookup", "test.example"})
+	assert.NilError(t, err)
+	assert.Check(t, is.Equal(res.ExitCode, 0))
+	assert.Check(t, is.Contains(res.Stdout(), dnsRespAddr))
+
+	// Connect the container to the internal network as well.
+	// External DNS should still be used.
+	err = c.NetworkConnect(ctx, intNetName, ctrId, nil)
+	assert.NilError(t, err)
+	res, err = container.Exec(ctx, c, ctrId, []string{"nslookup", "test.example"})
+	assert.NilError(t, err)
+	assert.Check(t, is.Equal(res.ExitCode, 0))
+	assert.Check(t, is.Contains(res.Stdout(), dnsRespAddr))
+
+	// Disconnect from the external network.
+	// Expect no access to the external DNS.
+	err = c.NetworkDisconnect(ctx, extNetName, ctrId, true)
+	assert.NilError(t, err)
+	res, err = container.Exec(ctx, c, ctrId, []string{"nslookup", "test.example"})
+	assert.NilError(t, err)
+	assert.Check(t, is.Equal(res.ExitCode, 1))
+	assert.Check(t, is.Contains(res.Stdout(), "SERVFAIL"))
+
+	// Reconnect the external network.
+	// Check that the external DNS server is used again.
+	err = c.NetworkConnect(ctx, extNetName, ctrId, nil)
+	assert.NilError(t, err)
+	res, err = container.Exec(ctx, c, ctrId, []string{"nslookup", "test.example"})
+	assert.NilError(t, err)
+	assert.Check(t, is.Equal(res.ExitCode, 0))
+	assert.Check(t, is.Contains(res.Stdout(), dnsRespAddr))
+}
diff --git a/libnetwork/default_gateway.go b/libnetwork/default_gateway.go
index 1c022a4..0298538 100644
--- a/libnetwork/default_gateway.go
+++ b/libnetwork/default_gateway.go
@@ -186,3 +186,27 @@
 	}
 	return nil
 }
+
+// hasExternalConnectivity returns true if the sandbox is connected to any
+// endpoint which provides external connectivity.
+//
+// This function is only necessary on branches without
+// https://github.com/moby/moby/pull/46603. With that PR applied, this function
+// would be equivalent to sb.getGatewayEndpoint() != nil.
+func (sb *sandbox) hasExternalConnectivity() bool {
+	for _, ep := range sb.getConnectedEndpoints() {
+		n := ep.getNetwork()
+		switch n.Type() {
+		case "null", "host":
+			continue
+		case "bridge":
+			if n.Internal() {
+				continue
+			}
+		}
+		if len(ep.Gateway()) != 0 {
+			return true
+		}
+	}
+	return false
+}
diff --git a/libnetwork/endpoint.go b/libnetwork/endpoint.go
index bd77eb9..e9b67c4 100644
--- a/libnetwork/endpoint.go
+++ b/libnetwork/endpoint.go
@@ -544,8 +544,12 @@
 		return sb.setupDefaultGW()
 	}
 
-	moveExtConn := sb.getGatewayEndpoint() != extEp
+	// Enable upstream forwarding if the sandbox gained external connectivity.
+	if sb.resolver != nil {
+		sb.resolver.SetForwardingPolicy(sb.hasExternalConnectivity())
+	}
 
+	moveExtConn := sb.getGatewayEndpoint() != extEp
 	if moveExtConn {
 		if extEp != nil {
 			logrus.Debugf("Revoking external connectivity on endpoint %s (%s)", extEp.Name(), extEp.ID())
@@ -777,6 +781,11 @@
 		return sb.setupDefaultGW()
 	}
 
+	// Disable upstream forwarding if the sandbox lost external connectivity.
+	if sb.resolver != nil {
+		sb.resolver.SetForwardingPolicy(sb.hasExternalConnectivity())
+	}
+
 	// New endpoint providing external connectivity for the sandbox
 	extEp = sb.getGatewayEndpoint()
 	if moveExtConn && extEp != nil {
diff --git a/libnetwork/resolver.go b/libnetwork/resolver.go
index 64459757..5801b4f 100644
--- a/libnetwork/resolver.go
+++ b/libnetwork/resolver.go
@@ -6,6 +6,7 @@
 	"net"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/docker/docker/libnetwork/types"
@@ -30,6 +31,9 @@
 	// SetExtServers configures the external nameservers the resolver
 	// should use to forward queries
 	SetExtServers([]extDNSEntry)
+	// SetForwardingPolicy re-configures the embedded DNS resolver to either
+	// enable or disable forwarding DNS queries to external servers.
+	SetForwardingPolicy(policy bool)
 	// ResolverOptions returns resolv.conf options that should be set
 	ResolverOptions() []string
 }
@@ -89,21 +93,23 @@
 	tStamp        time.Time
 	queryLock     sync.Mutex
 	listenAddress string
-	proxyDNS      bool
+	proxyDNS      atomic.Bool
 	resolverKey   string
 	startCh       chan struct{}
 }
 
 // NewResolver creates a new instance of the Resolver
 func NewResolver(address string, proxyDNS bool, resolverKey string, backend DNSBackend) Resolver {
-	return &resolver{
+	r := &resolver{
 		backend:       backend,
-		proxyDNS:      proxyDNS,
 		listenAddress: address,
 		resolverKey:   resolverKey,
 		err:           fmt.Errorf("setup not done yet"),
 		startCh:       make(chan struct{}, 1),
 	}
+	r.proxyDNS.Store(proxyDNS)
+
+	return r
 }
 
 func (r *resolver) SetupFunc(port int) func() {
@@ -196,6 +202,10 @@
 	}
 }
 
+func (r *resolver) SetForwardingPolicy(policy bool) {
+	r.proxyDNS.Store(policy)
+}
+
 func (r *resolver) NameServer() string {
 	return r.listenAddress
 }
@@ -407,7 +417,7 @@
 	if resp == nil {
 		// If the backend doesn't support proxying dns request
 		// fail the response
-		if !r.proxyDNS {
+		if !r.proxyDNS.Load() {
 			resp = new(dns.Msg)
 			resp.SetRcode(query, dns.RcodeServerFailure)
 			if err := w.WriteMsg(resp); err != nil {
diff --git a/libnetwork/sandbox_dns_unix.go b/libnetwork/sandbox_dns_unix.go
index d6a1aca..63d5090 100644
--- a/libnetwork/sandbox_dns_unix.go
+++ b/libnetwork/sandbox_dns_unix.go
@@ -27,7 +27,11 @@
 func (sb *sandbox) startResolver(restore bool) {
 	sb.resolverOnce.Do(func() {
 		var err error
-		sb.resolver = NewResolver(resolverIPSandbox, true, sb.Key(), sb)
+		// The resolver is started with proxyDNS=false if the sandbox does not currently
+		// have a gateway. So, if the Sandbox is only connected to an 'internal' network,
+		// it will not forward DNS requests to external resolvers. The resolver's
+		// proxyDNS setting is then updated as network Endpoints are added/removed.
+		sb.resolver = NewResolver(resolverIPSandbox, sb.hasExternalConnectivity(), sb.Key(), sb)
 		defer func() {
 			if err != nil {
 				sb.resolver = nil