daemon/containerfs_linux.go - third_party/github.com/moby/moby - Git at Google

 package daemon // import "github.com/docker/docker/daemon"

 import (
 	"context"
 	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"

 	"github.com/containerd/log"
 	"github.com/hashicorp/go-multierror"
 	"github.com/moby/sys/mount"
 	"github.com/moby/sys/symlink"
 	"golang.org/x/sys/unix"

 	"github.com/docker/docker/api/types"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/internal/compatcontext"
 	"github.com/docker/docker/internal/mounttree"
 	"github.com/docker/docker/internal/unshare"
 	"github.com/docker/docker/pkg/fileutils"
 )

 type future struct {
 	fn  func() error
 	res chan<- error
 }

 // containerFSView allows functions to be run in the context of a container's
 // filesystem. Inside these functions, the root directory is the container root
 // for all native OS filesystem APIs, including, but not limited to, the [os]
 // and [golang.org/x/sys/unix] packages. The view of the container's filesystem
 // is live and read-write. Each view has its own private set of tmpfs mounts.
 // Any files written under a tmpfs mount are not visible to processes inside the
 // container nor any other view of the container's filesystem, and vice versa.
 //
 // Each view has its own current working directory which is initialized to the
 // root of the container filesystem and can be changed with [os.Chdir]. Changes
 // to the current directory persist across successive [*containerFSView.RunInFS]
 // and [*containerFSView.GoInFS] calls.
 //
 // Multiple views of the same container filesystem can coexist at the same time.
 // Only one function can be running in a particular filesystem view at any given
 // time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will
 // block while another function is running. If more than one call is blocked
 // concurrently, the order they are unblocked is undefined.
 type containerFSView struct {
 	d    *Daemon
 	ctr  *container.Container
 	todo chan future
 	done chan error
 }

 // openContainerFS opens a new view of the container's filesystem.
 func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) {
 	ctx := context.TODO()

 	if err := daemon.Mount(container); err != nil {
 		return nil, err
 	}
 	defer func() {
 		if err != nil {
 			_ = daemon.Unmount(container)
 		}
 	}()

 	mounts, cleanup, err := daemon.setupMounts(ctx, container)
 	if err != nil {
 		return nil, err
 	}
 	defer func() {
 		ctx := compatcontext.WithoutCancel(ctx)
 		cleanup(ctx)
 		if err != nil {
 			_ = container.UnmountVolumes(ctx, daemon.LogVolumeEvent)
 		}
 	}()

 	// Setup in initial mount namespace complete. We're ready to unshare the
 	// mount namespace and bind the volume mounts into that private view of
 	// the container FS.
 	todo := make(chan future)
 	done := make(chan error)
 	err = unshare.Go(unix.CLONE_NEWNS,
 		func() error {
 			if err := mount.MakeRSlave("/"); err != nil {
 				return err
 			}
 			for _, m := range mounts {
 				dest, err := container.GetResourcePath(m.Destination)
 				if err != nil {
 					return err
 				}

 				var stat os.FileInfo
 				stat, err = os.Stat(m.Source)
 				if err != nil {
 					return err
 				}
 				if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
 					return err
 				}

 				bindMode := "rbind"
 				if m.NonRecursive {
 					bindMode = "bind"
 				}
 				writeMode := "ro"
 				if m.Writable {
 					writeMode = "rw"
 					if m.ReadOnlyNonRecursive {
 						return errors.New("options conflict: Writable && ReadOnlyNonRecursive")
 					}
 					if m.ReadOnlyForceRecursive {
 						return errors.New("options conflict: Writable && ReadOnlyForceRecursive")
 					}
 				}
 				if m.ReadOnlyNonRecursive && m.ReadOnlyForceRecursive {
 					return errors.New("options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
 				}

 				// openContainerFS() is called for temporary mounts
 				// outside the container. Soon these will be unmounted
 				// with lazy unmount option and given we have mounted
 				// them rbind, all the submounts will propagate if these
 				// are shared. If daemon is running in host namespace
 				// and has / as shared then these unmounts will
 				// propagate and unmount original mount as well. So make
 				// all these mounts rprivate.  Do not use propagation
 				// property of volume as that should apply only when
 				// mounting happens inside the container.
 				opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
 				if err := mount.Mount(m.Source, dest, "", opts); err != nil {
 					return err
 				}

 				if !m.Writable && !m.ReadOnlyNonRecursive {
 					if err := makeMountRRO(dest); err != nil {
 						if m.ReadOnlyForceRecursive {
 							return err
 						} else {
 							log.G(context.TODO()).WithError(err).Debugf("Failed to make %q recursively read-only", dest)
 						}
 					}
 				}
 			}

 			return mounttree.SwitchRoot(container.BaseFS)
 		},
 		func() {
 			defer close(done)

 			for it := range todo {
 				err := it.fn()
 				if it.res != nil {
 					it.res <- err
 				}
 			}

 			// The thread will terminate when this goroutine returns, taking the
 			// mount namespace and all the volume bind-mounts with it.
 		},
 	)
 	if err != nil {
 		return nil, err
 	}
 	vw := &containerFSView{
 		d:    daemon,
 		ctr:  container,
 		todo: todo,
 		done: done,
 	}
 	runtime.SetFinalizer(vw, (*containerFSView).Close)
 	return vw, nil
 }

 // RunInFS synchronously runs fn in the context of the container filesytem and
 // passes through its return value.
 //
 // The container filesystem is only visible to functions called in the same
 // goroutine as fn. Goroutines started from fn will see the host's filesystem.
 func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error {
 	res := make(chan error)
 	select {
 	case vw.todo <- future{fn: fn, res: res}:
 	case <-ctx.Done():
 		return ctx.Err()
 	}
 	return <-res
 }

 // GoInFS starts fn in the container FS. It blocks until fn is started but does
 // not wait until fn returns. An error is returned if ctx is canceled before fn
 // has been started.
 //
 // The container filesystem is only visible to functions called in the same
 // goroutine as fn. Goroutines started from fn will see the host's filesystem.
 func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error {
 	select {
 	case vw.todo <- future{fn: func() error { fn(); return nil }}:
 		return nil
 	case <-ctx.Done():
 		return ctx.Err()
 	}
 }

 // Close waits until any in-flight operations complete and frees all
 // resources associated with vw.
 func (vw *containerFSView) Close() error {
 	runtime.SetFinalizer(vw, nil)
 	close(vw.todo)
 	err := multierror.Append(nil, <-vw.done)
 	err = multierror.Append(err, vw.ctr.UnmountVolumes(context.TODO(), vw.d.LogVolumeEvent))
 	err = multierror.Append(err, vw.d.Unmount(vw.ctr))
 	return err.ErrorOrNil()
 }

 // Stat returns the metadata for path, relative to the current working directory
 // of vw inside the container filesystem view.
 func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) {
 	var stat *types.ContainerPathStat
 	err := vw.RunInFS(ctx, func() error {
 		lstat, err := os.Lstat(path)
 		if err != nil {
 			return err
 		}
 		var target string
 		if lstat.Mode()&os.ModeSymlink != 0 {
 			// Fully evaluate symlinks along path to the ultimate
 			// target, or as much as possible with broken links.
 			target, err = symlink.FollowSymlinkInScope(path, "/")
 			if err != nil {
 				return err
 			}
 		}
 		stat = &types.ContainerPathStat{
 			Name:       filepath.Base(path),
 			Size:       lstat.Size(),
 			Mode:       lstat.Mode(),
 			Mtime:      lstat.ModTime(),
 			LinkTarget: target,
 		}
 		return nil
 	})
 	return stat, err
 }

 // makeMountRRO makes the mount recursively read-only.
 func makeMountRRO(dest string) error {
 	attr := &unix.MountAttr{
 		Attr_set: unix.MOUNT_ATTR_RDONLY,
 	}
 	var err error
 	for {
 		err = unix.MountSetattr(-1, dest, unix.AT_RECURSIVE, attr)
 		if !errors.Is(err, unix.EINTR) {
 			break
 		}
 	}
 	if err != nil {
 		err = fmt.Errorf("failed to apply MOUNT_ATTR_RDONLY with AT_RECURSIVE to %q: %w", dest, err)
 	}
 	return err
 }
	package daemon // import "github.com/docker/docker/daemon"

	import (
	"context"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"runtime"
	"strings"

	"github.com/containerd/log"
	"github.com/hashicorp/go-multierror"
	"github.com/moby/sys/mount"
	"github.com/moby/sys/symlink"
	"golang.org/x/sys/unix"

	"github.com/docker/docker/api/types"
	"github.com/docker/docker/container"
	"github.com/docker/docker/internal/compatcontext"
	"github.com/docker/docker/internal/mounttree"
	"github.com/docker/docker/internal/unshare"
	"github.com/docker/docker/pkg/fileutils"
	)

	type future struct {
	fn func() error
	res chan<- error
	}

	// containerFSView allows functions to be run in the context of a container's
	// filesystem. Inside these functions, the root directory is the container root
	// for all native OS filesystem APIs, including, but not limited to, the [os]
	// and [golang.org/x/sys/unix] packages. The view of the container's filesystem
	// is live and read-write. Each view has its own private set of tmpfs mounts.
	// Any files written under a tmpfs mount are not visible to processes inside the
	// container nor any other view of the container's filesystem, and vice versa.
	//
	// Each view has its own current working directory which is initialized to the
	// root of the container filesystem and can be changed with [os.Chdir]. Changes
	// to the current directory persist across successive [*containerFSView.RunInFS]
	// and [*containerFSView.GoInFS] calls.
	//
	// Multiple views of the same container filesystem can coexist at the same time.
	// Only one function can be running in a particular filesystem view at any given
	// time. Calls to [containerFSView.RunInFS] or [containerFSView.GoInFS] will
	// block while another function is running. If more than one call is blocked
	// concurrently, the order they are unblocked is undefined.
	type containerFSView struct {
	d *Daemon
	ctr *container.Container
	todo chan future
	done chan error
	}

	// openContainerFS opens a new view of the container's filesystem.
	func (daemon Daemon) openContainerFS(container container.Container) (_ *containerFSView, err error) {
	ctx := context.TODO()

	if err := daemon.Mount(container); err != nil {
	return nil, err
	}
	defer func() {
	if err != nil {
	_ = daemon.Unmount(container)
	}
	}()

	mounts, cleanup, err := daemon.setupMounts(ctx, container)
	if err != nil {
	return nil, err
	}
	defer func() {
	ctx := compatcontext.WithoutCancel(ctx)
	cleanup(ctx)
	if err != nil {
	_ = container.UnmountVolumes(ctx, daemon.LogVolumeEvent)
	}
	}()

	// Setup in initial mount namespace complete. We're ready to unshare the
	// mount namespace and bind the volume mounts into that private view of
	// the container FS.
	todo := make(chan future)
	done := make(chan error)
	err = unshare.Go(unix.CLONE_NEWNS,
	func() error {
	if err := mount.MakeRSlave("/"); err != nil {
	return err
	}
	for _, m := range mounts {
	dest, err := container.GetResourcePath(m.Destination)
	if err != nil {
	return err
	}

	var stat os.FileInfo
	stat, err = os.Stat(m.Source)
	if err != nil {
	return err
	}
	if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
	return err
	}

	bindMode := "rbind"
	if m.NonRecursive {
	bindMode = "bind"
	}
	writeMode := "ro"
	if m.Writable {
	writeMode = "rw"
	if m.ReadOnlyNonRecursive {
	return errors.New("options conflict: Writable && ReadOnlyNonRecursive")
	}
	if m.ReadOnlyForceRecursive {
	return errors.New("options conflict: Writable && ReadOnlyForceRecursive")
	}
	}
	if m.ReadOnlyNonRecursive && m.ReadOnlyForceRecursive {
	return errors.New("options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
	}

	// openContainerFS() is called for temporary mounts
	// outside the container. Soon these will be unmounted
	// with lazy unmount option and given we have mounted
	// them rbind, all the submounts will propagate if these
	// are shared. If daemon is running in host namespace
	// and has / as shared then these unmounts will
	// propagate and unmount original mount as well. So make
	// all these mounts rprivate. Do not use propagation
	// property of volume as that should apply only when
	// mounting happens inside the container.
	opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
	if err := mount.Mount(m.Source, dest, "", opts); err != nil {
	return err
	}

	if !m.Writable && !m.ReadOnlyNonRecursive {
	if err := makeMountRRO(dest); err != nil {
	if m.ReadOnlyForceRecursive {
	return err
	} else {
	log.G(context.TODO()).WithError(err).Debugf("Failed to make %q recursively read-only", dest)
	}
	}
	}
	}

	return mounttree.SwitchRoot(container.BaseFS)
	},
	func() {
	defer close(done)

	for it := range todo {
	err := it.fn()
	if it.res != nil {
	it.res <- err
	}
	}

	// The thread will terminate when this goroutine returns, taking the
	// mount namespace and all the volume bind-mounts with it.
	},
	)
	if err != nil {
	return nil, err
	}
	vw := &containerFSView{
	d: daemon,
	ctr: container,
	todo: todo,
	done: done,
	}
	runtime.SetFinalizer(vw, (*containerFSView).Close)
	return vw, nil
	}

	// RunInFS synchronously runs fn in the context of the container filesytem and
	// passes through its return value.
	//
	// The container filesystem is only visible to functions called in the same
	// goroutine as fn. Goroutines started from fn will see the host's filesystem.
	func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error {
	res := make(chan error)
	select {
	case vw.todo <- future{fn: fn, res: res}:
	case <-ctx.Done():
	return ctx.Err()
	}
	return <-res
	}

	// GoInFS starts fn in the container FS. It blocks until fn is started but does
	// not wait until fn returns. An error is returned if ctx is canceled before fn
	// has been started.
	//
	// The container filesystem is only visible to functions called in the same
	// goroutine as fn. Goroutines started from fn will see the host's filesystem.
	func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error {
	select {
	case vw.todo <- future{fn: func() error { fn(); return nil }}:
	return nil
	case <-ctx.Done():
	return ctx.Err()
	}
	}

	// Close waits until any in-flight operations complete and frees all
	// resources associated with vw.
	func (vw *containerFSView) Close() error {
	runtime.SetFinalizer(vw, nil)
	close(vw.todo)
	err := multierror.Append(nil, <-vw.done)
	err = multierror.Append(err, vw.ctr.UnmountVolumes(context.TODO(), vw.d.LogVolumeEvent))
	err = multierror.Append(err, vw.d.Unmount(vw.ctr))
	return err.ErrorOrNil()
	}

	// Stat returns the metadata for path, relative to the current working directory
	// of vw inside the container filesystem view.
	func (vw containerFSView) Stat(ctx context.Context, path string) (types.ContainerPathStat, error) {
	var stat *types.ContainerPathStat
	err := vw.RunInFS(ctx, func() error {
	lstat, err := os.Lstat(path)
	if err != nil {
	return err
	}
	var target string
	if lstat.Mode()&os.ModeSymlink != 0 {
	// Fully evaluate symlinks along path to the ultimate
	// target, or as much as possible with broken links.
	target, err = symlink.FollowSymlinkInScope(path, "/")
	if err != nil {
	return err
	}
	}
	stat = &types.ContainerPathStat{
	Name: filepath.Base(path),
	Size: lstat.Size(),
	Mode: lstat.Mode(),
	Mtime: lstat.ModTime(),
	LinkTarget: target,
	}
	return nil
	})
	return stat, err
	}

	// makeMountRRO makes the mount recursively read-only.
	func makeMountRRO(dest string) error {
	attr := &unix.MountAttr{
	Attr_set: unix.MOUNT_ATTR_RDONLY,
	}
	var err error
	for {
	err = unix.MountSetattr(-1, dest, unix.AT_RECURSIVE, attr)
	if !errors.Is(err, unix.EINTR) {
	break
	}
	}
	if err != nil {
	err = fmt.Errorf("failed to apply MOUNT_ATTR_RDONLY with AT_RECURSIVE to %q: %w", dest, err)
	}
	return err
	}