| package daemon // import "github.com/docker/docker/daemon" |
| |
| import ( |
| "context" |
| "os" |
| "path/filepath" |
| "runtime" |
| "strings" |
| |
| "github.com/hashicorp/go-multierror" |
| "github.com/moby/sys/mount" |
| "github.com/moby/sys/symlink" |
| "golang.org/x/sys/unix" |
| |
| "github.com/docker/docker/api/types" |
| "github.com/docker/docker/container" |
| "github.com/docker/docker/internal/mounttree" |
| "github.com/docker/docker/internal/unshare" |
| "github.com/docker/docker/pkg/fileutils" |
| ) |
| |
| type future struct { |
| fn func() error |
| res chan<- error |
| } |
| |
| // containerFSView allows functions to be run in the context of a container's |
| // filesystem. Inside these functions, the root directory is the container root |
| // for all native OS filesystem APIs, including, but not limited to, the [os] |
| // and [golang.org/x/sys/unix] packages. The view of the container's filesystem |
| // is live and read-write. Each view has its own private set of tmpfs mounts. |
| // Any files written under a tmpfs mount are not visible to processes inside the |
| // container nor any other view of the container's filesystem, and vice versa. |
| // |
| // Each view has its own current working directory which is initialized to the |
| // root of the container filesystem and can be changed with [os.Chdir]. Changes |
| // to the current directory persist across successive [*containerFSView.RunInFS] |
| // and [*containerFSView.GoInFS] calls. |
| // |
| // Multiple views of the same container filesystem can coexist at the same time. |
| // Only one function can be running in a particular filesystem view at any given |
| // time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will |
| // block while another function is running. If more than one call is blocked |
| // concurrently, the order they are unblocked is undefined. |
| type containerFSView struct { |
| d *Daemon |
| ctr *container.Container |
| todo chan future |
| done chan error |
| } |
| |
| // openContainerFS opens a new view of the container's filesystem. |
| func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) { |
| if err := daemon.Mount(container); err != nil { |
| return nil, err |
| } |
| defer func() { |
| if err != nil { |
| _ = daemon.Unmount(container) |
| } |
| }() |
| |
| mounts, err := daemon.setupMounts(container) |
| if err != nil { |
| return nil, err |
| } |
| defer func() { |
| if err != nil { |
| _ = container.UnmountVolumes(daemon.LogVolumeEvent) |
| } |
| }() |
| |
| // Setup in initial mount namespace complete. We're ready to unshare the |
| // mount namespace and bind the volume mounts into that private view of |
| // the container FS. |
| todo := make(chan future) |
| done := make(chan error) |
| err = unshare.Go(unix.CLONE_NEWNS, |
| func() error { |
| if err := mount.MakeRSlave("/"); err != nil { |
| return err |
| } |
| for _, m := range mounts { |
| dest, err := container.GetResourcePath(m.Destination) |
| if err != nil { |
| return err |
| } |
| |
| var stat os.FileInfo |
| stat, err = os.Stat(m.Source) |
| if err != nil { |
| return err |
| } |
| if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil { |
| return err |
| } |
| |
| bindMode := "rbind" |
| if m.NonRecursive { |
| bindMode = "bind" |
| } |
| writeMode := "ro" |
| if m.Writable { |
| writeMode = "rw" |
| } |
| |
| // openContainerFS() is called for temporary mounts |
| // outside the container. Soon these will be unmounted |
| // with lazy unmount option and given we have mounted |
| // them rbind, all the submounts will propagate if these |
| // are shared. If daemon is running in host namespace |
| // and has / as shared then these unmounts will |
| // propagate and unmount original mount as well. So make |
| // all these mounts rprivate. Do not use propagation |
| // property of volume as that should apply only when |
| // mounting happens inside the container. |
| opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",") |
| if err := mount.Mount(m.Source, dest, "", opts); err != nil { |
| return err |
| } |
| } |
| |
| return mounttree.SwitchRoot(container.BaseFS) |
| }, |
| func() { |
| defer close(done) |
| |
| for it := range todo { |
| err := it.fn() |
| if it.res != nil { |
| it.res <- err |
| } |
| } |
| |
| // The thread will terminate when this goroutine returns, taking the |
| // mount namespace and all the volume bind-mounts with it. |
| }, |
| ) |
| if err != nil { |
| return nil, err |
| } |
| vw := &containerFSView{ |
| d: daemon, |
| ctr: container, |
| todo: todo, |
| done: done, |
| } |
| runtime.SetFinalizer(vw, (*containerFSView).Close) |
| return vw, nil |
| } |
| |
| // RunInFS synchronously runs fn in the context of the container filesytem and |
| // passes through its return value. |
| // |
| // The container filesystem is only visible to functions called in the same |
| // goroutine as fn. Goroutines started from fn will see the host's filesystem. |
| func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error { |
| res := make(chan error) |
| select { |
| case vw.todo <- future{fn: fn, res: res}: |
| case <-ctx.Done(): |
| return ctx.Err() |
| } |
| return <-res |
| } |
| |
| // GoInFS starts fn in the container FS. It blocks until fn is started but does |
| // not wait until fn returns. An error is returned if ctx is canceled before fn |
| // has been started. |
| // |
| // The container filesystem is only visible to functions called in the same |
| // goroutine as fn. Goroutines started from fn will see the host's filesystem. |
| func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error { |
| select { |
| case vw.todo <- future{fn: func() error { fn(); return nil }}: |
| return nil |
| case <-ctx.Done(): |
| return ctx.Err() |
| } |
| } |
| |
| // Close waits until any in-flight operations complete and frees all |
| // resources associated with vw. |
| func (vw *containerFSView) Close() error { |
| runtime.SetFinalizer(vw, nil) |
| close(vw.todo) |
| err := multierror.Append(nil, <-vw.done) |
| err = multierror.Append(err, vw.ctr.UnmountVolumes(vw.d.LogVolumeEvent)) |
| err = multierror.Append(err, vw.d.Unmount(vw.ctr)) |
| return err.ErrorOrNil() |
| } |
| |
| // Stat returns the metadata for path, relative to the current working directory |
| // of vw inside the container filesystem view. |
| func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) { |
| var stat *types.ContainerPathStat |
| err := vw.RunInFS(ctx, func() error { |
| lstat, err := os.Lstat(path) |
| if err != nil { |
| return err |
| } |
| var target string |
| if lstat.Mode()&os.ModeSymlink != 0 { |
| // Fully evaluate symlinks along path to the ultimate |
| // target, or as much as possible with broken links. |
| target, err = symlink.FollowSymlinkInScope(path, "/") |
| if err != nil { |
| return err |
| } |
| } |
| stat = &types.ContainerPathStat{ |
| Name: filepath.Base(path), |
| Size: lstat.Size(), |
| Mode: lstat.Mode(), |
| Mtime: lstat.ModTime(), |
| LinkTarget: target, |
| } |
| return nil |
| }) |
| return stat, err |
| } |