| /* |
| Copyright The containerd Authors. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| */ |
| |
| package mount |
| |
| import ( |
| "errors" |
| "fmt" |
| "os" |
| "os/exec" |
| "path" |
| "runtime" |
| "strings" |
| "time" |
| |
| "github.com/containerd/containerd/pkg/userns" |
| "golang.org/x/sys/unix" |
| ) |
| |
| var ( |
| pagesize = 4096 |
| allowedHelperBinaries = []string{"mount.fuse", "mount.fuse3"} |
| ) |
| |
| func init() { |
| pagesize = os.Getpagesize() |
| } |
| |
| // Mount to the provided target path. |
| // |
| // If m.Type starts with "fuse." or "fuse3.", "mount.fuse" or "mount.fuse3" |
| // helper binary is called. |
| func (m *Mount) mount(target string) (err error) { |
| for _, helperBinary := range allowedHelperBinaries { |
| // helperBinary = "mount.fuse", typePrefix = "fuse." |
| typePrefix := strings.TrimPrefix(helperBinary, "mount.") + "." |
| if strings.HasPrefix(m.Type, typePrefix) { |
| return m.mountWithHelper(helperBinary, typePrefix, target) |
| } |
| } |
| var ( |
| chdir string |
| options = m.Options |
| ) |
| |
| // avoid hitting one page limit of mount argument buffer |
| // |
| // NOTE: 512 is a buffer during pagesize check. |
| if m.Type == "overlay" && optionsSize(options) >= pagesize-512 { |
| chdir, options = compactLowerdirOption(options) |
| } |
| |
| flags, data, losetup := parseMountOptions(options) |
| |
| // propagation types. |
| const ptypes = unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE |
| |
| // Ensure propagation type change flags aren't included in other calls. |
| oflags := flags &^ ptypes |
| |
| var loopParams LoopParams |
| if losetup { |
| loopParams = LoopParams{ |
| Readonly: oflags&unix.MS_RDONLY == unix.MS_RDONLY, |
| Autoclear: true, |
| } |
| loopParams.Direct, data = hasDirectIO(data) |
| } |
| |
| dataInStr := strings.Join(data, ",") |
| if len(dataInStr) > pagesize { |
| return errors.New("mount options is too long") |
| } |
| |
| // In the case of remounting with changed data (data != ""), need to call mount (moby/moby#34077). |
| if flags&unix.MS_REMOUNT == 0 || dataInStr != "" { |
| // Initial call applying all non-propagation flags for mount |
| // or remount with changed data |
| source := m.Source |
| if losetup { |
| loFile, err := setupLoop(m.Source, loopParams) |
| if err != nil { |
| return err |
| } |
| defer loFile.Close() |
| |
| // Mount the loop device instead |
| source = loFile.Name() |
| } |
| if err := mountAt(chdir, source, target, m.Type, uintptr(oflags), dataInStr); err != nil { |
| return err |
| } |
| } |
| |
| if flags&ptypes != 0 { |
| // Change the propagation type. |
| const pflags = ptypes | unix.MS_REC | unix.MS_SILENT |
| if err := unix.Mount("", target, "", uintptr(flags&pflags), ""); err != nil { |
| return err |
| } |
| } |
| |
| const broflags = unix.MS_BIND | unix.MS_RDONLY |
| if oflags&broflags == broflags { |
| // Preserve CL_UNPRIVILEGED "locked" flags of the |
| // bind mount target when we remount to make the bind readonly. |
| // This is necessary to ensure that |
| // bind-mounting "with options" will not fail with user namespaces, due to |
| // kernel restrictions that require user namespace mounts to preserve |
| // CL_UNPRIVILEGED locked flags. |
| var unprivFlags int |
| if userns.RunningInUserNS() { |
| unprivFlags, err = getUnprivilegedMountFlags(target) |
| if err != nil { |
| return err |
| } |
| } |
| // Remount the bind to apply read only. |
| return unix.Mount("", target, "", uintptr(oflags|unprivFlags|unix.MS_REMOUNT), "") |
| } |
| return nil |
| } |
| |
| // Get the set of mount flags that are set on the mount that contains the given |
| // path and are locked by CL_UNPRIVILEGED. |
| // |
| // From https://github.com/moby/moby/blob/v23.0.1/daemon/oci_linux.go#L430-L460 |
| func getUnprivilegedMountFlags(path string) (int, error) { |
| var statfs unix.Statfs_t |
| if err := unix.Statfs(path, &statfs); err != nil { |
| return 0, err |
| } |
| |
| // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. |
| unprivilegedFlags := []int{ |
| unix.MS_RDONLY, |
| unix.MS_NODEV, |
| unix.MS_NOEXEC, |
| unix.MS_NOSUID, |
| unix.MS_NOATIME, |
| unix.MS_RELATIME, |
| unix.MS_NODIRATIME, |
| } |
| |
| var flags int |
| for flag := range unprivilegedFlags { |
| if int(statfs.Flags)&flag == flag { |
| flags |= flag |
| } |
| } |
| |
| return flags, nil |
| } |
| |
| // Unmount the provided mount path with the flags |
| func Unmount(target string, flags int) error { |
| if err := unmount(target, flags); err != nil && err != unix.EINVAL { |
| return err |
| } |
| return nil |
| } |
| |
| // fuseSuperMagic is defined in statfs(2) |
| const fuseSuperMagic = 0x65735546 |
| |
| func isFUSE(dir string) bool { |
| var st unix.Statfs_t |
| if err := unix.Statfs(dir, &st); err != nil { |
| return false |
| } |
| return st.Type == fuseSuperMagic |
| } |
| |
| // unmountFUSE attempts to unmount using fusermount/fusermount3 helper binary. |
| // |
| // For FUSE mounts, using these helper binaries is preferred, see: |
| // https://github.com/containerd/containerd/pull/3765#discussion_r342083514 |
| func unmountFUSE(target string) error { |
| var err error |
| for _, helperBinary := range []string{"fusermount3", "fusermount"} { |
| cmd := exec.Command(helperBinary, "-u", target) |
| err = cmd.Run() |
| if err == nil { |
| return nil |
| } |
| } |
| return err |
| } |
| |
| func unmount(target string, flags int) error { |
| if isFUSE(target) { |
| if err := unmountFUSE(target); err == nil { |
| return nil |
| } |
| } |
| for i := 0; i < 50; i++ { |
| if err := unix.Unmount(target, flags); err != nil { |
| switch err { |
| case unix.EBUSY: |
| time.Sleep(50 * time.Millisecond) |
| continue |
| default: |
| return err |
| } |
| } |
| return nil |
| } |
| return fmt.Errorf("failed to unmount target %s: %w", target, unix.EBUSY) |
| } |
| |
| // UnmountAll repeatedly unmounts the given mount point until there |
| // are no mounts remaining (EINVAL is returned by mount), which is |
| // useful for undoing a stack of mounts on the same mount point. |
| // UnmountAll all is noop when the first argument is an empty string. |
| // This is done when the containerd client did not specify any rootfs |
| // mounts (e.g. because the rootfs is managed outside containerd) |
| // UnmountAll is noop when the mount path does not exist. |
| func UnmountAll(mount string, flags int) error { |
| if mount == "" { |
| return nil |
| } |
| if _, err := os.Stat(mount); os.IsNotExist(err) { |
| return nil |
| } |
| |
| for { |
| if err := unmount(mount, flags); err != nil { |
| // EINVAL is returned if the target is not a |
| // mount point, indicating that we are |
| // done. It can also indicate a few other |
| // things (such as invalid flags) which we |
| // unfortunately end up squelching here too. |
| if err == unix.EINVAL { |
| return nil |
| } |
| return err |
| } |
| } |
| } |
| |
| // parseMountOptions takes fstab style mount options and parses them for |
| // use with a standard mount() syscall |
| func parseMountOptions(options []string) (int, []string, bool) { |
| var ( |
| flag int |
| losetup bool |
| data []string |
| ) |
| loopOpt := "loop" |
| flags := map[string]struct { |
| clear bool |
| flag int |
| }{ |
| "async": {true, unix.MS_SYNCHRONOUS}, |
| "atime": {true, unix.MS_NOATIME}, |
| "bind": {false, unix.MS_BIND}, |
| "defaults": {false, 0}, |
| "dev": {true, unix.MS_NODEV}, |
| "diratime": {true, unix.MS_NODIRATIME}, |
| "dirsync": {false, unix.MS_DIRSYNC}, |
| "exec": {true, unix.MS_NOEXEC}, |
| "mand": {false, unix.MS_MANDLOCK}, |
| "noatime": {false, unix.MS_NOATIME}, |
| "nodev": {false, unix.MS_NODEV}, |
| "nodiratime": {false, unix.MS_NODIRATIME}, |
| "noexec": {false, unix.MS_NOEXEC}, |
| "nomand": {true, unix.MS_MANDLOCK}, |
| "norelatime": {true, unix.MS_RELATIME}, |
| "nostrictatime": {true, unix.MS_STRICTATIME}, |
| "nosuid": {false, unix.MS_NOSUID}, |
| "rbind": {false, unix.MS_BIND | unix.MS_REC}, |
| "relatime": {false, unix.MS_RELATIME}, |
| "remount": {false, unix.MS_REMOUNT}, |
| "ro": {false, unix.MS_RDONLY}, |
| "rw": {true, unix.MS_RDONLY}, |
| "strictatime": {false, unix.MS_STRICTATIME}, |
| "suid": {true, unix.MS_NOSUID}, |
| "sync": {false, unix.MS_SYNCHRONOUS}, |
| } |
| for _, o := range options { |
| // If the option does not exist in the flags table or the flag |
| // is not supported on the platform, |
| // then it is a data value for a specific fs type |
| if f, exists := flags[o]; exists && f.flag != 0 { |
| if f.clear { |
| flag &^= f.flag |
| } else { |
| flag |= f.flag |
| } |
| } else if o == loopOpt { |
| losetup = true |
| } else { |
| data = append(data, o) |
| } |
| } |
| return flag, data, losetup |
| } |
| |
| func hasDirectIO(opts []string) (bool, []string) { |
| for idx, opt := range opts { |
| if opt == "direct-io" { |
| return true, append(opts[:idx], opts[idx+1:]...) |
| } |
| } |
| return false, opts |
| } |
| |
| // compactLowerdirOption updates overlay lowdir option and returns the common |
| // dir among all the lowdirs. |
| func compactLowerdirOption(opts []string) (string, []string) { |
| idx, dirs := findOverlayLowerdirs(opts) |
| if idx == -1 || len(dirs) == 1 { |
| // no need to compact if there is only one lowerdir |
| return "", opts |
| } |
| |
| // find out common dir |
| commondir := longestCommonPrefix(dirs) |
| if commondir == "" { |
| return "", opts |
| } |
| |
| // NOTE: the snapshot id is based on digits. |
| // in order to avoid to get snapshots/x, should be back to parent dir. |
| // however, there is assumption that the common dir is ${root}/io.containerd.v1.overlayfs/snapshots. |
| commondir = path.Dir(commondir) |
| if commondir == "/" { |
| return "", opts |
| } |
| commondir = commondir + "/" |
| |
| newdirs := make([]string, 0, len(dirs)) |
| for _, dir := range dirs { |
| newdirs = append(newdirs, dir[len(commondir):]) |
| } |
| |
| newopts := copyOptions(opts) |
| newopts = append(newopts[:idx], newopts[idx+1:]...) |
| newopts = append(newopts, fmt.Sprintf("lowerdir=%s", strings.Join(newdirs, ":"))) |
| return commondir, newopts |
| } |
| |
| // findOverlayLowerdirs returns the index of lowerdir in mount's options and |
| // all the lowerdir target. |
| func findOverlayLowerdirs(opts []string) (int, []string) { |
| var ( |
| idx = -1 |
| prefix = "lowerdir=" |
| ) |
| |
| for i, opt := range opts { |
| if strings.HasPrefix(opt, prefix) { |
| idx = i |
| break |
| } |
| } |
| |
| if idx == -1 { |
| return -1, nil |
| } |
| return idx, strings.Split(opts[idx][len(prefix):], ":") |
| } |
| |
| // longestCommonPrefix finds the longest common prefix in the string slice. |
| func longestCommonPrefix(strs []string) string { |
| if len(strs) == 0 { |
| return "" |
| } else if len(strs) == 1 { |
| return strs[0] |
| } |
| |
| // find out the min/max value by alphabetical order |
| min, max := strs[0], strs[0] |
| for _, str := range strs[1:] { |
| if min > str { |
| min = str |
| } |
| if max < str { |
| max = str |
| } |
| } |
| |
| // find out the common part between min and max |
| for i := 0; i < len(min) && i < len(max); i++ { |
| if min[i] != max[i] { |
| return min[:i] |
| } |
| } |
| return min |
| } |
| |
| // copyOptions copies the options. |
| func copyOptions(opts []string) []string { |
| if len(opts) == 0 { |
| return nil |
| } |
| |
| acopy := make([]string, len(opts)) |
| copy(acopy, opts) |
| return acopy |
| } |
| |
| // optionsSize returns the byte size of options of mount. |
| func optionsSize(opts []string) int { |
| size := 0 |
| for _, opt := range opts { |
| size += len(opt) |
| } |
| return size |
| } |
| |
| func mountAt(chdir string, source, target, fstype string, flags uintptr, data string) error { |
| if chdir == "" { |
| return unix.Mount(source, target, fstype, flags, data) |
| } |
| |
| ch := make(chan error, 1) |
| go func() { |
| runtime.LockOSThread() |
| |
| // Do not unlock this thread. |
| // If the thread is unlocked go will try to use it for other goroutines. |
| // However it is not possible to restore the thread state after CLONE_FS. |
| // |
| // Once the goroutine exits the thread should eventually be terminated by go. |
| |
| if err := unix.Unshare(unix.CLONE_FS); err != nil { |
| ch <- err |
| return |
| } |
| |
| if err := unix.Chdir(chdir); err != nil { |
| ch <- err |
| return |
| } |
| |
| ch <- unix.Mount(source, target, fstype, flags, data) |
| }() |
| return <-ch |
| } |
| |
| func (m *Mount) mountWithHelper(helperBinary, typePrefix, target string) error { |
| // helperBinary: "mount.fuse3" |
| // target: "/foo/merged" |
| // m.Type: "fuse3.fuse-overlayfs" |
| // command: "mount.fuse3 overlay /foo/merged -o lowerdir=/foo/lower2:/foo/lower1,upperdir=/foo/upper,workdir=/foo/work -t fuse-overlayfs" |
| args := []string{m.Source, target} |
| for _, o := range m.Options { |
| args = append(args, "-o", o) |
| } |
| args = append(args, "-t", strings.TrimPrefix(m.Type, typePrefix)) |
| |
| infoBeforeMount, err := Lookup(target) |
| if err != nil { |
| return err |
| } |
| |
| // cmd.CombinedOutput() may intermittently return ECHILD because of our signal handling in shim. |
| // See #4387 and wait(2). |
| const retriesOnECHILD = 10 |
| for i := 0; i < retriesOnECHILD; i++ { |
| cmd := exec.Command(helperBinary, args...) |
| out, err := cmd.CombinedOutput() |
| if err == nil { |
| return nil |
| } |
| if !errors.Is(err, unix.ECHILD) { |
| return fmt.Errorf("mount helper [%s %v] failed: %q: %w", helperBinary, args, string(out), err) |
| } |
| // We got ECHILD, we are not sure whether the mount was successful. |
| // If the mount ID has changed, we are sure we got some new mount, but still not sure it is fully completed. |
| // So we attempt to unmount the new mount before retrying. |
| infoAfterMount, err := Lookup(target) |
| if err != nil { |
| return err |
| } |
| if infoAfterMount.ID != infoBeforeMount.ID { |
| _ = unmount(target, 0) |
| } |
| } |
| return fmt.Errorf("mount helper [%s %v] failed with ECHILD (retired %d times)", helperBinary, args, retriesOnECHILD) |
| } |