internal/unshare/unshare_linux.go - third_party/github.com/moby/moby - Git at Google

 //go:build go1.10
 // +build go1.10

 package unshare // import "github.com/docker/docker/internal/unshare"

 import (
 	"fmt"
 	"os"
 	"runtime"

 	"golang.org/x/sys/unix"
 )

 func init() {
 	// The startup thread of a process is special in a few different ways.
 	// Most pertinent to the discussion at hand, any per-thread kernel state
 	// reflected in the /proc/[pid]/ directory for a process is taken from
 	// the state of the startup thread. Same goes for /proc/self/; it shows
 	// the state of the current process' startup thread, no matter which
 	// thread the files are being opened from. For most programs this is a
 	// distinction without a difference as the kernel state, such as the
 	// mount namespace and current working directory, is shared among (and
 	// kept synchronized across) all threads of a process. But things start
 	// to break down once threads start unsharing and modifying parts of
 	// their kernel state.
 	//
 	// The Go runtime schedules goroutines to execute on the startup thread,
 	// same as any other. How this could be problematic is best illustrated
 	// with a concrete example. Consider what happens if a call to
 	// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
 	// onto the startup thread. The thread's mount namespace will be
 	// unshared and modified. The contents of the /proc/[pid]/mountinfo file
 	// will then describe the mount tree of the unshared namespace, not the
 	// namespace of any other thread. It will remain this way until the
 	// process exits. (The startup thread is special in another way: exiting
 	// it puts the process into a "non-waitable zombie" state. To avoid this
 	// fate, the Go runtime parks the thread instead of exiting if a
 	// goroutine returns while locked to the startup thread. More
 	// information can be found in the Go runtime sources:
 	// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
 	// package reads from /proc/self/mountinfo, so will read the mount tree
 	// for the wrong namespace if the startup thread has had its mount
 	// namespace unshared! The /proc/thread-self/ directory, introduced in
 	// Linux 3.17, is one potential solution to this problem, but every
 	// package which opens files in /proc/self/ would need to be updated,
 	// and fallbacks to /proc/self/task/[tid]/ would be required to support
 	// older kernels. Overlooking any reference to /proc/self/ would
 	// manifest as stochastically-reproducible bugs, so this is far from an
 	// ideal solution.
 	//
 	// Reading from /proc/self/ would not be a problem if we could prevent
 	// the per-thread state of the startup thread from being modified
 	// nondeterministically in the first place. We can accomplish this
 	// simply by locking the main() function to the startup thread! Doing so
 	// excludes any other goroutine from being scheduled on the thread.
 	runtime.LockOSThread()
 }

 // reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
 // reversed using setns(2). The values are the basenames of the corresponding
 // /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
 // state.
 var reversibleSetnsFlags = map[int]string{
 	unix.CLONE_NEWCGROUP: "cgroup",
 	unix.CLONE_NEWNET:    "net",
 	unix.CLONE_NEWUTS:    "uts",
 	unix.CLONE_NEWPID:    "pid",
 	unix.CLONE_NEWTIME:   "time",

 	// The following CLONE_NEW* flags are not included because they imply
 	// another, irreversible flag when used with unshare(2).
 	//  - unix.CLONE_NEWIPC:  implies CLONE_SYSVMEM
 	//  - unix.CLONE_NEWNS:   implies CLONE_FS
 	//  - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
 }

 // Go calls the given functions in a new goroutine, locked to an OS thread,
 // which has had the parts of its execution state disassociated from the rest of
 // the current process using [unshare(2)]. It blocks until the new goroutine has
 // started and setupfn has returned. fn is only called if setupfn returns nil. A
 // nil setupfn or fn is equivalent to passing a no-op function.
 //
 // The disassociated execution state and any changes made to it are only visible
 // to the goroutine which the functions are called in. Any other goroutines,
 // including ones started from the function, will see the same execution state
 // as the rest of the process.
 //
 // The acceptable flags are documented in the [unshare(2)] Linux man-page.
 // The corresponding CLONE_* constants are defined in package [unix].
 //
 // # Warning
 //
 // This function may terminate the thread which the new goroutine executed on
 // after fn returns, which could cause subprocesses started with the
 // [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
 // termination. Any subprocess started before this function is called may be
 // affected, in addition to any subprocesses started inside setupfn or fn.
 // There are more details at https://go.dev/issue/27505.
 //
 // [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
 func Go(flags int, setupfn func() error, fn func()) error {
 	started := make(chan error)

 	maskedFlags := flags
 	for f := range reversibleSetnsFlags {
 		maskedFlags &^= f
 	}
 	isReversible := maskedFlags == 0

 	go func() {
 		// Prepare to manipulate per-thread kernel state.
 		runtime.LockOSThread()

 		// Not all changes to the execution state can be reverted.
 		// If an irreversible change to the execution state is made, our
 		// only recourse is to have the tampered thread terminated by
 		// returning from this function while the goroutine remains
 		// wired to the thread. The Go runtime will terminate the thread
 		// and replace it with a fresh one as needed.

 		if isReversible {
 			defer func() {
 				if isReversible {
 					// All execution state has been restored without error.
 					// The thread is once again fungible.
 					runtime.UnlockOSThread()
 				}
 			}()
 			tid := unix.Gettid()
 			for f, ns := range reversibleSetnsFlags {
 				if flags&f != f {
 					continue
 				}
 				// The /proc/thread-self directory was added in Linux 3.17.
 				// We are not using it to maximize compatibility.
 				pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
 				fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
 				if err != nil {
 					started <- &os.PathError{Op: "open", Path: pth, Err: err}
 					return
 				}
 				defer func() {
 					if isReversible {
 						if err := unix.Setns(fd, 0); err != nil {
 							isReversible = false
 						}
 					}
 					_ = unix.Close(fd)
 				}()
 			}
 		}

 		// Threads are implemented under Linux as processes which share
 		// a virtual memory space. Therefore in a multithreaded process
 		// unshare(2) disassociates parts of the calling thread's
 		// context from the thread it was clone(2)'d from.
 		if err := unix.Unshare(flags); err != nil {
 			started <- os.NewSyscallError("unshare", err)
 			return
 		}

 		if setupfn != nil {
 			if err := setupfn(); err != nil {
 				started <- err
 				return
 			}
 		}
 		close(started)

 		if fn != nil {
 			fn()
 		}
 	}()

 	return <-started
 }
	//go:build go1.10
	// +build go1.10

	package unshare // import "github.com/docker/docker/internal/unshare"

	import (
	"fmt"
	"os"
	"runtime"

	"golang.org/x/sys/unix"
	)

	func init() {
	// The startup thread of a process is special in a few different ways.
	// Most pertinent to the discussion at hand, any per-thread kernel state
	// reflected in the /proc/[pid]/ directory for a process is taken from
	// the state of the startup thread. Same goes for /proc/self/; it shows
	// the state of the current process' startup thread, no matter which
	// thread the files are being opened from. For most programs this is a
	// distinction without a difference as the kernel state, such as the
	// mount namespace and current working directory, is shared among (and
	// kept synchronized across) all threads of a process. But things start
	// to break down once threads start unsharing and modifying parts of
	// their kernel state.
	//
	// The Go runtime schedules goroutines to execute on the startup thread,
	// same as any other. How this could be problematic is best illustrated
	// with a concrete example. Consider what happens if a call to
	// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
	// onto the startup thread. The thread's mount namespace will be
	// unshared and modified. The contents of the /proc/[pid]/mountinfo file
	// will then describe the mount tree of the unshared namespace, not the
	// namespace of any other thread. It will remain this way until the
	// process exits. (The startup thread is special in another way: exiting
	// it puts the process into a "non-waitable zombie" state. To avoid this
	// fate, the Go runtime parks the thread instead of exiting if a
	// goroutine returns while locked to the startup thread. More
	// information can be found in the Go runtime sources:
	// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
	// package reads from /proc/self/mountinfo, so will read the mount tree
	// for the wrong namespace if the startup thread has had its mount
	// namespace unshared! The /proc/thread-self/ directory, introduced in
	// Linux 3.17, is one potential solution to this problem, but every
	// package which opens files in /proc/self/ would need to be updated,
	// and fallbacks to /proc/self/task/[tid]/ would be required to support
	// older kernels. Overlooking any reference to /proc/self/ would
	// manifest as stochastically-reproducible bugs, so this is far from an
	// ideal solution.
	//
	// Reading from /proc/self/ would not be a problem if we could prevent
	// the per-thread state of the startup thread from being modified
	// nondeterministically in the first place. We can accomplish this
	// simply by locking the main() function to the startup thread! Doing so
	// excludes any other goroutine from being scheduled on the thread.
	runtime.LockOSThread()
	}

	// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
	// reversed using setns(2). The values are the basenames of the corresponding
	// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
	// state.
	var reversibleSetnsFlags = map[int]string{
	unix.CLONE_NEWCGROUP: "cgroup",
	unix.CLONE_NEWNET: "net",
	unix.CLONE_NEWUTS: "uts",
	unix.CLONE_NEWPID: "pid",
	unix.CLONE_NEWTIME: "time",

	// The following CLONE_NEW* flags are not included because they imply
	// another, irreversible flag when used with unshare(2).
	// - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM
	// - unix.CLONE_NEWNS: implies CLONE_FS
	// - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
	}

	// Go calls the given functions in a new goroutine, locked to an OS thread,
	// which has had the parts of its execution state disassociated from the rest of
	// the current process using [unshare(2)]. It blocks until the new goroutine has
	// started and setupfn has returned. fn is only called if setupfn returns nil. A
	// nil setupfn or fn is equivalent to passing a no-op function.
	//
	// The disassociated execution state and any changes made to it are only visible
	// to the goroutine which the functions are called in. Any other goroutines,
	// including ones started from the function, will see the same execution state
	// as the rest of the process.
	//
	// The acceptable flags are documented in the [unshare(2)] Linux man-page.
	// The corresponding CLONE_* constants are defined in package [unix].
	//
	// # Warning
	//
	// This function may terminate the thread which the new goroutine executed on
	// after fn returns, which could cause subprocesses started with the
	// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
	// termination. Any subprocess started before this function is called may be
	// affected, in addition to any subprocesses started inside setupfn or fn.
	// There are more details at https://go.dev/issue/27505.
	//
	// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
	func Go(flags int, setupfn func() error, fn func()) error {
	started := make(chan error)

	maskedFlags := flags
	for f := range reversibleSetnsFlags {
	maskedFlags &^= f
	}
	isReversible := maskedFlags == 0

	go func() {
	// Prepare to manipulate per-thread kernel state.
	runtime.LockOSThread()

	// Not all changes to the execution state can be reverted.
	// If an irreversible change to the execution state is made, our
	// only recourse is to have the tampered thread terminated by
	// returning from this function while the goroutine remains
	// wired to the thread. The Go runtime will terminate the thread
	// and replace it with a fresh one as needed.

	if isReversible {
	defer func() {
	if isReversible {
	// All execution state has been restored without error.
	// The thread is once again fungible.
	runtime.UnlockOSThread()
	}
	}()
	tid := unix.Gettid()
	for f, ns := range reversibleSetnsFlags {
	if flags&f != f {
	continue
	}
	// The /proc/thread-self directory was added in Linux 3.17.
	// We are not using it to maximize compatibility.
	pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
	fd, err := unix.Open(pth, unix.O_RDONLY\|unix.O_CLOEXEC, 0)
	if err != nil {
	started <- &os.PathError{Op: "open", Path: pth, Err: err}
	return
	}
	defer func() {
	if isReversible {
	if err := unix.Setns(fd, 0); err != nil {
	isReversible = false
	}
	}
	_ = unix.Close(fd)
	}()
	}
	}

	// Threads are implemented under Linux as processes which share
	// a virtual memory space. Therefore in a multithreaded process
	// unshare(2) disassociates parts of the calling thread's
	// context from the thread it was clone(2)'d from.
	if err := unix.Unshare(flags); err != nil {
	started <- os.NewSyscallError("unshare", err)
	return
	}

	if setupfn != nil {
	if err := setupfn(); err != nil {
	started <- err
	return
	}
	}
	close(started)

	if fn != nil {
	fn()
	}
	}()

	return <-started
	}