Enable symbolization of data objects with the llvm symbolizer (#598)

* Compute the correct base offset for non-executable runtime mappings.

For user space executables, find the segment associated with the current mapping
and use its header in the call to GetBase, instead of using the program header
for the segment that includes the .text section.

This change is needed for all local symbolizers that handle ELF files.

* Enable the llvn-symbolizer to symbolize data addresses.

For data addresses, we need to issue "DATA <binary_name> <address>" commands,
while for code addresses, we issue "CODE <binary_name> <address>" commands.

Right now, the frames for data addresses include the symbol start address and
size in the filename field.
diff --git a/internal/binutils/addr2liner_llvm.go b/internal/binutils/addr2liner_llvm.go
index 43a6144..24c48e6 100644
--- a/internal/binutils/addr2liner_llvm.go
+++ b/internal/binutils/addr2liner_llvm.go
@@ -43,10 +43,12 @@
 	cmd *exec.Cmd
 	in  io.WriteCloser
 	out *bufio.Reader
+	// llvm-symbolizer requires the symbol type, CODE or DATA, for symbolization.
+	symType string
 }
 
 func (a *llvmSymbolizerJob) write(s string) error {
-	_, err := fmt.Fprint(a.in, s+"\n")
+	_, err := fmt.Fprintln(a.in, a.symType, s)
 	return err
 }
 
@@ -68,13 +70,17 @@
 // information about the given executable file. If file is a shared
 // library, base should be the address at which it was mapped in the
 // program under consideration.
-func newLLVMSymbolizer(cmd, file string, base uint64) (*llvmSymbolizer, error) {
+func newLLVMSymbolizer(cmd, file string, base uint64, isData bool) (*llvmSymbolizer, error) {
 	if cmd == "" {
 		cmd = defaultLLVMSymbolizer
 	}
 
 	j := &llvmSymbolizerJob{
-		cmd: exec.Command(cmd, "-inlining", "-demangle=false"),
+		cmd:     exec.Command(cmd, "-inlining", "-demangle=false"),
+		symType: "CODE",
+	}
+	if isData {
+		j.symType = "DATA"
 	}
 
 	var err error
@@ -123,7 +129,11 @@
 	}
 
 	linenumber := 0
-	if fileline == "??:0" {
+	// The llvm-symbolizer outputs the <file_name>:<line_number>:<column_number>.
+	// When it cannot identify the source code location, it outputs "??:0:0".
+	// Older versions output just the filename and line number, so we check for
+	// both conditions here.
+	if fileline == "??:0" || fileline == "??:0:0" {
 		fileline = ""
 	} else {
 		switch split := strings.Split(fileline, ":"); len(split) {
diff --git a/internal/binutils/binutils.go b/internal/binutils/binutils.go
index 2695487..576a6ee 100644
--- a/internal/binutils/binutils.go
+++ b/internal/binutils/binutils.go
@@ -450,7 +450,23 @@
 		}
 	}
 
-	base, err := elfexec.GetBase(&ef.FileHeader, elfexec.FindTextProgHeader(ef), stextOffset, start, limit, offset)
+	var ph *elf.ProgHeader
+	// For user space executables, find the actual program segment that is
+	// associated with the given mapping. Skip this search if limit <= start.
+	// We cannot use just a check on the start address of the mapping to tell if
+	// it's a kernel / .ko module mapping, because with quipper address remapping
+	// enabled, the address would be in the lower half of the address space.
+	if stextOffset == nil && start < limit && limit < (uint64(1)<<63) {
+		ph, err = elfexec.FindProgHeaderForMapping(ef, offset, limit-start)
+		if err != nil {
+			return nil, fmt.Errorf("failed to find program header for file %q, mapping pgoff %x, memsz=%x: %v", name, offset, limit-start, err)
+		}
+	} else {
+		// For the kernel, find the program segment that includes the .text section.
+		ph = elfexec.FindTextProgHeader(ef)
+	}
+
+	base, err := elfexec.GetBase(&ef.FileHeader, ph, stextOffset, start, limit, offset)
 	if err != nil {
 		return nil, fmt.Errorf("could not identify base for %s: %v", name, err)
 	}
@@ -461,10 +477,11 @@
 			buildID = fmt.Sprintf("%x", id)
 		}
 	}
+	isData := ph != nil && ph.Flags&elf.PF_X == 0
 	if b.fast || (!b.addr2lineFound && !b.llvmSymbolizerFound) {
-		return &fileNM{file: file{b, name, base, buildID}}, nil
+		return &fileNM{file: file{b, name, base, buildID, isData}}, nil
 	}
-	return &fileAddr2Line{file: file{b, name, base, buildID}}, nil
+	return &fileAddr2Line{file: file{b, name, base, buildID, isData}}, nil
 }
 
 func (b *binrep) openPE(name string, start, limit, offset uint64) (plugin.ObjFile, error) {
@@ -500,6 +517,7 @@
 	name    string
 	base    uint64
 	buildID string
+	isData  bool
 }
 
 func (f *file) Name() string {
@@ -575,7 +593,7 @@
 }
 
 func (f *fileAddr2Line) init() {
-	if llvmSymbolizer, err := newLLVMSymbolizer(f.b.llvmSymbolizer, f.name, f.base); err == nil {
+	if llvmSymbolizer, err := newLLVMSymbolizer(f.b.llvmSymbolizer, f.name, f.base, f.isData); err == nil {
 		f.llvmSymbolizer = llvmSymbolizer
 		return
 	}
diff --git a/internal/binutils/binutils_test.go b/internal/binutils/binutils_test.go
index 9d77736..64de938 100644
--- a/internal/binutils/binutils_test.go
+++ b/internal/binutils/binutils_test.go
@@ -336,7 +336,6 @@
 		start, limit, offset uint64
 		addr                 uint64
 	}{
-		{"fake mapping", 0, math.MaxUint64, 0, 0x40052d},
 		{"fixed load address", 0x400000, 0x4006fc, 0, 0x40052d},
 		// True user-mode ASLR binaries are ET_DYN rather than ET_EXEC so this case
 		// is a bit artificial except that it approximates the
@@ -448,33 +447,38 @@
 	}
 
 	cmd := filepath.Join("testdata", "fake-llvm-symbolizer")
-	symbolizer, err := newLLVMSymbolizer(cmd, "foo", 0)
-	if err != nil {
-		t.Fatalf("newLLVMSymbolizer: unexpected error %v", err)
-	}
-	defer symbolizer.rw.close()
-
 	for _, c := range []struct {
 		addr   uint64
+		isData bool
 		frames []plugin.Frame
 	}{
-		{0x10, []plugin.Frame{
+		{0x10, false, []plugin.Frame{
 			{Func: "Inlined_0x10", File: "foo.h", Line: 0},
 			{Func: "Func_0x10", File: "foo.c", Line: 2},
 		}},
-		{0x20, []plugin.Frame{
-			{Func: "Inlined_0x20", File: "foo.h", Line: 0},
-			{Func: "Func_0x20", File: "foo.c", Line: 2},
+		{0x20, true, []plugin.Frame{
+			{Func: "foo_0x20", File: "0x20 8"},
 		}},
 	} {
-		frames, err := symbolizer.addrInfo(c.addr)
-		if err != nil {
-			t.Errorf("LLVM: unexpected error %v", err)
-			continue
+		desc := fmt.Sprintf("Code %x", c.addr)
+		if c.isData {
+			desc = fmt.Sprintf("Data %x", c.addr)
 		}
-		if !reflect.DeepEqual(frames, c.frames) {
-			t.Errorf("LLVM: expect %v; got %v\n", c.frames, frames)
-		}
+		t.Run(desc, func(t *testing.T) {
+			symbolizer, err := newLLVMSymbolizer(cmd, "foo", 0, c.isData)
+			if err != nil {
+				t.Fatalf("newLLVMSymbolizer: unexpected error %v", err)
+			}
+			defer symbolizer.rw.close()
+
+			frames, err := symbolizer.addrInfo(c.addr)
+			if err != nil {
+				t.Fatalf("LLVM: unexpected error %v", err)
+			}
+			if !reflect.DeepEqual(frames, c.frames) {
+				t.Errorf("LLVM: expect %v; got %v\n", c.frames, frames)
+			}
+		})
 	}
 }
 
@@ -641,3 +645,39 @@
 		}
 	}
 }
+
+func TestOpenELF(t *testing.T) {
+	// The exe_linux_64 has two loadable program headers:
+	//  LOAD           0x0000000000000000 0x0000000000400000 0x0000000000400000
+	//                 0x00000000000006fc 0x00000000000006fc  R E    0x200000
+	//  LOAD           0x0000000000000e10 0x0000000000600e10 0x0000000000600e10
+	//                 0x0000000000000230 0x0000000000000238  RW     0x200000
+	name := filepath.Join("testdata", "exe_linux_64")
+
+	for _, tc := range []struct {
+		desc                 string
+		start, limit, offset uint64
+		wantError            bool
+		wantBase             uint64
+	}{
+		{"exec mapping", 0x5400000, 0x5401000, 0, false, 0x5000000},
+		{"short data mapping", 0x5600e00, 0x5602000, 0xe00, false, 0x5000000},
+		{"page aligned data mapping", 0x5600000, 0x5602000, 0, false, 0x5000000},
+		{"no matching segment", 0x5600000, 0x5602000, 0x2000, true, 0},
+		{"multiple matching segments, wrong size", 0x5600000, 0x5603000, 0, true, 0},
+	} {
+		t.Run(tc.desc, func(t *testing.T) {
+			b := binrep{}
+			o, err := b.openELF(name, tc.start, tc.limit, tc.offset)
+			if (err != nil) != tc.wantError {
+				t.Errorf("got error %v, want any error=%v", err, tc.wantError)
+			}
+			if err != nil {
+				return
+			}
+			if got := o.Base(); got != tc.wantBase {
+				t.Errorf("got base %x; want %x\n", got, tc.wantBase)
+			}
+		})
+	}
+}
diff --git a/internal/binutils/testdata/fake-llvm-symbolizer b/internal/binutils/testdata/fake-llvm-symbolizer
index 596713c..a3b4546 100755
--- a/internal/binutils/testdata/fake-llvm-symbolizer
+++ b/internal/binutils/testdata/fake-llvm-symbolizer
@@ -23,12 +23,21 @@
   # line has form:
   #    filename 0xaddr
   # Emit dummy output that matches llvm-symbolizer output format.
-  set -- $line
-  fname=$1
-  addr=$2
-  echo "Inlined_$addr"
-  echo "$fname.h"
-  echo "Func_$addr"
-  echo "$fname.c:2"
-  echo
+  set -- ${line}
+  kind=$1
+  fname=$2
+  addr=$3
+  case ${kind} in
+  CODE)
+    echo "Inlined_${addr}"
+    echo "${fname}.h"
+    echo "Func_${addr}"
+    echo "${fname}.c:2:1"
+    echo;;
+  DATA)
+    echo "${fname}_${addr}"
+    echo "${addr} 8"
+    echo;;
+  *) echo ${kind} ${fname} ${addr};;
+  esac
 done
diff --git a/internal/elfexec/elfexec.go b/internal/elfexec/elfexec.go
index d520765..b60357d 100644
--- a/internal/elfexec/elfexec.go
+++ b/internal/elfexec/elfexec.go
@@ -283,3 +283,83 @@
 	}
 	return nil
 }
+
+// FindProgHeaderForMapping returns the loadable program segment header that is
+// fully contained in the runtime mapping with file offset pgoff and memory size
+// memsz, or an error if the segment cannot be determined.
+func FindProgHeaderForMapping(f *elf.File, pgoff, memsz uint64) (*elf.ProgHeader, error) {
+	var headers []*elf.ProgHeader
+	loadables := 0
+	for _, p := range f.Progs {
+		if p.Type == elf.PT_LOAD && pgoff <= p.Off && p.Off+p.Memsz <= pgoff+memsz {
+			headers = append(headers, &p.ProgHeader)
+		}
+		if p.Type == elf.PT_LOAD {
+			loadables++
+		}
+	}
+	if len(headers) == 1 {
+		return headers[0], nil
+	}
+	// Some ELF files don't contain any program segments, e.g. .ko loadable kernel
+	// modules. Don't return an error in such cases.
+	if loadables == 0 {
+		return nil, nil
+	}
+	if len(headers) == 0 {
+		return nil, fmt.Errorf("no program header matches file offset %x and memory size %x", pgoff, memsz)
+	}
+
+	// Segments are mapped page aligned. In some cases, segments may be smaller
+	// than a page, which causes the next segment to start at a file offset that
+	// is logically on the same page if we were to align file offsets by page.
+	// Example:
+	//  LOAD           0x0000000000000000 0x0000000000400000 0x0000000000400000
+	//                 0x00000000000006fc 0x00000000000006fc  R E    0x200000
+	//  LOAD           0x0000000000000e10 0x0000000000600e10 0x0000000000600e10
+	//                 0x0000000000000230 0x0000000000000238  RW     0x200000
+	//
+	// In this case, perf records the following mappings for this executable:
+	// 0 0 [0xc0]: PERF_RECORD_MMAP2 87867/87867: [0x400000(0x1000) @ 0 00:3c 512041 0]: r-xp exename
+	// 0 0 [0xc0]: PERF_RECORD_MMAP2 87867/87867: [0x600000(0x2000) @ 0 00:3c 512041 0]: rw-p exename
+	//
+	// Both mappings have file offset 0. The first mapping is one page length and
+	// it can include only the first loadable segment. Due to page alignment, the
+	// second mapping starts also at file offset 0, and it spans two pages. It can
+	// include both the first and the second loadable segments. We must return the
+	// correct program header to compute the correct base offset.
+	//
+	// We cannot use the mapping protections to distinguish between segments,
+	// because protections are not passed through to this function.
+	// We cannot use the start address to differentiate between segments, because
+	// with ASLR, the mapping start address can be any value.
+	//
+	// We use a heuristic to compute the minimum mapping size required for a
+	// segment, assuming mappings are 4k page aligned, and return the segment that
+	// matches the given mapping size.
+	const pageSize = 4096
+
+	// The memory size based heuristic makes sense only if the mapping size is a
+	// multiple of 4k page size.
+	if memsz%pageSize != 0 {
+		return nil, fmt.Errorf("mapping size = %x and %d segments match the passed in mapping", memsz, len(headers))
+	}
+
+	// Return an error if no segment, or multiple segments match the size, so we can debug.
+	var ph *elf.ProgHeader
+	pageMask := ^uint64(pageSize - 1)
+	for _, h := range headers {
+		wantSize := (h.Vaddr+h.Memsz+pageSize-1)&pageMask - (h.Vaddr & pageMask)
+		if wantSize != memsz {
+			continue
+		}
+		if ph != nil {
+			return nil, fmt.Errorf("found second program header (%#v) that matches memsz %x, first program header is %#v", *h, memsz, *ph)
+		}
+		ph = h
+	}
+	if ph == nil {
+		return nil, fmt.Errorf("found %d matching program headers, but none matches mapping size %x", len(headers), memsz)
+	}
+	return ph, nil
+}
diff --git a/internal/elfexec/elfexec_test.go b/internal/elfexec/elfexec_test.go
index ff95c36..6312b1c 100644
--- a/internal/elfexec/elfexec_test.go
+++ b/internal/elfexec/elfexec_test.go
@@ -16,6 +16,7 @@
 
 import (
 	"debug/elf"
+	"reflect"
 	"testing"
 )
 
@@ -100,3 +101,110 @@
 func uint64p(n uint64) *uint64 {
 	return &n
 }
+
+func TestFindProgHeaderForMapping(t *testing.T) {
+	smallELFFile := elf.File{
+		Progs: []*elf.Prog{
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_PHDR, Flags: elf.PF_R | elf.PF_X, Off: 0x40, Vaddr: 0x400040, Paddr: 0x400040, Filesz: 0x1f8, Memsz: 0x1f8, Align: 8}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_INTERP, Flags: elf.PF_R, Off: 0x238, Vaddr: 0x400238, Paddr: 0x400238, Filesz: 0x1c, Memsz: 0x1c, Align: 1}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_X, Off: 0, Vaddr: 0x400000, Paddr: 0x400000, Filesz: 0x6fc, Memsz: 0x6fc, Align: 0x200000}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_W, Off: 0xe10, Vaddr: 0x600e10, Paddr: 0x600e10, Filesz: 0x230, Memsz: 0x238, Align: 0x200000}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_DYNAMIC, Flags: elf.PF_R | elf.PF_W, Off: 0xe28, Vaddr: 0x600e28, Paddr: 0x600e28, Filesz: 0x1d0, Memsz: 0x1d0, Align: 8}},
+		},
+	}
+	largeELFFile := elf.File{
+		Progs: []*elf.Prog{
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_PHDR, Flags: elf.PF_R, Off: 0x40, Vaddr: 0x40, Paddr: 0x40, Filesz: 0x268, Memsz: 0x268, Align: 8}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_INTERP, Flags: elf.PF_R, Off: 0x2a8, Vaddr: 0x2a8, Paddr: 0x2a8, Filesz: 0x28, Memsz: 0x28, Align: 1}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_X, Off: 0, Vaddr: 0, Paddr: 0, Filesz: 0x2ec5d2c0, Memsz: 0x2ec5d2c0, Align: 0x200000}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_W, Off: 0x2ec5d2c0, Vaddr: 0x2ee5d2c0, Paddr: 0x2ee5d2c0, Filesz: 0x1361118, Memsz: 0x1361150, Align: 0x200000}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_W, Off: 0x2ffbe440, Vaddr: 0x303be440, Paddr: 0x303be440, Filesz: 0x4637c0, Memsz: 0xc91610, Align: 0x200000}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_TLS, Flags: elf.PF_R, Off: 0x2ec5d2c0, Vaddr: 0x2ee5d2c0, Paddr: 0x2ee5d2c0, Filesz: 0x120, Memsz: 0x103f8, Align: 0x40}},
+			{ProgHeader: elf.ProgHeader{Type: elf.PT_DYNAMIC, Flags: elf.PF_R | elf.PF_W, Off: 0x2ffbc9e0, Vaddr: 0x301bc9e0, Paddr: 0x301bc9e0, Filesz: 0x1f0, Memsz: 0x1f0, Align: 8}},
+		},
+	}
+	for _, tc := range []struct {
+		desc         string
+		file         *elf.File
+		pgoff, memsz uint64
+		wantError    bool
+		want         *elf.ProgHeader
+	}{
+		{
+			desc:  "no prog headers ELF file",
+			file:  &elf.File{},
+			pgoff: 0,
+			memsz: 0x1000,
+			want:  nil,
+		},
+		{
+			desc:  "small ELF file / executable mapping",
+			file:  &smallELFFile,
+			pgoff: 0,
+			memsz: 0x1000,
+			want:  &elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_X, Off: 0, Vaddr: 0x400000, Paddr: 0x400000, Filesz: 0x6fc, Memsz: 0x6fc, Align: 0x200000},
+		},
+		{
+			desc:  "small ELF file / page aligned data mapping disambiguation",
+			file:  &smallELFFile,
+			pgoff: 0,
+			memsz: 0x2000,
+			want:  &elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_W, Off: 0xe10, Vaddr: 0x600e10, Paddr: 0x600e10, Filesz: 0x230, Memsz: 0x238, Align: 0x200000},
+		},
+		{
+			desc:      "small ELF file / no matching segment",
+			file:      &smallELFFile,
+			pgoff:     0x1000,
+			memsz:     0x1000,
+			wantError: true,
+		},
+		{
+			desc:      "small ELF file / multiple matching segments, but incorrect size ",
+			file:      &smallELFFile,
+			pgoff:     0,
+			memsz:     0x3000,
+			wantError: true,
+		},
+		{
+			desc:  "large ELF file / executable mapping",
+			file:  &largeELFFile,
+			pgoff: 0,
+			memsz: 0x2ec5e000,
+			want:  &elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_X, Off: 0, Vaddr: 0, Paddr: 0, Filesz: 0x2ec5d2c0, Memsz: 0x2ec5d2c0, Align: 0x200000},
+		},
+		{
+			desc:  "large ELF file / first data mapping",
+			file:  &largeELFFile,
+			pgoff: 0x2ec5d000,
+			memsz: 0x1362000,
+			want:  &elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_W, Off: 0x2ec5d2c0, Vaddr: 0x2ee5d2c0, Paddr: 0x2ee5d2c0, Filesz: 0x1361118, Memsz: 0x1361150, Align: 0x200000},
+		},
+		{
+			desc:      "large ELF file / split mapping doesn't match",
+			file:      &largeELFFile,
+			pgoff:     0x2ffbe000,
+			memsz:     0xb11000,
+			wantError: true,
+		},
+		{
+			desc:  "large ELF file / combined mapping matches second data mapping",
+			file:  &largeELFFile,
+			pgoff: 0x2ffbe000,
+			memsz: 0xb11000 + 0x181000,
+			want:  &elf.ProgHeader{Type: elf.PT_LOAD, Flags: elf.PF_R | elf.PF_W, Off: 0x2ffbe440, Vaddr: 0x303be440, Paddr: 0x303be440, Filesz: 0x4637c0, Memsz: 0xc91610, Align: 0x200000},
+		},
+	} {
+		t.Run(tc.desc, func(t *testing.T) {
+			got, err := FindProgHeaderForMapping(tc.file, tc.pgoff, tc.memsz)
+			if (err != nil) != tc.wantError {
+				t.Errorf("got error %v, want any error=%v", err, tc.wantError)
+			}
+			if err != nil {
+				return
+			}
+			if !reflect.DeepEqual(got, tc.want) {
+				t.Errorf("got program header %#v; want %#v", got, tc.want)
+			}
+		})
+	}
+}