[bloatalyzer] Refactoring data structure

So that it's more useful for manipulating.

Change-Id: Iebc2bcb13e429f60de4122c8eb9cb3d23c3538ba
diff --git a/bloaty/bloaty.go b/bloaty/bloaty.go
index 77ca2cc..6177af8 100644
--- a/bloaty/bloaty.go
+++ b/bloaty/bloaty.go
@@ -10,27 +10,17 @@
 	"fmt"
 	"os"
 	"os/exec"
+	"sort"
 	"strings"
 	"sync"
 )
 
 type bloatyOutput struct {
-	key  string
-	data Symbol
+	data row
 	file string
 	err  error
 }
 
-// Symbol represents all data about one symbol in the produced Bloaty output.
-type Symbol struct {
-	Name        string         `json:"Name"`
-	File        string         `json:"File"`
-	Segs        map[string]int `json:"Segs"`
-	TotalVmsz   uint64         `json:"TotalVmsz"`
-	TotalFilesz uint64         `json:"TotalFilesz"`
-	Binaries    []string       `json:"Binaries"`
-}
-
 // TODO(jakehehrlich): Add reading ids.txt to elflib, since there are now three
 // different tools that each need to read it for mostly unrelated reasons
 func getFiles(idsPath string) ([]string, error) {
@@ -83,20 +73,10 @@
 	}()
 
 	for r := range rows {
-		data := bloatyOutput{
-			key: r.Symbol + ":" + r.File,
-			data: Symbol{
-				Name:        r.Symbol,
-				File:        r.File,
-				Segs:        make(map[string]int),
-				TotalVmsz:   r.Vmsz,
-				TotalFilesz: r.Filesz,
-				Binaries:    append([]string{}, file),
-			},
+		out <- bloatyOutput{
+			data: r,
 			file: file,
 		}
-		data.data.Segs[r.Seg] += 1
-		out <- data
 	}
 
 	if err := cmd.Wait(); err != nil {
@@ -106,28 +86,108 @@
 
 }
 
-func updateSymbol(sym, newSym Symbol, file string) Symbol {
-	sym.Name = newSym.Name
-	sym.File = newSym.File
-	// TODO: Filtering by section would allow some of these symbols to be ignored
-	// or considered on a more useful global level.
-	for seg, count := range newSym.Segs {
-		sym.Segs[seg] += count
-	}
-	sym.TotalVmsz += newSym.TotalVmsz
-	sym.TotalFilesz += newSym.TotalFilesz
+func updateSymbol(newSym *row, file string, sym *Symbol) {
+	sym.Name = newSym.Symbol
+	sym.Vmsz += newSym.Vmsz
+	sym.Filesz += newSym.Filesz
 	sym.Binaries = append(sym.Binaries, file)
-	return sym
 }
 
-func RunBloaty(bloatyPath, idsPath string) (map[string]Symbol, error) {
+func addRowToOutput(r *row, file string, output map[string]*Segment) {
+	if _, ok := output[r.Seg]; !ok {
+		output[r.Seg] = &Segment{make(map[string]*File)}
+	}
+	seg := output[r.Seg]
+
+	if _, ok := seg.Files[r.File]; !ok {
+		seg.Files[r.File] = &File{Symbols: make(map[string]*Symbol)}
+	}
+	f := seg.Files[r.File]
+
+	if _, ok := f.Symbols[r.Symbol]; !ok {
+		f.Symbols[r.Symbol] = &Symbol{}
+	}
+	updateSymbol(r, file, f.Symbols[r.Symbol])
+	seg.Files[r.File] = f
+	output[r.Seg] = seg
+}
+
+func getTopN(fileSizes map[string]uint64, topFiles, topSyms uint64, output *map[string]*Segment) {
+	// If both topFiles and topSyms are 0, bail early because we're returning everything.
+	if topFiles == 0 && topSyms == 0 {
+		return
+	}
+	type sortedFile struct {
+		name string
+		size uint64
+	}
+
+	smallFiles := make(map[string]uint64)
+	if topFiles > 0 && topFiles < uint64(len(fileSizes)) {
+		var all []struct {
+			name string
+			size uint64
+		}
+		for name, size := range fileSizes {
+			all = append(all, sortedFile{name, size})
+		}
+		sort.Slice(all, func(i, j int) bool {
+			return all[i].size < all[j].size
+		})
+
+		for _, d := range all[:uint64(len(all))-topFiles] {
+			smallFiles[d.name] = d.size
+		}
+	}
+
+	for _, segData := range *output {
+		smallFilesSize := uint64(0)
+		for file, fileData := range segData.Files {
+			smallSyms := Symbol{Name: "all small syms"}
+			// If the file labeled a small file, add to small files size and delete the sym data.
+			if size, exists := smallFiles[file]; exists {
+				smallFilesSize += size
+				delete(segData.Files, file)
+			} else if topSyms > 0 && topSyms < uint64(len(fileData.Symbols)) {
+				var all []*Symbol
+				for _, sym := range fileData.Symbols {
+					all = append(all, sym)
+				}
+				sort.Slice(all, func(i, j int) bool {
+					return all[i].Filesz < all[j].Filesz
+				})
+
+				for _, d := range all[:uint64(len(all))-topSyms] {
+					if sym, exists := fileData.Symbols[d.Name]; exists {
+						smallSyms.Vmsz += sym.Vmsz
+						smallSyms.Filesz += sym.Filesz
+						delete(fileData.Symbols, d.Name)
+					}
+				}
+			}
+
+			if topSyms > 0 {
+				fileData.Symbols["all small syms"] = &smallSyms
+			}
+		}
+
+		if topFiles > 0 {
+			segData.Files["all small files"] = &File{TotalFilesz: smallFilesSize}
+		}
+	}
+}
+
+// RunBloaty runs bloaty on all files in ids.txt, and returns a mapping of the
+// symbols and files by segment.
+func RunBloaty(bloatyPath, idsPath string, topFiles, topSyms uint64) (map[string]*Segment, error) {
 	files, err := getFiles(idsPath)
 	if err != nil {
 		return nil, err
 	}
 
 	var wg sync.WaitGroup
-	output := make(map[string]Symbol)
+	output := make(map[string]*Segment)
+	fileSizes := make(map[string]uint64)
 	data := make(chan bloatyOutput)
 
 	for _, file := range files {
@@ -145,15 +205,13 @@
 
 	for d := range data {
 		if d.err != nil {
-			fmt.Printf("error: %v", d.err)
+			fmt.Printf("%v", d.err)
 			continue
 		}
-		if sym, ok := output[d.key]; !ok {
-			output[d.key] = d.data
-		} else {
-			output[d.key] = updateSymbol(sym, d.data, d.file)
-		}
+		addRowToOutput(&d.data, d.file, output)
+		fileSizes[d.data.File] += d.data.Filesz
 	}
 
+	getTopN(fileSizes, topFiles, topSyms, &output)
 	return output, nil
 }
diff --git a/bloaty/bloaty_test.go b/bloaty/bloaty_test.go
index 0506324..2acf8cd 100644
--- a/bloaty/bloaty_test.go
+++ b/bloaty/bloaty_test.go
@@ -49,3 +49,237 @@
 		}
 	}
 }
+
+func TestGetTopNFiles(t *testing.T) {
+	input := map[string]*Segment{
+		"LOAD [R]": {
+			Files: map[string]*File{
+				"file.c": {
+					TotalFilesz: 14,
+					Symbols: map[string]*Symbol{
+						"ecm_bind": {
+							Name:     "ecm_bind",
+							Vmsz:     14,
+							Filesz:   14,
+							Binaries: []string{"lib.so"},
+						},
+					},
+				},
+				"different.c": {
+					TotalFilesz: 5,
+					Symbols: map[string]*Symbol{
+						"ecm_bind": {
+							Name:     "ecm_bind",
+							Vmsz:     23,
+							Filesz:   5,
+							Binaries: []string{"lib.so"},
+						},
+					},
+				},
+			},
+		},
+		"LOAD [RX]": {
+			Files: map[string]*File{
+				"other.c": {
+					TotalFilesz: 1,
+					Symbols: map[string]*Symbol{
+						"ecm_bind": {
+							Name:     "ecm_bind",
+							Vmsz:     1,
+							Filesz:   1,
+							Binaries: []string{"lib.so"},
+						},
+					},
+				},
+				"test.c": {
+					TotalFilesz: 4,
+					Symbols: map[string]*Symbol{
+						"test": {
+							Name:     "test",
+							Vmsz:     4,
+							Filesz:   4,
+							Binaries: []string{"lib.so"},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	fileSizes := map[string]uint64{
+		"file.c":      14,
+		"different.c": 5,
+		"other.c":     1,
+		"test.c":      4,
+	}
+
+	getTopN(fileSizes, 1, 0, &input)
+	if len(input) != 2 {
+		t.Fatalf("In TestGetTopN, len is wrong: \n%+v", input)
+	}
+
+	if _, ok := input["LOAD [R]"]; !ok {
+		t.Fatalf("In TestGetTopN, missing LOAD [R]: \n%+v", input)
+	}
+
+	if len(input["LOAD [R]"].Files) != 2 {
+		t.Fatalf("In TestGetTopN, len is wrong: \n%+v", input["LOAD [R]"].Files)
+	}
+
+	if val, ok := input["LOAD [R]"].Files["file.c"]; !ok {
+		t.Fatalf("In TestGetTopN, missing file.c: \n%+v", input["LOAD [R]"].Files)
+	} else if val.TotalFilesz != 14 {
+		t.Fatalf("In TestGetTopN, filesz is wrong: \n%+v", val)
+	}
+
+	if val, ok := input["LOAD [R]"].Files["all small files"]; !ok {
+		t.Fatalf("In TestGetTopN, missing all small files: \n%+v", input["LOAD [R]"].Files)
+	} else if val.TotalFilesz != 5 {
+		t.Fatalf("In TestGetTopN, filesz is wrong: \n%+v", val)
+	}
+
+	if _, ok := input["LOAD [RX]"]; !ok {
+		t.Fatalf("In TestGetTopN, missing LOAD [RX]: \n%+v", input)
+	}
+
+	if len(input["LOAD [RX]"].Files) != 1 {
+		t.Fatalf("In TestGetTopN, len is wrong: \n%+v", input["LOAD [R]"].Files)
+	}
+
+	if val, ok := input["LOAD [RX]"].Files["all small files"]; !ok {
+		t.Fatalf("In TestGetTopN, missing all small files: \n%+v", input["LOAD [R]"].Files)
+	} else if val.TotalFilesz != 5 {
+		t.Fatalf("In TestGetTopN, filesz is wrong: \n%+v", val)
+	}
+}
+
+func TestGetTopNSymbols(t *testing.T) {
+	input := map[string]*Segment{
+		"LOAD [R]": {
+			Files: map[string]*File{
+				"file.c": {
+					TotalFilesz: 14,
+					Symbols: map[string]*Symbol{
+						"ecm_bind": {
+							Name:     "ecm_bind",
+							Vmsz:     14,
+							Filesz:   14,
+							Binaries: []string{"lib.so"},
+						},
+						"test": {
+							Name:     "test",
+							Vmsz:     23,
+							Filesz:   5,
+							Binaries: []string{"lib.so"},
+						},
+						"other": {
+							Name:     "other",
+							Vmsz:     5,
+							Filesz:   5,
+							Binaries: []string{"lib.so"},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	fileSizes := map[string]uint64{
+		"file.c":      14,
+		"different.c": 5,
+		"other.c":     1,
+		"test.c":      4,
+	}
+
+	getTopN(fileSizes, 0, 1, &input)
+	if len(input) != 1 {
+		t.Fatalf("In TestGetTopNSymbols, len is wrong: \n%+v", input)
+	}
+
+	if _, ok := input["LOAD [R]"]; !ok {
+		t.Fatalf("In TestGetTopNSymbols, missing LOAD [R]: \n%+v", input)
+	}
+
+	if len(input["LOAD [R]"].Files) != 1 {
+		t.Fatalf("In TestGetTopNSymbols, len is wrong: \n%+v", input["LOAD [R]"].Files)
+	}
+
+	if val, ok := input["LOAD [R]"].Files["file.c"]; !ok {
+		t.Fatalf("In TestGetTopNSymbols, missing file.c: \n%+v", input["LOAD [R]"].Files)
+	} else if val.TotalFilesz != 14 {
+		t.Fatalf("In TestGetTopNSymbols, filesz is wrong: \n%+v", val)
+	}
+
+	if len(input["LOAD [R]"].Files["file.c"].Symbols) != 2 {
+		t.Fatalf("In TestGetTopNSymbols, len is wrong: \n%+v", input["LOAD [R]"].Files["file.c"].Symbols)
+	}
+
+	if val, ok := input["LOAD [R]"].Files["file.c"].Symbols["ecm_bind"]; !ok {
+		t.Fatalf("In TestGetTopNSymbols, missing ecm_bind: \n%+v", input["LOAD [R]"].Files)
+	} else if val.Filesz != 14 {
+		t.Fatalf("In TestGetTopNSymbols, filesz is wrong: \n%+v", val)
+	}
+
+	if val, ok := input["LOAD [R]"].Files["file.c"].Symbols["all small syms"]; !ok {
+		t.Fatalf("In TestGetTopNSymbols, missing ecm_bind: \n%+v", input["LOAD [R]"].Files["file.c"].Symbols["all small syms"])
+	} else if val.Filesz != 10 {
+		t.Fatalf("In TestGetTopNSymbols, filesz is wrong: \n%+v", val)
+	}
+}
+
+func TestAddRowToOutput(t *testing.T) {
+	rows := []row{
+		{"ecm_bind", "other.c", "LOAD [RX]", 7, 7},
+		{"test", "other.c", "LOAD [RX]", 12, 2},
+		{"ecm_bind", "other.c", "LOAD [R]", 23, 5},
+		{"ecm_bind", "file.c", "LOAD [R]", 3, 3},
+	}
+
+	actual := make(map[string]*Segment)
+	for _, row := range rows {
+		addRowToOutput(&row, row.File, actual)
+	}
+
+	// {"ecm_bind", "other.c", "LOAD [RX]", 7, 7},
+	if _, ok := actual["LOAD [RX]"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, got \n%+v", actual)
+	}
+	if _, ok := actual["LOAD [RX]"].Files["other.c"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, missing LOAD[RX] other.c")
+	}
+	if val, ok := actual["LOAD [RX]"].Files["other.c"].Symbols["ecm_bind"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, missing LOAD[RX] other.c ecm_bind")
+	} else if val.Name != "ecm_bind" || val.Vmsz != 7 || val.Filesz != 7 {
+		t.Fatalf("In TestAddRowToOutput, got \n%+v", val)
+	}
+
+	// {"test", "other.c", "LOAD [RX]", 12, 2},
+	if val, ok := actual["LOAD [RX]"].Files["other.c"].Symbols["test"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, missing LOAD[RX] other.c test")
+	} else if val.Name != "test" || val.Vmsz != 12 || val.Filesz != 2 {
+		t.Fatalf("In TestAddRowToOutput, got \n%+v", val)
+	}
+
+	// {"ecm_bind", "other.c", "LOAD [R]", 23, 5},
+	if _, ok := actual["LOAD [R]"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, missing LOAD[R]")
+	}
+	if _, ok := actual["LOAD [R]"].Files["other.c"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, missing LOAD[R]:other.c")
+	}
+	if val, ok := actual["LOAD [R]"].Files["other.c"].Symbols["ecm_bind"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, missing LOAD[R] other.c ecm_bind")
+	} else if val.Name != "ecm_bind" || val.Vmsz != 23 || val.Filesz != 5 {
+		t.Fatalf("In TestAddRowToOutput, got \n%+v", val)
+	}
+
+	// {"ecm_bind", "file.c", "LOAD [R]", 3, 3},
+	if _, ok := actual["LOAD [R]"].Files["file.c"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, missing LOAD[R] file.c")
+	}
+	if val, ok := actual["LOAD [R]"].Files["file.c"].Symbols["ecm_bind"]; !ok {
+		t.Fatalf("In TestAddRowToOutput, missing LOAD[R] file.c ecm_bind")
+	} else if val.Name != "ecm_bind" || val.Vmsz != 3 || val.Filesz != 3 {
+		t.Fatalf("In TestAddRowToOutput, got \n%+v", val)
+	}
+}
diff --git a/bloaty/symbols.go b/bloaty/symbols.go
index 566a9b9..afb33b7 100644
--- a/bloaty/symbols.go
+++ b/bloaty/symbols.go
@@ -19,6 +19,25 @@
 	Filesz uint64
 }
 
+// Symbol represents data about one symbol in the produced Bloaty output.
+type Symbol struct {
+	Name     string   `json:"Name"`
+	Vmsz     uint64   `json:"Vmsz"`
+	Filesz   uint64   `json:"Filesz"`
+	Binaries []string `json:"Binaries"`
+}
+
+// File represents all data about one file in the produced Bloaty output
+type File struct {
+	Symbols     map[string]*Symbol `json:"Symbols"`
+	TotalFilesz uint64             `json:"TotalFilesz"`
+}
+
+// Segment represents all data about one segment in the produced Bloaty output
+type Segment struct {
+	Files map[string]*File
+}
+
 func parseRow(rawRow []string) (row, bool, error) {
 	var out row
 	if len(rawRow) != 5 {
diff --git a/cmd/bloatalyzer/main.go b/cmd/bloatalyzer/main.go
index b7d97dd..f17d0f1 100644
--- a/cmd/bloatalyzer/main.go
+++ b/cmd/bloatalyzer/main.go
@@ -19,12 +19,16 @@
 	bloatyPath string
 	idsPath    string
 	output     string
+	topFiles   uint64
+	topSyms    uint64
 )
 
 func init() {
 	flag.StringVar(&bloatyPath, "b", "", "path to bloaty executable")
 	flag.StringVar(&idsPath, "i", "", "path to ids.txt")
 	flag.StringVar(&output, "o", "", "output path")
+	flag.Uint64Var(&topFiles, "top-files", 0, "max number of files to keep")
+	flag.Uint64Var(&topSyms, "top-syms", 0, "max number of symbols to keep per file")
 }
 
 func main() {
@@ -43,7 +47,7 @@
 		logger.Fatalf(ctx, "%s", "must provide path to ids.txt file.")
 	}
 
-	data, err := bloaty.RunBloaty(bloatyPath, idsPath)
+	data, err := bloaty.RunBloaty(bloatyPath, idsPath, topFiles, topSyms)
 	if err != nil {
 		logger.Fatalf(ctx, "%v", err)
 	}