build/assembly/scripts/bootfs_files_by_size.py - fuchsia - Git at Google

 #!/usr/bin/env fuchsia-vendored-python
 # Copyright 2025 The Fuchsia Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Utility to print out the sizes of all bootfs files and identify them

 This script takes a pair of assembly_builder_forensics.json and zbi.json files
 from a given assembled system and uses them to generate CSV output that lists
 all the bootfs files by size (descending), identifying each blob in the bootfs
 packages by their path in the package.  When a blob is present in multiple
 packages, its package name is listed as 'multiple'.

 Some blobs are generated by the assembly process, and as such, aren't listed
 in the assembly_builder_forensics.file by merkle.  These files are given a
 path of 'assembly_generated' and a package of 'unknown'.

 """

 import argparse
 import json
 import sys
 from dataclasses import dataclass
 from pathlib import Path


 @dataclass
 class BootfsFile:
     bootfs_path: str
     size: int
     package_path: str | None
     packages: list[str]


 def get_zbi_bootfs(path: Path) -> list[dict[str, str | int]]:
     """Get all the bootfs files from zbi.json

     Returns a list of the files in the bootfs, as the 'contents' list from the zbi.json format.
     This is a list of dicts with the following fields:
       - name: the path of the file in bootfs
       - offset: (bytes) the offset within the uncompressed bootfs image where the file starts
       - length: (bytes) the length of the file's data
       - size: (bytes) the space within bootfs that the file takes up.  This will be in multiples of
               the alignment used by bootfs (4KiB).
     """
     with open(path) as zbi_json_file:
         zbi_json = json.load(zbi_json_file)
         for entry in zbi_json:
             if entry["type"] == "BOOTFS":
                 return entry["contents"]
     raise ValueError("Unable to find bootfs contents in zbi.json")


 @dataclass
 class PackagePath:
     """The path within a specific package for a file"""

     package: str
     path: str


 @dataclass
 class BootfsBlob:
     """All the PackagePaths that a given file (by content identity was found at)"""

     paths: list[PackagePath]


 def get_bootfs_package_blobs(path: Path) -> dict[str, list[PackagePath]]:
     """Get blob info for all packages in bootfs

     Returns a dict of BootfsBlob objects by their merkle.  Each object lists each package path that
     it has.
     """
     blobs: dict[str, list[PackagePath]] = {}
     with open(path) as forensics_file:
         forensics_json = json.load(forensics_file)
         packages = forensics_json["packages"]["inner"]
         for package, info in packages.items():
             for blob in info["manifest"]["blobs"]:
                 merkle = blob["merkle"]
                 blobs.setdefault(merkle, []).append(
                     PackagePath(package, blob["path"])
                 )
     return blobs


 def main() -> int:
     parser = argparse.ArgumentParser(description="Run build benchmarks")
     parser.add_argument(
         "--forensics",
         type=Path,
         help="path to an assembly_builder_forensics.json file",
     )
     parser.add_argument(
         "--zbi-json", type=Path, help="path to the zbi.json file"
     )
     args = parser.parse_args()

     zbi_bootfs_entries = get_zbi_bootfs(args.zbi_json)
     bootfs_package_blobs = get_bootfs_package_blobs(args.forensics)

     bootfs_files: list[BootfsFile] = []

     for entry in zbi_bootfs_entries:
         name: str = str(entry["name"])
         if name.startswith("blob/"):
             # if the bootfs file is a package blob, look it up by merkle in bootfs_package_blobs.
             merkle = name[5:]
             blob_paths = bootfs_package_blobs.get(merkle)
             if blob_paths:
                 # if the blob is at multiple paths in different packages, pick just one (this is
                 # fairly rare, so not that big of worry about loss of information)
                 paths = set(entry.path for entry in blob_paths)
                 bootfs_files.append(
                     BootfsFile(
                         name,
                         int(entry["size"]),
                         paths.pop(),
                         [entry.package for entry in blob_paths],
                     )
                 )
             else:
                 # The assembly_builder_forensics.json file doesn't have information about blobs that
                 # are created during product assembly, so mark these as "assembly_generated".
                 bootfs_files.append(
                     BootfsFile(
                         name,
                         int(entry["size"]),
                         "assembly_generated",
                         ["unknown"],
                     )
                 )
         else:
             # if it's a bare file in bootfs, just list it by path and size.
             bootfs_files.append(BootfsFile(name, int(entry["size"]), None, []))

     # Output CSV data about each file, with it's package(s) and (one) path for identification if it
     # is a package blob.
     print("path,size,paths,packages")
     for file in sorted(bootfs_files, key=lambda x: x.size, reverse=True):
         if file.package_path:
             # if the blob is in multiple packages, then just use 'multiple' instead of listing them
             # all.  Most blobs that are multiple packages are libs that are more-readily identified
             # by their path in the package, than they are their package name, anyway.
             package = (
                 file.packages[0] if len(file.packages) == 1 else "multiple"
             )
             print(
                 f"{file.bootfs_path},{file.size},{file.package_path},{package}"
             )
         else:
             print(f"{file.bootfs_path},{file.size},,")

     return 0


 if __name__ == "__main__":
     sys.exit(main())
	#!/usr/bin/env fuchsia-vendored-python
	# Copyright 2025 The Fuchsia Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Utility to print out the sizes of all bootfs files and identify them

	This script takes a pair of assembly_builder_forensics.json and zbi.json files
	from a given assembled system and uses them to generate CSV output that lists
	all the bootfs files by size (descending), identifying each blob in the bootfs
	packages by their path in the package. When a blob is present in multiple
	packages, its package name is listed as 'multiple'.

	Some blobs are generated by the assembly process, and as such, aren't listed
	in the assembly_builder_forensics.file by merkle. These files are given a
	path of 'assembly_generated' and a package of 'unknown'.

	"""

	import argparse
	import json
	import sys
	from dataclasses import dataclass
	from pathlib import Path


	@dataclass
	class BootfsFile:
	bootfs_path: str
	size: int
	package_path: str \| None
	packages: list[str]


	def get_zbi_bootfs(path: Path) -> list[dict[str, str \| int]]:
	"""Get all the bootfs files from zbi.json

	Returns a list of the files in the bootfs, as the 'contents' list from the zbi.json format.
	This is a list of dicts with the following fields:
	- name: the path of the file in bootfs
	- offset: (bytes) the offset within the uncompressed bootfs image where the file starts
	- length: (bytes) the length of the file's data
	- size: (bytes) the space within bootfs that the file takes up. This will be in multiples of
	the alignment used by bootfs (4KiB).
	"""
	with open(path) as zbi_json_file:
	zbi_json = json.load(zbi_json_file)
	for entry in zbi_json:
	if entry["type"] == "BOOTFS":
	return entry["contents"]
	raise ValueError("Unable to find bootfs contents in zbi.json")


	@dataclass
	class PackagePath:
	"""The path within a specific package for a file"""

	package: str
	path: str


	@dataclass
	class BootfsBlob:
	"""All the PackagePaths that a given file (by content identity was found at)"""

	paths: list[PackagePath]


	def get_bootfs_package_blobs(path: Path) -> dict[str, list[PackagePath]]:
	"""Get blob info for all packages in bootfs

	Returns a dict of BootfsBlob objects by their merkle. Each object lists each package path that
	it has.
	"""
	blobs: dict[str, list[PackagePath]] = {}
	with open(path) as forensics_file:
	forensics_json = json.load(forensics_file)
	packages = forensics_json["packages"]["inner"]
	for package, info in packages.items():
	for blob in info["manifest"]["blobs"]:
	merkle = blob["merkle"]
	blobs.setdefault(merkle, []).append(
	PackagePath(package, blob["path"])
	)
	return blobs


	def main() -> int:
	parser = argparse.ArgumentParser(description="Run build benchmarks")
	parser.add_argument(
	"--forensics",
	type=Path,
	help="path to an assembly_builder_forensics.json file",
	)
	parser.add_argument(
	"--zbi-json", type=Path, help="path to the zbi.json file"
	)
	args = parser.parse_args()

	zbi_bootfs_entries = get_zbi_bootfs(args.zbi_json)
	bootfs_package_blobs = get_bootfs_package_blobs(args.forensics)

	bootfs_files: list[BootfsFile] = []

	for entry in zbi_bootfs_entries:
	name: str = str(entry["name"])
	if name.startswith("blob/"):
	# if the bootfs file is a package blob, look it up by merkle in bootfs_package_blobs.
	merkle = name[5:]
	blob_paths = bootfs_package_blobs.get(merkle)
	if blob_paths:
	# if the blob is at multiple paths in different packages, pick just one (this is
	# fairly rare, so not that big of worry about loss of information)
	paths = set(entry.path for entry in blob_paths)
	bootfs_files.append(
	BootfsFile(
	name,
	int(entry["size"]),
	paths.pop(),
	[entry.package for entry in blob_paths],
	)
	)
	else:
	# The assembly_builder_forensics.json file doesn't have information about blobs that
	# are created during product assembly, so mark these as "assembly_generated".
	bootfs_files.append(
	BootfsFile(
	name,
	int(entry["size"]),
	"assembly_generated",
	["unknown"],
	)
	)
	else:
	# if it's a bare file in bootfs, just list it by path and size.
	bootfs_files.append(BootfsFile(name, int(entry["size"]), None, []))

	# Output CSV data about each file, with it's package(s) and (one) path for identification if it
	# is a package blob.
	print("path,size,paths,packages")
	for file in sorted(bootfs_files, key=lambda x: x.size, reverse=True):
	if file.package_path:
	# if the blob is in multiple packages, then just use 'multiple' instead of listing them
	# all. Most blobs that are multiple packages are libs that are more-readily identified
	# by their path in the package, than they are their package name, anyway.
	package = (
	file.packages[0] if len(file.packages) == 1 else "multiple"
	)
	print(
	f"{file.bootfs_path},{file.size},{file.package_path},{package}"
	)
	else:
	print(f"{file.bootfs_path},{file.size},,")

	return 0


	if __name__ == "__main__":
	sys.exit(main())