llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py - third_party/llvm-project - Git at Google

 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """IR2Vec Triplet Generator

 Generates IR2Vec triplets by applying random optimization levels to LLVM IR files
 and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
 files: entity2id.txt, relation2id.txt, and train2id.txt.

 Usage:
     python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir>
 """

 import argparse
 import logging
 import os
 import random
 import subprocess
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import List, Set, Tuple

 # Configuration
 OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
 DEFAULT_MAX_WORKERS = 100

 logger = logging.getLogger(__name__)


 # TODO: Change this to a dataclass with slots
 # when Python 3.10+ is the minimum version
 # https://docs.python.org/3/library/dataclasses.html#dataclasses.dataclass
 class TripletResult:
     """Result from processing a single LLVM IR file"""

     __slots__ = ["triplets", "max_relation"]

     def __init__(self, triplets: Set[str], max_relation: int):
         self.triplets = triplets
         self.max_relation = max_relation


 class IR2VecTripletGenerator:
     """Main class for generating IR2Vec triplets"""

     def __init__(
         self,
         llvm_build_dir: Path,
         num_optimizations: int,
         output_dir: Path,
         max_workers: int = DEFAULT_MAX_WORKERS,
     ):
         self.llvm_build_dir = llvm_build_dir
         self.num_optimizations = num_optimizations
         self.output_dir = output_dir
         self.max_workers = max_workers

         # Tool paths
         self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
         self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")

         self._validate_setup()

         # Create output directory if it doesn't exist
         self.output_dir.mkdir(parents=True, exist_ok=True)

     def _validate_setup(self):
         """Validate that all required tools and paths exist"""
         if not self.llvm_build_dir.exists():
             raise FileNotFoundError(
                 f"LLVM build directory not found: {self.llvm_build_dir}"
             )

         if not os.path.isfile(self.opt_binary) or not os.access(
             self.opt_binary, os.X_OK
         ):
             raise FileNotFoundError(
                 f"opt binary not found or not executable: {self.opt_binary}"
             )

         if not os.path.isfile(self.ir2vec_binary) or not os.access(
             self.ir2vec_binary, os.X_OK
         ):
             raise FileNotFoundError(
                 f"llvm-ir2vec binary not found or not executable: {self.ir2vec_binary}"
             )

         if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
             raise ValueError(
                 f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
             )

     def _select_optimization_levels(self) -> List[str]:
         """Select unique random optimization levels"""
         return random.sample(OPT_LEVELS, self.num_optimizations)

     def _process_single_file(self, input_file: Path) -> TripletResult:
         """Process a single LLVM IR file with multiple optimization levels"""
         all_triplets = set()
         max_relation = 1
         opt_levels = self._select_optimization_levels()

         for opt_level in opt_levels:
             triplets, file_max_relation = self._run_pipeline(input_file, opt_level)
             if triplets:
                 all_triplets.update(triplets)
                 max_relation = max(max_relation, file_max_relation)
                 logger.debug(
                     f"Generated {len(triplets)} triplets for {input_file} with {opt_level}"
                 )

         return TripletResult(all_triplets, max_relation)

     def _run_pipeline(self, input_file: Path, opt_level: str) -> Tuple[Set[str], int]:
         """Run opt | llvm-ir2vec pipeline using subprocess pipes."""
         try:
             # Run opt first
             opt_proc = subprocess.Popen(
                 [self.opt_binary, f"-{opt_level}", str(input_file), "-o", "-"],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
             )

             # Run llvm-ir2vec with opt's output as input
             ir2vec_proc = subprocess.Popen(
                 [self.ir2vec_binary, "triplets", "-", "-o", "-"],
                 stdin=opt_proc.stdout,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
             )

             opt_proc.stdout.close()
             stdout, _ = ir2vec_proc.communicate()
             opt_proc.wait()

             # Check if either process failed
             if opt_proc.returncode != 0 or ir2vec_proc.returncode != 0:
                 return set(), 1

             return self._parse_triplet_output(stdout)
         except (subprocess.SubprocessError, OSError):
             return set(), 1

     def _parse_triplet_output(self, output: str) -> Tuple[Set[str], int]:
         """Parse triplet output and extract max relation"""
         if not output.strip():
             return set(), 1

         lines = output.strip().split("\n")
         max_relation = 1

         # Extract max relation from metadata line
         if lines and lines[0].startswith("MAX_RELATION="):
             max_relation = int(lines[0].split("=")[1])
             lines = lines[1:]

         # Remove duplicate triplets by converting to a set
         return set(lines), max_relation

     def generate_triplets(self, file_list: Path) -> None:
         """Main method to generate triplets from a list of LLVM IR files"""
         input_files = self._read_file_list(file_list)
         logger.info(
             f"Processing {len(input_files)} files with {self.num_optimizations} "
             f"optimization levels using {self.max_workers} workers"
         )

         all_triplets = set()
         global_max_relation = 1

         with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
             future_to_file = {
                 executor.submit(self._process_single_file, file): file
                 for file in input_files
             }

             for future in as_completed(future_to_file):
                 try:
                     result = future.result()
                     all_triplets.update(result.triplets)
                     global_max_relation = max(global_max_relation, result.max_relation)
                 except (subprocess.SubprocessError, OSError, ValueError) as e:
                     file_path = future_to_file[future]
                     logger.error(f"Error processing {file_path}: {e}")

         self._generate_output_files(all_triplets, global_max_relation)
         logger.info("Processing completed successfully")

     def _read_file_list(self, file_list: Path) -> List[Path]:
         """Read and validate the list of input files"""
         input_files = []
         with open(file_list, "r") as f:
             for line_num, line in enumerate(f, 1):
                 if line := line.strip():
                     file_path = Path(line)
                     if file_path.exists():
                         input_files.append(file_path)
                     else:
                         logger.warning(f"File not found (line {line_num}): {file_path}")

         if not input_files:
             raise ValueError("No valid input files found")
         return input_files

     def _generate_output_files(self, all_triplets: Set[str], max_relation: int) -> None:
         """Generate the final output files"""
         logger.info(f"Generating output files with {len(all_triplets)} unique triplets")

         # Write all output files -- train2id.txt, entity2id.txt, relation2id.txt
         train2id_file = os.path.join(self.output_dir, "train2id.txt")
         entity2id_file = os.path.join(self.output_dir, "entity2id.txt")
         relation2id_file = os.path.join(self.output_dir, "relation2id.txt")

         with open(train2id_file, "w") as f:
             f.write(f"{len(all_triplets)}\n")
             f.writelines(f"{triplet}\n" for triplet in all_triplets)

         self._generate_entity2id(entity2id_file)
         self._generate_relation2id(relation2id_file, max_relation)

     def _generate_entity2id(self, output_file: Path) -> None:
         """Generate entity2id.txt using llvm-ir2vec"""
         subprocess.run(
             [str(self.ir2vec_binary), "entities", "-o", str(output_file)],
             check=True,
             capture_output=True,
         )

     def _generate_relation2id(self, output_file: Path, max_relation: int) -> None:
         """Generate relation2id.txt from max relation"""
         max_relation = max(max_relation, 1)  # At least Type and Next relations
         num_relations = max_relation + 1

         with open(output_file, "w") as f:
             f.write(f"{num_relations}\n")
             f.write("Type\t0\n")
             f.write("Next\t1\n")
             f.writelines(f"Arg{i-2}\t{i}\n" for i in range(2, num_relations))


 def main():
     """Main entry point"""
     parser = argparse.ArgumentParser(
         description="Generate IR2Vec triplets from LLVM IR files",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )

     parser.add_argument(
         "llvm_build_dir", type=Path, help="Path to LLVM build directory"
     )
     parser.add_argument(
         "num_optimizations",
         type=int,
         help="Number of optimization levels to apply (1-6)",
     )
     parser.add_argument(
         "ll_file_list",
         type=Path,
         help="File containing list of LLVM IR files to process",
     )
     parser.add_argument(
         "output_dir", type=Path, help="Output directory for generated files"
     )
     parser.add_argument(
         "-j",
         "--max-workers",
         type=int,
         default=DEFAULT_MAX_WORKERS,
         help=f"Maximum number of parallel workers (default: {DEFAULT_MAX_WORKERS})",
     )
     parser.add_argument(
         "-v", "--verbose", action="store_true", help="Enable debug logging"
     )
     parser.add_argument(
         "-q", "--quiet", action="store_true", help="Suppress all output except errors"
     )

     args = parser.parse_args()

     # Configure logging
     level = (
         logging.ERROR
         if args.quiet
         else (logging.DEBUG if args.verbose else logging.INFO)
     )
     logging.basicConfig(
         level=level,
         format="[%(asctime)s] %(levelname)s: %(message)s",
         datefmt="%H:%M:%S",
     )

     generator = IR2VecTripletGenerator(
         args.llvm_build_dir,
         args.num_optimizations,
         args.output_dir,
         args.max_workers,
     )
     generator.generate_triplets(args.ll_file_list)


 if __name__ == "__main__":
     main()
	# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	"""IR2Vec Triplet Generator

	Generates IR2Vec triplets by applying random optimization levels to LLVM IR files
	and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
	files: entity2id.txt, relation2id.txt, and train2id.txt.

	Usage:
	python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir>
	"""

	import argparse
	import logging
	import os
	import random
	import subprocess
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from pathlib import Path
	from typing import List, Set, Tuple

	# Configuration
	OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
	DEFAULT_MAX_WORKERS = 100

	logger = logging.getLogger(__name__)


	# TODO: Change this to a dataclass with slots
	# when Python 3.10+ is the minimum version
	# https://docs.python.org/3/library/dataclasses.html#dataclasses.dataclass
	class TripletResult:
	"""Result from processing a single LLVM IR file"""

	__slots__ = ["triplets", "max_relation"]

	def __init__(self, triplets: Set[str], max_relation: int):
	self.triplets = triplets
	self.max_relation = max_relation


	class IR2VecTripletGenerator:
	"""Main class for generating IR2Vec triplets"""

	def __init__(
	self,
	llvm_build_dir: Path,
	num_optimizations: int,
	output_dir: Path,
	max_workers: int = DEFAULT_MAX_WORKERS,
	):
	self.llvm_build_dir = llvm_build_dir
	self.num_optimizations = num_optimizations
	self.output_dir = output_dir
	self.max_workers = max_workers

	# Tool paths
	self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
	self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")

	self._validate_setup()

	# Create output directory if it doesn't exist
	self.output_dir.mkdir(parents=True, exist_ok=True)

	def _validate_setup(self):
	"""Validate that all required tools and paths exist"""
	if not self.llvm_build_dir.exists():
	raise FileNotFoundError(
	f"LLVM build directory not found: {self.llvm_build_dir}"
	)

	if not os.path.isfile(self.opt_binary) or not os.access(
	self.opt_binary, os.X_OK
	):
	raise FileNotFoundError(
	f"opt binary not found or not executable: {self.opt_binary}"
	)

	if not os.path.isfile(self.ir2vec_binary) or not os.access(
	self.ir2vec_binary, os.X_OK
	):
	raise FileNotFoundError(
	f"llvm-ir2vec binary not found or not executable: {self.ir2vec_binary}"
	)

	if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
	raise ValueError(
	f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
	)

	def _select_optimization_levels(self) -> List[str]:
	"""Select unique random optimization levels"""
	return random.sample(OPT_LEVELS, self.num_optimizations)

	def _process_single_file(self, input_file: Path) -> TripletResult:
	"""Process a single LLVM IR file with multiple optimization levels"""
	all_triplets = set()
	max_relation = 1
	opt_levels = self._select_optimization_levels()

	for opt_level in opt_levels:
	triplets, file_max_relation = self._run_pipeline(input_file, opt_level)
	if triplets:
	all_triplets.update(triplets)
	max_relation = max(max_relation, file_max_relation)
	logger.debug(
	f"Generated {len(triplets)} triplets for {input_file} with {opt_level}"
	)

	return TripletResult(all_triplets, max_relation)

	def _run_pipeline(self, input_file: Path, opt_level: str) -> Tuple[Set[str], int]:
	"""Run opt \| llvm-ir2vec pipeline using subprocess pipes."""
	try:
	# Run opt first
	opt_proc = subprocess.Popen(
	[self.opt_binary, f"-{opt_level}", str(input_file), "-o", "-"],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	)

	# Run llvm-ir2vec with opt's output as input
	ir2vec_proc = subprocess.Popen(
	[self.ir2vec_binary, "triplets", "-", "-o", "-"],
	stdin=opt_proc.stdout,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	)

	opt_proc.stdout.close()
	stdout, _ = ir2vec_proc.communicate()
	opt_proc.wait()

	# Check if either process failed
	if opt_proc.returncode != 0 or ir2vec_proc.returncode != 0:
	return set(), 1

	return self._parse_triplet_output(stdout)
	except (subprocess.SubprocessError, OSError):
	return set(), 1

	def _parse_triplet_output(self, output: str) -> Tuple[Set[str], int]:
	"""Parse triplet output and extract max relation"""
	if not output.strip():
	return set(), 1

	lines = output.strip().split("\n")
	max_relation = 1

	# Extract max relation from metadata line
	if lines and lines[0].startswith("MAX_RELATION="):
	max_relation = int(lines[0].split("=")[1])
	lines = lines[1:]

	# Remove duplicate triplets by converting to a set
	return set(lines), max_relation

	def generate_triplets(self, file_list: Path) -> None:
	"""Main method to generate triplets from a list of LLVM IR files"""
	input_files = self._read_file_list(file_list)
	logger.info(
	f"Processing {len(input_files)} files with {self.num_optimizations} "
	f"optimization levels using {self.max_workers} workers"
	)

	all_triplets = set()
	global_max_relation = 1

	with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
	future_to_file = {
	executor.submit(self._process_single_file, file): file
	for file in input_files
	}

	for future in as_completed(future_to_file):
	try:
	result = future.result()
	all_triplets.update(result.triplets)
	global_max_relation = max(global_max_relation, result.max_relation)
	except (subprocess.SubprocessError, OSError, ValueError) as e:
	file_path = future_to_file[future]
	logger.error(f"Error processing {file_path}: {e}")

	self._generate_output_files(all_triplets, global_max_relation)
	logger.info("Processing completed successfully")

	def _read_file_list(self, file_list: Path) -> List[Path]:
	"""Read and validate the list of input files"""
	input_files = []
	with open(file_list, "r") as f:
	for line_num, line in enumerate(f, 1):
	if line := line.strip():
	file_path = Path(line)
	if file_path.exists():
	input_files.append(file_path)
	else:
	logger.warning(f"File not found (line {line_num}): {file_path}")

	if not input_files:
	raise ValueError("No valid input files found")
	return input_files

	def _generate_output_files(self, all_triplets: Set[str], max_relation: int) -> None:
	"""Generate the final output files"""
	logger.info(f"Generating output files with {len(all_triplets)} unique triplets")

	# Write all output files -- train2id.txt, entity2id.txt, relation2id.txt
	train2id_file = os.path.join(self.output_dir, "train2id.txt")
	entity2id_file = os.path.join(self.output_dir, "entity2id.txt")
	relation2id_file = os.path.join(self.output_dir, "relation2id.txt")

	with open(train2id_file, "w") as f:
	f.write(f"{len(all_triplets)}\n")
	f.writelines(f"{triplet}\n" for triplet in all_triplets)

	self._generate_entity2id(entity2id_file)
	self._generate_relation2id(relation2id_file, max_relation)

	def _generate_entity2id(self, output_file: Path) -> None:
	"""Generate entity2id.txt using llvm-ir2vec"""
	subprocess.run(
	[str(self.ir2vec_binary), "entities", "-o", str(output_file)],
	check=True,
	capture_output=True,
	)

	def _generate_relation2id(self, output_file: Path, max_relation: int) -> None:
	"""Generate relation2id.txt from max relation"""
	max_relation = max(max_relation, 1) # At least Type and Next relations
	num_relations = max_relation + 1

	with open(output_file, "w") as f:
	f.write(f"{num_relations}\n")
	f.write("Type\t0\n")
	f.write("Next\t1\n")
	f.writelines(f"Arg{i-2}\t{i}\n" for i in range(2, num_relations))


	def main():
	"""Main entry point"""
	parser = argparse.ArgumentParser(
	description="Generate IR2Vec triplets from LLVM IR files",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	)

	parser.add_argument(
	"llvm_build_dir", type=Path, help="Path to LLVM build directory"
	)
	parser.add_argument(
	"num_optimizations",
	type=int,
	help="Number of optimization levels to apply (1-6)",
	)
	parser.add_argument(
	"ll_file_list",
	type=Path,
	help="File containing list of LLVM IR files to process",
	)
	parser.add_argument(
	"output_dir", type=Path, help="Output directory for generated files"
	)
	parser.add_argument(
	"-j",
	"--max-workers",
	type=int,
	default=DEFAULT_MAX_WORKERS,
	help=f"Maximum number of parallel workers (default: {DEFAULT_MAX_WORKERS})",
	)
	parser.add_argument(
	"-v", "--verbose", action="store_true", help="Enable debug logging"
	)
	parser.add_argument(
	"-q", "--quiet", action="store_true", help="Suppress all output except errors"
	)

	args = parser.parse_args()

	# Configure logging
	level = (
	logging.ERROR
	if args.quiet
	else (logging.DEBUG if args.verbose else logging.INFO)
	)
	logging.basicConfig(
	level=level,
	format="[%(asctime)s] %(levelname)s: %(message)s",
	datefmt="%H:%M:%S",
	)

	generator = IR2VecTripletGenerator(
	args.llvm_build_dir,
	args.num_optimizations,
	args.output_dir,
	args.max_workers,
	)
	generator.generate_triplets(args.ll_file_list)


	if __name__ == "__main__":
	main()