| #!/usr/bin/env python3 |
| # pyright: strict |
| |
| # Copyright 2024 Free Software Foundation, Inc. |
| |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 3 of the License, or |
| # (at your option) any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with this program. If not, see <http://www.gnu.org/licenses/>. |
| |
| # Due to the pyelftools dependency, this script requires Python version |
| # 3.10 or greater to run. |
| |
| """A utility to convert ELF files with DWARF info to Dwarf::assemble code. |
| |
| Usage: |
| python ./asm_to_dwarf_assembler.py <path/to/elf/file> |
| |
| Dependencies: |
| Python >= 3.10 |
| pyelftools >= 0.31 |
| |
| Notes: |
| - Line tables are not currently supported. |
| - Non-contiguous subprograms are not currently supported. |
| - If you want to use $srcfile or similar, you must edit the references to the |
| file name manually, including DW_AT_name attributes on compile units. |
| - If run with binaries generated by make check-gdb, it may include an |
| additional compile_unit before and after the actual compile units. This is |
| an artifact of the normal compilation process, as these CUs are indeed in |
| the generated DWARF in some cases. |
| """ |
| |
| import errno |
| import re |
| import sys |
| from copy import copy |
| from dataclasses import dataclass |
| from datetime import datetime |
| from functools import cache |
| from io import BytesIO, IOBase |
| from logging import getLogger |
| from typing import Annotated, Optional |
| |
| from elftools.dwarf.compileunit import CompileUnit as RawCompileUnit |
| from elftools.dwarf.die import DIE as RawDIE |
| from elftools.dwarf.die import AttributeValue |
| from elftools.elf.elffile import ELFFile |
| |
| logger = getLogger(__file__) |
| |
| |
| # While these aren't supported, their detection is important for replacing them |
| # with SPECIAL_expr and for writing the placeholder {MANUAL} expr list. |
| EXPR_ATTRIBUTE_FORMS = [ |
| "DW_FORM_exprloc", |
| "DW_FORM_block", |
| "DW_FORM_block1", |
| "DW_FORM_block2", |
| "DW_FORM_block4", |
| ] |
| |
| |
| # Workaround for my editor not to freak out over unclosed braces. |
| lbrace, rbrace = "{", "}" |
| |
| |
| @cache |
| def get_indent_str(indent_count: int) -> str: |
| """Get whitespace string to prepend to another for indenting.""" |
| indent = (indent_count // 2) * "\t" |
| if indent_count % 2 == 1: |
| indent += " " |
| return indent |
| |
| |
| def indent(line: str, indent_count: int) -> str: |
| """Indent line by indent_count levels.""" |
| return get_indent_str(indent_count) + line |
| |
| |
| def labelify_str(s: str) -> str: |
| """Make s appropriate for a label name.""" |
| # Replace "*" with the literal word "ptr". |
| s = s.replace("*", "ptr") |
| |
| # Replace any non-"word" characters by "_". |
| s = re.sub(r"\W", "_", s) |
| |
| # Remove consecutive "_"s. |
| s = re.sub(r"__+", "_", s) |
| |
| return s |
| |
| |
| class DWARFAttribute: |
| """Storage unit for a single DWARF attribute. |
| |
| All its values are strings that are usually passed on |
| directly to format. The exceptions to this are attributes |
| with int values with DW_FORM_ref4 or DW_FORM_ref_addr form. |
| Their values are interpreted as the global offset of the DIE |
| being referenced, which are looked up dynamically to fetch |
| their labels. |
| """ |
| |
| def __init__( |
| self, |
| die_offset: int, |
| name: str, |
| value: str | bytes | int | bool, |
| form=None, |
| ): |
| self.die_offset = die_offset |
| self.name = name |
| self.value = value |
| self.form = form |
| |
| def _format_expr_value(self) -> str: |
| self.form = "SPECIAL_expr" |
| return "{ MANUAL: Fill expr list }" |
| |
| def _needs_escaping(self, str_value: str) -> bool: |
| charset = set(str_value) |
| return bool(charset.intersection({"{", "}", " ", "\t"})) |
| |
| def _format_str(self, str_value: str) -> str: |
| if self._needs_escaping(str_value): |
| escaped_str = str(str_value) |
| # Replace single escape (which is itself escaped because of regex) |
| # with a double escape (which doesn't mean anything to regex so |
| # it doesn't need escaping). |
| escaped_str = re.sub(r"\\", r"\\", escaped_str) |
| escaped_str = re.sub("([{}])", r"\\\1", escaped_str) |
| return "{" + escaped_str + "}" |
| else: |
| return str_value |
| |
| def _format_value( |
| self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0 |
| ) -> str: |
| if self.form in EXPR_ATTRIBUTE_FORMS: |
| return self._format_expr_value() |
| elif isinstance(self.value, bool): |
| return str(int(self.value)) |
| elif isinstance(self.value, int): |
| if self.form == "DW_FORM_ref4": |
| # ref4-style referencing label. |
| die = offset_die_lookup[self.value] |
| return ":$" + die.tcl_label |
| elif self.form == "DW_FORM_ref_addr": |
| # ref_addr-style referencing label. |
| die = offset_die_lookup[self.value] |
| return "%$" + die.tcl_label |
| else: |
| return str(self.value) |
| elif isinstance(self.value, bytes): |
| return self._format_str(self.value.decode("ascii")) |
| elif isinstance(self.value, str): |
| return self._format_str(self.value) |
| else: |
| raise NotImplementedError(f"Unknown data type: {type(self.value)}") |
| |
| def format( |
| self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0 |
| ) -> str: |
| """Format the attribute in the form {name value form}. |
| |
| If form is DW_FORM_exprloc or DW_FORM_block, see next section on |
| DWARFOperations. |
| |
| If it isn't, value is formatted as follows: |
| If bool, use "1" if True, "0" if False. |
| If int: |
| If form is DW_FORM_ref4, use ":$label" where label is the |
| tcl_label of the DWARFDIE at offset "value". |
| If form is DW_FORM_ref_addr, use "%$label" where label is |
| the tcl_label of the DWARFDIE at offset "value". |
| Else, use value directly. |
| If bytes, use value.decode("ascii") |
| If str, use value directly. |
| Any other type results in a NotImplementedError being raised. |
| |
| Regarding DW_FORM_exprloc and DW_FORM_block: |
| The form is replaced with SPECIAL_expr. |
| The entries in the value are interpreted and decoded using the |
| dwarf_operations dictionary, and replaced with their names where |
| applicable. |
| """ |
| s = lbrace |
| if isinstance(self.name, int): |
| s += "DW_AT_" + hex(self.name) |
| else: |
| s += self.name |
| s += " " |
| s += self._format_value(offset_die_lookup) |
| |
| # Only explicitly state form if it's not a reference. |
| if self.form not in [None, "DW_FORM_ref4", "DW_FORM_ref_addr"]: |
| s += " " + self.form |
| |
| s += rbrace |
| return indent(s, indent_count) |
| |
| |
| class DWARFDIE: |
| """This script's parsed version of a RawDIE.""" |
| |
| def __init__( |
| self, |
| offset: int, |
| tag: str, |
| attrs: dict[str, DWARFAttribute], |
| tcl_label: Optional[str] = None, |
| ): |
| self.offset: Annotated[int, "Global offset of the DIE."] = offset |
| self.tag: Annotated[str, "DWARF tag for this DIE."] = tag |
| self.attrs: Annotated[ |
| dict[str, DWARFAttribute], "Dict of attributes for this DIE." |
| ] = copy(attrs) |
| self.children: Annotated[list[DWARFDIE], "List of child DIEs of this DIE."] = [] |
| self.tcl_label: Annotated[ |
| str, |
| "Label used by the Tcl code to reference this DIE, if any. These " |
| 'take the form of "label: " before the actual DIE definition.', |
| ] = tcl_label |
| |
| def format_lines( |
| self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0 |
| ) -> list[str]: |
| """Get the list of lines that represent this DIE in Dwarf assembler.""" |
| die_lines = [] |
| |
| # Prepend label to first line, if it's set. |
| if self.tcl_label: |
| first_line_start = self.tcl_label + ": " |
| else: |
| first_line_start = "" |
| |
| # First line, including label. |
| first_line = indent(first_line_start + self.tag + " " + lbrace, indent_count) |
| die_lines.append(first_line) |
| |
| # Format attributes, if any. |
| if self.attrs: |
| for attr_name, attr in self.attrs.items(): |
| attr_line = attr.format( |
| offset_die_lookup, indent_count=indent_count + 1 |
| ) |
| die_lines.append(attr_line) |
| die_lines.append(indent(rbrace, indent_count)) |
| else: |
| # Don't create a new line, just append and immediately close the |
| # brace on the last line. |
| die_lines[-1] += rbrace |
| |
| # Format children, if any. |
| if self.children: |
| # Only open a new brace if there are any children for the |
| # current DIE. |
| die_lines[-1] += " " + lbrace |
| for child in self.children: |
| child_lines = child.format_lines( |
| offset_die_lookup, indent_count=indent_count + 1 |
| ) |
| die_lines.extend(child_lines) |
| die_lines.append(indent(rbrace, indent_count)) |
| |
| return die_lines |
| |
| def format( |
| self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0 |
| ) -> str: |
| """Join result from format_lines into a single str.""" |
| return "\n".join(self.format_lines(offset_die_lookup, indent_count)) |
| |
| def name(self) -> Optional[str]: |
| """Get DW_AT_name (if present) decoded as ASCII.""" |
| raw_value = self.attrs.get("DW_AT_name") |
| if raw_value is None: |
| return None |
| else: |
| return raw_value.value.decode("ascii") |
| |
| def type_name(self) -> str: |
| """Name of Dwarf tag, with the "DW_TAG_" prefix removed.""" |
| return re.sub("DW_TAG_", "", self.tag) |
| |
| |
| class DWARFCompileUnit(DWARFDIE): |
| """Wrapper subclass for CU DIEs. |
| |
| This is necessary due to the special format CUs take in Dwarf::assemble. |
| |
| Instead of simply: |
| DW_TAG_compile_unit { |
| <attributes> |
| } { |
| <children> |
| } |
| |
| CUs are formatted as: |
| cu { <cu_special_vars> } { |
| DW_TAG_compile_unit { |
| <attributes> |
| } { |
| <children> |
| } |
| } |
| """ |
| |
| # Default value for parameter is_64 defined in dwarf.exp line 1553. |
| # This value is converted to 0/1 automatically when emitting |
| # Dwarf::assemble code. |
| default_is_64 = False |
| |
| # Default value for parameter dwarf_version defined in dwarf.exp line 1552. |
| default_dwarf_version = 4 |
| |
| # Default value for parameter is_fission defined in dwarf.exp line 1556. |
| # Currently not implemented, see comment below. |
| # default_is_fission = False |
| |
| # Tag that signifies a DIE is a compile unit. |
| compile_unit_tag = "DW_TAG_compile_unit" |
| |
| def __init__( |
| self, |
| raw_die: RawDIE, |
| raw_cu: RawCompileUnit, |
| attrs: dict[str, DWARFAttribute], |
| ): |
| """Initialize additional instance variables for CU encoding. |
| |
| The additional instance variables are: |
| - is_64_bit: bool |
| Whether this CU is 64 bit or not. |
| - dwarf_version: int |
| default DWARFCompileUnit.default_dwarf_version |
| Version of DWARF this CU is using. |
| - addr_size: Optional[int] |
| default None |
| Size of an address in bytes. |
| |
| These variables are used to configure the first parameter of the cu |
| proc (which contains calls to the compile_unit proc in the body of |
| Dwarf::assemble). |
| """ |
| super().__init__(raw_die.offset, DWARFCompileUnit.compile_unit_tag, attrs) |
| self.raw_cu = raw_cu |
| self.dwarf_version: int = raw_cu.header.get( |
| "version", DWARFCompileUnit.default_dwarf_version |
| ) |
| self.addr_size: Optional[int] = raw_cu.header.get("address_size") |
| self.is_64_bit: bool = raw_cu.dwarf_format() == 64 |
| |
| # Fission is not currently implemented because I don't know where to |
| # fetch this information from. |
| # self.is_fission: bool = self.default_is_fission |
| |
| # CU labels are not currently implemented because I haven't found where |
| # pyelftools exposes this information. |
| # self.cu_label: Optional[str] = None |
| |
| def format_lines( |
| self, |
| offset_die_lookup: dict[int, DWARFDIE], |
| indent_count: int = 0, |
| ) -> list[str]: |
| lines = [] |
| lines.append(self._get_header(indent_count)) |
| inner_lines = super().format_lines(offset_die_lookup, indent_count + 1) |
| lines += inner_lines |
| lines.append(indent(rbrace, indent_count)) |
| return lines |
| |
| def _get_header(self, indent_count: int = 0) -> str: |
| """Assemble the first line of the surrounding 'cu {} {}' proc call.""" |
| header = indent("cu " + lbrace, indent_count) |
| cu_params = [] |
| |
| if self.is_64_bit != DWARFCompileUnit.default_is_64: |
| # Convert from True/False to 1/0. |
| param_value = int(self.is_64_bit) |
| cu_params += ["is_64", str(param_value)] |
| |
| if self.dwarf_version != DWARFCompileUnit.default_dwarf_version: |
| cu_params += ["version", str(self.dwarf_version)] |
| |
| if self.addr_size is not None: |
| cu_params += ["addr_size", str(self.addr_size)] |
| |
| # Fission is not currently implemented, see comment above. |
| # if self.is_fission != DWARFCompileUnit.default_is_fission: |
| # # Same as is_64_bit conversion, True/False -> 1/0. |
| # param_value = int(self.is_fission) |
| # cu_params += ["fission", str(param_value)] |
| |
| # CU labels are not currently implemented, see commend above. |
| # if self.cu_label is not None: |
| # cu_params += ["label", self.cu_label] |
| |
| if cu_params: |
| header += " ".join(cu_params) |
| |
| header += rbrace + " " + lbrace |
| return header |
| |
| |
| class DWARFParser: |
| """Converter from pyelftools's DWARF representation to this script's.""" |
| |
| def __init__(self, elf_file: IOBase): |
| """Init parser with file opened in binary mode. |
| |
| File can be closed after this function is called. |
| """ |
| self.raw_data = BytesIO(elf_file.read()) |
| self.elf_data = ELFFile(self.raw_data) |
| self.dwarf_info = self.elf_data.get_dwarf_info() |
| self.offset_to_die: dict[int, DWARFDIE] = {} |
| self.label_to_die: dict[str, DWARFDIE] = {} |
| self.referenced_offsets: Annotated[ |
| set[int], "The set of all offsets that were referenced by some DIE." |
| ] = set() |
| self.raw_cu_list: list[RawCompileUnit] = [] |
| self.top_level_dies: list[DWARFDIE] = [] |
| self.subprograms: list[DWARFDIE] = [] |
| self.taken_labels: set[str] = set() |
| |
| self._read_all_cus() |
| self._create_necessary_labels() |
| |
| def _read_all_cus(self): |
| """Populate self.raw_cu_list with all CUs in self.dwarf_info.""" |
| for cu in self.dwarf_info.iter_CUs(): |
| self._read_cu(cu) |
| |
| def _read_cu(self, raw_cu: RawCompileUnit): |
| """Read a compile_unit into self.cu_list.""" |
| self.raw_cu_list.append(raw_cu) |
| for raw_die in raw_cu.iter_DIEs(): |
| if not raw_die.is_null(): |
| self._parse_die(raw_cu, raw_die) |
| |
| def _parse_die(self, die_cu: RawCompileUnit, raw_die: RawDIE) -> DWARFDIE: |
| """Process a single DIE and add it to offset_to_die. |
| |
| Look for DW_FORM_ref4 and DWD_FORM_ref_addr form attributes and replace |
| them with the global offset of the referenced DIE, and adding the |
| referenced DIE to a set. This will be used later to assign and use |
| labels only to DIEs that need it. |
| |
| In case the DIE is a top-level DIE, add it to self.top_level_dies. |
| |
| In case the DIE is a subprogram, add it to self.subprograms and call |
| self._use_vars_for_low_and_high_pc_attr with it. |
| """ |
| processed_attrs = {} |
| attr_value: AttributeValue |
| for attr_name, attr_value in raw_die.attributes.items(): |
| actual_value = attr_value.value |
| if attr_value.form in ("DW_FORM_ref4", "DW_FORM_ref_addr"): |
| referenced_die = raw_die.get_DIE_from_attribute(attr_name) |
| actual_value = referenced_die.offset |
| self.referenced_offsets.add(referenced_die.offset) |
| |
| processed_attrs[attr_name] = DWARFAttribute( |
| raw_die.offset, attr_name, actual_value, attr_value.form |
| ) |
| |
| if raw_die.tag == DWARFCompileUnit.compile_unit_tag: |
| processed_die = DWARFCompileUnit(raw_die, die_cu, processed_attrs) |
| else: |
| processed_die = DWARFDIE(raw_die.offset, raw_die.tag, processed_attrs, None) |
| |
| if raw_die.get_parent() is None: |
| # Top level DIE |
| self.top_level_dies.append(processed_die) |
| else: |
| # Setting the parent here assumes the parent was already processed |
| # prior to this DIE being found. |
| # As far as I'm aware, this is always true in DWARF. |
| processed_parent = self.offset_to_die[raw_die.get_parent().offset] |
| processed_parent.children.append(processed_die) |
| |
| if processed_die.tag == "DW_TAG_subprogram": |
| self.subprograms.append(processed_die) |
| self._use_vars_for_low_and_high_pc_attr(processed_die) |
| |
| self.offset_to_die[processed_die.offset] = processed_die |
| return processed_die |
| |
| def _create_necessary_labels(self): |
| """Create labels to DIEs that were referenced by others.""" |
| for offset in self.referenced_offsets: |
| die = self.offset_to_die[offset] |
| self._create_label_for_die(die) |
| |
| def _use_vars_for_low_and_high_pc_attr(self, subprogram: DWARFDIE) -> None: |
| """Replace existing PC attributes with Tcl variables. |
| |
| If DW_AT_low_pc exists for this DIE, replace it with accessing the |
| variable whose name is given by self.subprogram_start_var(subprogram). |
| |
| If DW_AT_high_pc exists for this DIE, replace it with accessing the |
| variable whose name is given by self.subprogram_end_var(subprogram). |
| """ |
| low_pc_attr_name = "DW_AT_low_pc" |
| if low_pc_attr_name in subprogram.attrs: |
| start = self.subprogram_start_var(subprogram) |
| subprogram.attrs[low_pc_attr_name].value = start |
| |
| high_pc_attr_name = "DW_AT_high_pc" |
| if high_pc_attr_name in subprogram.attrs: |
| end = self.subprogram_end_var(subprogram) |
| subprogram.attrs[high_pc_attr_name].value = end |
| |
| def _create_label_for_die(self, die: DWARFDIE) -> None: |
| """Set tcl_label to a unique string among other DIEs for this parser. |
| |
| As a first attempt, use labelify(die.name()). If the DIE does not have |
| a name, use labelify(die.type_name()). |
| |
| If the chosen initial label is already taken, try again appending "_2". |
| While the attempt is still taken, try again replacing it with "_3", then |
| "_4", and so on. |
| |
| This function also creates an entry on self.label_to_die. |
| """ |
| if die.tcl_label is not None: |
| return |
| |
| label = labelify_str(die.name() or die.type_name()) |
| |
| # Deduplicate label in case of collision |
| if label in self.taken_labels: |
| suffix_nr = 2 |
| |
| # Walrus operator to prevent writing the assembled label_suffix |
| # string literal twice. This could be rewritten by copying the |
| # string literal to the line after the end of the while loop, |
| # but I deemed it would be too frail in case one of them needs |
| # to be changed and the other is forgotten. |
| while (new_label := f"{label}_{suffix_nr}") in self.taken_labels: |
| suffix_nr += 1 |
| label = new_label |
| |
| die.tcl_label = label |
| self.label_to_die[label] = die |
| self.taken_labels.add(label) |
| |
| def subprogram_start_var(self, subprogram: DWARFDIE) -> str: |
| """Name of the Tcl variable that holds the low PC for a subprogram.""" |
| return f"${subprogram.name()}_start" |
| |
| def subprogram_end_var(self, subprogram: DWARFDIE) -> str: |
| """Name of the Tcl variable that holds the high PC for a subprogram.""" |
| return f"${subprogram.name()}_end" |
| |
| def all_labels(self) -> set[str]: |
| """Get a copy of the set of all labels known to the parser so far.""" |
| return copy(self.taken_labels) |
| |
| |
| class DWARFAssemblerGenerator: |
| """Class that generates Dwarf::assemble code out of a DWARFParser.""" |
| |
| def __init__(self, dwarf_parser: DWARFParser, output=sys.stdout): |
| self.dwarf_parser = dwarf_parser |
| self.output = output |
| |
| def emit(self, line: str, indent_count: int) -> None: |
| """Print a single line indented indent_count times to self.output. |
| |
| If line is empty, it will always print an empty line, even with nonzero |
| indent_count. |
| """ |
| if line: |
| line = get_indent_str(indent_count) + line |
| print(line, file=self.output) |
| |
| def generate_die(self, die: DWARFDIE, indent_count: int): |
| """Generate the lines that represent a DIE.""" |
| die_lines = die.format(self.dwarf_parser.offset_to_die, indent_count) |
| self.emit(die_lines, 0) |
| |
| def generate(self): |
| indent_count = 0 |
| |
| self.emit("Dwarf::assemble $asm_file {", indent_count) |
| |
| # Begin Dwarf::assemble body. |
| indent_count += 1 |
| self.emit("global srcdir subdir srcfile", indent_count) |
| |
| all_labels = self.dwarf_parser.all_labels() |
| if all_labels: |
| self.emit("declare_labels " + " ".join(all_labels), indent_count) |
| |
| self.emit("", 0) |
| for subprogram in self.dwarf_parser.subprograms: |
| self.emit(f"get_func_info {subprogram.name()}", indent_count) |
| |
| for die in self.dwarf_parser.top_level_dies: |
| self.generate_die(die, indent_count) |
| |
| # TODO: line table, if it's within scope (it probably isn't). |
| |
| # End Dwarf::assemble body. |
| indent_count -= 1 |
| self.emit(rbrace, indent_count) |
| |
| |
| def main(argv): |
| try: |
| filename = argv[1] |
| except IndexError: |
| print("Usage:", file=sys.stderr) |
| print("python ./asm_to_dwarf_assembler.py <path/to/elf/file>", file=sys.stderr) |
| sys.exit(errno.EOPNOTSUPP) |
| |
| try: |
| with open(filename, "rb") as elf_file: |
| parser = DWARFParser(elf_file) |
| except Exception as e: |
| print( |
| "Error parsing ELF file. Does it contain DWARF information?", |
| file=sys.stderr, |
| ) |
| print(str(e), file=sys.stderr) |
| sys.exit(errno.ENODATA) |
| generator = DWARFAssemblerGenerator(parser) |
| generator.generate() |
| |
| |
| if __name__ == "__main__": |
| main(sys.argv) |