blob: 58fb6cbe549499eb6dc109206cbfdf9b171c6ec3 [file] [log] [blame]
#!/usr/bin/env python3
# pyright: strict
# Copyright 2024 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Due to the pyelftools dependency, this script requires Python version
# 3.10 or greater to run.
"""A utility to convert ELF files with DWARF info to Dwarf::assemble code.
Usage:
python ./asm_to_dwarf_assembler.py <path/to/elf/file>
Dependencies:
Python >= 3.10
pyelftools >= 0.31
Notes:
- Line tables are not currently supported.
- Non-contiguous subprograms are not currently supported.
- If you want to use $srcfile or similar, you must edit the references to the
file name manually, including DW_AT_name attributes on compile units.
- If run with binaries generated by make check-gdb, it may include an
additional compile_unit before and after the actual compile units. This is
an artifact of the normal compilation process, as these CUs are indeed in
the generated DWARF in some cases.
"""
import errno
import re
import sys
from copy import copy
from dataclasses import dataclass
from datetime import datetime
from functools import cache
from io import BytesIO, IOBase
from logging import getLogger
from typing import Annotated, Optional
from elftools.dwarf.compileunit import CompileUnit as RawCompileUnit
from elftools.dwarf.die import DIE as RawDIE
from elftools.dwarf.die import AttributeValue
from elftools.elf.elffile import ELFFile
logger = getLogger(__file__)
# While these aren't supported, their detection is important for replacing them
# with SPECIAL_expr and for writing the placeholder {MANUAL} expr list.
EXPR_ATTRIBUTE_FORMS = [
"DW_FORM_exprloc",
"DW_FORM_block",
"DW_FORM_block1",
"DW_FORM_block2",
"DW_FORM_block4",
]
# Workaround for my editor not to freak out over unclosed braces.
lbrace, rbrace = "{", "}"
@cache
def get_indent_str(indent_count: int) -> str:
"""Get whitespace string to prepend to another for indenting."""
indent = (indent_count // 2) * "\t"
if indent_count % 2 == 1:
indent += " "
return indent
def indent(line: str, indent_count: int) -> str:
"""Indent line by indent_count levels."""
return get_indent_str(indent_count) + line
def labelify_str(s: str) -> str:
"""Make s appropriate for a label name."""
# Replace "*" with the literal word "ptr".
s = s.replace("*", "ptr")
# Replace any non-"word" characters by "_".
s = re.sub(r"\W", "_", s)
# Remove consecutive "_"s.
s = re.sub(r"__+", "_", s)
return s
class DWARFAttribute:
"""Storage unit for a single DWARF attribute.
All its values are strings that are usually passed on
directly to format. The exceptions to this are attributes
with int values with DW_FORM_ref4 or DW_FORM_ref_addr form.
Their values are interpreted as the global offset of the DIE
being referenced, which are looked up dynamically to fetch
their labels.
"""
def __init__(
self,
die_offset: int,
name: str,
value: str | bytes | int | bool,
form=None,
):
self.die_offset = die_offset
self.name = name
self.value = value
self.form = form
def _format_expr_value(self) -> str:
self.form = "SPECIAL_expr"
return "{ MANUAL: Fill expr list }"
def _needs_escaping(self, str_value: str) -> bool:
charset = set(str_value)
return bool(charset.intersection({"{", "}", " ", "\t"}))
def _format_str(self, str_value: str) -> str:
if self._needs_escaping(str_value):
escaped_str = str(str_value)
# Replace single escape (which is itself escaped because of regex)
# with a double escape (which doesn't mean anything to regex so
# it doesn't need escaping).
escaped_str = re.sub(r"\\", r"\\", escaped_str)
escaped_str = re.sub("([{}])", r"\\\1", escaped_str)
return "{" + escaped_str + "}"
else:
return str_value
def _format_value(
self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0
) -> str:
if self.form in EXPR_ATTRIBUTE_FORMS:
return self._format_expr_value()
elif isinstance(self.value, bool):
return str(int(self.value))
elif isinstance(self.value, int):
if self.form == "DW_FORM_ref4":
# ref4-style referencing label.
die = offset_die_lookup[self.value]
return ":$" + die.tcl_label
elif self.form == "DW_FORM_ref_addr":
# ref_addr-style referencing label.
die = offset_die_lookup[self.value]
return "%$" + die.tcl_label
else:
return str(self.value)
elif isinstance(self.value, bytes):
return self._format_str(self.value.decode("ascii"))
elif isinstance(self.value, str):
return self._format_str(self.value)
else:
raise NotImplementedError(f"Unknown data type: {type(self.value)}")
def format(
self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0
) -> str:
"""Format the attribute in the form {name value form}.
If form is DW_FORM_exprloc or DW_FORM_block, see next section on
DWARFOperations.
If it isn't, value is formatted as follows:
If bool, use "1" if True, "0" if False.
If int:
If form is DW_FORM_ref4, use ":$label" where label is the
tcl_label of the DWARFDIE at offset "value".
If form is DW_FORM_ref_addr, use "%$label" where label is
the tcl_label of the DWARFDIE at offset "value".
Else, use value directly.
If bytes, use value.decode("ascii")
If str, use value directly.
Any other type results in a NotImplementedError being raised.
Regarding DW_FORM_exprloc and DW_FORM_block:
The form is replaced with SPECIAL_expr.
The entries in the value are interpreted and decoded using the
dwarf_operations dictionary, and replaced with their names where
applicable.
"""
s = lbrace
if isinstance(self.name, int):
s += "DW_AT_" + hex(self.name)
else:
s += self.name
s += " "
s += self._format_value(offset_die_lookup)
# Only explicitly state form if it's not a reference.
if self.form not in [None, "DW_FORM_ref4", "DW_FORM_ref_addr"]:
s += " " + self.form
s += rbrace
return indent(s, indent_count)
class DWARFDIE:
"""This script's parsed version of a RawDIE."""
def __init__(
self,
offset: int,
tag: str,
attrs: dict[str, DWARFAttribute],
tcl_label: Optional[str] = None,
):
self.offset: Annotated[int, "Global offset of the DIE."] = offset
self.tag: Annotated[str, "DWARF tag for this DIE."] = tag
self.attrs: Annotated[
dict[str, DWARFAttribute], "Dict of attributes for this DIE."
] = copy(attrs)
self.children: Annotated[list[DWARFDIE], "List of child DIEs of this DIE."] = []
self.tcl_label: Annotated[
str,
"Label used by the Tcl code to reference this DIE, if any. These "
'take the form of "label: " before the actual DIE definition.',
] = tcl_label
def format_lines(
self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0
) -> list[str]:
"""Get the list of lines that represent this DIE in Dwarf assembler."""
die_lines = []
# Prepend label to first line, if it's set.
if self.tcl_label:
first_line_start = self.tcl_label + ": "
else:
first_line_start = ""
# First line, including label.
first_line = indent(first_line_start + self.tag + " " + lbrace, indent_count)
die_lines.append(first_line)
# Format attributes, if any.
if self.attrs:
for attr_name, attr in self.attrs.items():
attr_line = attr.format(
offset_die_lookup, indent_count=indent_count + 1
)
die_lines.append(attr_line)
die_lines.append(indent(rbrace, indent_count))
else:
# Don't create a new line, just append and immediately close the
# brace on the last line.
die_lines[-1] += rbrace
# Format children, if any.
if self.children:
# Only open a new brace if there are any children for the
# current DIE.
die_lines[-1] += " " + lbrace
for child in self.children:
child_lines = child.format_lines(
offset_die_lookup, indent_count=indent_count + 1
)
die_lines.extend(child_lines)
die_lines.append(indent(rbrace, indent_count))
return die_lines
def format(
self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0
) -> str:
"""Join result from format_lines into a single str."""
return "\n".join(self.format_lines(offset_die_lookup, indent_count))
def name(self) -> Optional[str]:
"""Get DW_AT_name (if present) decoded as ASCII."""
raw_value = self.attrs.get("DW_AT_name")
if raw_value is None:
return None
else:
return raw_value.value.decode("ascii")
def type_name(self) -> str:
"""Name of Dwarf tag, with the "DW_TAG_" prefix removed."""
return re.sub("DW_TAG_", "", self.tag)
class DWARFCompileUnit(DWARFDIE):
"""Wrapper subclass for CU DIEs.
This is necessary due to the special format CUs take in Dwarf::assemble.
Instead of simply:
DW_TAG_compile_unit {
<attributes>
} {
<children>
}
CUs are formatted as:
cu { <cu_special_vars> } {
DW_TAG_compile_unit {
<attributes>
} {
<children>
}
}
"""
# Default value for parameter is_64 defined in dwarf.exp line 1553.
# This value is converted to 0/1 automatically when emitting
# Dwarf::assemble code.
default_is_64 = False
# Default value for parameter dwarf_version defined in dwarf.exp line 1552.
default_dwarf_version = 4
# Default value for parameter is_fission defined in dwarf.exp line 1556.
# Currently not implemented, see comment below.
# default_is_fission = False
# Tag that signifies a DIE is a compile unit.
compile_unit_tag = "DW_TAG_compile_unit"
def __init__(
self,
raw_die: RawDIE,
raw_cu: RawCompileUnit,
attrs: dict[str, DWARFAttribute],
):
"""Initialize additional instance variables for CU encoding.
The additional instance variables are:
- is_64_bit: bool
Whether this CU is 64 bit or not.
- dwarf_version: int
default DWARFCompileUnit.default_dwarf_version
Version of DWARF this CU is using.
- addr_size: Optional[int]
default None
Size of an address in bytes.
These variables are used to configure the first parameter of the cu
proc (which contains calls to the compile_unit proc in the body of
Dwarf::assemble).
"""
super().__init__(raw_die.offset, DWARFCompileUnit.compile_unit_tag, attrs)
self.raw_cu = raw_cu
self.dwarf_version: int = raw_cu.header.get(
"version", DWARFCompileUnit.default_dwarf_version
)
self.addr_size: Optional[int] = raw_cu.header.get("address_size")
self.is_64_bit: bool = raw_cu.dwarf_format() == 64
# Fission is not currently implemented because I don't know where to
# fetch this information from.
# self.is_fission: bool = self.default_is_fission
# CU labels are not currently implemented because I haven't found where
# pyelftools exposes this information.
# self.cu_label: Optional[str] = None
def format_lines(
self,
offset_die_lookup: dict[int, DWARFDIE],
indent_count: int = 0,
) -> list[str]:
lines = []
lines.append(self._get_header(indent_count))
inner_lines = super().format_lines(offset_die_lookup, indent_count + 1)
lines += inner_lines
lines.append(indent(rbrace, indent_count))
return lines
def _get_header(self, indent_count: int = 0) -> str:
"""Assemble the first line of the surrounding 'cu {} {}' proc call."""
header = indent("cu " + lbrace, indent_count)
cu_params = []
if self.is_64_bit != DWARFCompileUnit.default_is_64:
# Convert from True/False to 1/0.
param_value = int(self.is_64_bit)
cu_params += ["is_64", str(param_value)]
if self.dwarf_version != DWARFCompileUnit.default_dwarf_version:
cu_params += ["version", str(self.dwarf_version)]
if self.addr_size is not None:
cu_params += ["addr_size", str(self.addr_size)]
# Fission is not currently implemented, see comment above.
# if self.is_fission != DWARFCompileUnit.default_is_fission:
# # Same as is_64_bit conversion, True/False -> 1/0.
# param_value = int(self.is_fission)
# cu_params += ["fission", str(param_value)]
# CU labels are not currently implemented, see commend above.
# if self.cu_label is not None:
# cu_params += ["label", self.cu_label]
if cu_params:
header += " ".join(cu_params)
header += rbrace + " " + lbrace
return header
class DWARFParser:
"""Converter from pyelftools's DWARF representation to this script's."""
def __init__(self, elf_file: IOBase):
"""Init parser with file opened in binary mode.
File can be closed after this function is called.
"""
self.raw_data = BytesIO(elf_file.read())
self.elf_data = ELFFile(self.raw_data)
self.dwarf_info = self.elf_data.get_dwarf_info()
self.offset_to_die: dict[int, DWARFDIE] = {}
self.label_to_die: dict[str, DWARFDIE] = {}
self.referenced_offsets: Annotated[
set[int], "The set of all offsets that were referenced by some DIE."
] = set()
self.raw_cu_list: list[RawCompileUnit] = []
self.top_level_dies: list[DWARFDIE] = []
self.subprograms: list[DWARFDIE] = []
self.taken_labels: set[str] = set()
self._read_all_cus()
self._create_necessary_labels()
def _read_all_cus(self):
"""Populate self.raw_cu_list with all CUs in self.dwarf_info."""
for cu in self.dwarf_info.iter_CUs():
self._read_cu(cu)
def _read_cu(self, raw_cu: RawCompileUnit):
"""Read a compile_unit into self.cu_list."""
self.raw_cu_list.append(raw_cu)
for raw_die in raw_cu.iter_DIEs():
if not raw_die.is_null():
self._parse_die(raw_cu, raw_die)
def _parse_die(self, die_cu: RawCompileUnit, raw_die: RawDIE) -> DWARFDIE:
"""Process a single DIE and add it to offset_to_die.
Look for DW_FORM_ref4 and DWD_FORM_ref_addr form attributes and replace
them with the global offset of the referenced DIE, and adding the
referenced DIE to a set. This will be used later to assign and use
labels only to DIEs that need it.
In case the DIE is a top-level DIE, add it to self.top_level_dies.
In case the DIE is a subprogram, add it to self.subprograms and call
self._use_vars_for_low_and_high_pc_attr with it.
"""
processed_attrs = {}
attr_value: AttributeValue
for attr_name, attr_value in raw_die.attributes.items():
actual_value = attr_value.value
if attr_value.form in ("DW_FORM_ref4", "DW_FORM_ref_addr"):
referenced_die = raw_die.get_DIE_from_attribute(attr_name)
actual_value = referenced_die.offset
self.referenced_offsets.add(referenced_die.offset)
processed_attrs[attr_name] = DWARFAttribute(
raw_die.offset, attr_name, actual_value, attr_value.form
)
if raw_die.tag == DWARFCompileUnit.compile_unit_tag:
processed_die = DWARFCompileUnit(raw_die, die_cu, processed_attrs)
else:
processed_die = DWARFDIE(raw_die.offset, raw_die.tag, processed_attrs, None)
if raw_die.get_parent() is None:
# Top level DIE
self.top_level_dies.append(processed_die)
else:
# Setting the parent here assumes the parent was already processed
# prior to this DIE being found.
# As far as I'm aware, this is always true in DWARF.
processed_parent = self.offset_to_die[raw_die.get_parent().offset]
processed_parent.children.append(processed_die)
if processed_die.tag == "DW_TAG_subprogram":
self.subprograms.append(processed_die)
self._use_vars_for_low_and_high_pc_attr(processed_die)
self.offset_to_die[processed_die.offset] = processed_die
return processed_die
def _create_necessary_labels(self):
"""Create labels to DIEs that were referenced by others."""
for offset in self.referenced_offsets:
die = self.offset_to_die[offset]
self._create_label_for_die(die)
def _use_vars_for_low_and_high_pc_attr(self, subprogram: DWARFDIE) -> None:
"""Replace existing PC attributes with Tcl variables.
If DW_AT_low_pc exists for this DIE, replace it with accessing the
variable whose name is given by self.subprogram_start_var(subprogram).
If DW_AT_high_pc exists for this DIE, replace it with accessing the
variable whose name is given by self.subprogram_end_var(subprogram).
"""
low_pc_attr_name = "DW_AT_low_pc"
if low_pc_attr_name in subprogram.attrs:
start = self.subprogram_start_var(subprogram)
subprogram.attrs[low_pc_attr_name].value = start
high_pc_attr_name = "DW_AT_high_pc"
if high_pc_attr_name in subprogram.attrs:
end = self.subprogram_end_var(subprogram)
subprogram.attrs[high_pc_attr_name].value = end
def _create_label_for_die(self, die: DWARFDIE) -> None:
"""Set tcl_label to a unique string among other DIEs for this parser.
As a first attempt, use labelify(die.name()). If the DIE does not have
a name, use labelify(die.type_name()).
If the chosen initial label is already taken, try again appending "_2".
While the attempt is still taken, try again replacing it with "_3", then
"_4", and so on.
This function also creates an entry on self.label_to_die.
"""
if die.tcl_label is not None:
return
label = labelify_str(die.name() or die.type_name())
# Deduplicate label in case of collision
if label in self.taken_labels:
suffix_nr = 2
# Walrus operator to prevent writing the assembled label_suffix
# string literal twice. This could be rewritten by copying the
# string literal to the line after the end of the while loop,
# but I deemed it would be too frail in case one of them needs
# to be changed and the other is forgotten.
while (new_label := f"{label}_{suffix_nr}") in self.taken_labels:
suffix_nr += 1
label = new_label
die.tcl_label = label
self.label_to_die[label] = die
self.taken_labels.add(label)
def subprogram_start_var(self, subprogram: DWARFDIE) -> str:
"""Name of the Tcl variable that holds the low PC for a subprogram."""
return f"${subprogram.name()}_start"
def subprogram_end_var(self, subprogram: DWARFDIE) -> str:
"""Name of the Tcl variable that holds the high PC for a subprogram."""
return f"${subprogram.name()}_end"
def all_labels(self) -> set[str]:
"""Get a copy of the set of all labels known to the parser so far."""
return copy(self.taken_labels)
class DWARFAssemblerGenerator:
"""Class that generates Dwarf::assemble code out of a DWARFParser."""
def __init__(self, dwarf_parser: DWARFParser, output=sys.stdout):
self.dwarf_parser = dwarf_parser
self.output = output
def emit(self, line: str, indent_count: int) -> None:
"""Print a single line indented indent_count times to self.output.
If line is empty, it will always print an empty line, even with nonzero
indent_count.
"""
if line:
line = get_indent_str(indent_count) + line
print(line, file=self.output)
def generate_die(self, die: DWARFDIE, indent_count: int):
"""Generate the lines that represent a DIE."""
die_lines = die.format(self.dwarf_parser.offset_to_die, indent_count)
self.emit(die_lines, 0)
def generate(self):
indent_count = 0
self.emit("Dwarf::assemble $asm_file {", indent_count)
# Begin Dwarf::assemble body.
indent_count += 1
self.emit("global srcdir subdir srcfile", indent_count)
all_labels = self.dwarf_parser.all_labels()
if all_labels:
self.emit("declare_labels " + " ".join(all_labels), indent_count)
self.emit("", 0)
for subprogram in self.dwarf_parser.subprograms:
self.emit(f"get_func_info {subprogram.name()}", indent_count)
for die in self.dwarf_parser.top_level_dies:
self.generate_die(die, indent_count)
# TODO: line table, if it's within scope (it probably isn't).
# End Dwarf::assemble body.
indent_count -= 1
self.emit(rbrace, indent_count)
def main(argv):
try:
filename = argv[1]
except IndexError:
print("Usage:", file=sys.stderr)
print("python ./asm_to_dwarf_assembler.py <path/to/elf/file>", file=sys.stderr)
sys.exit(errno.EOPNOTSUPP)
try:
with open(filename, "rb") as elf_file:
parser = DWARFParser(elf_file)
except Exception as e:
print(
"Error parsing ELF file. Does it contain DWARF information?",
file=sys.stderr,
)
print(str(e), file=sys.stderr)
sys.exit(errno.ENODATA)
generator = DWARFAssemblerGenerator(parser)
generator.generate()
if __name__ == "__main__":
main(sys.argv)