blob: 092ae6972a50985f9248622d697cd762fda6a790 [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2017 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from contextlib import contextmanager
from collections import namedtuple
import mmap
import os
import struct
import uuid
# Standard ELF constants.
ELFMAG = '\x7fELF'
EI_CLASS = 4
ELFCLASS32 = 1
ELFCLASS64 = 2
EI_DATA = 5
ELFDATA2LSB = 1
ELFDATA2MSB = 2
EM_386 = 3
EM_ARM = 40
EM_X86_64 = 62
EM_AARCH64 = 183
PT_LOAD = 1
PT_DYNAMIC = 2
PT_INTERP = 3
PT_NOTE = 4
DT_NEEDED = 1
DT_STRTAB = 5
DT_SONAME = 14
NT_GNU_BUILD_ID = 3
SHT_SYMTAB = 2
class elf_note(
namedtuple('elf_note', [
'name',
'type',
'desc',
])):
# An ELF note is identified by (name_string, type_integer).
def ident(self):
return (self.name, self.type)
def is_build_id(self):
return self.ident() == ('GNU\0', NT_GNU_BUILD_ID)
def build_id_hex(self):
if self.is_build_id():
return ''.join(('%02x' % ord(byte)) for byte in self.desc)
return None
def __repr__(self):
return ('elf_note(%r, %#x, <%d bytes>)' %
(self.name, self.type, len(self.desc)))
def gen_elf():
# { 'Struct1': (ELFCLASS32 fields, ELFCLASS64 fields),
# 'Struct2': fields_same_for_both, ... }
elf_types = {
'Ehdr': ([
('e_ident', '16s'),
('e_type', 'H'),
('e_machine', 'H'),
('e_version', 'I'),
('e_entry', 'I'),
('e_phoff', 'I'),
('e_shoff', 'I'),
('e_flags', 'I'),
('e_ehsize', 'H'),
('e_phentsize', 'H'),
('e_phnum', 'H'),
('e_shentsize', 'H'),
('e_shnum', 'H'),
('e_shstrndx', 'H'),
], [
('e_ident', '16s'),
('e_type', 'H'),
('e_machine', 'H'),
('e_version', 'I'),
('e_entry', 'Q'),
('e_phoff', 'Q'),
('e_shoff', 'Q'),
('e_flags', 'I'),
('e_ehsize', 'H'),
('e_phentsize', 'H'),
('e_phnum', 'H'),
('e_shentsize', 'H'),
('e_shnum', 'H'),
('e_shstrndx', 'H'),
]),
'Phdr': ([
('p_type', 'I'),
('p_offset', 'I'),
('p_vaddr', 'I'),
('p_paddr', 'I'),
('p_filesz', 'I'),
('p_memsz', 'I'),
('p_flags', 'I'),
('p_align', 'I'),
], [
('p_type', 'I'),
('p_flags', 'I'),
('p_offset', 'Q'),
('p_vaddr', 'Q'),
('p_paddr', 'Q'),
('p_filesz', 'Q'),
('p_memsz', 'Q'),
('p_align', 'Q'),
]),
'Shdr': ([
('sh_name', 'L'),
('sh_type', 'L'),
('sh_flags', 'L'),
('sh_addr', 'L'),
('sh_offset', 'L'),
('sh_size', 'L'),
('sh_link', 'L'),
('sh_info', 'L'),
('sh_addralign', 'L'),
('sh_entsize', 'L'),
], [
('sh_name', 'L'),
('sh_type', 'L'),
('sh_flags', 'Q'),
('sh_addr', 'Q'),
('sh_offset', 'Q'),
('sh_size', 'Q'),
('sh_link', 'L'),
('sh_info', 'L'),
('sh_addralign', 'Q'),
('sh_entsize', 'Q'),
]),
'Dyn': ([
('d_tag', 'i'),
('d_val', 'I'),
], [
('d_tag', 'q'),
('d_val', 'Q'),
]),
'Nhdr': [
('n_namesz', 'I'),
('n_descsz', 'I'),
('n_type', 'I'),
],
'dwarf2_line_header': [
('unit_length', 'L'),
('version', 'H'),
('header_length', 'L'),
('minimum_instruction_length', 'B'),
('default_is_stmt', 'B'),
('line_base', 'b'),
('line_range', 'B'),
('opcode_base', 'B'),
],
'dwarf4_line_header': [
('unit_length', 'L'),
('version', 'H'),
('header_length', 'L'),
('minimum_instruction_length', 'B'),
('maximum_operations_per_instruction', 'B'),
('default_is_stmt', 'B'),
('line_base', 'b'),
('line_range', 'b'),
('opcode_base', 'B'),
],
}
# There is an accessor for each struct, e.g. Ehdr.
# Ehdr.read is a function like Struct.unpack_from.
# Ehdr.size is the size of the struct.
elf_accessor = namedtuple('elf_accessor',
['size', 'read', 'write', 'pack'])
# All the accessors for a format (class, byte-order) form one elf,
# e.g. use elf.Ehdr and elf.Phdr.
elf = namedtuple('elf', elf_types.keys())
def gen_accessors(is64, struct_byte_order):
def make_accessor(type, decoder):
return elf_accessor(
size=decoder.size,
read=lambda buffer, offset=0: type._make(
decoder.unpack_from(buffer, offset)),
write=lambda buffer, offset, x: decoder.pack_into(
buffer, offset, *x),
pack=lambda x: decoder.pack(*x))
for name, fields in elf_types.iteritems():
if isinstance(fields, tuple):
fields = fields[1 if is64 else 0]
type = namedtuple(name, [field_name for field_name, fmt in fields])
decoder = struct.Struct(struct_byte_order +
''.join(fmt for field_name, fmt in fields))
yield make_accessor(type, decoder)
for elfclass, is64 in [(ELFCLASS32, False), (ELFCLASS64, True)]:
for elf_bo, struct_bo in [(ELFDATA2LSB, '<'), (ELFDATA2MSB, '>')]:
yield ((chr(elfclass), chr(elf_bo)),
elf(*gen_accessors(is64, struct_bo)))
# e.g. ELF[file[EI_CLASS], file[EI_DATA]].Ehdr.read(file).e_phnum
ELF = dict(gen_elf())
def get_elf_accessor(file):
# If it looks like an ELF file, whip out the decoder ring.
if file[:len(ELFMAG)] == ELFMAG:
return ELF[file[EI_CLASS], file[EI_DATA]]
return None
def gen_phdrs(file, elf, ehdr):
for pos in xrange(0, ehdr.e_phnum * elf.Phdr.size, elf.Phdr.size):
yield elf.Phdr.read(file, ehdr.e_phoff + pos)
def gen_shdrs(file, elf, ehdr):
for pos in xrange(0, ehdr.e_shnum * elf.Shdr.size, elf.Shdr.size):
yield elf.Shdr.read(file, ehdr.e_shoff + pos)
cpu = namedtuple('cpu', [
'e_machine', # ELF e_machine int
'llvm', # LLVM triple CPU component
'gn', # GN target_cpu
])
ELF_MACHINE_TO_CPU = {elf: cpu(elf, llvm, gn) for elf, llvm, gn in [
(EM_386, 'i386', 'x86'),
(EM_ARM, 'arm', 'arm'),
(EM_X86_64, 'x86_64', 'x64'),
(EM_AARCH64, 'aarch64', 'arm64'),
]}
@contextmanager
def mmapper(filename):
"""A context manager that yields (fd, file_contents) given a file name.
This ensures that the mmap and file objects are closed at the end of the
'with' statement."""
fileobj = open(filename, 'rb')
fd = fileobj.fileno()
if os.fstat(fd).st_size == 0:
# mmap can't handle empty files.
try:
yield fd, ''
finally:
fileobj.close()
else:
mmapobj = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
try:
yield fd, mmapobj
finally:
mmapobj.close()
fileobj.close()
def makedirs(dirs):
try:
os.makedirs(dirs)
except OSError as e:
if e.errno != os.errno.EEXIST:
raise e
# elf_info objects are only created by `get_elf_info` or the `copy` or
# `rename` methods.
class elf_info(
namedtuple('elf_info', [
'filename',
'cpu', # cpu tuple
'notes', # list of (ident, desc): selected notes
'build_id', # string: lowercase hex
'stripped', # bool: Has no symbols or .debug_* sections
'interp', # string or None: PT_INTERP (without \0)
'soname', # string or None: DT_SONAME
'needed', # list of strings: DT_NEEDED
])):
def rename(self, filename):
assert os.path.samefile(self.filename, filename)
# Copy the tuple.
clone = self.__class__(filename, *self[1:])
# Copy the lazy state.
clone.elf = self.elf
if self.get_sources == clone.get_sources:
raise Exception("uninitialized elf_info object!")
clone.get_sources = self.get_sources
return clone
def copy(self):
return self.rename(self.filename)
# This is replaced with a closure by the creator in get_elf_info.
def get_sources(self):
raise Exception("uninitialized elf_info object!")
def strip(self, stripped_filename):
"""Write stripped output to the given file unless it already exists
with identical contents. Returns True iff the file was changed."""
with mmapper(self.filename) as mapped:
fd, file = mapped
ehdr = self.elf.Ehdr.read(file)
stripped_ehdr = ehdr._replace(e_shoff=0, e_shnum=0, e_shstrndx=0)
stripped_size = max(phdr.p_offset + phdr.p_filesz
for phdr in gen_phdrs(file, self.elf, ehdr)
if phdr.p_type == PT_LOAD)
assert ehdr.e_phoff + (ehdr.e_phnum *
ehdr.e_phentsize) <= stripped_size
def gen_stripped_contents():
yield self.elf.Ehdr.pack(stripped_ehdr)
yield file[self.elf.Ehdr.size:stripped_size]
def old_file_matches():
old_size = os.path.getsize(stripped_filename)
new_size = sum(len(x) for x in gen_stripped_contents())
if old_size != new_size:
return False
with open(stripped_filename, 'rb') as f:
for chunk in gen_stripped_contents():
if f.read(len(chunk)) != chunk:
return False
return True
if os.path.exists(stripped_filename):
if old_file_matches():
return False
else:
os.remove(stripped_filename)
# Create the new file with the same mode as the original.
with os.fdopen(os.open(stripped_filename,
os.O_WRONLY | os.O_CREAT | os.O_EXCL,
os.fstat(fd).st_mode & 0777),
'wb') as stripped_file:
stripped_file.write(self.elf.Ehdr.pack(stripped_ehdr))
stripped_file.write(file[self.elf.Ehdr.size:stripped_size])
return True
def get_elf_info(filename, match_notes=False):
file = None
elf = None
ehdr = None
phdrs = None
# Yields an elf_note for each note in any PT_NOTE segment.
def gen_notes():
def round_up_to(size):
return ((size + 3) / 4) * 4
for phdr in phdrs:
if phdr.p_type == PT_NOTE:
pos = phdr.p_offset
while pos < phdr.p_offset + phdr.p_filesz:
nhdr = elf.Nhdr.read(file, pos)
pos += elf.Nhdr.size
name = file[pos:pos + nhdr.n_namesz]
pos += round_up_to(nhdr.n_namesz)
desc = file[pos:pos + nhdr.n_descsz]
pos += round_up_to(nhdr.n_descsz)
yield elf_note(name, nhdr.n_type, desc)
def gen_sections():
shdrs = list(gen_shdrs(file, elf, ehdr))
if not shdrs:
return
strtab_shdr = shdrs[ehdr.e_shstrndx]
for shdr, i in zip(shdrs, xrange(len(shdrs))):
if i == 0:
continue
assert shdr.sh_name < strtab_shdr.sh_size, (
"%s: invalid sh_name" % filename)
yield (shdr,
extract_C_string(strtab_shdr.sh_offset + shdr.sh_name))
# Generates '\0'-terminated strings starting at the given offset,
# until an empty string.
def gen_strings(start):
while True:
end = file.find('\0', start)
assert end >= start, (
"%s: Unterminated string at %#x" % (filename, start))
if start == end:
break
yield file[start:end]
start = end + 1
def extract_C_string(start):
for string in gen_strings(start):
return string
return ''
# Returns a string of hex digits (or None).
def get_build_id():
build_id = None
for note in gen_notes():
# Note that the last build_id note needs to be used due to TO-442.
possible_build_id = note.build_id_hex()
if possible_build_id:
build_id = possible_build_id
return build_id
# Returns a list of elf_note objects.
def get_matching_notes():
if isinstance(match_notes, bool):
if match_notes:
return list(gen_notes())
else:
return []
# If not a bool, it's an iterable of ident pairs.
return [note for note in gen_notes() if note.ident() in match_notes]
# Returns a string (without trailing '\0'), or None.
def get_interp():
# PT_INTERP points directly to a string in the file.
for interp in (phdr for phdr in phdrs if phdr.p_type == PT_INTERP):
interp = file[interp.p_offset:interp.p_offset + interp.p_filesz]
if interp[-1:] == '\0':
interp = interp[:-1]
return interp
return None
# Returns a set of strings.
def get_soname_and_needed():
# Each DT_NEEDED or DT_SONAME points to a string in the .dynstr table.
def GenDTStrings(tag):
return (extract_C_string(strtab_offset + dt.d_val)
for dt in dyn if dt.d_tag == tag)
# PT_DYNAMIC points to the list of ElfNN_Dyn tags.
for dynamic in (phdr for phdr in phdrs if phdr.p_type == PT_DYNAMIC):
dyn = [elf.Dyn.read(file, dynamic.p_offset + dyn_offset)
for dyn_offset in xrange(0, dynamic.p_filesz, elf.Dyn.size)]
# DT_STRTAB points to the string table's vaddr (.dynstr).
[strtab_vaddr] = [dt.d_val for dt in dyn if dt.d_tag == DT_STRTAB]
# Find the PT_LOAD containing the vaddr to compute the file offset.
[strtab_offset] = [
strtab_vaddr - phdr.p_vaddr + phdr.p_offset
for phdr in phdrs
if (phdr.p_type == PT_LOAD and
phdr.p_vaddr <= strtab_vaddr and
strtab_vaddr - phdr.p_vaddr < phdr.p_filesz)
]
soname = None
for soname in GenDTStrings(DT_SONAME):
break
return soname, set(GenDTStrings(DT_NEEDED))
return None, set()
def get_stripped():
return all(
shdr.sh_type != SHT_SYMTAB and not name.startswith('.debug_')
for shdr, name in gen_sections())
def get_cpu():
return ELF_MACHINE_TO_CPU.get(ehdr.e_machine)
def gen_source_files():
# Given the file position of a CU header (starting with the
# beginning of the .debug_line section), return the position
# of the include_directories portion and the position of the
# next CU header.
def read_line_header(pos):
# Decode DWARF .debug_line per-CU header.
hdr_type = elf.dwarf2_line_header
hdr = hdr_type.read(file, pos)
assert hdr.unit_length < 0xfffffff0, (
"%s: 64-bit DWARF" % filename)
assert hdr.version in [2,3,4], (
"%s: DWARF .debug_line version %r" %
(filename, hdr.version))
if hdr.version == 4:
hdr_type = elf.dwarf4_line_header
hdr = hdr_type.read(file, pos)
return (pos + hdr_type.size + hdr.opcode_base - 1,
pos + 4 + hdr.unit_length)
# Decode include_directories portion of DWARF .debug_line format.
def read_include_dirs(pos):
include_dirs = list(gen_strings(pos))
pos += sum(len(dir) + 1 for dir in include_dirs) + 1
return pos, include_dirs
# Decode file_paths portion of DWARF .debug_line format.
def gen_file_paths(start, limit):
while start < limit:
end = file.find('\0', start, limit)
assert end >= start, (
"%s: Unterminated string at %#x" % (filename, start))
if start == end:
break
name = file[start:end]
start = end + 1
# Decode 3 ULEB128s to advance start, but only use the first.
for i in range(3):
value = 0
bits = 0
while start < limit:
byte = ord(file[start])
start += 1
value |= (byte & 0x7f) << bits
if (byte & 0x80) == 0:
break
bits += 7
if i == 0:
include_idx = value
# Ignore the fake file names the compiler leaks into the DWARF.
if name not in ['<stdin>', '<command-line>']:
yield name, include_idx
for shdr, name in gen_sections():
if name == '.debug_line':
next = shdr.sh_offset
while next < shdr.sh_offset + shdr.sh_size:
pos, next = read_line_header(next)
pos, include_dirs = read_include_dirs(pos)
assert pos <= next
# 0 means relative to DW_AT_comp_dir, which should be ".".
# Indices into the actual table start at 1.
include_dirs.insert(0, '')
# Decode file_paths and apply include directories.
for name, i in gen_file_paths(pos, next):
name = os.path.join(include_dirs[i], name)
yield os.path.normpath(name)
# This closure becomes the elf_info object's `get_sources` method.
def lazy_get_sources():
# Run the generator and cache its results as a set.
sources_cache = set(gen_source_files())
# Replace the method to just return the cached set next time.
info.get_sources = lambda: sources_cache
return sources_cache
# Map in the whole file's contents and use it as a string.
with mmapper(filename) as mapped:
fd, file = mapped
elf = get_elf_accessor(file)
if elf is not None:
# ELF header leads to program headers.
ehdr = elf.Ehdr.read(file)
assert ehdr.e_phentsize == elf.Phdr.size, (
"%s: invalid e_phentsize" % filename)
phdrs = list(gen_phdrs(file, elf, ehdr))
info = elf_info(filename,
get_cpu(),
get_matching_notes(),
get_build_id(),
get_stripped(),
get_interp(),
*get_soname_and_needed())
info.elf = elf
info.get_sources = lazy_get_sources
return info
return None
# Module public API.
__all__ = ['cpu', 'elf_info', 'elf_note', 'get_elf_accessor', 'get_elf_info']
def test_main_strip(filenames):
for filename in filenames:
info = get_elf_info(filename)
print info
stripped_filename = info.filename + '.ei-strip'
info.strip(stripped_filename)
print '\t%s: %u -> %u' % (stripped_filename,
os.stat(filename).st_size,
os.stat(stripped_filename).st_size)
def test_main_get_info(filenames):
for filename in filenames:
info = get_elf_info(filename)
print info
for source in info.get_sources():
print '\t' + source
# For manual testing.
if __name__ == "__main__":
import sys
if sys.argv[1] == '-strip':
test_main_strip(sys.argv[2:])
else:
test_main_get_info(sys.argv[1:])