diff --git a/pyproject.toml b/pyproject.toml index 4ba5bb4..0e1e648 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,11 @@ requires = ["setuptools>=45", "wheel"] build-backend = "setuptools.build_meta" +[tool.pytest.ini_options] +python_files = ["test*.py"] +testpaths = ["tests"] +addopts = ["-p", "no:cacheprovider"] + [tool.ruff] # Exclude a variety of commonly ignored directories. exclude = [ @@ -75,4 +80,4 @@ skip-magic-trailing-comma = false line-ending = "auto" [tool.ruff.lint.isort] -known-first-party = ["smda"] \ No newline at end of file +known-first-party = ["smda"] diff --git a/smda/Disassembler.py b/smda/Disassembler.py index 91da5d6..c2b6641 100644 --- a/smda/Disassembler.py +++ b/smda/Disassembler.py @@ -80,9 +80,23 @@ def _populateBinaryInfo(self, loader, file_path=""): binary_info.base_addr = loader.getBaseAddress() binary_info.bitness = loader.getBitness() binary_info.architecture = loader.getArchitecture() + binary_info.abi = loader.getAbi() binary_info.code_areas = loader.getCodeAreas() return binary_info + def _ensureHashes(self, binary_info): + if binary_info.sha256 and binary_info.sha1 and binary_info.md5: + return + data = binary_info.raw_data if binary_info.raw_data else binary_info.binary + if data is None: + return + if not binary_info.sha256: + binary_info.sha256 = hashlib.sha256(data).hexdigest() + if not binary_info.sha1: + binary_info.sha1 = hashlib.sha1(data).hexdigest() + if not binary_info.md5: + binary_info.md5 = hashlib.md5(data).hexdigest() + def disassembleFile(self, file_path, pdb_path=""): start = datetime.datetime.now(datetime.timezone.utc) try: @@ -162,6 +176,7 @@ def disassembleBuffer( def _disassemble(self, binary_info, timeout=0): self._start_time = datetime.datetime.now(datetime.timezone.utc) self._timeout = timeout + self._ensureHashes(binary_info) if self.disassembler: self.disassembly = self.disassembler.analyzeBuffer(binary_info, self._callbackAnalysisTimeout) return SmdaReport(self.disassembly, config=self.config) diff --git a/smda/common/BinaryInfo.py b/smda/common/BinaryInfo.py index 7cce0d8..b65a72d 100644 --- a/smda/common/BinaryInfo.py +++ b/smda/common/BinaryInfo.py @@ -1,4 +1,3 @@ -import hashlib import logging import lief @@ -37,10 +36,8 @@ def __init__(self, binary): self.binary = binary self.raw_data = binary self.binary_size = len(binary) - self.sha256 = hashlib.sha256(binary).hexdigest() - self.sha1 = hashlib.sha1(binary).hexdigest() - self.md5 = hashlib.md5(binary).hexdigest() self._lief_binary = None + self.abi = "" def getBinaryData(self): """Safely retrieves binary data from either raw_data or a file path.""" diff --git a/smda/common/SmdaReport.py b/smda/common/SmdaReport.py index 84378d4..c41246b 100644 --- a/smda/common/SmdaReport.py +++ b/smda/common/SmdaReport.py @@ -19,6 +19,7 @@ class SmdaReport: architecture = None + abi = None base_addr = None binary_size = None binweight = None @@ -58,6 +59,7 @@ class SmdaReport: def __init__(self, disassembly=None, config=None, buffer=None): if disassembly is not None: self.architecture = disassembly.binary_info.architecture + self.abi = disassembly.binary_info.abi self.base_addr = disassembly.binary_info.base_addr self.binary_size = disassembly.binary_info.binary_size self.binweight = 0 @@ -223,6 +225,7 @@ def fromFile(cls, file_path): def fromDict(cls, report_dict) -> Optional["SmdaReport"]: smda_report = cls(None) smda_report.architecture = report_dict["architecture"] + smda_report.abi = report_dict.get("abi", "") smda_report.base_addr = report_dict["base_addr"] smda_report.binary_size = report_dict["binary_size"] smda_report.bitness = report_dict["bitness"] @@ -261,6 +264,7 @@ def fromDict(cls, report_dict) -> Optional["SmdaReport"]: smda_report.timestamp = datetime.datetime.strptime(report_dict["timestamp"], "%Y-%m-%dT%H-%M-%S") binary_info = BinaryInfo(b"") binary_info.architecture = smda_report.architecture + binary_info.abi = smda_report.abi binary_info.base_addr = smda_report.base_addr binary_info.binary_size = smda_report.binary_size binary_info.oep = smda_report.oep @@ -287,6 +291,7 @@ def toDict(self) -> dict: transformed_code_sections.append(("", 0, 0)) return { "architecture": self.architecture, + "abi": self.abi, "base_addr": self.base_addr, "binary_size": self.binary_size, "bitness": self.bitness, diff --git a/smda/utility/DelphiKbFileLoader.py b/smda/utility/DelphiKbFileLoader.py index 7569a13..6174aa2 100644 --- a/smda/utility/DelphiKbFileLoader.py +++ b/smda/utility/DelphiKbFileLoader.py @@ -25,3 +25,11 @@ def getBitness(binary): @staticmethod def getCodeAreas(binary): return [] + + @staticmethod + def getArchitecture(binary): + return "intel" + + @staticmethod + def getAbi(binary): + return "" diff --git a/smda/utility/ElfFileLoader.py b/smda/utility/ElfFileLoader.py index 447d94a..48a17f5 100644 --- a/smda/utility/ElfFileLoader.py +++ b/smda/utility/ElfFileLoader.py @@ -198,10 +198,20 @@ def mapBinary(binary): LOGGER.debug("ELF: final mapped size: 0x%x", len(mapped_binary)) return bytes(mapped_binary) + @staticmethod + def getAbi(binary): + abi = "" + try: + elffile = lief.parse(binary) + if elffile: + abi = elffile.header.identity_os_abi.name + except lief.bad_file as exc: + LOGGER.warning("Failed to determine ELF ABI: %s", exc) + return abi + @staticmethod def getArchitecture(binary): architecture = "intel" - # TODO 20250205 determine ABI based on this: https://lief.re/doc/latest/formats/elf/python.html#header return architecture @staticmethod diff --git a/smda/utility/FileLoader.py b/smda/utility/FileLoader.py index 6794606..47e08b9 100644 --- a/smda/utility/FileLoader.py +++ b/smda/utility/FileLoader.py @@ -13,6 +13,7 @@ class FileLoader: _raw_data = b"" _base_addr = 0 _bitness = 0 + _abi = "" _architecture = "" _code_areas = [] file_loaders = [PeFileLoader, ElfFileLoader, MachoFileLoader, DelphiKbFileLoader] @@ -40,6 +41,7 @@ def _loadFile(self, buffer=None): self._bitness = loader.getBitness(self._raw_data) self._code_areas = loader.getCodeAreas(self._raw_data) self._architecture = loader.getArchitecture(self._raw_data) + self._abi = loader.getAbi(self._raw_data) break else: self._data = self._raw_data @@ -53,6 +55,9 @@ def getRawData(self): def getBaseAddress(self): return self._base_addr + def getAbi(self): + return self._abi + def getArchitecture(self): return self._architecture diff --git a/smda/utility/MachoFileLoader.py b/smda/utility/MachoFileLoader.py index 69c3ec2..404e0e2 100644 --- a/smda/utility/MachoFileLoader.py +++ b/smda/utility/MachoFileLoader.py @@ -158,6 +158,10 @@ def mapBinary(binary): LOGGER.debug("MachO: final mapped size: 0x%x", len(mapped_binary)) return bytes(mapped_binary) + @staticmethod + def getAbi(binary): + return "" + @staticmethod def getArchitecture(binary): # TODO add machine types whenever we add more architectures diff --git a/smda/utility/PeFileLoader.py b/smda/utility/PeFileLoader.py index f390b86..a64ca8e 100644 --- a/smda/utility/PeFileLoader.py +++ b/smda/utility/PeFileLoader.py @@ -127,6 +127,10 @@ def getOEP(binary): oep_rva = struct.unpack("I", binary[pe_offset + 0x28 : pe_offset + 0x2C])[0] return oep_rva + @staticmethod + def getAbi(binary): + return "" + @staticmethod def getArchitecture(binary): architecture = "intel" diff --git a/tests/testFileFormatParsers.py b/tests/testFileFormatParsers.py index 804aa98..7e37f2a 100644 --- a/tests/testFileFormatParsers.py +++ b/tests/testFileFormatParsers.py @@ -76,6 +76,7 @@ def testElfParsingWithBashlite(self): binary_info.file_path = "" binary_info.base_addr = loader.getBaseAddress() binary_info.bitness = loader.getBitness() + binary_info.abi = loader.getAbi() binary_info.code_areas = loader.getCodeAreas() binary_info.oep = binary_info.getOep() controlled_disassembly = disasm._disassemble(binary_info) @@ -83,6 +84,7 @@ def testElfParsingWithBashlite(self): bashlite_unmapped_disassembly = disasm.disassembleUnmappedBuffer(bashlite_binary) assert bashlite_unmapped_disassembly.num_functions == 177 assert len([f.function_name for f in bashlite_unmapped_disassembly.getFunctions() if f.function_name]) == 174 + assert binary_info.abi == "SYSTEMV" # test section extraction sections = {name: (start, end) for name, start, end in binary_info.getSections()} assert len(sections) > 0 diff --git a/tests/testIntegration.py b/tests/testIntegration.py index 29c7e60..fbddc28 100644 --- a/tests/testIntegration.py +++ b/tests/testIntegration.py @@ -115,7 +115,7 @@ def testCutwailMarshalling(self): assert report_as_dict["status"] == "ok" assert report_as_dict["base_addr"] == 0x4000000 assert report_as_dict["statistics"]["num_instructions"] == 1611 - assert report_as_dict["sha256"] == "a348a0ddfab135d152b684d561a3215ab6c472570facd3d75aa2c7ee845a8e2b" + assert report_as_dict["sha256"] == "46686681e2be012ce26219eec1e765f8f2db9fc7a33ca802482050cef189334f" # compare our manual file loading with unmapped buffer assert self.cutwail_disassembly.num_instructions == self.cutwail_unmapped_disassembly.num_instructions SmdaReport.fromDict(report_as_dict)