Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
638d032
Refactor FunctionAnalysisState for Dalvik
r0ny123 Apr 8, 2026
ebfe007
Update smda/dalvik/DalvikDisassembler.py
r0ny123 Apr 8, 2026
39e92dd
Update smda/utility/DexFileLoader.py
r0ny123 Apr 8, 2026
c4e2b9f
fix(dalvik): address code review - remove invalid header bounds check…
r0ny123 Apr 8, 2026
ef65e93
feat(dalvik): implement complete Dalvik opcode table (0x00-0xFF)
r0ny123 Apr 8, 2026
d09fc1d
refactor(dalvik): address 6 code review comments for robust disassembly
r0ny123 Apr 8, 2026
ab05959
fix(dalvik): revert LIEF DEX parse buffer type from bytes back to list
r0ny123 Apr 8, 2026
6220b40
fix(dalvik): fix insns_size extraction and core compat
r0ny123 Apr 8, 2026
dab84e9
test(dalvik): add integration tests and raw buffer fixes
r0ny123 Apr 8, 2026
2ab6055
Apply suggestions from code review
r0ny123 Apr 8, 2026
d42f105
fix(dalvik): simplify in-memory LIEF buffer parsing to always use list()
r0ny123 Apr 9, 2026
ef7c282
feat(dalvik): harden DEX decoding and analysis coverage
r0ny123 Apr 10, 2026
4b16f8f
fix(dalvik): stabilize CFG recovery and add verbose diagnostics
r0ny123 Apr 12, 2026
c86acd0
feat(dalvik): correctness, hardening, and format coverage improvements
r0ny123 Apr 13, 2026
db82d2a
style: apply ruff format to fix CI formatting check
r0ny123 Apr 13, 2026
35e8469
fix(dalvik): harden CFG validation and docs
r0ny123 Apr 13, 2026
d8da253
fix(dalvik): clean up raw dex cli output
r0ny123 Apr 13, 2026
dd43bf7
Update smda/common/SmdaInstruction.py
r0ny123 Apr 13, 2026
53e9a08
fix(dalvik): improve cli diagnostics
r0ny123 Apr 14, 2026
e73fa8f
Refactor: Address PR review comments regarding imports
r0ny123 Apr 15, 2026
fdba45d
Fix: Lint issues identified by ruff
r0ny123 Apr 15, 2026
fdc1de2
Style: Reformat smda/Disassembler.py with ruff
r0ny123 Apr 15, 2026
ee66eff
Update smda/Disassembler.py
r0ny123 Apr 15, 2026
7f65c5b
Refactor: Address review feedback on Dalvik PR
r0ny123 May 7, 2026
47d70a5
Refactor: Rename DexReferenceResolver methods, fix autodetect backend
r0ny123 May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# SMDA

SMDA is a minimalist recursive disassembler library that is optimized for accurate Control Flow Graph (CFG) recovery from memory dumps.
It is based on [Capstone](http://www.capstone-engine.org/) and currently supports x86/x64 Intel machine code.
As input, arbitrary memory dumps (ideally with known base address) can be processed.
It is based on [Capstone](http://www.capstone-engine.org/) and currently supports x86/x64 Intel machine code, experimental CIL (.NET) disassembly, and Dalvik bytecode from raw DEX files.
As input, arbitrary memory dumps (ideally with known base address) can be processed, and raw DEX files can be analyzed directly.
The output is a collection of functions, basic blocks, and instructions with their respective edges between blocks and functions (in/out).
Optionally, references to the Windows API can be inferred by using the ApiScout method.

Expand Down Expand Up @@ -48,6 +48,8 @@ There is also a demo script:

* analyze.py -- example usage: perform disassembly on a file or memory dump and optionally store results in JSON to a given output path.

For Dalvik, the current scope is raw single-DEX inputs. APK, multi-dex container handling, and ODEX/VDEX/CDEX runtime-artifact analysis are not yet first-class workflows in SMDA.

The code should be fully compatible with Python 3.8+.
Further explanation on the innerworkings follow in separate publications but will be referenced here.

Expand Down
94 changes: 77 additions & 17 deletions analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
import os
import re
import sys
import textwrap

from smda.Disassembler import Disassembler
from smda.SmdaConfig import SmdaConfig
from smda.utility.DexFileLoader import DexFileLoader


def parseBaseAddrFromArgs(args):
def parseBaseAddrFromArgs(args, silent=False):
if args.base_addr:
parsed_base_addr = int(args.base_addr, 16) if args.base_addr.startswith("0x") else int(args.base_addr)
logging.info("using provided base address: 0x%08x", parsed_base_addr)
Expand All @@ -18,24 +20,66 @@ def parseBaseAddrFromArgs(args):
baddr_match = re.search(re.compile("_0x(?P<base_addr>[0-9a-fA-F]{8,16})"), args.input_path)
if baddr_match:
parsed_base_addr = int(baddr_match.group("base_addr"), 16)
logging.info(
"Parsed base address from file name: 0x%08x",
parsed_base_addr,
)
logging.info("Parsed base address from file name: 0x%08x", parsed_base_addr)
return parsed_base_addr
logging.warning("No base address recognized, using 0.")
if not silent:
logging.warning("No base address recognized, using 0.")
return 0


def parseOepFromArgs(args):
def parseOepFromArgs(args, silent=False):
if args.oep and args.oep != "":
parsed_oep = int(args.oep, 16) if args.oep.startswith("0x") else int(args.oep)
logging.info("using provided OEP(RVA): 0x%08x", parsed_oep)
return parsed_oep
logging.warning("No OEP recognized, skipping.")
if not silent:
logging.warning("No OEP recognized, skipping.")
return None


def _printDalvikSummary(report, output_path, input_filename):
"""Structured one-screen summary printed to stdout after Dalvik/DEX analysis."""
size_bytes = report.binary_size or 0
size_str = f"{size_bytes / 1024 / 1024:.1f} MB" if size_bytes >= 1024 * 1024 else f"{size_bytes / 1024:.1f} KB"

dex_version = report.version if report.version else "?"
bitness_str = f".{report.bitness}bit" if report.bitness else ""
stats = report.statistics

# Aggregate heuristic tags and string-ref count across all functions
heuristic_counts = {}
string_ref_total = 0
for fn in report.getFunctions():
for tag in (fn.architecture_metadata or {}).get("heuristics", []):
heuristic_counts[tag] = heuristic_counts.get(tag, 0) + 1
string_ref_total += len(fn.stringrefs or {})

print(f"[*] File: {input_filename} ({size_str})")
print(f"[*] Architecture: {report.architecture}{bitness_str}")
print(f"[*] Format: Dalvik DEX v{dex_version}")
print(f"[*] Time: {report.execution_time:.3f}s")
print(f"[*] Functions: {stats.num_functions:,}")
print(f"[*] CFG: {stats.num_basic_blocks:,} blocks / {stats.num_instructions:,} instructions")
print(f"[*] Refs: api={stats.num_api_calls:,} strings={string_ref_total:,}")

if heuristic_counts:
tags = sorted(heuristic_counts.items(), key=lambda kv: -kv[1])
joined = " ".join(f"{k}={v}" for k, v in tags)
lines = textwrap.wrap(joined, width=52, break_long_words=False, break_on_hyphens=False) or [""]
print(f"[!] Heuristics: {lines[0]}")
for line in lines[1:]:
print(f" {line}")

if output_path and os.path.isdir(output_path):
print(f"[+] Saved: {os.path.join(output_path, input_filename + '.smda')}")


def _getInteractiveStream(stream):
if hasattr(stream, "reconfigure"):
stream.reconfigure(errors="backslashreplace")
return stream


def readFileContent(file_path):
file_content = b""
with open(file_path, "rb") as fin:
Expand Down Expand Up @@ -66,7 +110,7 @@ def readFileContent(file_path):
"--architecture",
type=str,
default="",
help="Use the disassembler for the following architecture if available (default:auto, options: [intel, cil]).",
help="Use the disassembler for the following architecture if available (default:auto, options: [intel, cil, dalvik]).",
)
PARSER.add_argument(
"-a",
Expand Down Expand Up @@ -120,11 +164,18 @@ def readFileContent(file_path):

# optionally create and set up a config, e.g. when using ApiScout profiles for WinAPI import usage discovery
config = SmdaConfig()
if ARGS.verbose:
config.LOG_LEVEL = logging.DEBUG
if ARGS.strings:
config.WITH_STRINGS = True
logging.basicConfig(level=config.LOG_LEVEL, format=config.LOG_FORMAT)
if ARGS.verbose:
config.LOG_LEVEL = logging.DEBUG
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s.%(msecs)03d %(levelname).1s %(name)s: %(message)s",
datefmt="%H:%M:%S",
stream=_getInteractiveStream(sys.stdout),
)
else:
logging.basicConfig(level=config.LOG_LEVEL, format=config.LOG_FORMAT)
SMDA_REPORT = None
INPUT_FILENAME = ""
BITNESS = ARGS.bitness if (ARGS.bitness in [32, 64]) else None
Expand All @@ -136,15 +187,24 @@ def readFileContent(file_path):
SMDA_REPORT = DISASSEMBLER.disassembleFile(ARGS.input_path, pdb_path=ARGS.pdb_path)
else:
BUFFER = readFileContent(ARGS.input_path)
BASE_ADDR = parseBaseAddrFromArgs(ARGS)
OEP = parseOepFromArgs(ARGS)
treat_as_dalvik = ARGS.architecture in {"", "dalvik"} and DexFileLoader.isCompatible(BUFFER)
if treat_as_dalvik:
BASE_ADDR = DexFileLoader.getBaseAddress(BUFFER)
OEP = None
else:
BASE_ADDR = parseBaseAddrFromArgs(ARGS)
OEP = parseOepFromArgs(ARGS)
config.API_COLLECTION_FILES = {
"win_7": os.sep.join([config.PROJECT_ROOT, "data", "apiscout_win7_prof-n_sp1.json"])
}
DISASSEMBLER = Disassembler(config, backend=ARGS.architecture)
SMDA_REPORT = DISASSEMBLER.disassembleBuffer(BUFFER, BASE_ADDR, BITNESS, oep=OEP)
SMDA_REPORT.filename = os.path.basename(ARGS.input_path)
print(SMDA_REPORT)
if SMDA_REPORT and os.path.isdir(ARGS.output_path):
with open(ARGS.output_path + os.sep + INPUT_FILENAME + ".smda", "w") as fout:
if SMDA_REPORT.architecture == "dalvik":
_printDalvikSummary(SMDA_REPORT, ARGS.output_path, INPUT_FILENAME)
else:
print(SMDA_REPORT)
if SMDA_REPORT and ARGS.output_path and os.path.isdir(ARGS.output_path):
output_file = os.path.join(ARGS.output_path, INPUT_FILENAME + ".smda")
with open(output_file, "w") as fout:
json.dump(SMDA_REPORT.toDict(), fout, indent=1, sort_keys=True)
43 changes: 41 additions & 2 deletions smda/Disassembler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from smda.common.BinaryInfo import BinaryInfo
from smda.common.labelprovider.GoLabelProvider import GoSymbolProvider
from smda.common.SmdaReport import SmdaReport
from smda.dalvik.DalvikDisassembler import DalvikDisassembler
from smda.ida.IdaExporter import IdaExporter
from smda.intel.IntelDisassembler import IntelDisassembler
from smda.SmdaConfig import SmdaConfig
from smda.utility.DexFileLoader import DexFileLoader
from smda.utility.FileLoader import FileLoader
from smda.utility.MemoryFileLoader import MemoryFileLoader
from smda.utility.StringExtractor import extract_strings
Expand All @@ -27,10 +29,13 @@ def __init__(self, config=None, backend=None):
self.disassembler = IntelDisassembler(self.config)
elif backend == "cil":
self.disassembler = CilDisassembler(self.config)
elif backend == "dalvik":
self.disassembler = DalvikDisassembler(self.config)
elif backend == "IDA":
self.disassembler = IdaExporter(self.config)
self._start_time = None
self._timeout = 0
self._last_timeout_log_second = -1
# cache the last DisassemblyResult
self.disassembly = None

Expand All @@ -41,6 +46,8 @@ def initDisassembler(self, architecture="intel"):
self.disassembler = IntelDisassembler(self.config)
elif architecture == "cil":
self.disassembler = CilDisassembler(self.config)
elif architecture == "dalvik":
self.disassembler = DalvikDisassembler(self.config)

def _getDurationInSeconds(self, start_ts, end_ts):
return (end_ts - start_ts).seconds + ((end_ts - start_ts).microseconds / 1000000.0)
Expand All @@ -49,12 +56,33 @@ def _callbackAnalysisTimeout(self):
if not self._timeout:
return False
time_diff = datetime.datetime.now(datetime.timezone.utc) - self._start_time
LOGGER.debug("Current analysis callback time %s", (time_diff))
return time_diff.seconds >= self._timeout
elapsed_seconds = int(time_diff.total_seconds())
if elapsed_seconds >= self._timeout:
LOGGER.debug("Current analysis callback time %s", time_diff)
return True
# Log on 30s bucket transitions (not exact-second boundaries) so the message
# still fires when callback timing skips past 30/60/... whole seconds.
current_bucket = elapsed_seconds // 30
if current_bucket >= 1 and current_bucket != self._last_timeout_log_second:
self._last_timeout_log_second = current_bucket
LOGGER.debug("Current analysis callback time %s", time_diff)
return False

def _addStringsToReport(self, smda_report, buffer, mode=None):
smda_report.buffer = buffer
for smda_function in smda_report.getFunctions():
if smda_report.architecture == "dalvik":
if smda_function.stringrefs and isinstance(smda_function.stringrefs, dict):
smda_function.stringrefs = [
{
"string": string_value,
"ins_addr": referencing_addr,
"data_addr": None,
"type": "dex",
}
for referencing_addr, string_value in sorted(smda_function.stringrefs.items())
]
continue
function_strings = []
for string_result in extract_strings(smda_function, mode=mode):
string, referencing_addr, string_addr, string_type = string_result
Expand Down Expand Up @@ -150,6 +178,16 @@ def disassembleBuffer(
Disassemble a given buffer (file_content), with given base_addr.
Optionally specify bitness, the areas to which disassembly should be limited to (code_areas) and an entry point (oep)
"""
# Auto-detect DEX when the caller did not explicitly override architecture.
# disassembleUnmappedBuffer / disassembleFile already use FileLoader for detection;
# this path bypasses it, so we check the magic bytes manually here.
if architecture == "intel" and DexFileLoader.isCompatible(file_content):
architecture = "dalvik"
if bitness is None:
bitness = DexFileLoader.getBitness(file_content)
# initDisassembler caches by self.disassembler-is-None, so a backend
# picked at construction time would otherwise win against autodetect.
self.disassembler = None
binary_info = BinaryInfo(file_content)
binary_info.base_addr = base_addr
binary_info.bitness = bitness
Expand All @@ -176,6 +214,7 @@ def disassembleBuffer(
def _disassemble(self, binary_info, timeout=0):
self._start_time = datetime.datetime.now(datetime.timezone.utc)
self._timeout = timeout
self._last_timeout_log_second = -1
self._ensureHashes(binary_info)
if self.disassembler:
self.disassembly = self.disassembler.analyzeBuffer(binary_info, self._callbackAnalysisTimeout)
Expand Down
76 changes: 63 additions & 13 deletions smda/DisassemblyResult.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(self):
self.exported_functions = set()
self.failed_analysis_addr = []
self.function_borders = {}
self.function_metadata = {}
# stored as key: int(i.address) = (i.size, i.mnemonic, i.op_str)
self.instructions = {}
self.ins2fn = {}
Expand Down Expand Up @@ -127,6 +128,49 @@ def getMnemonic(self, instruction_addr):
return self.instructions[instruction_addr][0]
return ""

def _getContainingBlockStart(self, blocks, instruction_addr):
for block in blocks:
if not block:
continue
block_start = block[0][0]
block_end = block[-1][0] + block[-1][1]
if block_start <= instruction_addr < block_end:
return block_start
return None

def _getExceptionSuccessors(self, func_addr, blocks):
metadata = self.function_metadata.get(func_addr, {})
try_ranges = metadata.get("try_ranges", [])
if not try_ranges:
return {}
block_successors = {}
for try_range in try_ranges:
raw_targets = []
for handler in try_range.get("handlers", []):
target_addr = handler.get("target_addr") if isinstance(handler, dict) else None
if target_addr is not None:
raw_targets.append(target_addr)
if try_range.get("catch_all_addr") is not None:
raw_targets.append(try_range["catch_all_addr"])
if not raw_targets:
continue
normalized_targets = set()
for target_addr in raw_targets:
block_start = self._getContainingBlockStart(blocks, target_addr)
if block_start is None:
block_start = target_addr
normalized_targets.add(block_start)
for block in blocks:
if not block:
continue
block_start = block[0][0]
block_end = block[-1][0] + block[-1][1]
if try_range["start_addr"] < block_end and block_start < try_range["end_addr"]:
successors = block_successors.get(block_start, set())
successors.update(normalized_targets)
block_successors[block_start] = successors
return block_successors

def isCode(self, addr):
return addr in self.code_map

Expand Down Expand Up @@ -190,20 +234,26 @@ def removeDataRefs(self, addr_from, addr_to):
self.data_refs_to[addr_to] = refs_to

def getBlockRefs(self, func_addr):
"""blocks refs should stay within function context, thus kill all references outside function"""
block_refs = {}
ins_addrs = set()
for block in self.functions[func_addr]:
for ins in block:
ins_addr = ins[0]
ins_addrs.add(ins_addr)
for block in self.functions[func_addr]:
"""Return a normalized intra-function CFG keyed by block start."""
if func_addr not in self.functions:
return {}
blocks = [block for block in self.functions[func_addr] if block]
block_starts = {block[0][0] for block in blocks}
block_refs = {block_start: [] for block_start in sorted(block_starts)}
for block in blocks:
last_ins_addr = block[-1][0]
if last_ins_addr in self.code_refs_from:
verified_refs = sorted(ins_addrs.intersection(self.code_refs_from[last_ins_addr]))
if verified_refs:
block_refs[block[0][0]] = verified_refs
return block_refs
if last_ins_addr not in self.code_refs_from:
continue
verified_refs = sorted(block_starts.intersection(self.code_refs_from[last_ins_addr]))
if verified_refs:
block_refs[block[0][0]] = verified_refs
for block_start, successors in self._getExceptionSuccessors(func_addr, blocks).items():
merged_successors = set(block_refs.get(block_start, []))
merged_successors.update(successors)
block_refs[block_start] = sorted(merged_successors)
for successor in successors:
block_refs.setdefault(successor, [])
return {block_start: block_refs[block_start] for block_start in sorted(block_refs)}

def getInRefs(self, func_addr):
in_refs = []
Expand Down
2 changes: 2 additions & 0 deletions smda/common/BinaryInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,6 @@ def getHeaderBytes(self):
return self.raw_data[:0x400]
elif isinstance(lief_result, lief.ELF.Binary):
return self.raw_data[:0x40]
elif self.architecture == "dalvik" or self.raw_data[:4] == b"dex\n":
return self.raw_data[:0x70]
return None
Loading
Loading