From ab369212723e1110ae6e7ecab771cb7376934a0c Mon Sep 17 00:00:00 2001 From: Olivier Cervello Date: Mon, 15 Jun 2026 18:53:24 +0200 Subject: [PATCH 1/7] fix(smb): use anonymous null session before Guest for share enumeration When no credentials are supplied, login() switched to the "Guest" account and only fell back to a true null session if Guest *failed*. On Samba with `map to guest = Bad User`, the Guest login succeeds but the SRVSVC NetrShareEnumAll RPC (listShares) is denied (STATUS_ACCESS_DENIED), so share enumeration silently returned nothing even though `smbclient -L -N` works. Prefer a true anonymous null session first, matching smbclient -N behaviour. Co-Authored-By: Claude Opus 4.8 --- man_spider/lib/smb.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/man_spider/lib/smb.py b/man_spider/lib/smb.py index 6966fbe..59a4214 100644 --- a/man_spider/lib/smb.py +++ b/man_spider/lib/smb.py @@ -199,15 +199,19 @@ def login(self, refresh=False, first_try=True): if s in str(e): log.warning(f"{self.server}: {s}: {self.username}") - log.debug(f"{self.server}: Trying guest session") - self.username = "Guest" + # Prefer a true null/anonymous session first: on Samba, the SRVSVC + # share-enumeration RPC (listShares) is permitted for anonymous logons + # but denied for the "Guest" account, so null must be tried before Guest + # to match `smbclient -L -N` behaviour. + log.debug(f"{self.server}: Trying null session") + self.username = "" self.password = "" self.domain = "" self.nthash = "" - guest_success = self.login(refresh=True, first_try=False) - if not guest_success: - log.debug(f"{self.server}: Switching to null session") - self.username = "" + null_success = self.login(refresh=True, first_try=False) + if not null_success: + log.debug(f"{self.server}: Switching to guest session") + self.username = "Guest" self.login(refresh=True, first_try=False) return False From 5d8d0112a8d5b551656f2de4c64520217a43779c Mon Sep 17 00:00:00 2001 From: Olivier Cervello Date: Mon, 15 Jun 2026 18:58:35 +0200 Subject: [PATCH 2/7] feat: JSON Lines output, no-filter share listing, and Py3.14 console fix - feat(output): add `--json [FILE]` to emit results as JSON Lines (share, file, content_match records). With no FILE the JSONL goes to stdout and human-readable logs are redirected to stderr; with a FILE it is appended. - feat(cli): when no content/filename/extension filter is given, enumerate and print shares for each remote target (like `smbclient -L`) instead of erroring. - fix(logging): force the "fork" multiprocessing start method. Python 3.14 changed the Linux default to "forkserver", which re-imports modules in the worker and creates a new, unlistened log queue, so no console output appeared (only the log file). Forcing fork restores the shared queue; guarded so it is a no-op on older Pythons and degrades safely where fork is unavailable. Co-Authored-By: Claude Opus 4.8 --- man_spider/lib/logger.py | 51 ++++++++++++++++++++++++++++++ man_spider/lib/parser/parser.py | 8 +++++ man_spider/lib/spiderling.py | 11 +++++++ man_spider/manspider.py | 55 +++++++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 2 deletions(-) diff --git a/man_spider/lib/logger.py b/man_spider/lib/logger.py index 570e466..b3222ba 100644 --- a/man_spider/lib/logger.py +++ b/man_spider/lib/logger.py @@ -1,3 +1,6 @@ +import os +import sys +import json import logging from copy import copy from sys import stdout @@ -7,6 +10,54 @@ from logging.handlers import QueueHandler, QueueListener +### JSON LINES OUTPUT ### + +# JSONL destination, configured once via set_json_output() before forking so that +# child/worker processes inherit it (we force the "fork" start method). +_json_output_path = None # file path, when writing JSONL to a file +_json_to_stdout = False # when True, JSONL goes to stdout and logs go to stderr + + +def set_json_output(path): + """ + Configure JSONL output. "path" is: + - None / "" -> disabled + - "-" -> write JSONL to stdout, and redirect human-readable logs to stderr + - -> append JSONL to that file + """ + global _json_output_path, _json_to_stdout + if not path: + _json_output_path = None + _json_to_stdout = False + elif path == "-": + _json_output_path = None + _json_to_stdout = True + # keep stdout clean for JSONL: send all log records to stderr instead + console.setStream(sys.stderr) + else: + _json_output_path = str(path) + _json_to_stdout = False + + +def json_log(record): + """ + Emit a single record as one JSON line. No-op when --json wasn't specified. + Safe across processes: a short write() to an O_APPEND file, or a single + os.write() to stdout, is atomic on POSIX. + """ + if not (_json_output_path or _json_to_stdout): + return + try: + line = json.dumps(record, default=str) + "\n" + if _json_to_stdout: + os.write(1, line.encode("utf-8")) + else: + with open(_json_output_path, "a", encoding="utf-8") as f: + f.write(line) + except Exception: + pass + + ### PRETTY COLORS ### diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py index b4205d6..fe0a8e6 100644 --- a/man_spider/lib/parser/parser.py +++ b/man_spider/lib/parser/parser.py @@ -219,6 +219,14 @@ def extract_text(self, file, pretty_filename): for _filter, match_count in matches.items(): log.info(ColoredFormatter.green(f'{pretty_filename}: matched "{_filter.pattern}" {match_count:,} times')) + json_log( + { + "type": "content_match", + "file": str(pretty_filename), + "pattern": _filter.pattern, + "count": match_count, + } + ) # run grep for pretty output if not self.quiet: self.grep(binary_content, _filter.pattern) diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py index 658d8bc..b574a71 100644 --- a/man_spider/lib/spiderling.py +++ b/man_spider/lib/spiderling.py @@ -10,6 +10,7 @@ from man_spider.lib.file import * from man_spider.lib.util import * from man_spider.lib.errors import * +from man_spider.lib.logger import json_log from man_spider.lib.processpool import * @@ -129,6 +130,16 @@ def go(self): # otherwise, just save it elif not self.local: log.info(f"{self.target}: {file.share}\\{file.name} ({bytes_to_human(file.size)})") + json_log( + { + "type": "file", + "target": str(self.target), + "share": file.share, + "path": file.name, + "size": file.size, + "downloaded": not self.parent.no_download, + } + ) if not self.parent.no_download: self.save_file(file) diff --git a/man_spider/manspider.py b/man_spider/manspider.py index 19010c7..d1cc71a 100755 --- a/man_spider/manspider.py +++ b/man_spider/manspider.py @@ -29,9 +29,37 @@ def go(options): ) sleep(2) - # exit if no filters were specified + # if no filters were specified, just enumerate and print shares for each + # remote target (like `smbclient -L`) instead of spidering if not (options.filenames or options.extensions or options.exclude_extensions or options.content): - log.error("Please specify at least one of --filenames, --content, --extensions, or --exclude-extensions") + remote_targets = [t for t in options.targets if not isinstance(t, pathlib.PosixPath)] + if not remote_targets: + log.error("Please specify at least one of --filenames, --content, --extensions, or --exclude-extensions") + return + log.info("No filters specified; listing shares only") + for target in remote_targets: + smb_client = SMBClient( + target.host, + options.username, + options.password, + options.domain, + options.hash, + options.kerberos, + options.aes_key, + options.dc_ip, + port=target.port, + ) + if smb_client.login() is None: + log.warning(f"{target.host}: Could not connect") + continue + shares = smb_client.shares + if shares: + log.info(f"{target.host}: {len(shares)} shares:") + for share in shares: + log.info(f" {share}") + json_log({"type": "share", "target": target.host, "port": target.port, "share": share}) + else: + log.warning(f"{target.host}: No shares found (or enumeration denied)") return # exit if --maxdepth is invalid @@ -80,6 +108,16 @@ def load_content_wordlist(filepath, options): def main(): + # The logging setup (lib/logger.py) shares a multiprocessing Queue between the + # parent's QueueListener (console output) and the child worker's QueueHandler. + # Python 3.14 changed the default start method on Linux to "forkserver", which + # re-imports modules in the child and creates a *new*, unlistened queue -> no + # console output. Force "fork" so parent and child share the same queue. + try: + multiprocessing.set_start_method("fork", force=True) + except (ValueError, RuntimeError): + pass + interrupted = False examples = """ @@ -226,6 +264,16 @@ def main(): metavar="DATE", help="only show files modified before this date (format: YYYY-MM-DD)", ) + parser.add_argument( + "--json", + dest="json", + nargs="?", + const="-", + default=None, + metavar="FILE", + help="write results (shares, matched/looted files) as JSON Lines; to FILE, " + "or to stdout if no FILE is given (in which case logs are sent to stderr)", + ) syntax_error = False try: @@ -241,6 +289,9 @@ def main(): if options.verbose: log.setLevel("DEBUG") + # configure JSON Lines output (must happen before forking the worker) + set_json_output(options.json) + if options.kerberos and "KRB5CCNAME" not in os.environ: log.error("KRB5CCNAME is not set in the environment") sys.exit(1) From 1ae807b376dda519d28b9352b86739f5707a7938 Mon Sep 17 00:00:00 2001 From: Olivier Cervello Date: Tue, 16 Jun 2026 10:16:01 +0200 Subject: [PATCH 3/7] feat: always report enumerated shares (INFO + JSON) while spidering Previously the share list was only printed in no-filter mode. When a filter was supplied, shares were enumerated but only logged at debug level. Now the enumerated shares are reported once per target at INFO and emitted as JSON share records regardless of whether filters are active. Co-Authored-By: Claude Opus 4.8 --- man_spider/lib/spiderling.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py index b574a71..7bf445f 100644 --- a/man_spider/lib/spiderling.py +++ b/man_spider/lib/spiderling.py @@ -60,6 +60,8 @@ def __init__(self, target, parent): try: self.parent = parent self.target = target + # ensures enumerated shares are logged/emitted only once per target + self._shares_logged = False # unless we're only searching local files, connect to target if type(self.target) == pathlib.PosixPath: @@ -221,8 +223,25 @@ def shares(self): # Keep track of shares we've already yielded to avoid duplicates yielded_shares = set() + enumerated_shares = self.smb_client.shares + + # always report the shares we found, even while spidering with filters + if not self._shares_logged: + self._shares_logged = True + if enumerated_shares: + log.info(f"{self.target}: {len(enumerated_shares)} shares: {', '.join(enumerated_shares)}") + for share in enumerated_shares: + json_log( + { + "type": "share", + "target": self.target.host, + "port": self.target.port, + "share": share, + } + ) + # First, yield enumerated shares that match filters - for share in self.smb_client.shares: + for share in enumerated_shares: if self.share_match(share): yielded_shares.add(share.lower()) yield share From 8c70cdd40600e3e25a6f153f30cc4feca0c133d4 Mon Sep 17 00:00:00 2001 From: Olivier Cervello Date: Tue, 16 Jun 2026 10:23:16 +0200 Subject: [PATCH 4/7] feat(json): include target host and port in every record Make all JSON Lines records consistent: share, file, and content_match now each carry explicit "target" and "port" fields (plus share/path). The content_match record is emitted from the spiderling instead of the parser so the target/port/share context is available; the file record now uses the host/port directly rather than str(Target). Co-Authored-By: Claude Opus 4.8 --- man_spider/lib/parser/parser.py | 8 -------- man_spider/lib/spiderling.py | 15 ++++++++++++++- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py index fe0a8e6..b4205d6 100644 --- a/man_spider/lib/parser/parser.py +++ b/man_spider/lib/parser/parser.py @@ -219,14 +219,6 @@ def extract_text(self, file, pretty_filename): for _filter, match_count in matches.items(): log.info(ColoredFormatter.green(f'{pretty_filename}: matched "{_filter.pattern}" {match_count:,} times')) - json_log( - { - "type": "content_match", - "file": str(pretty_filename), - "pattern": _filter.pattern, - "count": match_count, - } - ) # run grep for pretty output if not self.quiet: self.grep(binary_content, _filter.pattern) diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py index 7bf445f..17fb73f 100644 --- a/man_spider/lib/spiderling.py +++ b/man_spider/lib/spiderling.py @@ -135,7 +135,8 @@ def go(self): json_log( { "type": "file", - "target": str(self.target), + "target": self.target.host, + "port": self.target.port, "share": file.share, "path": file.name, "size": file.size, @@ -194,6 +195,18 @@ def parse_file(self, file): try: if type(file) == RemoteFile: matches = self.parent.parser.parse_file(str(file.tmp_filename), pretty_filename=str(file)) + for _filter, match_count in matches.items(): + json_log( + { + "type": "content_match", + "target": self.target.host, + "port": self.target.port, + "share": file.share, + "path": file.name, + "pattern": _filter.pattern, + "count": match_count, + } + ) if matches and not self.parent.no_download: self.save_file(file) else: From 6b29f29d55b8a51f949ed6e21dfba737bc36ddf6 Mon Sep 17 00:00:00 2001 From: Olivier Cervello Date: Tue, 16 Jun 2026 10:40:21 +0200 Subject: [PATCH 5/7] feat(json): include the actual matched strings in content_match records content_match records now carry a "matches" array with the exact substrings that matched each pattern (deduped, capped at 20 distinct samples per pattern per file, each truncated to 256 chars). The matched substrings are recovered from the regex span against the extracted text, so no extra scanning is needed. Co-Authored-By: Claude Opus 4.8 --- man_spider/lib/parser/parser.py | 31 ++++++++++++++++++++++--------- man_spider/lib/spiderling.py | 5 +++-- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py index b4205d6..fac0cd4 100644 --- a/man_spider/lib/parser/parser.py +++ b/man_spider/lib/parser/parser.py @@ -81,6 +81,10 @@ class FileParser: ".dylib", } + # limits for the matched strings captured for JSON output + max_match_samples = 20 # max distinct matched strings kept per pattern per file + match_sample_maxlen = 256 # max length of each captured matched string + def __init__(self, filters, quiet=False): self.init_content_filters(filters) self.quiet = quiet @@ -210,15 +214,24 @@ def extract_text(self, file, pretty_filename): except Exception: pass - # count the matches - for _filter, match in self.match(text_content): - try: - matches[_filter] += 1 - except KeyError: - matches[_filter] = 1 - - for _filter, match_count in matches.items(): - log.info(ColoredFormatter.green(f'{pretty_filename}: matched "{_filter.pattern}" {match_count:,} times')) + # count the matches and capture a sample of the actual matched strings + for _filter, span in self.match(text_content): + entry = matches.get(_filter) + if entry is None: + entry = {"count": 0, "samples": []} + matches[_filter] = entry + entry["count"] += 1 + if len(entry["samples"]) < self.max_match_samples: + sample = text_content[span[0]:span[1]][: self.match_sample_maxlen] + if sample not in entry["samples"]: + entry["samples"].append(sample) + + for _filter, match_data in matches.items(): + log.info( + ColoredFormatter.green( + f'{pretty_filename}: matched "{_filter.pattern}" {match_data["count"]:,} times' + ) + ) # run grep for pretty output if not self.quiet: self.grep(binary_content, _filter.pattern) diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py index 17fb73f..6184f11 100644 --- a/man_spider/lib/spiderling.py +++ b/man_spider/lib/spiderling.py @@ -195,7 +195,7 @@ def parse_file(self, file): try: if type(file) == RemoteFile: matches = self.parent.parser.parse_file(str(file.tmp_filename), pretty_filename=str(file)) - for _filter, match_count in matches.items(): + for _filter, match_data in matches.items(): json_log( { "type": "content_match", @@ -204,7 +204,8 @@ def parse_file(self, file): "share": file.share, "path": file.name, "pattern": _filter.pattern, - "count": match_count, + "count": match_data["count"], + "matches": match_data["samples"], } ) if matches and not self.parent.no_download: From c43831258d16c49e1c81453e504e6d5e0f0afe2a Mon Sep 17 00:00:00 2001 From: Olivier Cervello Date: Tue, 16 Jun 2026 11:18:44 +0200 Subject: [PATCH 6/7] feat(json): include line/column location for each content match Each entry in a content_match record's "matches" array is now an object with the matched string plus its 1-based location in the extracted text: {"match": ..., "line": N, "column": N, "end_column": N}. Locations are derived from the regex span, so no extra scanning is required. Co-Authored-By: Claude Opus 4.8 --- man_spider/lib/parser/parser.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py index fac0cd4..c66178e 100644 --- a/man_spider/lib/parser/parser.py +++ b/man_spider/lib/parser/parser.py @@ -214,7 +214,8 @@ def extract_text(self, file, pretty_filename): except Exception: pass - # count the matches and capture a sample of the actual matched strings + # count the matches and capture a sample of the actual matched strings, + # including where in the file each one was found (1-based line/column) for _filter, span in self.match(text_content): entry = matches.get(_filter) if entry is None: @@ -222,8 +223,13 @@ def extract_text(self, file, pretty_filename): matches[_filter] = entry entry["count"] += 1 if len(entry["samples"]) < self.max_match_samples: - sample = text_content[span[0]:span[1]][: self.match_sample_maxlen] - if sample not in entry["samples"]: + start, end = span + value = text_content[start:end][: self.match_sample_maxlen] + line = text_content.count("\n", 0, start) + 1 + column = start - text_content.rfind("\n", 0, start) + end_column = column + (end - start) + sample = {"match": value, "line": line, "column": column, "end_column": end_column} + if not any(s["match"] == value and s["line"] == line and s["column"] == column for s in entry["samples"]): entry["samples"].append(sample) for _filter, match_data in matches.items(): From e2fb6bdf39ad12b9ef49fb32066267adf7da261a Mon Sep 17 00:00:00 2001 From: Olivier Cervello Date: Tue, 16 Jun 2026 16:27:14 +0200 Subject: [PATCH 7/7] fix(cli): split whitespace in --extensions / --exclude-extensions values `--extensions "ini cfg"` (a single quoted argument) now behaves the same as `--extensions ini cfg` (two arguments). Previously the quoted form became the single extension ".ini cfg", causing the search to match nothing. Each value is now split on whitespace before normalization. Co-Authored-By: Claude Opus 4.8 --- man_spider/manspider.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/man_spider/manspider.py b/man_spider/manspider.py index d1cc71a..4b5b7a0 100755 --- a/man_spider/manspider.py +++ b/man_spider/manspider.py @@ -315,17 +315,20 @@ def main(): else: options.modified_before = None - # make sure extension formats are valid - for i, extension in enumerate(options.extensions): - if extension and not extension.startswith("."): - extension = f".{extension}" - options.extensions[i] = extension.lower() - - # make sure extension blacklist is valid - for i, extension in enumerate(options.exclude_extensions): - if not extension.startswith("."): - extension = f".{extension}" - options.exclude_extensions[i] = extension.lower() + # normalize extensions: split each entry on whitespace so that a quoted + # `--extensions "ini cfg"` behaves the same as `--extensions ini cfg`, + # then ensure a leading dot and lowercase + def normalize_extensions(extensions): + normalized = [] + for entry in extensions: + for extension in entry.split(): + if not extension.startswith("."): + extension = f".{extension}" + normalized.append(extension.lower()) + return normalized + + options.extensions = normalize_extensions(options.extensions) + options.exclude_extensions = normalize_extensions(options.exclude_extensions) # lowercase share names options.sharenames = [s.lower() for s in options.sharenames]