From ab369212723e1110ae6e7ecab771cb7376934a0c Mon Sep 17 00:00:00 2001
From: Olivier Cervello <ocervello@freelabz.com>
Date: Mon, 15 Jun 2026 18:53:24 +0200
Subject: [PATCH 1/7] fix(smb): use anonymous null session before Guest for
 share enumeration

When no credentials are supplied, login() switched to the "Guest" account
and only fell back to a true null session if Guest *failed*. On Samba with
`map to guest = Bad User`, the Guest login succeeds but the SRVSVC
NetrShareEnumAll RPC (listShares) is denied (STATUS_ACCESS_DENIED), so share
enumeration silently returned nothing even though `smbclient -L -N` works.

Prefer a true anonymous null session first, matching smbclient -N behaviour.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 man_spider/lib/smb.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/man_spider/lib/smb.py b/man_spider/lib/smb.py
index 6966fbe..59a4214 100644
--- a/man_spider/lib/smb.py
+++ b/man_spider/lib/smb.py
@@ -199,15 +199,19 @@ def login(self, refresh=False, first_try=True):
                             if s in str(e):
                                 log.warning(f"{self.server}: {s}: {self.username}")
 
-                    log.debug(f"{self.server}: Trying guest session")
-                    self.username = "Guest"
+                    # Prefer a true null/anonymous session first: on Samba, the SRVSVC
+                    # share-enumeration RPC (listShares) is permitted for anonymous logons
+                    # but denied for the "Guest" account, so null must be tried before Guest
+                    # to match `smbclient -L -N` behaviour.
+                    log.debug(f"{self.server}: Trying null session")
+                    self.username = ""
                     self.password = ""
                     self.domain = ""
                     self.nthash = ""
-                    guest_success = self.login(refresh=True, first_try=False)
-                    if not guest_success:
-                        log.debug(f"{self.server}: Switching to null session")
-                        self.username = ""
+                    null_success = self.login(refresh=True, first_try=False)
+                    if not null_success:
+                        log.debug(f"{self.server}: Switching to guest session")
+                        self.username = "Guest"
                         self.login(refresh=True, first_try=False)
 
             return False

From 5d8d0112a8d5b551656f2de4c64520217a43779c Mon Sep 17 00:00:00 2001
From: Olivier Cervello <ocervello@freelabz.com>
Date: Mon, 15 Jun 2026 18:58:35 +0200
Subject: [PATCH 2/7] feat: JSON Lines output, no-filter share listing, and
 Py3.14 console fix

- feat(output): add `--json [FILE]` to emit results as JSON Lines (share,
  file, content_match records). With no FILE the JSONL goes to stdout and
  human-readable logs are redirected to stderr; with a FILE it is appended.
- feat(cli): when no content/filename/extension filter is given, enumerate and
  print shares for each remote target (like `smbclient -L`) instead of erroring.
- fix(logging): force the "fork" multiprocessing start method. Python 3.14
  changed the Linux default to "forkserver", which re-imports modules in the
  worker and creates a new, unlistened log queue, so no console output appeared
  (only the log file). Forcing fork restores the shared queue; guarded so it is
  a no-op on older Pythons and degrades safely where fork is unavailable.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 man_spider/lib/logger.py        | 51 ++++++++++++++++++++++++++++++
 man_spider/lib/parser/parser.py |  8 +++++
 man_spider/lib/spiderling.py    | 11 +++++++
 man_spider/manspider.py         | 55 +++++++++++++++++++++++++++++++--
 4 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/man_spider/lib/logger.py b/man_spider/lib/logger.py
index 570e466..b3222ba 100644
--- a/man_spider/lib/logger.py
+++ b/man_spider/lib/logger.py
@@ -1,3 +1,6 @@
+import os
+import sys
+import json
 import logging
 from copy import copy
 from sys import stdout
@@ -7,6 +10,54 @@
 from logging.handlers import QueueHandler, QueueListener
 
 
+### JSON LINES OUTPUT ###
+
+# JSONL destination, configured once via set_json_output() before forking so that
+# child/worker processes inherit it (we force the "fork" start method).
+_json_output_path = None  # file path, when writing JSONL to a file
+_json_to_stdout = False  # when True, JSONL goes to stdout and logs go to stderr
+
+
+def set_json_output(path):
+    """
+    Configure JSONL output. "path" is:
+      - None / ""   -> disabled
+      - "-"         -> write JSONL to stdout, and redirect human-readable logs to stderr
+      - <filename>  -> append JSONL to that file
+    """
+    global _json_output_path, _json_to_stdout
+    if not path:
+        _json_output_path = None
+        _json_to_stdout = False
+    elif path == "-":
+        _json_output_path = None
+        _json_to_stdout = True
+        # keep stdout clean for JSONL: send all log records to stderr instead
+        console.setStream(sys.stderr)
+    else:
+        _json_output_path = str(path)
+        _json_to_stdout = False
+
+
+def json_log(record):
+    """
+    Emit a single record as one JSON line. No-op when --json wasn't specified.
+    Safe across processes: a short write() to an O_APPEND file, or a single
+    os.write() to stdout, is atomic on POSIX.
+    """
+    if not (_json_output_path or _json_to_stdout):
+        return
+    try:
+        line = json.dumps(record, default=str) + "\n"
+        if _json_to_stdout:
+            os.write(1, line.encode("utf-8"))
+        else:
+            with open(_json_output_path, "a", encoding="utf-8") as f:
+                f.write(line)
+    except Exception:
+        pass
+
+
 ### PRETTY COLORS ###
 
 
diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py
index b4205d6..fe0a8e6 100644
--- a/man_spider/lib/parser/parser.py
+++ b/man_spider/lib/parser/parser.py
@@ -219,6 +219,14 @@ def extract_text(self, file, pretty_filename):
 
         for _filter, match_count in matches.items():
             log.info(ColoredFormatter.green(f'{pretty_filename}: matched "{_filter.pattern}" {match_count:,} times'))
+            json_log(
+                {
+                    "type": "content_match",
+                    "file": str(pretty_filename),
+                    "pattern": _filter.pattern,
+                    "count": match_count,
+                }
+            )
             # run grep for pretty output
             if not self.quiet:
                 self.grep(binary_content, _filter.pattern)
diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py
index 658d8bc..b574a71 100644
--- a/man_spider/lib/spiderling.py
+++ b/man_spider/lib/spiderling.py
@@ -10,6 +10,7 @@
 from man_spider.lib.file import *
 from man_spider.lib.util import *
 from man_spider.lib.errors import *
+from man_spider.lib.logger import json_log
 from man_spider.lib.processpool import *
 
 
@@ -129,6 +130,16 @@ def go(self):
                 # otherwise, just save it
                 elif not self.local:
                     log.info(f"{self.target}: {file.share}\\{file.name} ({bytes_to_human(file.size)})")
+                    json_log(
+                        {
+                            "type": "file",
+                            "target": str(self.target),
+                            "share": file.share,
+                            "path": file.name,
+                            "size": file.size,
+                            "downloaded": not self.parent.no_download,
+                        }
+                    )
                     if not self.parent.no_download:
                         self.save_file(file)
 
diff --git a/man_spider/manspider.py b/man_spider/manspider.py
index 19010c7..d1cc71a 100755
--- a/man_spider/manspider.py
+++ b/man_spider/manspider.py
@@ -29,9 +29,37 @@ def go(options):
             )
             sleep(2)
 
-        # exit if no filters were specified
+        # if no filters were specified, just enumerate and print shares for each
+        # remote target (like `smbclient -L`) instead of spidering
         if not (options.filenames or options.extensions or options.exclude_extensions or options.content):
-            log.error("Please specify at least one of --filenames, --content, --extensions, or --exclude-extensions")
+            remote_targets = [t for t in options.targets if not isinstance(t, pathlib.PosixPath)]
+            if not remote_targets:
+                log.error("Please specify at least one of --filenames, --content, --extensions, or --exclude-extensions")
+                return
+            log.info("No filters specified; listing shares only")
+            for target in remote_targets:
+                smb_client = SMBClient(
+                    target.host,
+                    options.username,
+                    options.password,
+                    options.domain,
+                    options.hash,
+                    options.kerberos,
+                    options.aes_key,
+                    options.dc_ip,
+                    port=target.port,
+                )
+                if smb_client.login() is None:
+                    log.warning(f"{target.host}: Could not connect")
+                    continue
+                shares = smb_client.shares
+                if shares:
+                    log.info(f"{target.host}: {len(shares)} shares:")
+                    for share in shares:
+                        log.info(f"  {share}")
+                        json_log({"type": "share", "target": target.host, "port": target.port, "share": share})
+                else:
+                    log.warning(f"{target.host}: No shares found (or enumeration denied)")
             return
 
         # exit if --maxdepth is invalid
@@ -80,6 +108,16 @@ def load_content_wordlist(filepath, options):
 
 def main():
 
+    # The logging setup (lib/logger.py) shares a multiprocessing Queue between the
+    # parent's QueueListener (console output) and the child worker's QueueHandler.
+    # Python 3.14 changed the default start method on Linux to "forkserver", which
+    # re-imports modules in the child and creates a *new*, unlistened queue -> no
+    # console output. Force "fork" so parent and child share the same queue.
+    try:
+        multiprocessing.set_start_method("fork", force=True)
+    except (ValueError, RuntimeError):
+        pass
+
     interrupted = False
 
     examples = """
@@ -226,6 +264,16 @@ def main():
         metavar="DATE",
         help="only show files modified before this date (format: YYYY-MM-DD)",
     )
+    parser.add_argument(
+        "--json",
+        dest="json",
+        nargs="?",
+        const="-",
+        default=None,
+        metavar="FILE",
+        help="write results (shares, matched/looted files) as JSON Lines; to FILE, "
+        "or to stdout if no FILE is given (in which case logs are sent to stderr)",
+    )
 
     syntax_error = False
     try:
@@ -241,6 +289,9 @@ def main():
         if options.verbose:
             log.setLevel("DEBUG")
 
+        # configure JSON Lines output (must happen before forking the worker)
+        set_json_output(options.json)
+
         if options.kerberos and "KRB5CCNAME" not in os.environ:
             log.error("KRB5CCNAME is not set in the environment")
             sys.exit(1)

From 1ae807b376dda519d28b9352b86739f5707a7938 Mon Sep 17 00:00:00 2001
From: Olivier Cervello <ocervello@freelabz.com>
Date: Tue, 16 Jun 2026 10:16:01 +0200
Subject: [PATCH 3/7] feat: always report enumerated shares (INFO + JSON) while
 spidering

Previously the share list was only printed in no-filter mode. When a filter
was supplied, shares were enumerated but only logged at debug level. Now the
enumerated shares are reported once per target at INFO and emitted as JSON
share records regardless of whether filters are active.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 man_spider/lib/spiderling.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py
index b574a71..7bf445f 100644
--- a/man_spider/lib/spiderling.py
+++ b/man_spider/lib/spiderling.py
@@ -60,6 +60,8 @@ def __init__(self, target, parent):
         try:
             self.parent = parent
             self.target = target
+            # ensures enumerated shares are logged/emitted only once per target
+            self._shares_logged = False
 
             # unless we're only searching local files, connect to target
             if type(self.target) == pathlib.PosixPath:
@@ -221,8 +223,25 @@ def shares(self):
         # Keep track of shares we've already yielded to avoid duplicates
         yielded_shares = set()
 
+        enumerated_shares = self.smb_client.shares
+
+        # always report the shares we found, even while spidering with filters
+        if not self._shares_logged:
+            self._shares_logged = True
+            if enumerated_shares:
+                log.info(f"{self.target}: {len(enumerated_shares)} shares: {', '.join(enumerated_shares)}")
+                for share in enumerated_shares:
+                    json_log(
+                        {
+                            "type": "share",
+                            "target": self.target.host,
+                            "port": self.target.port,
+                            "share": share,
+                        }
+                    )
+
         # First, yield enumerated shares that match filters
-        for share in self.smb_client.shares:
+        for share in enumerated_shares:
             if self.share_match(share):
                 yielded_shares.add(share.lower())
                 yield share

From 8c70cdd40600e3e25a6f153f30cc4feca0c133d4 Mon Sep 17 00:00:00 2001
From: Olivier Cervello <ocervello@freelabz.com>
Date: Tue, 16 Jun 2026 10:23:16 +0200
Subject: [PATCH 4/7] feat(json): include target host and port in every record

Make all JSON Lines records consistent: share, file, and content_match now
each carry explicit "target" and "port" fields (plus share/path). The
content_match record is emitted from the spiderling instead of the parser so
the target/port/share context is available; the file record now uses the
host/port directly rather than str(Target).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 man_spider/lib/parser/parser.py |  8 --------
 man_spider/lib/spiderling.py    | 15 ++++++++++++++-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py
index fe0a8e6..b4205d6 100644
--- a/man_spider/lib/parser/parser.py
+++ b/man_spider/lib/parser/parser.py
@@ -219,14 +219,6 @@ def extract_text(self, file, pretty_filename):
 
         for _filter, match_count in matches.items():
             log.info(ColoredFormatter.green(f'{pretty_filename}: matched "{_filter.pattern}" {match_count:,} times'))
-            json_log(
-                {
-                    "type": "content_match",
-                    "file": str(pretty_filename),
-                    "pattern": _filter.pattern,
-                    "count": match_count,
-                }
-            )
             # run grep for pretty output
             if not self.quiet:
                 self.grep(binary_content, _filter.pattern)
diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py
index 7bf445f..17fb73f 100644
--- a/man_spider/lib/spiderling.py
+++ b/man_spider/lib/spiderling.py
@@ -135,7 +135,8 @@ def go(self):
                     json_log(
                         {
                             "type": "file",
-                            "target": str(self.target),
+                            "target": self.target.host,
+                            "port": self.target.port,
                             "share": file.share,
                             "path": file.name,
                             "size": file.size,
@@ -194,6 +195,18 @@ def parse_file(self, file):
         try:
             if type(file) == RemoteFile:
                 matches = self.parent.parser.parse_file(str(file.tmp_filename), pretty_filename=str(file))
+                for _filter, match_count in matches.items():
+                    json_log(
+                        {
+                            "type": "content_match",
+                            "target": self.target.host,
+                            "port": self.target.port,
+                            "share": file.share,
+                            "path": file.name,
+                            "pattern": _filter.pattern,
+                            "count": match_count,
+                        }
+                    )
                 if matches and not self.parent.no_download:
                     self.save_file(file)
                 else:

From 6b29f29d55b8a51f949ed6e21dfba737bc36ddf6 Mon Sep 17 00:00:00 2001
From: Olivier Cervello <ocervello@freelabz.com>
Date: Tue, 16 Jun 2026 10:40:21 +0200
Subject: [PATCH 5/7] feat(json): include the actual matched strings in
 content_match records

content_match records now carry a "matches" array with the exact substrings
that matched each pattern (deduped, capped at 20 distinct samples per pattern
per file, each truncated to 256 chars). The matched substrings are recovered
from the regex span against the extracted text, so no extra scanning is needed.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 man_spider/lib/parser/parser.py | 31 ++++++++++++++++++++++---------
 man_spider/lib/spiderling.py    |  5 +++--
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py
index b4205d6..fac0cd4 100644
--- a/man_spider/lib/parser/parser.py
+++ b/man_spider/lib/parser/parser.py
@@ -81,6 +81,10 @@ class FileParser:
         ".dylib",
     }
 
+    # limits for the matched strings captured for JSON output
+    max_match_samples = 20  # max distinct matched strings kept per pattern per file
+    match_sample_maxlen = 256  # max length of each captured matched string
+
     def __init__(self, filters, quiet=False):
         self.init_content_filters(filters)
         self.quiet = quiet
@@ -210,15 +214,24 @@ def extract_text(self, file, pretty_filename):
         except Exception:
             pass
 
-        # count the matches
-        for _filter, match in self.match(text_content):
-            try:
-                matches[_filter] += 1
-            except KeyError:
-                matches[_filter] = 1
-
-        for _filter, match_count in matches.items():
-            log.info(ColoredFormatter.green(f'{pretty_filename}: matched "{_filter.pattern}" {match_count:,} times'))
+        # count the matches and capture a sample of the actual matched strings
+        for _filter, span in self.match(text_content):
+            entry = matches.get(_filter)
+            if entry is None:
+                entry = {"count": 0, "samples": []}
+                matches[_filter] = entry
+            entry["count"] += 1
+            if len(entry["samples"]) < self.max_match_samples:
+                sample = text_content[span[0]:span[1]][: self.match_sample_maxlen]
+                if sample not in entry["samples"]:
+                    entry["samples"].append(sample)
+
+        for _filter, match_data in matches.items():
+            log.info(
+                ColoredFormatter.green(
+                    f'{pretty_filename}: matched "{_filter.pattern}" {match_data["count"]:,} times'
+                )
+            )
             # run grep for pretty output
             if not self.quiet:
                 self.grep(binary_content, _filter.pattern)
diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py
index 17fb73f..6184f11 100644
--- a/man_spider/lib/spiderling.py
+++ b/man_spider/lib/spiderling.py
@@ -195,7 +195,7 @@ def parse_file(self, file):
         try:
             if type(file) == RemoteFile:
                 matches = self.parent.parser.parse_file(str(file.tmp_filename), pretty_filename=str(file))
-                for _filter, match_count in matches.items():
+                for _filter, match_data in matches.items():
                     json_log(
                         {
                             "type": "content_match",
@@ -204,7 +204,8 @@ def parse_file(self, file):
                             "share": file.share,
                             "path": file.name,
                             "pattern": _filter.pattern,
-                            "count": match_count,
+                            "count": match_data["count"],
+                            "matches": match_data["samples"],
                         }
                     )
                 if matches and not self.parent.no_download:

From c43831258d16c49e1c81453e504e6d5e0f0afe2a Mon Sep 17 00:00:00 2001
From: Olivier Cervello <ocervello@freelabz.com>
Date: Tue, 16 Jun 2026 11:18:44 +0200
Subject: [PATCH 6/7] feat(json): include line/column location for each content
 match

Each entry in a content_match record's "matches" array is now an object with
the matched string plus its 1-based location in the extracted text:
{"match": ..., "line": N, "column": N, "end_column": N}. Locations are
derived from the regex span, so no extra scanning is required.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 man_spider/lib/parser/parser.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py
index fac0cd4..c66178e 100644
--- a/man_spider/lib/parser/parser.py
+++ b/man_spider/lib/parser/parser.py
@@ -214,7 +214,8 @@ def extract_text(self, file, pretty_filename):
         except Exception:
             pass
 
-        # count the matches and capture a sample of the actual matched strings
+        # count the matches and capture a sample of the actual matched strings,
+        # including where in the file each one was found (1-based line/column)
         for _filter, span in self.match(text_content):
             entry = matches.get(_filter)
             if entry is None:
@@ -222,8 +223,13 @@ def extract_text(self, file, pretty_filename):
                 matches[_filter] = entry
             entry["count"] += 1
             if len(entry["samples"]) < self.max_match_samples:
-                sample = text_content[span[0]:span[1]][: self.match_sample_maxlen]
-                if sample not in entry["samples"]:
+                start, end = span
+                value = text_content[start:end][: self.match_sample_maxlen]
+                line = text_content.count("\n", 0, start) + 1
+                column = start - text_content.rfind("\n", 0, start)
+                end_column = column + (end - start)
+                sample = {"match": value, "line": line, "column": column, "end_column": end_column}
+                if not any(s["match"] == value and s["line"] == line and s["column"] == column for s in entry["samples"]):
                     entry["samples"].append(sample)
 
         for _filter, match_data in matches.items():

From e2fb6bdf39ad12b9ef49fb32066267adf7da261a Mon Sep 17 00:00:00 2001
From: Olivier Cervello <ocervello@freelabz.com>
Date: Tue, 16 Jun 2026 16:27:14 +0200
Subject: [PATCH 7/7] fix(cli): split whitespace in --extensions /
 --exclude-extensions values

`--extensions "ini cfg"` (a single quoted argument) now behaves the same as
`--extensions ini cfg` (two arguments). Previously the quoted form became the
single extension ".ini cfg", causing the search to match nothing. Each value
is now split on whitespace before normalization.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 man_spider/manspider.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/man_spider/manspider.py b/man_spider/manspider.py
index d1cc71a..4b5b7a0 100755
--- a/man_spider/manspider.py
+++ b/man_spider/manspider.py
@@ -315,17 +315,20 @@ def main():
         else:
             options.modified_before = None
 
-        # make sure extension formats are valid
-        for i, extension in enumerate(options.extensions):
-            if extension and not extension.startswith("."):
-                extension = f".{extension}"
-            options.extensions[i] = extension.lower()
-
-        # make sure extension blacklist is valid
-        for i, extension in enumerate(options.exclude_extensions):
-            if not extension.startswith("."):
-                extension = f".{extension}"
-            options.exclude_extensions[i] = extension.lower()
+        # normalize extensions: split each entry on whitespace so that a quoted
+        # `--extensions "ini cfg"` behaves the same as `--extensions ini cfg`,
+        # then ensure a leading dot and lowercase
+        def normalize_extensions(extensions):
+            normalized = []
+            for entry in extensions:
+                for extension in entry.split():
+                    if not extension.startswith("."):
+                        extension = f".{extension}"
+                    normalized.append(extension.lower())
+            return normalized
+
+        options.extensions = normalize_extensions(options.extensions)
+        options.exclude_extensions = normalize_extensions(options.exclude_extensions)
 
         # lowercase share names
         options.sharenames = [s.lower() for s in options.sharenames]