-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: prevent uploading PyPI tokens in common places #19994
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,9 +16,25 @@ | |
| # YARA rules directory | ||
| _RULES_DIR = Path(__file__).parent / "scanner_rules" | ||
|
|
||
| # Extensions to scan inside archives. Python source (.py) for source-level | ||
| # rules (e.g. pyarmor), and .pye for SourceDefender-encrypted files. | ||
| _SCAN_EXTENSIONS = {".py", ".pye"} | ||
| # Extensions to scan inside archives. | ||
| _SCAN_TARGETS = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: there might be a better way to express these targets? Maybe as globs, to make it clearer which ones are filenames versus suffixes?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking about globs (also to cover your second comment), but I decided it would be better to leave it for a separate change to better research what would be the best way for that. My personal preference would be scan every file under a size limit, but I think it requires more considerations
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense, thanks. I'll defer to whatever others think makes sense for an initial merge here. |
||
| # Python source for source-level rules (e.g. pyarmor) | ||
| ".py", | ||
| # .pye for SourceDefender-encrypted files | ||
| ".pye", | ||
| # Different textual files for common places where PyPI tokens are accidentally left. | ||
| ".md", | ||
| ".rst", | ||
| ".env", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: it might be good to match anything with
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes - see the previous comment. As a bit of context, the current list is based on where I saw tokens recently. |
||
| ".sh", | ||
| ".txt", | ||
| ".pypirc", | ||
| ".toml", | ||
| ".cfg", | ||
| ".conf", | ||
| "METADATA", | ||
| "PKG-INFO", | ||
| } | ||
|
|
||
| # Max size of individual file to scan inside archive (5 MiB) | ||
| _SCAN_MAX_FILE_SIZE = 5 * 1024 * 1024 | ||
|
|
@@ -78,8 +94,10 @@ def iter_zip_members(zfp: zipfile.ZipFile) -> typing.Iterator[tuple[str, int, by | |
| for entry in zfp.infolist(): | ||
| if entry.is_dir(): | ||
| continue | ||
| ext = Path(entry.filename).suffix.lower() | ||
| if ext not in _SCAN_EXTENSIONS: | ||
| path = Path(entry.filename) | ||
| ext = path.suffix.lower() | ||
| # Names like "METADATA", ".env" have empty suffix | ||
| if ext not in _SCAN_TARGETS and path.name not in _SCAN_TARGETS: | ||
| continue | ||
| data = zfp.read(entry.filename) | ||
| yield entry.filename, len(data), data | ||
|
|
@@ -90,8 +108,10 @@ def iter_tar_members(tar: tarfile.TarFile) -> typing.Iterator[tuple[str, int, by | |
| for member in tar.getmembers(): | ||
| if not member.isfile(): | ||
| continue | ||
| ext = Path(member.name).suffix.lower() | ||
| if ext not in _SCAN_EXTENSIONS: | ||
| path = Path(member.name) | ||
| ext = path.suffix.lower() | ||
| # Names like "PKG-INFO", ".env" have empty suffix | ||
| if ext not in _SCAN_TARGETS and path.name not in _SCAN_TARGETS: | ||
| continue | ||
| f = tar.extractfile(member) | ||
| if f is None: # pragma: no cover | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| rule secrets_pypi_token | ||
| { | ||
| meta: | ||
| description = "Detects PyPI API tokens exposed in source code." | ||
| author = "Kamil Mankowski" | ||
| message = "We have detected a PyPI API token exposed in the uploaded file. Publishing it would allow anyone to perform actions on your behalf. For your own security, please revoke the token immediately. See https://pypi.org/help/#compromised-token for additional help." | ||
|
|
||
| strings: | ||
| // Regex adapted from trufflehog's PyPI token detector | ||
| // Intentionally not derived from the official Token format definition to spare unnecessary matches. | ||
| // Pre-computed head ensures we match actual pypi.org tokens | ||
| // https://github.com/trufflesecurity/trufflehog/blob/main/pkg/detectors/pypi/pypi.go | ||
| $pypi_token = /pypi-AgEIcHlwaS5vcmcCJ[a-zA-Z0-9-_]{150,157}/ | ||
|
|
||
| // TODO: look if there are test tokens in use we should exclude | ||
| // $test_token = "pypi-AgEIcHlwaS5vcmcCJxxx" | ||
|
|
||
| condition: | ||
| $pypi_token | ||
| // If we want to allow some test-only tokens, we can use: | ||
| // and for all i in (1 .. #pypi_token) : ( | ||
| // not $test_token at @pypi_token[i] | ||
| // ) | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can imagine a future factory-style generator to create tokens for testing, so this paves that future nicely.