From b1d1f1959eafaada9292687083e60d35bbd7d6cf Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 5 May 2026 08:40:41 -0400 Subject: [PATCH] feat: add accbase target for ATAC-seq/DNase-seq discovery Add support for discovering chromatin accessibility datasets (ATAC-seq, scATAC-seq, DNase-seq) from GEO and uploading them to the accbase namespace in PEPhub. - Add "accbase" to valid targets in CLI - Add ACCBASE_FINDER_FILTER and ACCBASE_MAX_SIZE constants - Add accbase branches in queuer (assay-type filter) and uploader - Add accbase_uploader.yml and accbase_checker.yml GitHub Actions - Update README with accbase documentation --- .github/workflows/accbase_checker.yml | 35 ++++++++++++++++++++++++++ .github/workflows/accbase_uploader.yml | 34 +++++++++++++++++++++++++ README.md | 8 ++++++ geopephub/cli.py | 4 +-- geopephub/const.py | 4 +++ geopephub/metageo_pephub.py | 17 ++++++++++++- 6 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/accbase_checker.yml create mode 100644 .github/workflows/accbase_uploader.yml diff --git a/.github/workflows/accbase_checker.yml b/.github/workflows/accbase_checker.yml new file mode 100644 index 0000000..e625fad --- /dev/null +++ b/.github/workflows/accbase_checker.yml @@ -0,0 +1,35 @@ +name: Check failed cycles and samples for accbase + +on: + schedule: + # run every 2 days at 13:00 (offset from bedbase) + - cron: '0 13 1/2 * *' + workflow_dispatch: + +jobs: + check: + runs-on: ubuntu-latest + env: + POSTGRES_DB: ${{ secrets.POSTGRES_DB }} + POSTGRES_HOST: ${{ secrets.POSTGRES_HOST }} + POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }} + POSTGRES_PORT: ${{ secrets.POSTGRES_PORT }} + POSTGRES_USER: ${{ secrets.POSTGRES_USER }} + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.12' + + - name: Install package + run: python -m pip install . + + - name: Check cycles 1-5 + run: | + for cycle in 1 2 3 4 5; do + echo "Checking cycle $cycle..." + geopephub run-checker --target accbase --period 2 --cycle-count $cycle + done diff --git a/.github/workflows/accbase_uploader.yml b/.github/workflows/accbase_uploader.yml new file mode 100644 index 0000000..b91fe4a --- /dev/null +++ b/.github/workflows/accbase_uploader.yml @@ -0,0 +1,34 @@ +name: Queue and upload Accbase projects + +on: + schedule: + # run every 2 days at 12:00 (offset from bedbase) + - cron: '0 12 1/2 * *' + workflow_dispatch: + +jobs: + upload: + runs-on: ubuntu-latest + env: + POSTGRES_DB: ${{ secrets.POSTGRES_DB }} + POSTGRES_HOST: ${{ secrets.POSTGRES_HOST }} + POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }} + POSTGRES_PORT: ${{ secrets.POSTGRES_PORT }} + POSTGRES_USER: ${{ secrets.POSTGRES_USER }} + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.12' + + - name: Install package + run: python -m pip install . + + - name: Add to queue + run: geopephub run-queuer --target accbase --period 2 + + - name: Upload to PEPhub + run: geopephub run-uploader --target accbase diff --git a/README.md b/README.md index 7a3fc3b..1a2abc9 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,14 @@ This repository contains `geopephub` CLI, that enables to automatic upload GEO projects to PEPhub based on date and scheduled automatic uploading using GitHub actions. Additionally, the CLI includes a download command, enabling users to retrieve projects from specified namespace directly from the PEPhub database. This feature is particularly helpful for downloading all GEO projects at once. +## Supported Targets + +The pipeline supports three targets: + +- **geo**: All GEO projects (default behavior, no filtering) +- **bedbase**: BED file projects - filters GEO for BED, narrowPeak, and broadPeak files, uploads to the `bedbase` namespace +- **accbase**: Chromatin accessibility projects - filters GEO for ATAC-seq, scATAC-seq, and DNase-seq assays, uploads to the `accbase` namespace + ## Installation To install `geopephub` use this command: ``` diff --git a/geopephub/cli.py b/geopephub/cli.py index 9c51a3d..562caa6 100644 --- a/geopephub/cli.py +++ b/geopephub/cli.py @@ -15,10 +15,10 @@ def validate_target(value: str): - valid_target = ["geo", "bedbase"] + valid_target = ["geo", "bedbase", "accbase"] if value.lower() not in valid_target: raise typer.BadParameter( - f"Invalid color '{value}'. Choose from: {', '.join(valid_target)}" + f"Invalid target '{value}'. Choose from: {', '.join(valid_target)}" ) return value.lower() diff --git a/geopephub/const.py b/geopephub/const.py index 4ba0174..b8b52e0 100644 --- a/geopephub/const.py +++ b/geopephub/const.py @@ -24,3 +24,7 @@ POSTGRES_DIALECT = "postgresql+psycopg" BEDBASE_MAX_SIZE = "500MB" + +# Accbase specific constants +ACCBASE_FINDER_FILTER = "((ATAC-seq) OR (scATAC-seq) OR (DNase-seq))" +ACCBASE_MAX_SIZE = "1GB" diff --git a/geopephub/metageo_pephub.py b/geopephub/metageo_pephub.py index 562bce6..dbc3754 100644 --- a/geopephub/metageo_pephub.py +++ b/geopephub/metageo_pephub.py @@ -11,7 +11,7 @@ import peppy -from geopephub.const import LAST_UPDATE_DATES, BEDBASE_MAX_SIZE +from geopephub.const import LAST_UPDATE_DATES, BEDBASE_MAX_SIZE, ACCBASE_FINDER_FILTER, ACCBASE_MAX_SIZE from geopephub.utils import get_agent, get_base_db_engine from geopephub.models import StatusModel, CycleModel from geopephub.utils import run_geofetch, add_link_to_description @@ -59,6 +59,11 @@ def add_to_queue_by_period( gse_list = geofetch.Finder( filters="((bed) OR narrowPeak) OR broadPeak" ).get_gse_by_date(start_date_str, today_date_str) + elif target == "accbase": + # get chromatin accessibility projects (ATAC-seq, scATAC-seq, DNase-seq) + gse_list = geofetch.Finder( + filters=ACCBASE_FINDER_FILTER + ).get_gse_by_date(start_date_str, today_date_str) elif target == "geo": gse_list = geofetch.Finder().get_gse_by_date(start_date_str, today_date_str) else: @@ -195,6 +200,14 @@ def _upload_gse_project( data_source="all", processed=True, ) + elif target == "accbase": + # For accbase, we want all files from ATAC-seq/DNase-seq projects + # No file extension filter - we filter by assay type in the Finder + geofetcher_obj = geofetch.Geofetcher( + filter_size=ACCBASE_MAX_SIZE, + data_source="all", + processed=True, + ) else: geofetcher_obj = geofetch.Geofetcher() total_nb = len(log_model_dict.keys()) @@ -254,6 +267,8 @@ def _upload_gse_project( gse_log.status_info = "pepdbagent" if target == "bedbase": tag = pep_tag + elif target == "accbase": + tag = pep_tag else: tag = "default" try: