diff --git a/codecov.yml b/codecov.yml index e356f9a6e..078ae0622 100644 --- a/codecov.yml +++ b/codecov.yml @@ -29,5 +29,4 @@ ignore: - '*wsgi.py' - '*/__init__.py' - '*/urls.py' - - 'versions/*' # this is internal only, and not mission critical - - 'wagtailimportexport/*' # this doesn't work, candidate for removal / offloading / rewriting \ No newline at end of file + - 'versions/*' # this is internal only, and not mission critical \ No newline at end of file diff --git a/docs/refresh-from-prod-runbook.md b/docs/refresh-from-prod-runbook.md new file mode 100644 index 000000000..afbd89fb4 --- /dev/null +++ b/docs/refresh-from-prod-runbook.md @@ -0,0 +1,298 @@ +# Refreshing a Non-Prod Environment from Prod + +This is the operational runbook for rebuilding dev or staging from a prod +snapshot. It's the "Option 1" path described in +[`wagtail-transfer-page-sync.md`](./wagtail-transfer-page-sync.md) — it brings +the target environment to identical data with prod, then aligns the +`wagtail_transfer_idmapping` UUIDs so future page transfers from prod work as +updates rather than creating duplicates. + +After completing this runbook on a target env, that env will be data-identical +to prod (with URLs rewritten to match the target host), and wagtail-transfer +will be able to keep it in sync with prod going forward. The CMS `auth_user` +table contains only internal OpenStax staff accounts, so user data is carried +across intentionally — this means engineers can log into dev/staging with +their prod credentials after the refresh. + +> **This procedure is destructive on the target environment.** Every existing +> row in the target DB and every file in the target S3 bucket will be +> overwritten. Make sure no editors are mid-edit on the target before +> starting. + +## When to do this + +- First time setting up wagtail-transfer across envs. +- Periodic refresh (we suggested quarterly in the page-sync doc) so dev and + staging don't drift indefinitely from prod. +- Recovery after a major content incident on a non-prod env. + +**Never run this against prod.** The fixup management command has a guard +that refuses to run when `ENVIRONMENT == 'prod'`, but you should also refuse +mentally — pg_dump and `aws s3 sync` don't have that guard. + +## Prerequisites + +Before you start, confirm: + +- [ ] **Access to prod DB** — credentials/SSH that let you run `pg_dump` + against the prod Postgres instance. +- [ ] **Access to target env DB** — credentials that let you drop/recreate + tables in dev or staging Postgres. +- [ ] **AWS credentials** with read on the prod media bucket and + read/write on the target env's media bucket. Cross-account access may + require a profile that can assume into both accounts. +- [ ] **The target environment is freezing writes.** Tell editors that + anything they edit on dev/staging right now will be lost. Pause cron + jobs / scheduled tasks on the target env. +- [ ] **Disk space** — Postgres dump for the prod CMS is on the order of + a few hundred MB to a few GB depending on revisions history. Wherever + you stage the dump file needs the room. +- [ ] **The PR with `wagtail-transfer` is deployed to all three envs.** + Specifically: the `wagtail_transfer_idmapping` table must exist on + both prod (source of preseed) and the target env (where we restore). + +## Step 1 — Preseed prod's IDMapping table + +This is the step that makes everything else worth doing. It assigns +deterministic UUIDs to every page, image, document, and auth user on prod, +storing them in `wagtail_transfer_idmapping`. When we dump and restore that +table onto the target env in step 3, the target env ends up with the same +UUIDs prod has, which is what lets wagtail-transfer recognize "the same +page" between the two. + +```bash +# on a prod app instance +./manage.py preseed_transfer_table auth wagtailcore wagtailimages.image wagtaildocs +``` + +This is **idempotent** — running it twice just no-ops on already-mapped +rows. Safe to re-run if interrupted. + +## Step 2 — Dump prod's database + +From a machine with prod DB access (an app instance, a bastion, wherever +your team typically does this): + +```bash +pg_dump \ + --no-owner --no-privileges \ + --exclude-table=django_session \ + --exclude-table-data='django_admin_log' \ + --format=custom \ + --file=/tmp/oscms_prod_$(date +%Y%m%d).dump \ + $PROD_DATABASE_URL +``` + +Notes on what's excluded: +- **`django_session`** — session cookies. Useless on a different host, and + bloats the dump. Editors will need to re-login on the target env. +- **`django_admin_log`** — Django admin history. Not worth carrying over. + +Move the dump to wherever you'll restore from (laptop, target instance, S3 +staging bucket, etc.). It contains live prod data; treat it as sensitive +and delete it when you're done. + +## Step 3 — Restore onto the target environment + +> Skip ahead and run `./manage.py migrate` afterward if your prod and target +> branches happen to be at different migration revisions. Otherwise the +> restored schema may be ahead of the running app's models. + +```bash +# on the target env (dev or staging), against its DB: + +# Drop and recreate so we get a clean slate +dropdb $TARGET_DATABASE_NAME +createdb $TARGET_DATABASE_NAME + +# Restore from the dump +pg_restore \ + --no-owner --no-privileges \ + --dbname=$TARGET_DATABASE_NAME \ + /tmp/oscms_prod_YYYYMMDD.dump + +# Sanity check the sequences match the max IDs +psql $TARGET_DATABASE_URL -c "SELECT setval(pg_get_serial_sequence('wagtailcore_page', 'id'), MAX(id)) FROM wagtailcore_page;" +``` + +(That last `setval` is belt-and-suspenders — pg_dump usually handles +sequences correctly with the `custom` format, but it's been known to drift. +Repeat for any table you've seen `IntegrityError: duplicate key` on after a +prior restore.) + +## Step 4 — Sync the S3 media bucket + +Prod media (uploaded images, documents) lives in prod's S3 bucket. The DB +you just restored references those files by path; if the target env's +bucket doesn't have them, every image in the admin shows broken. + +```bash +# Read-only sweep — see what would change before doing it +aws s3 sync \ + s3://$PROD_MEDIA_BUCKET/ \ + s3://$TARGET_MEDIA_BUCKET/ \ + --dryrun --delete + +# When the dry run looks right +aws s3 sync \ + s3://$PROD_MEDIA_BUCKET/ \ + s3://$TARGET_MEDIA_BUCKET/ \ + --delete +``` + +Notes: +- `--delete` removes files from the target bucket that don't exist in prod. + That's what you want — anything uploaded to dev/staging that's not on + prod is about to be orphaned anyway when the DB no longer references it. +- If your buckets live in different AWS accounts, you'll need a profile + that can read from one and write to the other, or copy via an + intermediate bucket. Your AWS setup will dictate which. +- If your team uses bucket-level encryption keys (SSE-KMS), the copying + identity also needs `kms:Decrypt` on the source key and `kms:Encrypt` + on the destination key. + +## Step 5 — Run the fixup management command + +This is the part you don't want to forget. After step 3, the target env's +DB believes it *is* prod — Wagtail Site is `openstax.org`, all the URLs in +StreamField/RichText content reference `openstax.org`, and `auth_user` +contains real prod users. + +```bash +# on the target env + +# 1. Dry-run first. Read the output and confirm what it says it'll change. +./manage.py refresh_from_prod_fixup --target-host=dev.openstax.org + +# 2. Commit it +./manage.py refresh_from_prod_fixup \ + --target-host=dev.openstax.org \ + --commit +``` + +Use the appropriate `--target-host` for the env you're refreshing: +- dev: `dev.openstax.org` +- staging: `staging.openstax.org` + +User accounts come across with the dump/restore in step 3 and the fixup +command leaves them alone — the CMS `auth_user` table holds only internal +staff, so carrying it across means engineers can log into the refreshed +env with the same credentials they use on prod. + +The fixup also **empties Salesforce-synced data tables** (`Adopter`, +`AdoptionOpportunityRecord`, `School`, `Partner`, `ResourceDownload`, +`SavingsNumber`). Prod salesforce IDs point at prod's Salesforce instance, +which isn't what dev/staging is configured to talk to — so the next +scheduled run of `update_partners`, `update_schools`, etc. will repopulate +those tables with the target env's sandbox data. Salesforce config tables +(`SalesforceSettings`, `SalesforceForms`, the `*Mapping` models) are left +intact. + +> One thing to be aware of: any locally-uploaded media on Partner records +> (logos, images, videos) is referenced by row, so emptying the table +> orphans those files in S3 until the next Partner sync re-establishes +> the references. If a partner is missing imagery after the refresh, run +> `./manage.py update_partners` and confirm. + +After this, restart the target env's web workers so any in-memory caches +clear. + +## Step 6 — Verify + +Sanity checks to run on the target env: + +```bash +# Site hostname is right +psql $TARGET_DATABASE_URL -c "SELECT hostname FROM wagtailcore_site WHERE is_default_site;" + +# No leftover prod URLs in obvious places +psql $TARGET_DATABASE_URL -c "SELECT COUNT(*) FROM snippets_givebanner WHERE link_url LIKE 'https://openstax.org%';" + +# wagtail_transfer_idmapping is populated and traveled over from prod +psql $TARGET_DATABASE_URL -c "SELECT COUNT(*) FROM wagtail_transfer_idmapping;" +``` + +Then in the browser: +1. Log into the target env's admin as a superuser. +2. Open the home page in the page tree — pages from prod should be there. +3. Open a page that has hero images / StreamField content — confirm images + render and that any URLs in the content show the target host, not + `openstax.org`. +4. Open Wagtail's Import view (Settings → Import or wherever the menu item + is) and pick prod as the source. Browse the page tree. Pick a page that + has been edited on prod since the dump and import it — it should resolve + as an **update** rather than create a duplicate, because the + `wagtail_transfer_idmapping` UUIDs match. + +If the last check creates a duplicate instead of updating, step 1 (preseed +on prod) didn't happen or didn't include `wagtailcore` — go back and check. + +## Rollback / if it goes sideways + +The pg_dump/restore step is the destructive one. If you realize mid-step +that you needed something from the target DB that's now gone: + +- **Before step 3**: nothing's gone yet. You can stop without consequences. +- **After step 3, before step 5**: the target DB is now prod data. To + restore the previous target state, you need a pre-refresh backup of that + env. Take one *before* step 3 if there's anything on dev/staging worth + keeping (in-progress edits, etc.). + +For S3, `aws s3 sync --delete` removes target-only files. Same advice: +take a snapshot of the target bucket (or rely on bucket versioning if +enabled) before step 4 if you care about anything that lives only there. + +## Common pitfalls + +1. **Forgetting step 1 (preseed on prod).** If you skip preseed, the + target env's `wagtail_transfer_idmapping` will be empty for pages + created before this refresh. Future imports from prod will create + duplicates instead of updating. Catch: import a test page after step 6 + and confirm it updates rather than duplicates. + +2. **Schema drift between prod and target branches.** If prod is running + migration `0099` and the target branch is at `0095`, the restored + schema is ahead of the target's models. Symptoms: `manage.py migrate` + tries to undo migrations, or admin views throw `ProgrammingError`. + Resolution: get the target branch caught up to prod, OR cherry-pick the + relevant migrations onto the target branch before refreshing. + +3. **CloudFront / CDN caching.** The CDN may still be serving cached + responses from before the refresh. Issue an invalidation + (`/*` is the nuclear option; usually you only need a few paths) if + pages look wrong immediately after refresh and look right after a few + minutes. + +4. **Wagtail search index out of sync.** If your search backend is + Elasticsearch (rather than Postgres FTS), run + `./manage.py update_index` after step 5. + +5. **Cron jobs reactivating before you're ready.** Cron tasks that send + emails, sync to Salesforce, or otherwise reach outside the app should + stay paused until you've verified step 6. Otherwise you might email + prod users from your dev environment, etc. + +6. **Salesforce tables empty until the next sync.** The fixup empties + the synced tables on purpose (see Step 5), so the admin will show + "no partners," "no schools," etc. until the next scheduled sync runs. + If that's blocking verification, run the sync commands manually: + ``` + ./manage.py update_partners + ./manage.py update_schools + ./manage.py update_opportunities + ./manage.py update_resource_downloads + ``` + +## Reusing this for a new environment + +If you ever stand up a new environment (e.g., a `qa` env), this same +procedure works. Steps: + +1. Deploy the codebase to the new env. +2. Run `./manage.py migrate` so the schema is in place. +3. Add the new env's hostname and secret key to prod's + `WAGTAILTRANSFER_SOURCES_JSON` if you want prod to be able to pull + from it (usually you don't). +4. Run this runbook with `--target-host=qa.openstax.org`. + +After that, the new env is ready for normal wagtail-transfer use. diff --git a/docs/wagtail-transfer-page-sync.md b/docs/wagtail-transfer-page-sync.md new file mode 100644 index 000000000..fba624297 --- /dev/null +++ b/docs/wagtail-transfer-page-sync.md @@ -0,0 +1,174 @@ +# Syncing Existing Pages Across Environments — Future Plan + +## Status + +**Not implemented.** This document records the options we considered for joining +pages that exist independently on dev/staging/prod so they can be kept in sync +with `wagtail-transfer`, and the tradeoffs of each. It exists so the next person +picking this up has the context. + +The current `wagtail-transfer` integration (see `openstax/settings/base.py`) +handles two things well: + +- **New pages** — net-new content built on staging and pushed forward to prod. + The transfer creates the page on the destination and records the UUID mapping; + future transfers update it in place. +- **Snippets** — matched by natural key via `WAGTAILTRANSFER_LOOKUP_FIELDS` (e.g. + `Subject.name`, `Role.salesforce_name`). Editing a snippet on staging and + re-importing on prod updates the existing row rather than creating a duplicate. + +What it does **not** handle is pages that already exist on both sides with +different IDs, slugs, URL paths, and image references because they were created +independently at different times. Transferring those today produces a duplicate +on the destination instead of an update. + +## Why pages are hard + +`wagtail-transfer` matches objects by a UUID stored in the +`wagtail_transfer_idmapping` table on each instance. The mapping is established +either: + +1. The first time an object is transferred (UUID assigned at source, stored on + destination after import), or +2. By running `./manage.py preseed_transfer_table`, which deterministically + derives the same UUID on every instance for objects with the same primary + key in a given range. + +If two environments created the "same" page independently, neither path applies +— the IDs differ, no transfer has ever joined them, and there is no natural +unique key to lean on. + +`WAGTAILTRANSFER_LOOKUP_FIELDS` is the escape hatch for snippets but is a bad +fit for pages: + +- Slugs are unique per parent, not site-wide. +- Slugs (and parents, and URL paths) already differ across our environments. +- The setting applies to the base `wagtailcore.page` model — it cannot be + scoped to one page type. A wrong match could promote, say, an old "About" + page onto a "Subjects" page slot. + +## Options + +### Option 1 — Make one environment the source of truth, dump/restore the others + +Pick one environment (almost certainly prod) as canonical. Dump its database, +restore over staging and dev, then run +`./manage.py preseed_transfer_table wagtailcore.page wagtailimages.image wagtaildocs` +on every environment. After that point, every page has a matching UUID +everywhere and `wagtail-transfer` works for any direction. + +- **Pros** + - Clean and finished — no ongoing reconciliation tax. + - Mirrors `wagtail-transfer`'s own "Example 1" rollout guidance. + - Easy to reason about: dev and staging are pristine reflections of prod + until edited. +- **Cons** + - Destructive on dev/staging — any unique in-progress work there is lost. + - One-shot operation that needs scheduling and team buy-in. + - Requires deciding how to reapply any in-flight edits that were on staging + but not prod (cherry-pick by hand, redo, or accept loss). +- **When to pick this**: the divergent content on non-prod envs is mostly + scratch work that can be redone or thrown away, and we want this resolved + in a single afternoon. + +### Option 2 — Manual per-page reconciliation via wagtail-transfer + +For each page that exists on both sides, pick the canonical version, delete +the duplicate on the other side, then transfer the canonical version. The +import creates a fresh idmapping row that links the two going forward. + +- **Pros** + - No bulk database surgery; nothing destructive at the schema level. + - Editor-driven; an editor can decide on a per-page basis which side wins. + - Adds redirects naturally (Wagtail records them on slug change). +- **Cons** + - Linear effort per page — does not scale beyond a handful. + - Deleting a page that has inbound links from other pages can break those + links until the new page lands. Order of operations matters. + - Easy to forget a page or to half-do the job. +- **When to pick this**: there are only a small number of overlapping pages, + and editors want explicit say in each one. + +### Option 3 — Custom `seed_page_uuids` management command driven by a mapping file + +Build a one-time command that reads a CSV (or YAML) of +`source_env, source_page_id, dest_env, dest_page_id` tuples and inserts +matching UUIDs into `wagtail_transfer_idmapping` on both sides. The mapping +file can be hand-curated, generated by a fuzzy match on titles, or some +combination. + +- **Pros** + - Non-destructive — neither side loses data. + - Repeatable: the mapping file is reviewable, version-controllable, and + can be re-run. + - Can be extended to images and documents using the same idea. +- **Cons** + - We have to build it, including the matching heuristic if we want to avoid + hand-curating the whole list. + - First sync after seeding becomes a "merge" — whichever side runs the + import overwrites the other's field values. Field-level conflict + resolution is not possible. + - Risk of bugs in the seeding script causing wrong joins. Hard to undo + once the wrong UUID is in production. +- **When to pick this**: there is a moderate-to-large set of overlapping + pages and editors are unwilling to lose either side's content. + +### Option 4 — Lookup fields on `wagtailcore.page` + +Add `WAGTAILTRANSFER_LOOKUP_FIELDS = {'wagtailcore.page': []}` so +pages get matched by some natural key. + +- **Pros** + - Zero migration work — once configured, transfers Just Work. +- **Cons** + - We have no field that is genuinely unique across our pages. Slug is + per-parent; title is not unique; `url_path` would need to be kept aligned + by hand on both sides. + - The setting applies to every page type. A bad match has no safety net. + - **Not recommended** in our environment. +- **When to pick this**: only if we one day adopt a site-unique stable + identifier on pages (a custom UUID field we manage, for example). + +### Option 5 — Hybrid: governance now, alignment when needed + +Treat the "existing pages are divergent" problem as primarily a process +issue, not a tooling one. + +1. Declare prod the canonical source for already-shipped pages. +2. New content always flows staging → prod (works today). +3. When an existing page genuinely needs cross-env work, fall back to + Option 2 (manual reconciliation) for that one page. +4. Revisit Option 1 or Option 3 if the manual queue ever outgrows itself. + +- **Pros** + - No build effort up front. + - No destructive actions. + - Avoids over-engineering a problem we may rarely hit. +- **Cons** + - The divergence stays on the books until someone takes one of the other + options. + - Editors need to remember the rule. +- **When to pick this**: we don't actually know yet how often existing-page + sync is needed, and we'd rather not invest until we do. + +## Recommendation + +Start with **Option 5**. Run that way for a quarter and count how often +"existing page needs to move between envs" actually comes up. + +- If it almost never does → keep Option 5. Done. +- If it comes up for a small set of well-known pages → use Option 2 to clear + them as a one-time pass. +- If it comes up constantly → build Option 3, since Option 1 will be too + disruptive by that point and Option 4 still won't be safe. + +## What to do before any of these + +Independent of which option we choose, these have to be in place first: + +- The current PR (`move-content-btw-envs`) merged and `wagtail-transfer` + configured per env (see the PR description). +- The `wagtail_transfer_idmapping` table populated for at least one + successful transfer end-to-end, so we have a known-good baseline. +- A decision on whether images should also be aligned. Today we deliberately + don't update images on import; revisit if Option 1 or Option 3 is chosen. diff --git a/openstax/settings/base.py b/openstax/settings/base.py index 6ede95ada..88ea34181 100644 --- a/openstax/settings/base.py +++ b/openstax/settings/base.py @@ -1,3 +1,4 @@ +import json import os import sys @@ -249,7 +250,7 @@ 'oxauth', 'webinars', 'donations', - 'wagtailimportexport', + 'wagtail_transfer', 'versions', 'oxmenus', # wagtail @@ -270,6 +271,100 @@ 'wagtail.contrib.settings', ] +#################### +# Wagtail Transfer # +#################### + +WAGTAILTRANSFER_SECRET_KEY = os.getenv('WAGTAILTRANSFER_SECRET_KEY', 'change-me-in-production') + +# Validate the secret key via Django's system check framework rather than at +# import time. This still flags misconfiguration on `manage.py check`, +# `runserver`, `migrate`, etc., but does NOT fire during `collectstatic` (which +# sets `requires_system_checks = []`). That matters because the AMI bake runs +# `collectstatic` before runtime secrets have been loaded from SSM. +from django.core import checks as _django_checks # noqa: E402 + +@_django_checks.register(_django_checks.Tags.security) +def _check_wagtail_transfer_secret_key(app_configs, **kwargs): + from django.conf import settings as _settings + if ( + getattr(_settings, 'ENVIRONMENT', 'local') not in ('local', 'test') + and _settings.WAGTAILTRANSFER_SECRET_KEY == 'change-me-in-production' + ): + return [_django_checks.Error( + "WAGTAILTRANSFER_SECRET_KEY is set to the insecure default placeholder.", + hint="Set the WAGTAILTRANSFER_SECRET_KEY environment variable to a unique secure value.", + id='openstax.E001', + )] + return [] + +# Sources this environment can pull content FROM. +# +# Preferred: set WAGTAILTRANSFER_SOURCES_JSON with a JSON object, e.g. +# {"staging": {"BASE_URL": "https://staging.openstax.org/admin/wagtail-transfer/", +# "SECRET_KEY": ""}, +# "prod": {"BASE_URL": "https://openstax.org/admin/wagtail-transfer/", +# "SECRET_KEY": ""}} +# The SECRET_KEY for a source here must equal the WAGTAILTRANSFER_SECRET_KEY +# configured on that source's own environment. +# +# Fallback for a single source: WAGTAILTRANSFER_SOURCE_NAME/_URL/_KEY. +WAGTAILTRANSFER_SOURCES = {} + +_transfer_sources_json = os.getenv('WAGTAILTRANSFER_SOURCES_JSON') +if _transfer_sources_json: + try: + WAGTAILTRANSFER_SOURCES = json.loads(_transfer_sources_json) + except json.JSONDecodeError as e: + raise RuntimeError(f"WAGTAILTRANSFER_SOURCES_JSON is not valid JSON: {e}") + for _name, _cfg in WAGTAILTRANSFER_SOURCES.items(): + if not isinstance(_cfg, dict) or not _cfg.get('BASE_URL') or not _cfg.get('SECRET_KEY'): + raise RuntimeError( + f"WAGTAILTRANSFER_SOURCES_JSON source '{_name}' must have BASE_URL and SECRET_KEY." + ) +else: + _transfer_source_name = os.getenv('WAGTAILTRANSFER_SOURCE_NAME') + _transfer_source_url = os.getenv('WAGTAILTRANSFER_SOURCE_URL') + _transfer_source_key = os.getenv('WAGTAILTRANSFER_SOURCE_KEY') + _transfer_vars = { + 'WAGTAILTRANSFER_SOURCE_NAME': _transfer_source_name, + 'WAGTAILTRANSFER_SOURCE_URL': _transfer_source_url, + 'WAGTAILTRANSFER_SOURCE_KEY': _transfer_source_key, + } + _set_vars = {name for name, value in _transfer_vars.items() if value} + if _set_vars and len(_set_vars) != len(_transfer_vars): + missing = sorted(set(_transfer_vars.keys()) - _set_vars) + raise RuntimeError( + "Invalid Wagtail Transfer source configuration: " + "the environment variables WAGTAILTRANSFER_SOURCE_NAME, " + "WAGTAILTRANSFER_SOURCE_URL, and WAGTAILTRANSFER_SOURCE_KEY " + "must either all be set or all be unset. " + f"Currently missing: {', '.join(missing)}." + ) + if _transfer_source_name and _transfer_source_url and _transfer_source_key: + WAGTAILTRANSFER_SOURCES[_transfer_source_name] = { + 'BASE_URL': _transfer_source_url, + 'SECRET_KEY': _transfer_source_key, + } + +# Match snippets across environments by their natural identifier instead of +# wagtail-transfer's auto UUID. Without this, an import would create duplicates +# of any snippet that was authored independently on each environment. +WAGTAILTRANSFER_LOOKUP_FIELDS = { + 'snippets.subject': ['name'], + 'snippets.k12subject': ['name'], + 'snippets.role': ['salesforce_name'], + 'snippets.facultyresource': ['heading'], + 'snippets.studentresource': ['heading'], + 'snippets.newssource': ['name'], + 'snippets.sharedcontent': ['title'], + 'snippets.erratacontent': ['heading', 'book_state'], + 'snippets.blogcontenttype': ['content_type'], + 'snippets.blogcollection': ['name'], + 'snippets.webinarcollection': ['name'], + 'snippets.promotesnippet': ['name'], +} + ######## # Cron # ######## diff --git a/openstax/urls.py b/openstax/urls.py index 61f236e0f..7d375b8c9 100644 --- a/openstax/urls.py +++ b/openstax/urls.py @@ -5,6 +5,8 @@ from wagtail.admin import urls as wagtailadmin_urls from wagtailautocomplete.urls.admin import urlpatterns as autocomplete_admin_urls from wagtail import urls as wagtail_urls +from wagtail_transfer import urls as wagtailtransfer_urls +from . import wagtail_transfer_patches # noqa: F401 — applied on import from wagtail.documents import urls as wagtaildocs_urls from accounts import urls as accounts_urls @@ -19,6 +21,8 @@ urlpatterns = [ path('admin/autocomplete/', include(autocomplete_admin_urls)), + # Must come before the broader 'admin/' include below — Django matches in order. + path('admin/wagtail-transfer/', include(wagtailtransfer_urls)), path('admin/', include(wagtailadmin_urls)), path('django-admin/error/', throw_error, name='throw_error'), diff --git a/openstax/wagtail_transfer_patches.py b/openstax/wagtail_transfer_patches.py new file mode 100644 index 000000000..4231bdc6c --- /dev/null +++ b/openstax/wagtail_transfer_patches.py @@ -0,0 +1,23 @@ +""" +Runtime patches for wagtail-transfer 0.11. + +When importing pages, an Objective is occasionally constructed with a Page +subclass (e.g. pages.RootPage) instead of the base wagtailcore.Page. The +import context's uids_by_source / destination_ids_by_source dicts are keyed +by base model, so the subclass lookup raises KeyError in +Objective._find_at_destination. + +Every other call site in wagtail-transfer normalizes via get_base_model() — +this patch closes the one gap that leaks the subclass through. +""" +from wagtail_transfer.models import get_base_model +from wagtail_transfer.operations import Objective + +_original_init = Objective.__init__ + + +def _patched_init(self, model, source_id, context, must_update=False): + _original_init(self, get_base_model(model), source_id, context, must_update) + + +Objective.__init__ = _patched_init diff --git a/pages/management/commands/refresh_from_prod_fixup.py b/pages/management/commands/refresh_from_prod_fixup.py new file mode 100644 index 000000000..0452eca5c --- /dev/null +++ b/pages/management/commands/refresh_from_prod_fixup.py @@ -0,0 +1,253 @@ +""" +Fix up data after restoring a non-prod database from a prod dump. + +Run this AFTER pg_dump/restore and S3 bucket sync have completed. It: + + 1. Updates the Wagtail Site hostname so URL routing matches the target env. + 2. Rewrites prod URLs (`https://openstax.org`, `//openstax.org`, etc.) in + RichText, StreamField, URLField, and string-typed fields across every + concrete model in the project. + 3. Rewrites the latest revision per Wagtail Page so the admin editor shows + env-correct URLs rather than the prod ones from the dump. + 4. Empties Salesforce-synced data tables. Prod salesforce IDs reference + prod's Salesforce instance, so they're useless on dev/staging. The next + scheduled sync (update_partners / update_schools / etc.) repopulates + them with the env-correct sandbox data. Local config tables + (SalesforceSettings, SalesforceForms, *Mapping) are left alone. + +User accounts are carried across by the dump/restore and left alone here — +the CMS auth_user table holds only internal staff, so engineers can log into +the refreshed env with their prod credentials. + +Dry-run by default. Refuses to run when ENVIRONMENT == 'prod'. + +Idempotent: the URL patterns include a scheme or `//` prefix so a second +run does not double-rewrite (`https://dev.openstax.org` does not contain +the substring `https://openstax.org`). +""" +import json + +from django.apps import apps +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError +from django.db import models, transaction + +from wagtail.fields import RichTextField, StreamField +from wagtail.models import Page, Site + + +DEFAULT_SOURCE_HOST = 'openstax.org' + +SKIP_APP_LABELS = { + 'contenttypes', + 'sessions', + 'admin', + 'auth', + 'wagtail_transfer', + # salesforce models are wiped wholesale by _truncate_salesforce_synced_data, + # so there's no point doing URL rewriting in them first. + 'salesforce', +} + +# Salesforce-synced data tables — emptied during fixup so the next scheduled +# sync repopulates with the target env's sandbox data. Config/mapping models +# (SalesforceSettings, SalesforceForms, *Mapping, MapBoxDataset) are NOT in +# this list and stay intact. +SALESFORCE_SYNCED_MODELS = [ + 'salesforce.Adopter', + 'salesforce.AdoptionOpportunityRecord', + 'salesforce.School', + 'salesforce.Partner', + 'salesforce.ResourceDownload', + 'salesforce.SavingsNumber', +] + + +def _replacements(source_host, target_host): + """Build (old, new) string pairs. Order: most-specific first.""" + return [ + (f'https://{source_host}', f'https://{target_host}'), + (f'http://{source_host}', f'https://{target_host}'), + (f'//{source_host}', f'//{target_host}'), + ] + + +def _apply(value, replacements): + """Apply all replacements to a string. Returns (new_value, changed?).""" + if not isinstance(value, str): + return value, False + new = value + for old, repl in replacements: + new = new.replace(old, repl) + return new, new != value + + +class Command(BaseCommand): + help = ( + "Fix up a non-prod database after restoring from a prod dump. " + "Updates the Wagtail Site hostname and rewrites prod URLs in content. " + "DRY RUN BY DEFAULT — pass --commit to apply." + ) + + def add_arguments(self, parser): + parser.add_argument( + '--target-host', + required=True, + help="Hostname for this environment, e.g. dev.openstax.org", + ) + parser.add_argument( + '--source-host', + default=DEFAULT_SOURCE_HOST, + help=f"Hostname being replaced. Default: {DEFAULT_SOURCE_HOST}", + ) + parser.add_argument( + '--commit', + action='store_true', + help="Apply changes. Without this, runs in dry-run mode.", + ) + + def handle(self, *args, **options): + target_host = options['target_host'].strip().lower() + source_host = options['source_host'].strip().lower() + commit = options['commit'] + + if getattr(settings, 'ENVIRONMENT', None) == 'prod': + raise CommandError("Refuse to run on production.") + if target_host == source_host: + raise CommandError( + f"--target-host ({target_host}) cannot equal --source-host ({source_host})." + ) + if '.' not in target_host: + raise CommandError(f"Invalid --target-host: {target_host!r}") + + mode = self.style.WARNING('[COMMIT]') if commit else self.style.WARNING('[DRY RUN]') + self.stdout.write( + f"{mode} replacing {source_host} → {target_host}" + ) + + replacements = _replacements(source_host, target_host) + + with transaction.atomic(): + self._update_site(target_host) + self._rewrite_concrete_models(replacements) + self._rewrite_page_revisions(replacements) + self._truncate_salesforce_synced_data() + if not commit: + transaction.set_rollback(True) + self.stdout.write( + self.style.WARNING("[DRY RUN] All changes rolled back. Pass --commit to apply.") + ) + else: + self.stdout.write(self.style.SUCCESS("Done.")) + + def _update_site(self, target_host): + try: + site = Site.objects.get(is_default_site=True) + except Site.DoesNotExist: + self.stdout.write(self.style.WARNING(" no default Site found, skipping hostname update")) + return + old = site.hostname + if old == target_host: + self.stdout.write(f" Site.hostname already {target_host}, no change") + return + site.hostname = target_host + site.save() + self.stdout.write(f" Site.hostname: {old} → {target_host}") + + def _rewrite_concrete_models(self, replacements): + """Walk all concrete project models, rewriting URL-bearing field values.""" + url_field_types = (RichTextField, StreamField, models.URLField) + + for model in apps.get_models(): + if model._meta.abstract or model._meta.proxy: + continue + if model._meta.app_label in SKIP_APP_LABELS: + continue + + # Bucket fields: specialized URL fields vs plain text/char that MIGHT carry URLs. + url_fields, text_fields = [], [] + for field in model._meta.fields: + if isinstance(field, url_field_types): + url_fields.append(field) + elif isinstance(field, (models.TextField, models.CharField)): + text_fields.append(field) + + if not url_fields and not text_fields: + continue + + changed_rows = 0 + for instance in model.objects.all().iterator(): + row_changed = False + for field in url_fields: + if self._rewrite_field_on_instance(instance, field, replacements): + row_changed = True + for field in text_fields: + if self._rewrite_field_on_instance(instance, field, replacements): + row_changed = True + if row_changed: + instance.save() + changed_rows += 1 + if changed_rows: + self.stdout.write(f" {model._meta.label}: rewrote {changed_rows} row(s)") + + def _rewrite_field_on_instance(self, instance, field, replacements): + """Rewrite one field's value in place on an instance. Returns whether it changed.""" + value = getattr(instance, field.name) + if value is None: + return False + + if isinstance(field, StreamField): + # StreamField stored as JSON text in DB; rewrite at the JSON level so any + # block type with URLs inside it gets covered without per-block knowledge. + try: + raw = value.raw_data if hasattr(value, 'raw_data') else json.loads(str(value)) + except (TypeError, ValueError): + return False + json_str = json.dumps(raw) + new_json, changed = _apply(json_str, replacements) + if changed: + setattr(instance, field.name, json.loads(new_json)) + return changed + + if isinstance(value, str): + new_value, changed = _apply(value, replacements) + if changed: + setattr(instance, field.name, new_value) + return changed + + return False + + def _truncate_salesforce_synced_data(self): + """Empty Salesforce-synced data tables so the next sync repopulates them.""" + for label in SALESFORCE_SYNCED_MODELS: + try: + model = apps.get_model(label) + except LookupError: + self.stdout.write(self.style.WARNING(f" {label}: model not found, skipping")) + continue + count = model.objects.count() + if count: + model.objects.all().delete() + self.stdout.write(f" {label}: emptied {count} row(s)") + + def _rewrite_page_revisions(self, replacements): + """Rewrite the latest revision per page so the admin editor reflects new URLs.""" + count = 0 + for page in Page.objects.all().iterator(): + rev = page.get_latest_revision() + if rev is None: + continue + try: + content = rev.content + except AttributeError: + continue + if not content: + continue + json_str = json.dumps(content) + new_json, changed = _apply(json_str, replacements) + if changed: + rev.content = json.loads(new_json) + rev.save(update_fields=['content']) + count += 1 + if count: + self.stdout.write(f" wagtailcore.Revision: rewrote {count} latest revision(s)") diff --git a/pages/test_refresh_from_prod_fixup.py b/pages/test_refresh_from_prod_fixup.py new file mode 100644 index 000000000..a2bb050ff --- /dev/null +++ b/pages/test_refresh_from_prod_fixup.py @@ -0,0 +1,103 @@ +"""Tests for the refresh_from_prod_fixup management command.""" +from io import StringIO + +from django.core.management import call_command +from django.core.management.base import CommandError +from django.test import TestCase, override_settings + +from salesforce.models import Partner, SalesforceForms +from snippets.models import SharedContent +from wagtail.models import Site + + +class RefreshFromProdFixupTests(TestCase): + def setUp(self): + self.site = Site.objects.get(is_default_site=True) + self.site.hostname = 'openstax.org' + self.site.save() + + self.snippet = SharedContent.objects.create( + title='Test', + heading='Heading', + content='Visit https://openstax.org/subjects for more.', + button_url='https://openstax.org/give', + ) + + def _call(self, *extra_args): + out = StringIO() + call_command( + 'refresh_from_prod_fixup', + '--target-host', 'dev.openstax.org', + *extra_args, + stdout=out, + ) + return out.getvalue() + + def test_dry_run_does_not_persist(self): + output = self._call() + self.snippet.refresh_from_db() + self.site.refresh_from_db() + self.assertEqual(self.snippet.button_url, 'https://openstax.org/give') + self.assertEqual(self.site.hostname, 'openstax.org') + self.assertIn('[DRY RUN]', output) + + def test_commit_rewrites_url_and_text_fields_and_site(self): + self._call('--commit') + self.snippet.refresh_from_db() + self.site.refresh_from_db() + self.assertEqual(self.snippet.button_url, 'https://dev.openstax.org/give') + self.assertEqual( + self.snippet.content, + 'Visit https://dev.openstax.org/subjects for more.', + ) + self.assertEqual(self.site.hostname, 'dev.openstax.org') + + def test_second_run_is_idempotent(self): + self._call('--commit') + self._call('--commit') # second run on already-rewritten data + self.snippet.refresh_from_db() + self.assertEqual(self.snippet.button_url, 'https://dev.openstax.org/give') + # Specifically: no nested-substitution like https://dev.dev.openstax.org + self.assertNotIn('dev.dev.', self.snippet.button_url) + self.assertNotIn('dev.dev.', self.snippet.content) + + def test_refuses_when_source_equals_target(self): + with self.assertRaises(CommandError): + call_command( + 'refresh_from_prod_fixup', + '--target-host', 'openstax.org', + '--commit', + stdout=StringIO(), + ) + + def test_refuses_when_target_is_not_a_hostname(self): + with self.assertRaises(CommandError): + call_command( + 'refresh_from_prod_fixup', + '--target-host', 'localhost', + '--commit', + stdout=StringIO(), + ) + + def test_truncates_salesforce_synced_models_leaves_config(self): + Partner.objects.create(partner_name='Synced Partner', salesforce_id='abc123') + forms_config = SalesforceForms.objects.create( + oid='form-oid', posting_url='https://openstax.org/form', + ) + self._call('--commit') + # Salesforce-synced data wiped (next sync will repopulate) + self.assertEqual(Partner.objects.count(), 0) + # Local config preserved + forms_config.refresh_from_db() + self.assertEqual(forms_config.oid, 'form-oid') + + @override_settings(ENVIRONMENT='prod') + def test_refuses_to_run_on_prod(self): + with self.assertRaises(CommandError): + call_command( + 'refresh_from_prod_fixup', + '--target-host', 'dev.openstax.org', + '--commit', + stdout=StringIO(), + ) + diff --git a/requirements/base.txt b/requirements/base.txt index 0a0fcc1f4..b380d8f79 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -33,4 +33,5 @@ vcrpy==7.0.0 wagtail==7.3.2 wagtail-autocomplete==0.12.0 wagtail-modeladmin==2.2.0 +wagtail-transfer==0.11 whitenoise==6.9.0 diff --git a/wagtailimportexport/__init__.py b/wagtailimportexport/__init__.py deleted file mode 100644 index 27dd626de..000000000 --- a/wagtailimportexport/__init__.py +++ /dev/null @@ -1 +0,0 @@ -#default_app_config = 'wagtailimportexport.apps.WagtailImportExportAppConfig' \ No newline at end of file diff --git a/wagtailimportexport/admin_urls.py b/wagtailimportexport/admin_urls.py deleted file mode 100644 index a32cfdf76..000000000 --- a/wagtailimportexport/admin_urls.py +++ /dev/null @@ -1,11 +0,0 @@ -from django.urls import re_path - -from wagtailimportexport import views - - -app_name = 'wagtailimportexport' -urlpatterns = [ - re_path(r'^import-page/$', views.import_page, name='import-page'), - re_path(r'^export-page/$', views.export_page, name='export-page'), - re_path(r'^$', views.index, name='index'), -] diff --git a/wagtailimportexport/apps.py b/wagtailimportexport/apps.py deleted file mode 100644 index bce1cd0a0..000000000 --- a/wagtailimportexport/apps.py +++ /dev/null @@ -1,8 +0,0 @@ -from django.apps import AppConfig - - -class WagtailImportExportAppConfig(AppConfig): - name = 'wagtailimportexport' - label = 'wagtailimportexport' - verbose_name = "Import/Export Tool" - default = True diff --git a/wagtailimportexport/config.py b/wagtailimportexport/config.py deleted file mode 100644 index a80908065..000000000 --- a/wagtailimportexport/config.py +++ /dev/null @@ -1,3 +0,0 @@ -app_settings = { - 'max_file_size': 20000000, # Limit max file size to 20.000.000 bytes if checkbox is selected. -} \ No newline at end of file diff --git a/wagtailimportexport/exporting.py b/wagtailimportexport/exporting.py deleted file mode 100644 index 60b357ba2..000000000 --- a/wagtailimportexport/exporting.py +++ /dev/null @@ -1,189 +0,0 @@ -import io -import json -import logging -import copy - -from django.core.files import File -from django.core.serializers.json import DjangoJSONEncoder -from django.db.models.base import ModelState -from django.db.models.fields.files import FieldFile - -from wagtail.models import Page -from wagtail.blocks import StreamValue -from wagtail.images.models import Image - -from wagtailimportexport import functions -from wagtailimportexport.config import app_settings -from wagtail.documents.models import Document - - -def export_page(settings={'root_page': None, 'export_unpublished': False, - 'export_documents': False, 'export_images': False, 'null_pk': True, - 'null_fk': False, 'null_users': False - }): - """ - Exports the root_page as well as its children (if the setting is set). - - Arguments: - settings -- A dictionary that holds settings from cleared form data. - - Returns: - A zip archive of the exported pages; if it fails at any point, returns - None and logs the error. - """ - - settings = copy.deepcopy(settings) - - # If root_page is not set, then set it the main directory as default. - if not settings['root_page']: - settings['root_page'] = Page.objects.filter(url_path='/').first() - - # Get the list of the pages, (that are the descendant of the root_page). - pages = Page.objects.descendant_of( - settings['root_page'], inclusive=True).order_by('path').specific() - - # Filter the pages if export_unpublished is set to false. - if not settings['export_unpublished']: - pages = pages.filter(live=True) - - # Initialize the variables. - page_data = [] - exported_paths = set() - - # Start looping through pages and export their content. - for (i, page) in enumerate(pages): - parent_path = page.path[:-(Page.steplen)] - - # skip over pages whose parents haven't already been exported - # (which means that export_unpublished is false and the parent was unpublished) - if i == 0 or (parent_path in exported_paths): - - # Turn page data to a dictionary. - data = json.loads(page.to_json()) - locale = data['locale'] - - # look up document titles - if page.content_type.model == 'book': - cover = functions.document_title(data['cover']) - title_image = functions.document_title(data['title_image']) - hi_res_pdf = functions.document_title(data['high_resolution_pdf']) - lo_res_pdf = functions.document_title(data['low_resolution_pdf']) - community_logo = functions.document_title(data['community_resource_logo']) - community_feature_link = functions.document_title(data['community_resource_feature_link']) - - # Get list (and metadata) of images and documents to be exported. - images = list_fileobjects(page, settings, Image) if settings['export_images'] else {} - documents = list_fileobjects(page, settings, Document) if settings['export_documents'] else {} - - # Remove FKs - if settings['null_fk']: - functions.null_fks(page, data) - - #Remove the owner of the page. - if settings['null_users'] and not data.get('owner'): - data['owner'] = None - - # Null all the images. - if settings['export_images']: - for image in images: - if data.get(image) is not None: - data[image] = None - - data['pk'] = None - data['locale'] = locale - # add document titles to data - if page.content_type.model == 'book': - data['cover'] = cover - data['title_image'] = title_image - data['high_resolution_pdf'] = hi_res_pdf - data['low_resolution_pdf'] = lo_res_pdf - data['community_resource_logo'] = community_logo - data['community_resource_feature_link'] = community_feature_link - - # Export page data. - page_data.append({ - 'content': data, - 'model': page.content_type.model, - 'app_label': page.content_type.app_label, - 'images': images, - 'documents': documents - }) - - exported_paths.add(page.path) - - return functions.zip_contents(page_data) - - -def list_fileobjects(page, settings, objtype): - """ - Returns a dict of all fields that has the related_model of objtype as well as their metadata. - - Arguments: - page -- Page instance with supported fields. - settings -- Settings dictionary from main method. - objtype -- Image, Document from Wagtail. - - Returns: - A dictionary of fields with their respective metadata. - """ - - data = json.loads(page.to_json()) - - if objtype == Image: - related_model_by = "" - elif objtype == Document: - related_model_by = "" - else: - return {} - - objects = {} - for field in page._meta.get_fields(): - if field.related_model and str(field.related_model) == related_model_by: - if data[field.name]: - - try: - # Get the object instance. - instance = objtype.objects.get(pk=data[field.name]) - - # Null the object if the filesize is larger. - if instance.file.size > app_settings['max_file_size'] and settings['ignore_large_files']: - objects[field.name] = None - else: - objects[field.name] = instance_to_data(instance, null_users=settings['null_users']) - - except (FileNotFoundError, objtype.DoesNotExist): - logging.error("File for " + str(field.name) + " is not found on the environment, skipping.") - objects[field.name] = None - - else: - objects[field.name] = None - - return objects - - -def instance_to_data(instance, null_users=False): - """ - A utility to create JSON-able data from a model instance. - - Arguments: - instance -- objects.get() object instance. - null_users -- Whether to null user references. - - Returns: - A dictionary of metadata of instance. - """ - - data = {} - - for key, value in instance.__dict__.items(): - if isinstance(value, ModelState): - continue - elif null_users == True and ('user_id' in key or 'owner' in key): - data[key] = None - elif isinstance(value, StreamValue): - data[key] = json.dumps(value.stream_data, cls=DjangoJSONEncoder) - elif isinstance(value, FieldFile) or isinstance(value, File): - data[key] = {'name': value.name, 'size': value.size} - else: - data[key] = value - return data \ No newline at end of file diff --git a/wagtailimportexport/forms.py b/wagtailimportexport/forms.py deleted file mode 100644 index 048a7fefb..000000000 --- a/wagtailimportexport/forms.py +++ /dev/null @@ -1,96 +0,0 @@ -from django import forms - -from wagtail.admin.widgets import AdminPageChooser -from wagtail.models import Page -from wagtail.admin import widgets as wagtailadmin_widgets - - -admin_page_params = { - 'can_choose_root': True, - 'show_edit_link': False, - 'user_perms': 'copy_to' -} - - -class ImportPage(forms.Form): - """ - This form renders the import fields for zip archives. - """ - - file = forms.FileField(label="Zip Archive to Import") - - parent_page = forms.ModelChoiceField( - queryset=Page.objects.all(), - widget=AdminPageChooser(**admin_page_params.copy()), - label="Destination Parent Page", - help_text="Imported pages will be created as children of this page." - ) - -class ExportPage(forms.Form): - """ - This form renders the export fields. - """ - - root_page = forms.ModelChoiceField( - queryset=Page.objects.all(), - widget=AdminPageChooser(**admin_page_params.copy()), - label="Root Page to Export", - help_text="All children pages (including the selected root page) will be exported." - ) - - export_unpublished = forms.BooleanField( - initial=True, - required=False, - label="Export Unpublished Pages", - help_text="If True, unpublished pages will be exported along with published pages.", - ) - - null_pk = forms.BooleanField( - widget=forms.HiddenInput(), - required = False, - initial=False, - label="Remove Primary Keys", - help_text="This is set to False as default and can be changed in code. Changing to True may break import functionality.", - ) - - null_fk = forms.BooleanField( - initial=True, - required=False, - label="Remove Foreign Keys", - help_text="If True, foreign keys will be nulled. Leave checked if exported archive will be imported to a different environment.", - ) - - null_users = forms.BooleanField( - initial=True, - required=False, - label="Remove User References", - help_text="If True, user fields (owner in pages, *user_id in images) will be nulled. Leave checked if exported archive will be imported to a different environment.", - ) - - export_images = forms.BooleanField( - initial=True, - required=False, - label="Export Images", - help_text="If True, image references will be nulled and images that are used on the page will be exported along with the rest of the content. Leave checked if exported archive will be imported to a different environment.", - ) - - export_documents = forms.BooleanField( - initial=True, - required=False, - label="Export Documents", - help_text="If True, document references will be nulled and documents that are used on the page will be exported along with the rest of the content. Leave checked if exported archive will be imported to a different environment.", - ) - - export_snippets = forms.BooleanField( - initial=True, - required=False, - label="Export Snippets", - help_text="If True, snippet references will be nulled and snippets that are used on the page will be exported along with the rest of the content. Leave checked if exported archive will be imported to a different environment.", - ) - - ignore_large_files = forms.BooleanField( - initial=True, - required=False, - label="Exclude Large Files", - help_text="If True, large files will be nullified and ignored during export.", - ) diff --git a/wagtailimportexport/functions.py b/wagtailimportexport/functions.py deleted file mode 100644 index 067ff1889..000000000 --- a/wagtailimportexport/functions.py +++ /dev/null @@ -1,246 +0,0 @@ -import tempfile -import zipfile -import os -import io -import json -import logging - -from django.core.files.storage import storages as get_storage_class -from django.core.serializers.json import DjangoJSONEncoder -from django.db.models.fields.related import ForeignKey -from django.db.models.fields.reverse_related import ManyToOneRel -from django.contrib.contenttypes.models import ContentType - -from wagtail.fields import StreamField -from wagtail.documents.models import Document - - -def null_pks(page, data): - """ - Nullifies primary keys within all supplied fields. - - Arguments: - page -- Page object. - data -- Page object in dictionary format. - - Returns: - N/A. Overwrites the argument. - """ - - # Nullify the main ID - data['id'] = None - data['pk'] = None - - # Loop through all fields. - for field_name, field_val in data.items(): - if type(field_val) != list: - continue - - for i, sub_item in enumerate(field_val): - if 'pk' in sub_item: - data[field_name][i]['pk'] = None - - -def find_null_child_blocks(subfield, location, data): - """ - Recursive function to find all children blocks - within streamfield and nullify fks. - - Arguments: - subfield -- A field. - location -- (Ordered) list of field keys that act - as a tree. - data -- Data object to overwrite the changes to. - - Returns: - N/A. Overwrites data. - """ - - # Some fields do not have child_blocks, and we should not - # investigate further if that's the case. - if "child_blocks" in subfield.__dict__.keys(): - - # Go through all fields. - for field_key, field_val in subfield.child_blocks.items(): - - # We want to catch the ForeignKey - if isinstance(field_val, ForeignKey): - # TODO: Implement overwriting. - pass - - # Recursive Calls - find_null_child_blocks(field_val, location + [field_key], data) - - -def find_null_child_relations(subfield, location, data): - """ - Recursive function to find all children relations - within manyotoone relationships and nullify fks. - - Arguments: - subfield -- A field. - location -- (Ordered) list of field keys that act - as a tree. - data -- Data object to overwrite the changes to. - - Returns: - N/A. Overwrites data. - """ - - # Some fields do not have related_model, and we should not - # investigate further if that's the case. - if "related_model" in subfield.__dict__.keys(): - - # Go through all fields. - for field in subfield.related_model._meta.fields: - - # We want to catch the ForeignKey - if isinstance(field, ForeignKey): - if not location[0] in data: - continue - - for i, value in enumerate(data[location[0]]): - if not field.name in data[location[0]][i]: - continue - - data[location[0]][i][field.name] = None - - -def null_fks(page, data): - """ - Nullifies foreign keys within all supplied fields. - - Arguments: - page -- Page object. - data -- Page object in dictionary format. - - Returns: - N/A. Overwrites the argument. - """ - - # Loop through all fields. - for field in page._meta.get_fields(): - - # Check whether the field is a ForeignKey. - # By nature, owner, content_type, live_revision - # are foreign keys defined by wagtail core pages. - if (isinstance(field, ForeignKey)): - data[field.name] = None - - # StreamFields often have foreign keys associated with them. - # if(isinstance(field, StreamField)): - # find_null_child_blocks(field.stream_block, [field.name], data) - # - # # Many to One relations often have foreign keys associated with them. - # if(isinstance(field, ManyToOneRel)): - # find_null_child_relations(field, [field.name], data) - - -def zip_contents(page_contents): - """ - Creates and returns a zip archive of all supplied items. - - Arguments: - page_contents -- A list of page dictionaries. - - Returns: - Zip file to be downloaded by the client. - """ - - file_storage = get_storage_class()() - - # Create a temporary directory. - with tempfile.TemporaryDirectory() as tempdir: - - # Create a temporary zip. - zfname = os.path.join(tempdir, 'content.zip') - - # Open the zip archive with write mode. - with zipfile.ZipFile(zfname, 'w') as zf: - - # Write the main content.json file to store all data. - zf.writestr( - 'content.json', - json.dumps(page_contents, indent=2, cls=DjangoJSONEncoder) - ) - - # Loop through pages to explore all used images and documents. - for page in page_contents: - - # Export all the images. - for image_def in page['images'].values(): - if not image_def: - continue - - filename = image_def['file']['name'] - - try: - with file_storage.open(filename, 'rb') as f: - zf.writestr(filename, f.read()) - except FileNotFoundError: - logging.error("File " + str(filename) + " is not found on local file storage and was not exported.") - - # Export all the documents. - for doc_def in page['documents'].values(): - if not doc_def: - continue - - filename = doc_def['file']['name'] - - try: - with file_storage.open(filename, 'rb') as f: - zf.writestr(filename, f.read()) - except FileNotFoundError: - logging.error("File " + str(filename) + " is not found on local file storage and was not exported.") - - with open(zfname, 'rb') as zf: - fd = zf.read() - - return io.BytesIO(fd) - - -def unzip_contents(zip_contents): - """ - Extracts all items in the zip archive and returns a mapping - of the contents, as well as their location in tempdir. - - Arguments: - zip_contents -- Zip file that is in memory. - - Returns: - Map of the extracted files. - """ - - # Create a temporary directory. - tempdir = tempfile.mkdtemp() - - # Extract all contents. - zip_contents.extractall(tempdir) - - # Return the mapping of all extracted members. - return {member: tempdir + '/' + member for member in zip_contents.namelist()} - - -def document_title(doc_pk): - doc = Document.objects.all().filter(pk=doc_pk) - if not doc: - return None - else: - return str(doc[0]) - - -def document_id(doc_title): - doc = Document.objects.all().filter(title=doc_title) - if not doc: - return None - else: - return doc[0].pk - - -def content_type_by_model(model): - content_type = ContentType.objects.all().filter(model=model) - if not content_type: - return None - else: - return str(content_type[0].pk) - diff --git a/wagtailimportexport/importing.py b/wagtailimportexport/importing.py deleted file mode 100644 index 8c61af851..000000000 --- a/wagtailimportexport/importing.py +++ /dev/null @@ -1,312 +0,0 @@ -import io -import json -import logging -import traceback -from zipfile import ZipFile - -from django.apps import apps -from django.core.files.images import ImageFile -from django.core.files.base import File -from django.contrib.contenttypes.models import ContentType -from django.db import models, transaction, IntegrityError - -from modelcluster.models import get_all_child_relations - -from wagtail.models import Page -from wagtail.images.models import Image - -from wagtailimportexport import functions -import snippets.models as snippets - - -def import_page(uploaded_archive, parent_page, overwrites={}): - """ - Imports uploaded_archive as children of parent_page. - - Arguments: - uploaded_archive -- A file object, which includes contents.json - and the media objects. - parent_page -- Page object, where the page(s) will be imported to. - - Returns: - numpages -- Integer value of number of pages that were successfully - imported. - numfails -- Integer value of number of pages that were failed to be - imported. - message -- String message to report any warning/issue. - """ - - # Read the zip archive and load as 'payload'. - payload = io.BytesIO(uploaded_archive.read()) - - # Open zip archive. - with ZipFile(payload, 'r') as zf: - try: - # Open content.json and load them into contents dictionary. - with zf.open('content.json') as mf: - contents = json.loads(mf.read().decode('utf-8-sig')) - error_msg = '' - - # First create the base Page records; these contain no foreign keys, so this allows us to - # build a complete mapping from old IDs to new IDs before we go on to importing the - # specific page models, which may require us to rewrite page IDs within foreign keys / rich - # text / streamfields. - page_content_type = ContentType.objects.get_for_model(Page) - - # Unzip all the files in the zip directory. - contents_mapping = functions.unzip_contents(zf) - - # Get the list of pages to skip. - existing_pages = list_existing_pages(contents) if not overwrites else [] - - # Dictionaries to store original paths. - pages_by_original_path = {} - pages_by_original_id = {} - - # Loop through all the pages. - for (i, page_record) in enumerate(contents): - - new_field_datas = {} - content_type = functions.content_type_by_model(page_record['model']) - #content_type = page_record['content']['content_type'] - - # Skip the existing pages. - if i in existing_pages: - error_msg = 'Import stopped. Duplicate slug: ' + str(page_record['content']['slug']) - continue - - # Reassign image IDs. - for (fieldname, filedata) in page_record["images"].items(): - - new_field_datas[fieldname] = None - - # Skip if the image is set to null. - if not filedata: - continue - - local_file_query = get_fileobject(filedata["file"]["name"].split("/")[-1], Image) - - local_file_id = local_file_query if local_file_query else create_fileobject( - filedata["title"], contents_mapping[filedata["file"]["name"]], Image) - - new_field_datas[fieldname] = local_file_id - - # Overwrite image and document IDs - for (field, new_value) in new_field_datas.items(): - page_record['content'][field] = new_value - - # Misc. overwrites - for (field, new_value) in overwrites.items(): - page_record['content'][field] = new_value - - if page_record['model'] == 'book': - # look up document ids - page_record['content']['cover'] = functions.document_id(page_record['content']['cover']) - page_record['content']['title_image'] = functions.document_id(page_record['content']['title_image']) - page_record['content']['high_resolution_pdf'] = functions.document_id(page_record['content']['high_resolution_pdf']) - page_record['content']['low_resolution_pdf'] = functions.document_id(page_record['content']['low_resolution_pdf']) - page_record['content']['community_resource_logo'] = functions.document_id(page_record['content']['community_resource_logo']) - page_record['content']['community_resource_feature_link'] = functions.document_id(page_record['content']['community_resource_feature_link']) - - # set page.pk to null if pk already exists - pages = Page.objects.all() - for p in pages: - if p.pk == page_record['content']['pk']: - page_record['content']['pk'] = None - break - - page_record['content']['content_type'] = content_type - # Create page instance. - page = Page.from_serializable_data(page_record['content']) - - original_path = page.path - original_id = page.id - - # Clear id and treebeard-related fields so that they get reassigned when we save via add_child - page.id = None - page.path = None - page.depth = None - page.numchild = 0 - page.url_path = None - page.content_type = page_content_type - - # Handle children of the imported page(s). - if i == 0: - parent_page.add_child(instance=page) - else: - # Child pages are created in the same sibling path order as the - # source tree because the export is ordered by path - parent_path = original_path[:-(Page.steplen)] - pages_by_original_path[parent_path].add_child(instance=page) - - pages_by_original_path[original_path] = page - pages_by_original_id[original_id] = page - - # Get the page model of the source page by app_label and model name - # The content type ID of the source page is not in general the same - # between the source and destination sites but the page model needs - # to exist on both. - try: - model = apps.get_model(page_record['app_label'], page_record['model']) - except LookupError: - logging.error("Importing file failed because the model " + page_record[ - 'model'] + " does not exist on this environment.") - return (0, 1, "Importing file failed because the model " + page_record[ - 'model'] + " does not exist on this environment.") - - specific_page = model.from_serializable_data(page_record['content'], check_fks=False, - strict_fks=False) - - base_page = pages_by_original_id[specific_page.id] - specific_page.page_ptr = base_page - specific_page.__dict__.update(base_page.__dict__) - specific_page.content_type = ContentType.objects.get_for_model(model) - update_page_references(specific_page, pages_by_original_id) - specific_page.save() - - return (len(contents) - len(existing_pages), len(existing_pages), error_msg) - - except LookupError as e: - # If content.json does not exist, then return the error, - # and terminate the import_page. - logging.error("Importing file failed because file does not exist: " + str(e)) - traceback.print_exception(type(e), e, e.__traceback__) - return (0, 1, "File does not exist: " + str(e)) - - return (0, 1, "") - - -def list_existing_pages(pages): - """ - Returns a list of pages that already exist in this - environment by looking up by slug. - - Arguments: - pages -- A list of pages in content.json - - Returns: - existing_pages -- List of pages that correspond to indexes - in 'pages'. - """ - - existing_pages = [] - - for (i, page_record) in enumerate(pages): - try: - # Trying to get the page. - localpage = Page.objects.get(slug=page_record['content']['slug']) - - if localpage: - existing_pages.append(i) - - except Page.DoesNotExist: - continue - - return existing_pages - - -def get_fileobject(title, objtype): - """ - Returns the id of the object if it exists, otherwise returns - False. - - Arguments: - title -- The filename to be queried. - objtype -- Image, Document from Wagtail. - - Returns: - False if the object does not exist in this environment, - object's integer ID if it does exist. - """ - - try: - # Check whether the object already exists. - localobj = objtype.objects.get(file=title) - - if localobj: - return localobj.id - - except objtype.DoesNotExist: - return False - - return False - - -def create_fileobject(title, uploaded_file, objtype): - """ - Creates a new object given the information and returns - the ID of the created object. Assumes the object with - title does not exist. - - Arguments: - title -- The filename of the object to be created. - uploaded_file -- The file object to create. - objtype -- Image, Document from Wagtail. - - Returns: - Integer ID of the created object if the creation is successful; - otherwise None. - """ - - try: - with open(uploaded_file, 'rb') as mf: - - # Create the file object based on objtype. - if objtype == File: - filedata = File(mf, name=mf.name.split("/")[-1]) - elif objtype == Image: - filedata = ImageFile(mf, name=mf.name.split("/")[-1]) - else: - return None - - try: - with transaction.atomic(): - # Create the object and return the ID. - localobj = objtype.objects.create(file=filedata, title=title) - return localobj.id - - except IntegrityError: - logging.error("Integrity error while uploading a file:", title) - return None - except FileNotFoundError: - logging.error("File " + uploaded_file + " is not found on imported archive, skipping.") - - return None - - -def update_page_references(model, pages_by_original_id): - """ - Updates the page references recursively. - - Arguments: - model -- - pages_by_original_id -- - - Returns: - N/A. Overwrites model attributes. - - """ - - for field in model._meta.get_fields(): - if isinstance(field, models.ForeignKey) and issubclass(field.related_model, Page): - linked_page_id = getattr(model, field.attname) - try: - # see if the linked page is one of the ones we're importing - linked_page = pages_by_original_id[linked_page_id] - except KeyError: - # any references to pages outside of the import should be left unchanged - continue - - # update fk to the linked page's new ID - setattr(model, field.attname, linked_page.id) - - # update references within inline child models, including the ParentalKey pointing back - # to the page - for rel in get_all_child_relations(model): - for child in getattr(model, rel.get_accessor_name()).all(): - # reset the child model's PK so that it will be inserted as a new record - # rather than updating an existing one - child.pk = None - # update page references on the child model, including the ParentalKey - update_page_references(child, pages_by_original_id) - \ No newline at end of file diff --git a/wagtailimportexport/migrations/__init__.py b/wagtailimportexport/migrations/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/wagtailimportexport/templates/wagtailimportexport/export-page.html b/wagtailimportexport/templates/wagtailimportexport/export-page.html deleted file mode 100644 index e8f471b60..000000000 --- a/wagtailimportexport/templates/wagtailimportexport/export-page.html +++ /dev/null @@ -1,31 +0,0 @@ -{% extends "wagtailadmin/base.html" %} -{% load i18n %} -{% block titletag %}{% blocktrans %}Export Pages{% endblocktrans %}{% endblock %} -{% block content %} - {% trans "Export Pages" as title_str %} - {% include "wagtailadmin/shared/header.html" with title=title_str icon="download" %} - -
-
- {% csrf_token %} -
    - {% for field in form %} -
  • {% include "wagtailadmin/shared/field.html" %}
  • - {% endfor %} -
- - -
-
-{% endblock %} - -{% block extra_js %} - {{ block.super }} - {% include "wagtailadmin/pages/_editor_js.html" %} - {{ form.media.js }} -{% endblock %} - -{% block extra_css %} - {{ block.super }} - {{ form.media.css }} -{% endblock %} diff --git a/wagtailimportexport/templates/wagtailimportexport/import-page.html b/wagtailimportexport/templates/wagtailimportexport/import-page.html deleted file mode 100644 index 4f53e3198..000000000 --- a/wagtailimportexport/templates/wagtailimportexport/import-page.html +++ /dev/null @@ -1,31 +0,0 @@ -{% extends "wagtailadmin/base.html" %} -{% load i18n %} -{% block titletag %}{% blocktrans %}Import Pages{% endblocktrans %}{% endblock %} -{% block content %} - {% trans "Import Pages" as title_str %} - {% include "wagtailadmin/shared/header.html" with title=title_str icon="download" %} - -
-
- {% csrf_token %} -
    - {% for field in form %} -
  • {% include "wagtailadmin/shared/field.html" %}
  • - {% endfor %} -
- - -
-
-{% endblock %} - -{% block extra_js %} - {{ block.super }} - {% include "wagtailadmin/pages/_editor_js.html" %} - {{ form.media.js }} -{% endblock %} - -{% block extra_css %} - {{ block.super }} - {{ form.media.css }} -{% endblock %} \ No newline at end of file diff --git a/wagtailimportexport/templates/wagtailimportexport/index.html b/wagtailimportexport/templates/wagtailimportexport/index.html deleted file mode 100644 index 34cac3d5c..000000000 --- a/wagtailimportexport/templates/wagtailimportexport/index.html +++ /dev/null @@ -1,30 +0,0 @@ -{% extends "wagtailadmin/base.html" %} -{% load i18n %} -{% block titletag %}{% blocktrans %}Import/Export Pages{% endblocktrans %}{% endblock %} -{% block content %} - {% trans "Import/Export Pages" as title_str %} - {% include "wagtailadmin/shared/header.html" with title=title_str icon="download" %} - - -{% endblock %} diff --git a/wagtailimportexport/tests/__init__.py b/wagtailimportexport/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/wagtailimportexport/tests/test_functions.py b/wagtailimportexport/tests/test_functions.py deleted file mode 100644 index 4997a3903..000000000 --- a/wagtailimportexport/tests/test_functions.py +++ /dev/null @@ -1,32 +0,0 @@ -from django.test import TestCase - -from wagtailimportexport import functions - - -class TestNullPKs(TestCase): - """ - Test cases for null_pks method in functions.py - """ - def test_null(self): - pass - -class TestNullFKs(TestCase): - """ - Test cases for null_fks method in functions.py - """ - def test_null(self): - pass - -class TestZipContents(TestCase): - """ - Test cases for zip_contents method in functions.py - """ - def test_null(self): - pass - -class TestUnZipContents(TestCase): - """ - Test cases for unzip_contents method in functions.py - """ - def test_null(self): - pass \ No newline at end of file diff --git a/wagtailimportexport/tests/tests.py b/wagtailimportexport/tests/tests.py deleted file mode 100644 index ec9f65409..000000000 --- a/wagtailimportexport/tests/tests.py +++ /dev/null @@ -1,36 +0,0 @@ -from unittest import mock - -from django.forms import FileField, ModelChoiceField -from django.test import TestCase, Client -from django.core.files.uploadedfile import SimpleUploadedFile -from django.core.files import File - -from pages.models import HomePage -from wagtailimportexport.forms import ImportPage, ExportPage - - -class TemplateTests(TestCase): - def setUp(self): - self.client = Client() - - def test_import_template(self): - response = self.client.get('/admin/import-export/import-page/', follow=True) - self.assertEqual(response.status_code, 200) - - def test_export_template(self): - response = self.client.get('/admin/import-export/export-page/', follow=True) - self.assertEqual(response.status_code, 200) - - def test_import_form(self): - zip = SimpleUploadedFile("test.zip", b"file content", content_type="application/zip") - form_data = {"parent_page": 1} - form = ImportPage(form_data, files={'file': zip}) - self.assertTrue(form.is_valid(), form.errors) - - def test_export_form(self): - form_data = {"root_page": 1} - form = ExportPage(form_data) - self.assertTrue(form.is_valid(), form.errors) - - - diff --git a/wagtailimportexport/views.py b/wagtailimportexport/views.py deleted file mode 100644 index 4759ae640..000000000 --- a/wagtailimportexport/views.py +++ /dev/null @@ -1,102 +0,0 @@ -from django.http import JsonResponse -from django.shortcuts import redirect, render -from django.urls import reverse -from django.utils.translation import ngettext -from django.http import HttpResponse - -from wagtail.admin import messages - -from wagtailimportexport import forms, importing, exporting - - -def index(request): - """ - View for main menu of the Import/Export tool. Provides a list - of features. - """ - return render(request, 'wagtailimportexport/index.html') - -def import_page(request): - """ - View for the import page. - """ - if request.method == 'POST': - form = forms.ImportPage(request.POST, request.FILES) - - if form.is_valid(): - - # Read fields on the submitted form. - form_file = form.cleaned_data['file'] - form_parentpage = form.cleaned_data['parent_page'] - - # Import pages and get the response. - num_uploaded, num_failed, response = importing.import_page(form_file, form_parentpage) - - # Show messages depending on the response. - if not num_failed: - # All pages are imported. - messages.success( - request, ngettext("Imported %(count)s page.", "Imported %(count)s pages.", num_uploaded) - % {'count': num_uploaded} - ) - elif not num_uploaded: - # None of the pages are imported. - messages.error( - request, ngettext("Failed to import %(count)s page. %(reason)s", "Failed to import %(count)s pages. %(reason)s", num_failed) - % {'count': num_failed, 'reason': response} - ) - else: - # Some pages are imported and some failed. - messages.warning( - request, ngettext("Failed to import %(failed)s out of %(total)s page. %(reason)s", "Failed to import %(failed)s out of %(total)s pages. %(reason)s", num_failed + num_uploaded) - % {'failed': num_failed, 'total': num_failed + num_uploaded, 'reason': response} - ) - - # Redirect client to the parent page view on admin. - return redirect('wagtailadmin_explore', form_parentpage.pk) - else: - form = forms.ImportPage() - - # Redirect client to form. - return render(request, 'wagtailimportexport/import-page.html', { - 'form': form, - }) - -def export_page(request): - """ - View for the export page. - """ - - if request.method == 'POST': - form = forms.ExportPage(request.POST) - - if form.is_valid(): - export_file = exporting.export_page(settings=form.cleaned_data) - - if export_file: - # Grab ZIP file from in-memory, make response with correct MIME-type - response = HttpResponse(export_file.getvalue(), content_type = "application/x-zip-compressed") - - # ..and correct content-disposition - response['Content-Disposition'] = 'attachment; filename=wagtail-export.zip' - - return response - else: - form = forms.ExportPage() - - messages.error( - request, "Failed to generate an export file. Please refer to the logs for further details." - ) - - # Redirect client to form. - return render(request, 'wagtailimportexport/export-page.html', { - 'form': form, - }) - - else: - form = forms.ExportPage() - - # Redirect client to form. - return render(request, 'wagtailimportexport/export-page.html', { - 'form': form, - }) diff --git a/wagtailimportexport/wagtail_hooks.py b/wagtailimportexport/wagtail_hooks.py deleted file mode 100644 index 7001d7487..000000000 --- a/wagtailimportexport/wagtail_hooks.py +++ /dev/null @@ -1,37 +0,0 @@ -from django.urls import include, path -from django.urls import reverse - -from wagtail import hooks -from wagtail.admin.menu import MenuItem - -from wagtailimportexport import admin_urls - - -@hooks.register('register_admin_urls') -def register_admin_urls(): - """ - Register 'import-export/' url path to admin urls. - """ - return [ - path(r'import-export/', include(admin_urls, namespace='wagtailimportexport')), - ] - - -class ImportExportMenuItem(MenuItem): - """ - Add the menu item to admin side menu. This will be only shown if the user is - superuser. This will be only shown if the user is - superuser. - """ - def is_shown(self, request): - return request.user.is_superuser - - -@hooks.register('register_admin_menu_item') -def register_import_export_menu_item(): - """ - Add the menu item to admin side menu. - """ - return ImportExportMenuItem( - 'Import / Export', reverse('wagtailimportexport:index'), classname='icon icon-download', order=800 - )