From 778963c3bdf7b91448692a4103ef9143e06f786a Mon Sep 17 00:00:00 2001 From: Fran McDade <18710366+frano-m@users.noreply.github.com> Date: Wed, 13 May 2026 22:27:46 +1000 Subject: [PATCH 1/3] feat: [lungmap] add lungmap projects to google datasets catalog (#4808) --- .../schemaOrg/lungmapProjectDataset.test.ts | 74 ++++++ app/utils/schemaOrg/hcaProjectDataset.ts | 200 +--------------- app/utils/schemaOrg/lungmapProjectDataset.ts | 27 +++ app/utils/schemaOrg/projectDataset.ts | 224 ++++++++++++++++++ pages/[entityListType]/[...params].tsx | 4 + 5 files changed, 337 insertions(+), 192 deletions(-) create mode 100644 __tests__/utils/schemaOrg/lungmapProjectDataset.test.ts create mode 100644 app/utils/schemaOrg/lungmapProjectDataset.ts create mode 100644 app/utils/schemaOrg/projectDataset.ts diff --git a/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts b/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts new file mode 100644 index 000000000..67688c89b --- /dev/null +++ b/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts @@ -0,0 +1,74 @@ +import type { ProjectsResponse } from "../../../app/apis/azul/hca-dcp/common/responses"; +import { buildLungmapProjectJsonLd } from "../../../app/utils/schemaOrg/lungmapProjectDataset"; + +const BROWSER_URL = "https://data-browser.lungmap.net"; + +/** + * Builds a minimal valid project response for the LungMAP wrapper. The full + * mapping is covered by `hcaProjectDataset.test.ts` (same shared core); this + * file only verifies the LungMAP-specific catalog identity surfaces correctly. + * @returns A `ProjectsResponse` shape sufficient for catalog-identity checks. + */ +function makeProjectsResponse(): ProjectsResponse { + return { + dates: [], + donorOrganisms: [], + entryId: "abc", + fileTypeSummaries: [], + projects: [ + { + accessible: true, + accessions: [], + bionetworkName: [], + contributedAnalyses: {}, + contributors: [], + dataUseRestriction: null, + duosId: null, + estimatedCellCount: null, + laboratory: [], + matrices: {}, + projectDescription: + "A study of lung development and disease across many donors.", + projectId: "uuid-1", + projectShortname: "Lung Study", + projectTitle: "Lung development atlas", + }, + ], + protocols: [], + samples: [], + specimens: [], + status: 200, + } as unknown as ProjectsResponse; +} + +describe("buildLungmapProjectJsonLd", () => { + it("returns undefined when no project is present", () => { + const response = { ...makeProjectsResponse(), projects: [] }; + expect( + buildLungmapProjectJsonLd(response as ProjectsResponse, BROWSER_URL) + ).toBeUndefined(); + }); + + it("surfaces LungMAP as the catalog identity and uses the projects URL pattern", () => { + const result = buildLungmapProjectJsonLd( + makeProjectsResponse(), + BROWSER_URL + ); + expect(result).toBeDefined(); + expect(result!.includedInDataCatalog).toEqual({ + "@type": "DataCatalog", + name: "LungMAP Data Explorer", + url: BROWSER_URL, + }); + expect(result!.url).toBe(`${BROWSER_URL}/projects/uuid-1`); + }); + + it("pads short descriptions with the LungMAP catalog suffix", () => { + const response = makeProjectsResponse(); + response.projects[0].projectDescription = "Short."; + const result = buildLungmapProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Lung development atlas — Short. — LungMAP Data Explorer project." + ); + }); +}); diff --git a/app/utils/schemaOrg/hcaProjectDataset.ts b/app/utils/schemaOrg/hcaProjectDataset.ts index 20b9ed879..aa2bcd084 100644 --- a/app/utils/schemaOrg/hcaProjectDataset.ts +++ b/app/utils/schemaOrg/hcaProjectDataset.ts @@ -1,81 +1,17 @@ -import type { - AccessionResponse, - ContributorResponse, - PublicationResponse, -} from "../../apis/azul/hca-dcp/common/entities"; import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; -import { transformAccessionURL } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper"; -import { ACCESSION_CONFIGS_BY_RESPONSE_KEY } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/constants"; -import { MAX_KEYWORDS } from "./constants"; -import type { - SchemaDataset, - SchemaOrganization, - SchemaPerson, - SchemaScholarlyArticle, -} from "./types"; -import { buildDescription, uniqueNonEmpty } from "./utils"; +import type { ProjectCatalogOptions } from "./projectDataset"; +import { buildProjectJsonLd } from "./projectDataset"; +import type { SchemaDataset } from "./types"; const CATALOG_NAME = "Human Cell Atlas Data Coordination Platform"; -const DESCRIPTION_FALLBACK_SUFFIX = `${CATALOG_NAME} project.`; -/** - * Builds the citation array from project publications. Skips entries without a - * title. Prefers DOI for `sameAs`, falling back to the publication URL. - * @param publications - HCA project publications. - * @returns Array of schema.org ScholarlyArticle objects. - */ -function buildCitations( - publications: PublicationResponse[] -): SchemaScholarlyArticle[] { - const citations: SchemaScholarlyArticle[] = []; - for (const publication of publications ?? []) { - if (!publication.publicationTitle) continue; - const article: SchemaScholarlyArticle = { - "@type": "ScholarlyArticle", - headline: publication.publicationTitle, - name: publication.publicationTitle, - }; - if (publication.doi) { - article.sameAs = `https://doi.org/${publication.doi}`; - } else if (publication.publicationUrl) { - article.sameAs = publication.publicationUrl; - } - citations.push(article); - } - return citations; -} - -/** - * Builds the creator array from project contributors. Skips entries without a - * name. When the contributor has an institution, attaches it as an affiliation. - * @param contributors - HCA project contributors. - * @returns Array of schema.org Person objects. - */ -function buildCreators(contributors: ContributorResponse[]): SchemaPerson[] { - const creators: SchemaPerson[] = []; - for (const contributor of contributors ?? []) { - if (!contributor.contactName) continue; - const person: SchemaPerson = { - "@type": "Person", - name: normaliseContactName(contributor.contactName), - }; - if (contributor.institution) { - const affiliation: SchemaOrganization = { - "@type": "Organization", - name: contributor.institution, - }; - person.affiliation = affiliation; - } - creators.push(person); - } - return creators; -} +const OPTIONS: ProjectCatalogOptions = { + catalogName: CATALOG_NAME, + descriptionFallbackSuffix: `${CATALOG_NAME} project.`, +}; /** * Builds a Schema.org Dataset JSON-LD object for an HCA DCP project. - * - * Returns `undefined` when the response does not carry a project we can - * describe (i.e. no project entity), so the caller can skip rendering. * @param data - HCA DCP project detail response from Azul. * @param browserURL - Site base URL used for canonical and catalog URLs. * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. @@ -84,125 +20,5 @@ export function buildHcaProjectJsonLd( data: ProjectsResponse, browserURL: string ): SchemaDataset | undefined { - const project = data.projects?.[0]; - if (!project) return undefined; - - const name = project.projectTitle || project.projectShortname; - const description = buildDescription( - project.projectDescription, - name, - DESCRIPTION_FALLBACK_SUFFIX - ); - const identifier = uniqueNonEmpty([ - project.projectId, - ...project.accessions.flatMap((accession) => - splitAccessionIds(accession.accession) - ), - ]); - - const jsonLd: SchemaDataset = { - "@context": "https://schema.org", - "@type": "Dataset", - description, - identifier, - includedInDataCatalog: { - "@type": "DataCatalog", - name: CATALOG_NAME, - url: browserURL, - }, - isAccessibleForFree: true, - name, - url: `${browserURL}/projects/${project.projectId}`, - }; - - const sameAs = buildSameAs(project.accessions); - if (sameAs.length > 0) jsonLd.sameAs = sameAs; - - const keywords = buildKeywords(data); - if (keywords.length > 0) jsonLd.keywords = keywords; - - const creator = buildCreators(project.contributors); - if (creator.length > 0) jsonLd.creator = creator; - - const citation = buildCitations(project.publications); - if (citation.length > 0) jsonLd.citation = citation; - - return jsonLd; -} - -/** - * Builds a keywords array by unioning biologically-meaningful fields from the - * project's aggregated donor/sample/specimen/protocol responses. - * @param data - HCA project detail response. - * @returns Deduplicated keywords array. - */ -function buildKeywords(data: ProjectsResponse): string[] { - const values: (string | null | undefined)[] = []; - for (const donor of data.donorOrganisms ?? []) { - values.push(...(donor.genusSpecies ?? [])); - values.push(...(donor.disease ?? [])); - } - for (const sample of data.samples ?? []) { - values.push(...(sample.organ ?? [])); - values.push(...(sample.organPart ?? [])); - values.push(...(sample.disease ?? [])); - values.push(...(sample.sampleEntityType ?? [])); - } - for (const specimen of data.specimens ?? []) { - values.push(...(specimen.organ ?? [])); - values.push(...(specimen.organPart ?? [])); - values.push(...(specimen.disease ?? [])); - } - for (const protocol of data.protocols ?? []) { - values.push(...(protocol.libraryConstructionApproach ?? [])); - values.push(...(protocol.instrumentManufacturerModel ?? [])); - } - return uniqueNonEmpty(values).slice(0, MAX_KEYWORDS); -} - -/** - * Builds the sameAs array of external accession URLs via identifiers.org. - * Only includes accessions whose namespace maps to a known identifier prefix. - * @param accessions - Project accessions from the Azul response. - * @returns Array of canonical accession URLs. - */ -function buildSameAs(accessions: AccessionResponse[]): string[] { - const urls: string[] = []; - for (const { accession, namespace } of accessions) { - const prefix = - ACCESSION_CONFIGS_BY_RESPONSE_KEY.get(namespace)?.identifierOrgPrefix; - if (!prefix) continue; - for (const id of splitAccessionIds(accession)) { - const url = transformAccessionURL(id, prefix); - if (url) urls.push(url); - } - } - return uniqueNonEmpty(urls); -} - -/** - * Normalises an HCA contributor's contactName from "Last,First,Middle" to - * "First Middle Last" for use as a Schema.org Person.name value. - * @param contactName - Raw contactName from the Azul response. - * @returns Human-readable contributor name. - */ -function normaliseContactName(contactName: string): string { - const parts = contactName.split(",").map((part) => part.trim()); - if (parts.length < 2) return contactName; - const [last, ...rest] = parts; - return [...rest, last].filter(Boolean).join(" "); -} - -/** - * Splits an Azul accession string into individual accession IDs. Azul returns - * accessions as a semicolon-separated string when a project carries multiple - * IDs under the same namespace (mirrors the split done by `mapAccessions`). - * @param accession - Raw accession value from the Azul response. - * @returns Trimmed, non-empty accession IDs. - */ -function splitAccessionIds(accession: string): string[] { - return accession - .split(";") - .map((id) => id.trim()) - .filter(Boolean); + return buildProjectJsonLd(data, browserURL, OPTIONS); } diff --git a/app/utils/schemaOrg/lungmapProjectDataset.ts b/app/utils/schemaOrg/lungmapProjectDataset.ts new file mode 100644 index 000000000..d07cba9d3 --- /dev/null +++ b/app/utils/schemaOrg/lungmapProjectDataset.ts @@ -0,0 +1,27 @@ +import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; +import type { ProjectCatalogOptions } from "./projectDataset"; +import { buildProjectJsonLd } from "./projectDataset"; +import type { SchemaDataset } from "./types"; + +const CATALOG_NAME = "LungMAP Data Explorer"; + +const OPTIONS: ProjectCatalogOptions = { + catalogName: CATALOG_NAME, + descriptionFallbackSuffix: `${CATALOG_NAME} project.`, +}; + +/** + * Builds a Schema.org Dataset JSON-LD object for a LungMAP project. LungMAP + * shares the HCA Azul backend, so the response shape matches HCA's + * `ProjectsResponse` and the shared `buildProjectJsonLd` core does the + * mapping; this wrapper just supplies LungMAP-specific catalog identity. + * @param data - LungMAP project detail response from Azul. + * @param browserURL - Site base URL used for canonical and catalog URLs. + * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. + */ +export function buildLungmapProjectJsonLd( + data: ProjectsResponse, + browserURL: string +): SchemaDataset | undefined { + return buildProjectJsonLd(data, browserURL, OPTIONS); +} diff --git a/app/utils/schemaOrg/projectDataset.ts b/app/utils/schemaOrg/projectDataset.ts new file mode 100644 index 000000000..8703a298e --- /dev/null +++ b/app/utils/schemaOrg/projectDataset.ts @@ -0,0 +1,224 @@ +/** + * Shared Schema.org Dataset builder for consumers that surface HCA-style + * `ProjectResponse` data (HCA DCP, LungMAP). Per-consumer files (e.g. + * `hcaProjectDataset.ts`, `lungmapProjectDataset.ts`) supply a + * `ProjectCatalogOptions` describing catalog identity and call + * `buildProjectJsonLd` to produce the JSON-LD payload. + */ + +import type { + AccessionResponse, + ContributorResponse, + PublicationResponse, +} from "../../apis/azul/hca-dcp/common/entities"; +import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; +import { transformAccessionURL } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper"; +import { ACCESSION_CONFIGS_BY_RESPONSE_KEY } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/constants"; +import type { + SchemaDataset, + SchemaOrganization, + SchemaPerson, + SchemaScholarlyArticle, +} from "./types"; +import { buildDescription, uniqueNonEmpty } from "./utils"; + +/** + * Per-consumer catalog identity used to populate `includedInDataCatalog` and + * the description-padding fallback. Callers (e.g. HCA, LungMAP) supply this + * via thin wrappers so the shared builder stays consumer-agnostic. + */ +export interface ProjectCatalogOptions { + catalogName: string; + descriptionFallbackSuffix: string; +} + +/** + * Builds the citation array from project publications. Skips entries without a + * title. Prefers DOI for `sameAs`, falling back to the publication URL. + * @param publications - Project publications. + * @returns Array of schema.org ScholarlyArticle objects. + */ +function buildCitations( + publications: PublicationResponse[] +): SchemaScholarlyArticle[] { + const citations: SchemaScholarlyArticle[] = []; + for (const publication of publications ?? []) { + if (!publication.publicationTitle) continue; + const article: SchemaScholarlyArticle = { + "@type": "ScholarlyArticle", + headline: publication.publicationTitle, + name: publication.publicationTitle, + }; + if (publication.doi) { + article.sameAs = `https://doi.org/${publication.doi}`; + } else if (publication.publicationUrl) { + article.sameAs = publication.publicationUrl; + } + citations.push(article); + } + return citations; +} + +/** + * Builds the creator array from project contributors. Skips entries without a + * name. When the contributor has an institution, attaches it as an affiliation. + * @param contributors - Project contributors. + * @returns Array of schema.org Person objects. + */ +function buildCreators(contributors: ContributorResponse[]): SchemaPerson[] { + const creators: SchemaPerson[] = []; + for (const contributor of contributors ?? []) { + if (!contributor.contactName) continue; + const person: SchemaPerson = { + "@type": "Person", + name: normaliseContactName(contributor.contactName), + }; + if (contributor.institution) { + const affiliation: SchemaOrganization = { + "@type": "Organization", + name: contributor.institution, + }; + person.affiliation = affiliation; + } + creators.push(person); + } + return creators; +} + +/** + * Builds a keywords array by unioning biologically-meaningful fields from the + * project's aggregated donor/sample/specimen/protocol responses. + * @param data - Project detail response. + * @returns Deduplicated keywords array. + */ +function buildKeywords(data: ProjectsResponse): string[] { + const values: (string | null | undefined)[] = []; + for (const donor of data.donorOrganisms ?? []) { + values.push(...(donor.genusSpecies ?? [])); + values.push(...(donor.disease ?? [])); + } + for (const sample of data.samples ?? []) { + values.push(...(sample.organ ?? [])); + values.push(...(sample.organPart ?? [])); + values.push(...(sample.disease ?? [])); + values.push(...(sample.sampleEntityType ?? [])); + } + for (const specimen of data.specimens ?? []) { + values.push(...(specimen.organ ?? [])); + values.push(...(specimen.organPart ?? [])); + values.push(...(specimen.disease ?? [])); + } + for (const protocol of data.protocols ?? []) { + values.push(...(protocol.libraryConstructionApproach ?? [])); + values.push(...(protocol.instrumentManufacturerModel ?? [])); + } + return uniqueNonEmpty(values); +} + +/** + * Builds a Schema.org Dataset JSON-LD object from a project detail response. + * + * Returns `undefined` when the response does not carry a project we can + * describe, so the caller can skip rendering. + * @param data - Project detail response from Azul. + * @param browserURL - Site base URL used for canonical and catalog URLs. + * @param options - Consumer-specific catalog identity. + * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. + */ +export function buildProjectJsonLd( + data: ProjectsResponse, + browserURL: string, + options: ProjectCatalogOptions +): SchemaDataset | undefined { + const project = data.projects?.[0]; + if (!project) return undefined; + + const name = project.projectTitle || project.projectShortname; + const description = buildDescription( + project.projectDescription, + name, + options.descriptionFallbackSuffix + ); + const identifier = uniqueNonEmpty([ + project.projectId, + ...project.accessions.flatMap((accession) => + splitAccessionIds(accession.accession) + ), + ]); + + const jsonLd: SchemaDataset = { + "@context": "https://schema.org", + "@type": "Dataset", + description, + identifier, + includedInDataCatalog: { + "@type": "DataCatalog", + name: options.catalogName, + url: browserURL, + }, + isAccessibleForFree: true, + name, + url: `${browserURL}/projects/${project.projectId}`, + }; + + const sameAs = buildSameAs(project.accessions); + if (sameAs.length > 0) jsonLd.sameAs = sameAs; + + const keywords = buildKeywords(data); + if (keywords.length > 0) jsonLd.keywords = keywords; + + const creator = buildCreators(project.contributors); + if (creator.length > 0) jsonLd.creator = creator; + + const citation = buildCitations(project.publications); + if (citation.length > 0) jsonLd.citation = citation; + + return jsonLd; +} + +/** + * Builds the sameAs array of external accession URLs via identifiers.org. + * Only includes accessions whose namespace maps to a known identifier prefix. + * @param accessions - Project accessions from the Azul response. + * @returns Array of canonical accession URLs. + */ +function buildSameAs(accessions: AccessionResponse[]): string[] { + const urls: string[] = []; + for (const { accession, namespace } of accessions) { + const prefix = + ACCESSION_CONFIGS_BY_RESPONSE_KEY.get(namespace)?.identifierOrgPrefix; + if (!prefix) continue; + for (const id of splitAccessionIds(accession)) { + const url = transformAccessionURL(id, prefix); + if (url) urls.push(url); + } + } + return uniqueNonEmpty(urls); +} + +/** + * Normalises an Azul contributor's contactName from "Last,First,Middle" to + * "First Middle Last" for use as a Schema.org Person.name value. + * @param contactName - Raw contactName from the Azul response. + * @returns Human-readable contributor name. + */ +function normaliseContactName(contactName: string): string { + const parts = contactName.split(",").map((part) => part.trim()); + if (parts.length < 2) return contactName; + const [last, ...rest] = parts; + return [...rest, last].filter(Boolean).join(" "); +} + +/** + * Splits an Azul accession string into individual accession IDs. Azul returns + * accessions as a semicolon-separated string when a project carries multiple + * IDs under the same namespace (mirrors the split done by `mapAccessions`). + * @param accession - Raw accession value from the Azul response. + * @returns Trimmed, non-empty accession IDs. + */ +function splitAccessionIds(accession: string): string[] { + return accession + .split(";") + .map((id) => id.trim()) + .filter(Boolean); +} diff --git a/pages/[entityListType]/[...params].tsx b/pages/[entityListType]/[...params].tsx index d272dc225..bc3714561 100644 --- a/pages/[entityListType]/[...params].tsx +++ b/pages/[entityListType]/[...params].tsx @@ -32,6 +32,7 @@ import { JSX } from "react"; import { EntityGuard } from "../../app/components/Detail/components/EntityGuard/entityGuard"; import { buildAnvilDatasetJsonLd } from "../../app/utils/schemaOrg/anvilDataset"; import { buildHcaProjectJsonLd } from "../../app/utils/schemaOrg/hcaProjectDataset"; +import { buildLungmapProjectJsonLd } from "../../app/utils/schemaOrg/lungmapProjectDataset"; import type { SchemaDataset } from "../../app/utils/schemaOrg/types"; import { readFile } from "../../app/utils/tsvParser"; import { JsonLd } from "../../app/views/EntityDetailView/components/JsonLd/jsonLd"; @@ -75,11 +76,13 @@ export interface EntityDetailPageProps extends AzulEntityStaticResponse { // Catalog", which shares the "AnVIL" prefix but has a different entity shape. const APP_TITLE_ANVIL_CMG = "AnVIL Data Explorer"; const APP_TITLE_HCA_DCP = "HCA Data Explorer"; +const APP_TITLE_LUNGMAP = "LungMAP Data Explorer"; const EntityDetailPage = (props: EntityDetailPageProps): JSX.Element => { const { config: siteConfig } = useConfig(); const isAnVIL = siteConfig.appTitle === APP_TITLE_ANVIL_CMG; const isHcaDcp = siteConfig.appTitle === APP_TITLE_HCA_DCP; + const isLungMap = siteConfig.appTitle === APP_TITLE_LUNGMAP; const { query } = useRouter(); if (!props.entityListType) return <>; if (props.override) return ; @@ -98,6 +101,7 @@ const EntityDetailPage = (props: EntityDetailPageProps): JSX.Element => { <> {isAnVIL && renderJsonLd(props, "datasets", buildAnvilDatasetJsonLd)} {isHcaDcp && renderJsonLd(props, "projects", buildHcaProjectJsonLd)} + {isLungMap && renderJsonLd(props, "projects", buildLungmapProjectJsonLd)} ); From 358eb1dba5999ea17d5aabfd067544edf826194b Mon Sep 17 00:00:00 2001 From: Fran McDade <18710366+frano-m@users.noreply.github.com> Date: Fri, 22 May 2026 16:31:34 +1000 Subject: [PATCH 2/3] fix: cap shared project keywords at max_keywords to match anvil builder (#4808) Co-Authored-By: Claude Opus 4.7 (1M context) --- app/utils/schemaOrg/projectDataset.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/utils/schemaOrg/projectDataset.ts b/app/utils/schemaOrg/projectDataset.ts index 8703a298e..d89f874ae 100644 --- a/app/utils/schemaOrg/projectDataset.ts +++ b/app/utils/schemaOrg/projectDataset.ts @@ -14,6 +14,7 @@ import type { import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; import { transformAccessionURL } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper"; import { ACCESSION_CONFIGS_BY_RESPONSE_KEY } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/constants"; +import { MAX_KEYWORDS } from "./constants"; import type { SchemaDataset, SchemaOrganization, @@ -112,7 +113,7 @@ function buildKeywords(data: ProjectsResponse): string[] { values.push(...(protocol.libraryConstructionApproach ?? [])); values.push(...(protocol.instrumentManufacturerModel ?? [])); } - return uniqueNonEmpty(values); + return uniqueNonEmpty(values).slice(0, MAX_KEYWORDS); } /** From af5b43aee2139d6626376ccabd994e62ca756f52 Mon Sep 17 00:00:00 2001 From: Fran McDade <18710366+frano-m@users.noreply.github.com> Date: Fri, 22 May 2026 16:47:29 +1000 Subject: [PATCH 3/3] chore: rewrite description fallback suffixes for anvil, hca, lungmap (#4808) - AnVIL suffix expanded to spell out NHGRI Analysis Visualization and Informatics Lab-space - HCA renamed catalog to "Human Cell Atlas Data Explorer", suffix matches - LungMAP suffix uses "A project in the LungMAP Data Explorer." - Update buildDescription jsdoc to reflect that the entity name's length carries the 50-char minimum in practice (suffix alone no longer self-sufficient) - Update test expectations accordingly Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/utils/schemaOrg/anvilDataset.test.ts | 4 ++-- .../utils/schemaOrg/hcaProjectDataset.test.ts | 6 +++--- .../schemaOrg/lungmapProjectDataset.test.ts | 2 +- app/utils/schemaOrg/anvilDataset.ts | 2 +- app/utils/schemaOrg/hcaProjectDataset.ts | 4 ++-- app/utils/schemaOrg/lungmapProjectDataset.ts | 2 +- app/utils/schemaOrg/utils.ts | 16 ++++++++++------ 7 files changed, 20 insertions(+), 16 deletions(-) diff --git a/__tests__/utils/schemaOrg/anvilDataset.test.ts b/__tests__/utils/schemaOrg/anvilDataset.test.ts index 160bb12e6..16ad30713 100644 --- a/__tests__/utils/schemaOrg/anvilDataset.test.ts +++ b/__tests__/utils/schemaOrg/anvilDataset.test.ts @@ -98,7 +98,7 @@ describe("buildAnvilDatasetJsonLd", () => { response.datasets[0].description = "Short."; const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); expect(result!.description).toBe( - "Rare disease dataset — Short. — A genomic dataset in the AnVIL Data Explorer catalog." + "Rare disease dataset — Short. — A dataset in the AnVIL Data Explorer for NHGRI's Analysis Visualization and Informatics Lab-space." ); expect(result!.description.length).toBeGreaterThanOrEqual( DESCRIPTION_LENGTH.MIN @@ -110,7 +110,7 @@ describe("buildAnvilDatasetJsonLd", () => { response.datasets[0].description = undefined; const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); expect(result!.description).toBe( - "Rare disease dataset — A genomic dataset in the AnVIL Data Explorer catalog." + "Rare disease dataset — A dataset in the AnVIL Data Explorer for NHGRI's Analysis Visualization and Informatics Lab-space." ); expect(result!.description.length).toBeGreaterThanOrEqual( DESCRIPTION_LENGTH.MIN diff --git a/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts b/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts index 64d5dde9a..eb7b5ad41 100644 --- a/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts +++ b/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts @@ -68,7 +68,7 @@ describe("buildHcaProjectJsonLd", () => { expect(result!.isAccessibleForFree).toBe(true); expect(result!.includedInDataCatalog).toEqual({ "@type": "DataCatalog", - name: "Human Cell Atlas Data Coordination Platform", + name: "Human Cell Atlas Data Explorer", url: BROWSER_URL, }); }); @@ -95,7 +95,7 @@ describe("buildHcaProjectJsonLd", () => { response.projects[0].projectDescription = "Short."; const result = buildHcaProjectJsonLd(response, BROWSER_URL); expect(result!.description).toBe( - "Cells of the body — Short. — Human Cell Atlas Data Coordination Platform project." + "Cells of the body — Short. — A project in the Human Cell Atlas Data Explorer." ); expect(result!.description.length).toBeGreaterThanOrEqual( DESCRIPTION_LENGTH.MIN @@ -107,7 +107,7 @@ describe("buildHcaProjectJsonLd", () => { response.projects[0].projectDescription = ""; const result = buildHcaProjectJsonLd(response, BROWSER_URL); expect(result!.description).toBe( - "Cells of the body — Human Cell Atlas Data Coordination Platform project." + "Cells of the body — A project in the Human Cell Atlas Data Explorer." ); expect(result!.description.length).toBeGreaterThanOrEqual( DESCRIPTION_LENGTH.MIN diff --git a/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts b/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts index 67688c89b..c52a3eb75 100644 --- a/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts +++ b/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts @@ -68,7 +68,7 @@ describe("buildLungmapProjectJsonLd", () => { response.projects[0].projectDescription = "Short."; const result = buildLungmapProjectJsonLd(response, BROWSER_URL); expect(result!.description).toBe( - "Lung development atlas — Short. — LungMAP Data Explorer project." + "Lung development atlas — Short. — A project in the LungMAP Data Explorer." ); }); }); diff --git a/app/utils/schemaOrg/anvilDataset.ts b/app/utils/schemaOrg/anvilDataset.ts index 189eba2c8..7ada125c0 100644 --- a/app/utils/schemaOrg/anvilDataset.ts +++ b/app/utils/schemaOrg/anvilDataset.ts @@ -4,7 +4,7 @@ import type { SchemaDataset } from "./types"; import { buildDescription, uniqueNonEmpty } from "./utils"; const CATALOG_NAME = "AnVIL Data Explorer"; -const DESCRIPTION_FALLBACK_SUFFIX = `A genomic dataset in the ${CATALOG_NAME} catalog.`; +const DESCRIPTION_FALLBACK_SUFFIX = `A dataset in the AnVIL Data Explorer for NHGRI's Analysis Visualization and Informatics Lab-space.`; /** * Builds a Schema.org Dataset JSON-LD object for an AnVIL CMG dataset. diff --git a/app/utils/schemaOrg/hcaProjectDataset.ts b/app/utils/schemaOrg/hcaProjectDataset.ts index aa2bcd084..5cdf094a8 100644 --- a/app/utils/schemaOrg/hcaProjectDataset.ts +++ b/app/utils/schemaOrg/hcaProjectDataset.ts @@ -3,11 +3,11 @@ import type { ProjectCatalogOptions } from "./projectDataset"; import { buildProjectJsonLd } from "./projectDataset"; import type { SchemaDataset } from "./types"; -const CATALOG_NAME = "Human Cell Atlas Data Coordination Platform"; +const CATALOG_NAME = "Human Cell Atlas Data Explorer"; const OPTIONS: ProjectCatalogOptions = { catalogName: CATALOG_NAME, - descriptionFallbackSuffix: `${CATALOG_NAME} project.`, + descriptionFallbackSuffix: `A project in the Human Cell Atlas Data Explorer.`, }; /** diff --git a/app/utils/schemaOrg/lungmapProjectDataset.ts b/app/utils/schemaOrg/lungmapProjectDataset.ts index d07cba9d3..6eecf5b40 100644 --- a/app/utils/schemaOrg/lungmapProjectDataset.ts +++ b/app/utils/schemaOrg/lungmapProjectDataset.ts @@ -7,7 +7,7 @@ const CATALOG_NAME = "LungMAP Data Explorer"; const OPTIONS: ProjectCatalogOptions = { catalogName: CATALOG_NAME, - descriptionFallbackSuffix: `${CATALOG_NAME} project.`, + descriptionFallbackSuffix: `A project in the LungMAP Data Explorer.`, }; /** diff --git a/app/utils/schemaOrg/utils.ts b/app/utils/schemaOrg/utils.ts index d29358c49..8f40756e7 100644 --- a/app/utils/schemaOrg/utils.ts +++ b/app/utils/schemaOrg/utils.ts @@ -2,13 +2,17 @@ import { DESCRIPTION_LENGTH } from "./constants"; /** * Builds a Schema.org description string from a raw entity description, padding - * short or empty values with the entity name and a caller-supplied fallback - * suffix so the result satisfies Google's minimum description-length - * requirement (50 chars). + * short or empty values by prepending the entity name and appending a + * caller-supplied fallback suffix. The padded result is `name — suffix` (or + * `name — source — suffix` when the source description is non-empty but short), + * relying on the entity name's length plus the suffix to clear Google's + * 50-character description minimum in practice. * @param sourceDescription - Raw description (may contain HTML, may be empty). - * @param name - Entity name used in the padded fallback. - * @param fallbackSuffix - Caller-owned suffix (e.g. catalog + entity kind) used - * to reliably push padded descriptions past the 50-character minimum. The + * @param name - Entity name used in the padded fallback. Prepended to the + * output; its length is the main contributor to clearing the 50-char minimum + * when the source description is short or empty. + * @param fallbackSuffix - Caller-owned suffix (e.g. catalog + entity kind) + * appended after the name and (when present) the source description. The * caller controls phrasing and punctuation; the helper does not add a period. * @returns HTML-stripped description, padded when short, truncated when long. */