From 6638f85a83d74081a1a8e0fc0b0645a9471a53ed Mon Sep 17 00:00:00 2001 From: even1024 Date: Fri, 15 May 2026 04:49:47 +0200 Subject: [PATCH 1/8] biln fix --- api/tests/integration/ref/formats/biln.py.out | 7 ++++ api/tests/integration/tests/formats/biln.py | 28 +++++++++++++++ api/wasm/indigo-ketcher/test/test.js | 32 +++++++++++++++++ .../molecule/src/sequence_loader_helm.cpp | 35 ++++++++++++++----- .../molecule/src/sequence_saver.cpp | 14 +++++--- 5 files changed, 103 insertions(+), 13 deletions(-) diff --git a/api/tests/integration/ref/formats/biln.py.out b/api/tests/integration/ref/formats/biln.py.out index c07c6a5dc8..4035098c06 100644 --- a/api/tests/integration/ref/formats/biln.py.out +++ b/api/tests/integration/ref/formats/biln.py.out @@ -1,10 +1,17 @@ *** BILN interop *** +biln_bracketed_alias:BILN->HELM SUCCEED biln_cap:BILN->HELM SUCCEED biln_disulfides:BILN->HELM SUCCEED +biln_large_bond_id:BILN->HELM SUCCEED +biln_star_alias:BILN->HELM SUCCEED biln_three_chains:BILN->HELM SUCCEED +biln_underscore_alias:BILN->HELM SUCCEED +helm_bracketed_alias:HELM->BILN SUCCEED helm_cap:HELM->BILN SUCCEED helm_cycle:HELM->BILN SUCCEED +helm_star_alias:HELM->BILN SUCCEED helm_three_chains:HELM->BILN SUCCEED +helm_underscore_alias:HELM->BILN SUCCEED Test 'A(1,3)-C': got expected error 'Invalid BILN bond 1: expected two endpoints but found 1.' Test 'A--C': got expected error 'Invalid BILN string: empty monomer.' Test 'A-C(1,4)': got expected error 'Invalid BILN bond 1: expected two endpoints but found 1.' diff --git a/api/tests/integration/tests/formats/biln.py b/api/tests/integration/tests/formats/biln.py index e94a9fd8df..4dd69429c0 100644 --- a/api/tests/integration/tests/formats/biln.py +++ b/api/tests/integration/tests/formats/biln.py @@ -33,6 +33,22 @@ "Ac(1,2).A-K(1,3)(2,2).Me(2,1)", "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}|PEPTIDE3{[Me]}$PEPTIDE1,PEPTIDE2,1:R2-2:R3|PEPTIDE2,PEPTIDE3,2:R2-1:R1$$$V2.0", ), + "biln_underscore_alias": ( + "A-1Nal-Cys_Bn-C", + "PEPTIDE1{A.[1Nal].[Cys_Bn].C}$$$$V2.0", + ), + "biln_bracketed_alias": ( + "A-[D-1Nal]-[Cys_Bn]-[C]", + "PEPTIDE1{A.[D-1Nal].[Cys_Bn].C}$$$$V2.0", + ), + "biln_star_alias": ( + "A-D*-C", + "PEPTIDE1{A.[D*].C}$$$$V2.0", + ), + "biln_large_bond_id": ( + "A-C(7563,3).C(7563,3)", + "PEPTIDE1{A.C}|PEPTIDE2{C}$PEPTIDE1,PEPTIDE2,2:R3-1:R3$$$V2.0", + ), "biln_disulfides": ( "D-T-H-F-P-I-C(1,3)-I-F-C(2,3)-C(3,3)-G-C(2,3)-C(4,3)-H-R-S-K-C(3,3)-G-M-C(4,3)-C(1,3)-K-T", "PEPTIDE1{D.T.H.F.P.I.C.I.F.C.C.G.C.C.H.R.S.K.C.G.M.C.C.K.T}$PEPTIDE1,PEPTIDE1,7:R3-23:R3|PEPTIDE1,PEPTIDE1,10:R3-13:R3|PEPTIDE1,PEPTIDE1,11:R3-19:R3|PEPTIDE1,PEPTIDE1,14:R3-22:R3$$$V2.0", @@ -66,6 +82,18 @@ "PEPTIDE1{[Abu].[Sar].[NMeL].V.[NMeL].A.[DAla].[NMeL].[NMeL].[NMeV].[NMeThr4RBut2enyl]}$PEPTIDE1,PEPTIDE1,1:R1-11:R2$$$V2.0", "Abu(1,1)-Sar-NMeL-V-NMeL-A-DAla-NMeL-NMeL-NMeV-NMeThr4RBut2enyl(1,2)", ), + "helm_underscore_alias": ( + "PEPTIDE1{A.[1Nal].[Cys_Bn].C}$$$$V2.0", + "A-1Nal-Cys_Bn-C", + ), + "helm_bracketed_alias": ( + "PEPTIDE1{A.[D-1Nal].[Cys_Bn].C}$$$$V2.0", + "A-[D-1Nal]-Cys_Bn-C", + ), + "helm_star_alias": ( + "PEPTIDE1{A.[D*].C}$$$$V2.0", + "A-D*-C", + ), } for name in sorted(helm_to_biln.keys()): diff --git a/api/wasm/indigo-ketcher/test/test.js b/api/wasm/indigo-ketcher/test/test.js index 4ac7344e09..e91ca4560f 100644 --- a/api/wasm/indigo-ketcher/test/test.js +++ b/api/wasm/indigo-ketcher/test/test.js @@ -1205,6 +1205,38 @@ M END }); } + { + test("BILN", "underscore_alias", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + const biln = "A-1Nal-Cys_Bn-C"; + const res = indigo.convert(biln, "helm", options); + const res_helm = JSON.parse(res).struct; + assert.equal(res_helm, "PEPTIDE1{A.[1Nal].[Cys_Bn].C}$$$$V2.0"); + options.delete(); + }); + } + + { + test("BILN", "bracketed_alias", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + const biln = "A-[D-1Nal]-[Cys_Bn]-[C]"; + const res = indigo.convert(biln, "helm", options); + const res_helm = JSON.parse(res).struct; + assert.equal(res_helm, "PEPTIDE1{A.[D-1Nal].[Cys_Bn].C}$$$$V2.0"); + options.delete(); + }); + } + { test("BILN", "cross_links", () => { var fs = require('fs'); diff --git a/core/indigo-core/molecule/src/sequence_loader_helm.cpp b/core/indigo-core/molecule/src/sequence_loader_helm.cpp index b5e3acacb7..46cf4560fc 100644 --- a/core/indigo-core/molecule/src/sequence_loader_helm.cpp +++ b/core/indigo-core/molecule/src/sequence_loader_helm.cpp @@ -17,6 +17,7 @@ ***************************************************************************/ #include +#include #include #include #include @@ -622,8 +623,6 @@ void SequenceLoader::loadBILN(KetDocument& document) std::vector bonds; std::unordered_map bond_to_idx; - constexpr int kMaxBilnIndex = 1000; - size_t data_pos = 0; auto skip_spaces = [&]() { while (data_pos < biln.size() && is_space(biln[data_pos])) @@ -636,9 +635,10 @@ void SequenceLoader::loadBILN(KetDocument& document) int value = 0; while (data_pos < biln.size() && std::isdigit(static_cast(biln[data_pos]))) { - value = value * 10 + (biln[data_pos] - '0'); - if (value > kMaxBilnIndex) + const int digit = biln[data_pos] - '0'; + if (value > (std::numeric_limits::max() - digit) / 10) throw Error("Invalid BILN bond annotation: %s number is too large.", field_name); + value = value * 10 + digit; data_pos++; } if (value == 0) @@ -680,12 +680,29 @@ void SequenceLoader::loadBILN(KetDocument& document) throw Error("Invalid BILN string: empty monomer."); std::string monomer_alias; - while (data_pos < biln.size() && biln[data_pos] != '(' && biln[data_pos] != '-' && biln[data_pos] != '.' && !is_space(biln[data_pos])) + if (biln[data_pos] == '[') { - char ch = biln[data_pos]; - if (!std::isalnum(static_cast(ch)) && ch != '[' && ch != ']' && ch != '#') - throw Error("Invalid BILN string: unexpected symbol '%c'.", ch); - monomer_alias += biln[data_pos++]; + data_pos++; + while (data_pos < biln.size() && biln[data_pos] != ']') + { + char ch = biln[data_pos]; + if (ch == '.' || ch == '(' || ch == ')' || ch == ',' || ch == '[' || is_space(ch)) + throw Error("Invalid BILN string: unexpected symbol '%c'.", ch); + monomer_alias += biln[data_pos++]; + } + if (data_pos >= biln.size()) + throw Error("Invalid BILN string: unexpected end of bracketed monomer."); + data_pos++; + } + else + { + while (data_pos < biln.size() && biln[data_pos] != '(' && biln[data_pos] != '-' && biln[data_pos] != '.' && !is_space(biln[data_pos])) + { + char ch = biln[data_pos]; + if (ch == ')' || ch == ',' || ch == '[' || ch == ']') + throw Error("Invalid BILN string: unexpected symbol '%c'.", ch); + monomer_alias += biln[data_pos++]; + } } if (monomer_alias.empty()) throw Error("Invalid BILN string: empty monomer."); diff --git a/core/indigo-core/molecule/src/sequence_saver.cpp b/core/indigo-core/molecule/src/sequence_saver.cpp index 8e3bef194b..263c981ae6 100644 --- a/core/indigo-core/molecule/src/sequence_saver.cpp +++ b/core/indigo-core/molecule/src/sequence_saver.cpp @@ -843,15 +843,22 @@ static std::string get_biln_attachment_idx(const KetConnectionEndPoint& ep) return ap.substr(1); } -static void check_biln_alias(const std::string& monomer_alias) +static std::string format_biln_alias(const std::string& monomer_alias) { if (monomer_alias.empty()) throw SequenceSaver::Error("Cannot save empty monomer alias in BILN format."); + bool needs_brackets = false; for (auto ch : monomer_alias) { - if (ch == '-' || ch == '.' || ch == '(' || ch == ')' || ch == ',' || std::isspace(static_cast(ch))) + if (ch == '-') + { + needs_brackets = true; + continue; + } + if (ch == '.' || ch == '(' || ch == ')' || ch == ',' || ch == '[' || ch == ']' || std::isspace(static_cast(ch))) throw SequenceSaver::Error("Cannot save monomer alias '%s' in BILN format.", monomer_alias.c_str()); } + return needs_brackets ? "[" + monomer_alias + "]" : monomer_alias; } std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector>& sequences) @@ -874,9 +881,8 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vectoralias().c_str()); - check_biln_alias(monomer->alias()); monomer_to_chain_pos.emplace(monomer_id, std::make_pair(chains.size(), chain.size())); - chain.emplace_back(monomer->alias()); + chain.emplace_back(format_biln_alias(monomer->alias())); } if (!chain.empty()) chains.emplace_back(chain); From 1be4006978dc9c004bd16427bd3678ba9f6366c0 Mon Sep 17 00:00:00 2001 From: even1024 Date: Fri, 15 May 2026 18:56:08 +0200 Subject: [PATCH 2/8] Fix BILN import/export --- api/tests/integration/ref/formats/biln.py.out | 37 +- api/tests/integration/tests/formats/biln.py | 161 ++++++- api/wasm/indigo-ketcher/test/test.js | 139 +++++- .../molecule/src/sequence_loader_helm.cpp | 163 +++++-- .../molecule/src/sequence_saver.cpp | 405 ++++++++++++++++-- 5 files changed, 795 insertions(+), 110 deletions(-) diff --git a/api/tests/integration/ref/formats/biln.py.out b/api/tests/integration/ref/formats/biln.py.out index 4035098c06..de62105ba1 100644 --- a/api/tests/integration/ref/formats/biln.py.out +++ b/api/tests/integration/ref/formats/biln.py.out @@ -1,17 +1,40 @@ *** BILN interop *** biln_bracketed_alias:BILN->HELM SUCCEED -biln_cap:BILN->HELM SUCCEED biln_disulfides:BILN->HELM SUCCEED +biln_explicit_backbone:BILN->HELM SUCCEED biln_large_bond_id:BILN->HELM SUCCEED biln_star_alias:BILN->HELM SUCCEED -biln_three_chains:BILN->HELM SUCCEED +biln_two_backbones:BILN->HELM SUCCEED biln_underscore_alias:BILN->HELM SUCCEED +biln_alphabetic_order:BILN->BILN SUCCEED +biln_amino_acid_count_order:BILN->BILN SUCCEED +biln_backbone_order:BILN->BILN SUCCEED +biln_bracketed_no_hyphen:BILN->BILN SUCCEED +biln_cycle_best:BILN->BILN SUCCEED +biln_cycle_reverse_rotation:BILN->BILN SUCCEED +biln_cycle_rotation:BILN->BILN SUCCEED +biln_cycle_with_extra_bond_order:BILN->BILN SUCCEED +biln_library_alias:BILN->BILN SUCCEED +biln_multiple_nonbackbone_order:BILN->BILN SUCCEED +biln_short_chain_order:BILN->BILN SUCCEED +biln_valid_large_bond_ids:BILN->BILN SUCCEED +helm_alias_to_biln_alias:HELM->BILN SUCCEED helm_bracketed_alias:HELM->BILN SUCCEED -helm_cap:HELM->BILN SUCCEED +helm_chem_backbone:HELM->BILN SUCCEED +helm_chem_with_biln_code:HELM->BILN SUCCEED helm_cycle:HELM->BILN SUCCEED helm_star_alias:HELM->BILN SUCCEED -helm_three_chains:HELM->BILN SUCCEED helm_underscore_alias:HELM->BILN SUCCEED -Test 'A(1,3)-C': got expected error 'Invalid BILN bond 1: expected two endpoints but found 1.' -Test 'A--C': got expected error 'Invalid BILN string: empty monomer.' -Test 'A-C(1,4)': got expected error 'Invalid BILN bond 1: expected two endpoints but found 1.' +Test 'CHEM1{[qweqwe]}$$$$V2.0': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' +Test 'PEPTIDE1{A}|RNA1{R(A)P}$$$$V2.0': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' +Test KET 'custom_chem_without_biln_code': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' +Test 'A(1,3)-C': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'A--C': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'A-C(-1,3)-D(2,3)-E.F-G-H(-1,3)-I-K(2,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'A-C(1,3)-D(1,3)-E.F-G-H(1,3)-I-K(2,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'A-C(1,4)': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'A-C(1,4)-D(2,3)-E.F-G-H(1,3)-I-K(2,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'A-C(1.25,3)-D(2,3)-E.F-G-H(1.25,3)-I-K(2,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'Cys_SEt': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'D-2Thi-D-D-gGlu-meF-G-Lys-al': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test '[D-Cit](1,2)-aThr(1,1)(2,2)-meS(2,1)': got expected error 'The string cannot be interpreted as a valid BILN string.' diff --git a/api/tests/integration/tests/formats/biln.py b/api/tests/integration/tests/formats/biln.py index 4dd69429c0..55d1b251ee 100644 --- a/api/tests/integration/tests/formats/biln.py +++ b/api/tests/integration/tests/formats/biln.py @@ -25,13 +25,9 @@ ) biln_to_helm = { - "biln_cap": ( - "Ac(1,2).A-K(1,3)", - "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0", - ), - "biln_three_chains": ( - "Ac(1,2).A-K(1,3)(2,2).Me(2,1)", - "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}|PEPTIDE3{[Me]}$PEPTIDE1,PEPTIDE2,1:R2-2:R3|PEPTIDE2,PEPTIDE3,2:R2-1:R1$$$V2.0", + "biln_two_backbones": ( + "A-C-D.E-F-G", + "PEPTIDE1{A.C.D}|PEPTIDE2{E.F.G}$$$$V2.0", ), "biln_underscore_alias": ( "A-1Nal-Cys_Bn-C", @@ -49,6 +45,10 @@ "A-C(7563,3).C(7563,3)", "PEPTIDE1{A.C}|PEPTIDE2{C}$PEPTIDE1,PEPTIDE2,2:R3-1:R3$$$V2.0", ), + "biln_explicit_backbone": ( + "[D-Cit](1,2).aThr(1,1)(2,2).meS(2,1)", + "PEPTIDE1{[D-Cit].[aThr].[meS]}$$$$V2.0", + ), "biln_disulfides": ( "D-T-H-F-P-I-C(1,3)-I-F-C(2,3)-C(3,3)-G-C(2,3)-C(4,3)-H-R-S-K-C(3,3)-G-M-C(4,3)-C(1,3)-K-T", "PEPTIDE1{D.T.H.F.P.I.C.I.F.C.C.G.C.C.H.R.S.K.C.G.M.C.C.K.T}$PEPTIDE1,PEPTIDE1,7:R3-23:R3|PEPTIDE1,PEPTIDE1,10:R3-13:R3|PEPTIDE1,PEPTIDE1,11:R3-19:R3|PEPTIDE1,PEPTIDE1,14:R3-22:R3$$$V2.0", @@ -69,19 +69,72 @@ except IndigoException as e: print(name + ":FAILED - " + getIndigoExceptionText(e)) -helm_to_biln = { - "helm_cap": ( - "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0", - "Ac(1,2).A-K(1,3)", +biln_to_biln = { + "biln_cycle_best": ( + "A(1,1)-C-D-E(1,2)", + "A(1,1)-C-D-E(1,2)", ), - "helm_three_chains": ( - "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}|PEPTIDE3{[Me]}$PEPTIDE1,PEPTIDE2,1:R2-2:R3|PEPTIDE2,PEPTIDE3,2:R2-1:R1$$$V2.0", - "Ac(1,2).A-K(1,3)(2,2).Me(2,1)", + "biln_cycle_rotation": ( + "C(1,1)-D-E-A(1,2)", + "A(1,1)-C-D-E(1,2)", ), - "helm_cycle": ( - "PEPTIDE1{[Abu].[Sar].[NMeL].V.[NMeL].A.[DAla].[NMeL].[NMeL].[NMeV].[NMeThr4RBut2enyl]}$PEPTIDE1,PEPTIDE1,1:R1-11:R2$$$V2.0", - "Abu(1,1)-Sar-NMeL-V-NMeL-A-DAla-NMeL-NMeL-NMeV-NMeThr4RBut2enyl(1,2)", + "biln_cycle_reverse_rotation": ( + "D(1,2)-C-A-E(1,1)", + "A(1,1)-C-D-E(1,2)", + ), + "biln_bracketed_no_hyphen": ( + "[D-2Thi]-[D]-[D-gGlu]-[meF]-[G]-[Lys-al]", + "[D-2Thi]-D-[D-gGlu]-meF-G-[Lys-al]", + ), + "biln_backbone_order": ( + "A-A-A-A-A-A.C-C-C-C.[PEG-2]-C-C-C-C-[PEG-2]", + "A-A-A-A-A-A.[PEG-2]-C-C-C-C-[PEG-2].C-C-C-C", + ), + "biln_short_chain_order": ( + "A-A-A6OH-A6OH-A6OH.C-C-C-C-C", + "A-A-A6OH-A6OH-A6OH.C-C-C-C-C", ), + "biln_amino_acid_count_order": ( + "C-C-A6OH-A6OH-C-C.C-C-C-C-C-A6OH", + "C-C-C-C-C-A6OH.C-C-A6OH-A6OH-C-C", + ), + "biln_alphabetic_order": ( + "C-D-E-F-G-A6OH.A-C-D-E-F-A6OH", + "A-C-D-E-F-A6OH.C-D-E-F-G-A6OH", + ), + "biln_multiple_nonbackbone_order": ( + "A-[Test-6-Ch](1,4)(2,3)-C.D(2,1).E(1,2)", + "A-[Test-6-Ch](1,3)(2,4)-C.D(1,1).E(2,2)", + ), + "biln_cycle_with_extra_bond_order": ( + "C(1,1)(2,3)-C-C(2,3)-C(1,2)", + "C(1,1)-C(2,3)-C-C(1,2)(2,3)", + ), + "biln_library_alias": ( + "Edc", + "Edc", + ), + "biln_valid_large_bond_ids": ( + "A-C(7563,3)-D(3,3)-E.F-G-H(7563,3)-I-K(3,3)", + "F-G-H(1,3)-I-K(2,3).A-C(1,3)-D(2,3)-E", + ), +} + +for name in sorted(biln_to_biln.keys()): + biln, biln_ref = biln_to_biln[name] + try: + doc = indigo.loadBiln(biln, lib) + canonical_biln = doc.biln(lib) + diff = find_diff(biln_ref, canonical_biln) + if diff: + print(name + ":FAILED") + print(diff) + else: + print(name + ":BILN->BILN SUCCEED") + except IndigoException as e: + print(name + ":FAILED - " + getIndigoExceptionText(e)) + +helm_to_biln = { "helm_underscore_alias": ( "PEPTIDE1{A.[1Nal].[Cys_Bn].C}$$$$V2.0", "A-1Nal-Cys_Bn-C", @@ -94,6 +147,22 @@ "PEPTIDE1{A.[D*].C}$$$$V2.0", "A-D*-C", ), + "helm_cycle": ( + "PEPTIDE1{A.C.D.E}$PEPTIDE1,PEPTIDE1,1:R1-4:R2$$$V2.0", + "A(1,1)-C-D-E(1,2)", + ), + "helm_chem_backbone": ( + "PEPTIDE1{A.A.A.A.A.A}|CHEM1{[PEG-2]}|PEPTIDE2{C.C.C.C}|CHEM2{[PEG-2]}$PEPTIDE1,CHEM1,6:R2-1:R1|CHEM1,PEPTIDE2,1:R2-1:R1|PEPTIDE2,CHEM2,4:R2-1:R1$$$V2.0", + "A-A-A-A-A-A-[PEG-2]-C-C-C-C-[PEG-2]", + ), + "helm_chem_with_biln_code": ( + "CHEM1{[PEG-2]}$$$$V2.0", + "[PEG-2]", + ), + "helm_alias_to_biln_alias": ( + "PEPTIDE1{[Cys_SEt]}$$$$V2.0", + "Edc", + ), } for name in sorted(helm_to_biln.keys()): @@ -110,10 +179,62 @@ except IndigoException as e: print(name + ":FAILED - " + getIndigoExceptionText(e)) +helm_errors = { + "CHEM1{[qweqwe]}$$$$V2.0": "Only amino acids and CHEMs with BILN codes can get exported to BILN.", + "PEPTIDE1{A}|RNA1{R(A)P}$$$$V2.0": "Only amino acids and CHEMs with BILN codes can get exported to BILN.", +} + +for helm in sorted(helm_errors.keys()): + error = helm_errors[helm] + try: + doc = indigo.loadHelm(helm, lib) + doc.biln(lib) + print("Test %s failed: exception expected." % helm) + except IndigoException as e: + text = getIndigoExceptionText(e) + if error in text: + print("Test '%s': got expected error '%s'" % (helm, error)) + else: + print( + "Test '%s': expected error '%s' but got '%s'" + % (helm, error, text) + ) + +ket_errors = { + "custom_chem_without_biln_code": ( + "CHEM1{[qweqwe]}$$$$V2.0", + "Only amino acids and CHEMs with BILN codes can get exported to BILN.", + ), +} + +for name in sorted(ket_errors.keys()): + helm, error = ket_errors[name] + try: + doc = indigo.loadHelm(helm, lib) + doc = indigo.loadKetDocument(doc.json()) + doc.biln(lib) + print("Test KET %s failed: exception expected." % name) + except IndigoException as e: + text = getIndigoExceptionText(e) + if error in text: + print("Test KET '%s': got expected error '%s'" % (name, error)) + else: + print( + "Test KET '%s': expected error '%s' but got '%s'" + % (name, error, text) + ) + biln_errors = { - "A(1,3)-C": "Invalid BILN bond 1: expected two endpoints but found 1.", - "A--C": "Invalid BILN string: empty monomer.", - "A-C(1,4)": "Invalid BILN bond 1: expected two endpoints but found 1.", + "A(1,3)-C": "The string cannot be interpreted as a valid BILN string.", + "A--C": "The string cannot be interpreted as a valid BILN string.", + "A-C(1,4)": "The string cannot be interpreted as a valid BILN string.", + "[D-Cit](1,2)-aThr(1,1)(2,2)-meS(2,1)": "The string cannot be interpreted as a valid BILN string.", + "D-2Thi-D-D-gGlu-meF-G-Lys-al": "The string cannot be interpreted as a valid BILN string.", + "A-C(-1,3)-D(2,3)-E.F-G-H(-1,3)-I-K(2,3)": "The string cannot be interpreted as a valid BILN string.", + "A-C(1.25,3)-D(2,3)-E.F-G-H(1.25,3)-I-K(2,3)": "The string cannot be interpreted as a valid BILN string.", + "A-C(1,3)-D(1,3)-E.F-G-H(1,3)-I-K(2,3)": "The string cannot be interpreted as a valid BILN string.", + "A-C(1,4)-D(2,3)-E.F-G-H(1,3)-I-K(2,3)": "The string cannot be interpreted as a valid BILN string.", + "Cys_SEt": "The string cannot be interpreted as a valid BILN string.", } for biln in sorted(biln_errors.keys()): diff --git a/api/wasm/indigo-ketcher/test/test.js b/api/wasm/indigo-ketcher/test/test.js index e91ca4560f..475c02de68 100644 --- a/api/wasm/indigo-ketcher/test/test.js +++ b/api/wasm/indigo-ketcher/test/test.js @@ -1237,6 +1237,141 @@ M END }); } + { + test("BILN", "cycle_canonical", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + const biln = "D(1,2)-C-A-E(1,1)"; + const ket = JSON.parse(indigo.convert(biln, "ket", options)).struct; + let save_options = new indigo.MapStringString(); + save_options.set("output-content-type", "application/json"); + save_options.set("input-format", "chemical/x-indigo-ket"); + save_options.set("monomerLibrary", monomersLib); + const res_biln = JSON.parse(indigo.convert(ket, "biln", save_options)).struct; + assert.equal(res_biln, "A(1,1)-C-D-E(1,2)"); + options.delete(); + save_options.delete(); + }); + } + + { + test("BILN", "nonbackbone_bond_order", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + const biln = "A-[Test-6-Ch](1,4)(2,3)-C.D(2,1).E(1,2)"; + const ket = JSON.parse(indigo.convert(biln, "ket", options)).struct; + let save_options = new indigo.MapStringString(); + save_options.set("output-content-type", "application/json"); + save_options.set("input-format", "chemical/x-indigo-ket"); + save_options.set("monomerLibrary", monomersLib); + const res_biln = JSON.parse(indigo.convert(ket, "biln", save_options)).struct; + assert.equal(res_biln, "A-[Test-6-Ch](1,3)(2,4)-C.D(1,1).E(2,2)"); + options.delete(); + save_options.delete(); + }); + } + + { + test("BILN", "cycle_with_extra_bond_order", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + const biln = "C(1,1)(2,3)-C-C(2,3)-C(1,2)"; + const ket = JSON.parse(indigo.convert(biln, "ket", options)).struct; + let save_options = new indigo.MapStringString(); + save_options.set("output-content-type", "application/json"); + save_options.set("input-format", "chemical/x-indigo-ket"); + save_options.set("monomerLibrary", monomersLib); + const res_biln = JSON.parse(indigo.convert(ket, "biln", save_options)).struct; + assert.equal(res_biln, "C(1,1)-C(2,3)-C-C(1,2)(2,3)"); + options.delete(); + save_options.delete(); + }); + } + + { + test("BILN", "valid_large_bond_ids", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + const biln = "A-C(7563,3)-D(3,3)-E.F-G-H(7563,3)-I-K(3,3)"; + const ket = JSON.parse(indigo.convert(biln, "ket", options)).struct; + let save_options = new indigo.MapStringString(); + save_options.set("output-content-type", "application/json"); + save_options.set("input-format", "chemical/x-indigo-ket"); + save_options.set("monomerLibrary", monomersLib); + const res_biln = JSON.parse(indigo.convert(ket, "biln", save_options)).struct; + assert.equal(res_biln, "F-G-H(1,3)-I-K(2,3).A-C(1,3)-D(2,3)-E"); + options.delete(); + save_options.delete(); + }); + } + + { + test("BILN", "custom_chem_without_biln_code_error", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-helm"); + options.set("monomerLibrary", monomersLib); + const ket = JSON.parse(indigo.convert("CHEM1{[qweqwe]}$$$$V2.0", "ket", options)).struct; + let save_options = new indigo.MapStringString(); + save_options.set("output-content-type", "application/json"); + save_options.set("input-format", "chemical/x-indigo-ket"); + save_options.set("monomerLibrary", monomersLib); + assert.throws(() => { + indigo.convert(ket, "biln", save_options); + }, /Only amino acids and CHEMs with BILN codes/); + options.delete(); + save_options.delete(); + }); + } + + { + test("BILN", "chem_with_biln_code", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-helm"); + options.set("monomerLibrary", monomersLib); + const res = indigo.convert("CHEM1{[PEG-2]}$$$$V2.0", "biln", options); + const res_biln = JSON.parse(res).struct; + assert.equal(res_biln, "[PEG-2]"); + options.delete(); + }); + } + + { + test("BILN", "helm_alias_to_biln_alias", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-helm"); + options.set("monomerLibrary", monomersLib); + const res = indigo.convert("PEPTIDE1{[Cys_SEt]}$$$$V2.0", "biln", options); + const res_biln = JSON.parse(res).struct; + assert.equal(res_biln, "Edc"); + options.delete(); + }); + } + { test("BILN", "cross_links", () => { var fs = require('fs'); @@ -1245,10 +1380,10 @@ M END options.set("output-content-type", "application/json"); options.set("input-format", "chemical/x-biln"); options.set("monomerLibrary", monomersLib); - const biln = "Ac(1,2).A-K(1,3)"; + const biln = "A-C(1,3)-A.C(1,3)"; const res = indigo.convert(biln, "helm", options); const res_helm = JSON.parse(res).struct; - assert.equal(res_helm, "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0"); + assert.equal(res_helm, "PEPTIDE1{A.C.A}|PEPTIDE2{C}$PEPTIDE1,PEPTIDE2,2:R3-1:R3$$$V2.0"); options.delete(); }); } diff --git a/core/indigo-core/molecule/src/sequence_loader_helm.cpp b/core/indigo-core/molecule/src/sequence_loader_helm.cpp index 46cf4560fc..57efc6a1bf 100644 --- a/core/indigo-core/molecule/src/sequence_loader_helm.cpp +++ b/core/indigo-core/molecule/src/sequence_loader_helm.cpp @@ -597,6 +597,7 @@ void SequenceLoader::loadBILN(KetDocument& document) std::string biln; _scanner.readAll(biln); + auto throw_invalid_biln = []() -> void { throw Error("The string cannot be interpreted as a valid BILN string."); }; auto is_space = [](char ch) { return std::isspace(static_cast(ch)) != 0; }; auto begin = biln.begin(); while (begin != biln.end() && is_space(*begin)) @@ -607,10 +608,11 @@ void SequenceLoader::loadBILN(KetDocument& document) biln = std::string(begin, end); if (biln.empty()) - throw Error("Empty BILN string."); + throw_invalid_biln(); struct BilnEndpoint { + size_t monomer_idx; std::string monomer_id; int attachment_idx; }; @@ -619,7 +621,13 @@ void SequenceLoader::loadBILN(KetDocument& document) int bond_idx; std::vector endpoints; }; + struct BilnChain + { + std::vector monomer_indices; + bool reverse = false; + }; + std::vector chains; std::vector bonds; std::unordered_map bond_to_idx; @@ -629,27 +637,36 @@ void SequenceLoader::loadBILN(KetDocument& document) data_pos++; }; auto read_positive_int = [&](const char* field_name) -> int { + (void)field_name; skip_spaces(); if (data_pos >= biln.size() || !std::isdigit(static_cast(biln[data_pos]))) - throw Error("Invalid BILN bond annotation: expected %s number.", field_name); + throw_invalid_biln(); int value = 0; while (data_pos < biln.size() && std::isdigit(static_cast(biln[data_pos]))) { const int digit = biln[data_pos] - '0'; if (value > (std::numeric_limits::max() - digit) / 10) - throw Error("Invalid BILN bond annotation: %s number is too large.", field_name); + throw_invalid_biln(); value = value * 10 + digit; data_pos++; } if (value == 0) - throw Error("Invalid BILN bond annotation: %s number should be positive.", field_name); + throw_invalid_biln(); return value; }; - auto remember_bond_endpoint = [&](int bond_idx, const std::string& monomer_id, int attachment_idx) { + auto remember_bond_endpoint = [&](int bond_idx, size_t monomer_idx, const std::string& monomer_id, int attachment_idx) { auto [it, inserted] = bond_to_idx.emplace(bond_idx, bonds.size()); if (inserted) bonds.push_back({bond_idx, {}}); - bonds.at(it->second).endpoints.push_back({monomer_id, attachment_idx}); + bonds.at(it->second).endpoints.push_back({monomer_idx, monomer_id, attachment_idx}); + }; + auto get_biln_monomer_class = [&](const std::string& monomer_alias) { + if (_library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias).size() > 0) + return MonomerClass::AminoAcid; + if (_library.getMonomerTemplateIdByAlias(MonomerClass::CHEM, monomer_alias).size() > 0) + return MonomerClass::CHEM; + throw_invalid_biln(); + return MonomerClass::Unknown; }; _row = 0; @@ -665,11 +682,10 @@ void SequenceLoader::loadBILN(KetDocument& document) if (data_pos >= biln.size()) break; if (biln[data_pos] == '.') - throw Error("Invalid BILN string: empty chain."); + throw_invalid_biln(); _col = 0; - size_t previous_monomer_idx = 0; - int chain_monomer_count = 0; + BilnChain chain; while (data_pos < biln.size()) { @@ -677,7 +693,7 @@ void SequenceLoader::loadBILN(KetDocument& document) if (data_pos >= biln.size()) break; if (biln[data_pos] == '-' || biln[data_pos] == '.') - throw Error("Invalid BILN string: empty monomer."); + throw_invalid_biln(); std::string monomer_alias; if (biln[data_pos] == '[') @@ -687,11 +703,11 @@ void SequenceLoader::loadBILN(KetDocument& document) { char ch = biln[data_pos]; if (ch == '.' || ch == '(' || ch == ')' || ch == ',' || ch == '[' || is_space(ch)) - throw Error("Invalid BILN string: unexpected symbol '%c'.", ch); + throw_invalid_biln(); monomer_alias += biln[data_pos++]; } if (data_pos >= biln.size()) - throw Error("Invalid BILN string: unexpected end of bracketed monomer."); + throw_invalid_biln(); data_pos++; } else @@ -700,21 +716,20 @@ void SequenceLoader::loadBILN(KetDocument& document) { char ch = biln[data_pos]; if (ch == ')' || ch == ',' || ch == '[' || ch == ']') - throw Error("Invalid BILN string: unexpected symbol '%c'.", ch); + throw_invalid_biln(); monomer_alias += biln[data_pos++]; } } if (monomer_alias.empty()) - throw Error("Invalid BILN string: empty monomer."); + throw_invalid_biln(); Vec3f monomer_pos(_col * LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, -LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH * _row, 0); ambiguous_template_opts options; + const auto monomer_class = get_biln_monomer_class(monomer_alias); auto monomer_idx = - addHelmMonomer(document, std::make_tuple(monomer_alias, false, std::string(), std::string(), options), MonomerClass::AminoAcid, monomer_pos); - if (chain_monomer_count > 0) - addMonomerConnection(document, previous_monomer_idx, monomer_idx); - previous_monomer_idx = monomer_idx; + addHelmMonomer(document, std::make_tuple(monomer_alias, false, std::string(), std::string(), options), monomer_class, monomer_pos); std::string monomer_id = std::to_string(monomer_idx); + chain.monomer_indices.push_back(monomer_idx); skip_spaces(); while (data_pos < biln.size() && biln[data_pos] == '(') @@ -723,18 +738,17 @@ void SequenceLoader::loadBILN(KetDocument& document) int bond_idx = read_positive_int("bond"); skip_spaces(); if (data_pos >= biln.size() || biln[data_pos] != ',') - throw Error("Invalid BILN bond annotation: expected ','."); + throw_invalid_biln(); data_pos++; int attachment_idx = read_positive_int("attachment point"); skip_spaces(); if (data_pos >= biln.size() || biln[data_pos] != ')') - throw Error("Invalid BILN bond annotation: expected ')'."); + throw_invalid_biln(); data_pos++; - remember_bond_endpoint(bond_idx, monomer_id, attachment_idx); + remember_bond_endpoint(bond_idx, monomer_idx, monomer_id, attachment_idx); skip_spaces(); } - chain_monomer_count++; _col++; skip_spaces(); if (data_pos >= biln.size()) @@ -749,50 +763,111 @@ void SequenceLoader::loadBILN(KetDocument& document) data_pos++; skip_spaces(); if (data_pos >= biln.size()) - throw Error("Invalid BILN string: empty chain."); + throw_invalid_biln(); break; } - throw Error("Invalid BILN string: unexpected symbol '%c'.", biln[data_pos]); + throw_invalid_biln(); } - if (chain_monomer_count == 0) - throw Error("Invalid BILN string: empty chain."); + if (chain.monomer_indices.empty()) + throw_invalid_biln(); + chains.push_back(chain); _row++; } std::set> used_biln_endpoints; auto attachment_point = [](const BilnEndpoint& ep) { return std::string("R") + std::to_string(ep.attachment_idx); }; + auto validate_endpoint = [&](const BilnEndpoint& ep, const std::string& ap) -> const std::unique_ptr& { + const auto& monomer = document.monomers().at(ep.monomer_id); + if (monomer->attachmentPoints().count(ap) == 0) + throw_invalid_biln(); + if (!used_biln_endpoints.emplace(ep.monomer_id, ap).second) + throw_invalid_biln(); + return monomer; + }; for (const auto& bond : bonds) { const auto& endpoints = bond.endpoints; if (endpoints.size() != 2) - throw Error("Invalid BILN bond %d: expected two endpoints but found %d.", bond.bond_idx, static_cast(endpoints.size())); + throw_invalid_biln(); const auto& ep1 = endpoints[0]; const auto& ep2 = endpoints[1]; - auto validate_endpoint = [&](const BilnEndpoint& ep, const std::string& ap) -> const std::unique_ptr& { - const auto& monomer = document.monomers().at(ep.monomer_id); - if (monomer->attachmentPoints().count(ap) == 0) - throw Error("Unknown attachment point '%s' in monomer '%s(%s)'", ap.c_str(), monomer->alias().c_str(), monomer->ref().c_str()); - if (monomer->connections().count(ap) > 0) - { - const auto& connection = monomer->connections().at(ap); - throw Error("Monomer '%s(%s)' attachment point '%s' already connected to monomer'%s' attachment point '%s'", monomer->alias().c_str(), - monomer->ref().c_str(), ap.c_str(), connection.first.c_str(), connection.second.c_str()); - } - if (!used_biln_endpoints.emplace(ep.monomer_id, ap).second) - throw Error("Monomer '%s(%s)' attachment point '%s' already connected.", monomer->alias().c_str(), monomer->ref().c_str(), ap.c_str()); - return monomer; - }; const auto ap1 = attachment_point(ep1); const auto ap2 = attachment_point(ep2); - const auto& monomer1 = validate_endpoint(ep1, ap1); - const auto& monomer2 = validate_endpoint(ep2, ap2); + std::ignore = validate_endpoint(ep1, ap1); + std::ignore = validate_endpoint(ep2, ap2); + } + + std::set> used_implicit_endpoints = used_biln_endpoints; + auto reserve_implicit_endpoint = [&](size_t monomer_idx, const std::string& ap, std::set>& endpoints) -> bool { + const auto monomer_id = std::to_string(monomer_idx); + const auto& monomer = document.monomers().at(monomer_id); + if (monomer->attachmentPoints().count(ap) == 0) + return false; + return endpoints.emplace(monomer_id, ap).second; + }; + auto can_apply_chain_orientation = [&](const BilnChain& chain, bool reverse) { + auto endpoints = used_implicit_endpoints; + for (size_t idx = 1; idx < chain.monomer_indices.size(); idx++) + { + auto left_idx = chain.monomer_indices[idx - 1]; + auto right_idx = chain.monomer_indices[idx]; + if (!reserve_implicit_endpoint(left_idx, reverse ? kAttachmentPointR1 : kAttachmentPointR2, endpoints)) + return false; + if (!reserve_implicit_endpoint(right_idx, reverse ? kAttachmentPointR2 : kAttachmentPointR1, endpoints)) + return false; + } + return true; + }; + auto apply_chain_orientation = [&](BilnChain& chain) { + if (chain.monomer_indices.size() < 2) + return; + if (can_apply_chain_orientation(chain, false)) + chain.reverse = false; + else if (can_apply_chain_orientation(chain, true)) + chain.reverse = true; + else + throw_invalid_biln(); + for (size_t idx = 1; idx < chain.monomer_indices.size(); idx++) + { + auto left_idx = chain.monomer_indices[idx - 1]; + auto right_idx = chain.monomer_indices[idx]; + if (!reserve_implicit_endpoint(left_idx, chain.reverse ? kAttachmentPointR1 : kAttachmentPointR2, used_implicit_endpoints)) + throw_invalid_biln(); + if (!reserve_implicit_endpoint(right_idx, chain.reverse ? kAttachmentPointR2 : kAttachmentPointR1, used_implicit_endpoints)) + throw_invalid_biln(); + } + }; + for (auto& chain : chains) + apply_chain_orientation(chain); + for (const auto& chain : chains) + { + for (size_t idx = 1; idx < chain.monomer_indices.size(); idx++) + { + const auto& left_monomer = document.monomers().at(std::to_string(chain.monomer_indices[idx - 1])); + const auto& right_monomer = document.monomers().at(std::to_string(chain.monomer_indices[idx])); + document.addConnection(left_monomer->ref(), chain.reverse ? kAttachmentPointR1 : kAttachmentPointR2, right_monomer->ref(), + chain.reverse ? kAttachmentPointR2 : kAttachmentPointR1); + } + } + + for (const auto& bond : bonds) + { + const auto& ep1 = bond.endpoints[0]; + const auto& ep2 = bond.endpoints[1]; + const auto ap1 = attachment_point(ep1); + const auto ap2 = attachment_point(ep2); + const auto& monomer1 = document.monomers().at(ep1.monomer_id); + const auto& monomer2 = document.monomers().at(ep2.monomer_id); KetConnectionEndPoint endpoint1, endpoint2; setKetStrProp(endpoint1, monomerId, monomer1->ref()); setKetStrProp(endpoint1, attachmentPointId, ap1); setKetStrProp(endpoint2, monomerId, monomer2->ref()); setKetStrProp(endpoint2, attachmentPointId, ap2); - document.addExplicitConnection(endpoint1, endpoint2); + if ((ap1 == kAttachmentPointR1 && ap2 == kAttachmentPointR2) || (ap1 == kAttachmentPointR2 && ap2 == kAttachmentPointR1)) + document.addConnection(monomer1->ref(), ap1, monomer2->ref(), ap2); + else + document.addExplicitConnection(endpoint1, endpoint2); } } diff --git a/core/indigo-core/molecule/src/sequence_saver.cpp b/core/indigo-core/molecule/src/sequence_saver.cpp index 263c981ae6..75f6c7d475 100644 --- a/core/indigo-core/molecule/src/sequence_saver.cpp +++ b/core/indigo-core/molecule/src/sequence_saver.cpp @@ -30,7 +30,10 @@ #include "molecule/monomer_commons.h" #include "molecule/monomers_template_library.h" #include "molecule/smiles_saver.h" +#include #include +#include +#include #ifdef _MSC_VER #pragma warning(push, 4) @@ -863,63 +866,391 @@ static std::string format_biln_alias(const std::string& monomer_alias) std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector>& sequences) { + (void)sequences; + static const char* biln_export_error = "Only amino acids and CHEMs with BILN codes can get exported to BILN."; + if (doc.moleculesRefs().size() > 0) - throw Error("Cannot save micro-molecules to BILN format."); + throw Error(biln_export_error); - std::string biln_string; - std::vector> chains; - std::map> monomer_to_chain_pos; - const auto& monomers = doc.monomers(); + struct BilnNode + { + std::string monomer_id; + std::string monomer_ref; + std::string alias; + std::string biln_alias; + MonomerClass monomer_class; + }; + struct BilnConnection + { + int node1; + std::string ap1; + int node2; + std::string ap2; + int bond_idx = 0; + }; + struct BilnChain + { + std::vector nodes; + std::string sort_key; + int amino_acid_count = 0; + int effective_amino_acid_count = 0; + int monomer_count = 0; + }; - for (auto& sequence : sequences) + std::vector nodes; + std::map monomer_ref_to_node; + const auto& monomers = doc.monomers(); + const auto& monomer_ids = doc.monomersIds(); + auto get_biln_alias = [&](MonomerClass monomer_class, const std::string& monomer_alias) { + std::string template_id = _library.getMonomerTemplateIdByAlias(monomer_class, monomer_alias); + if (template_id.empty()) + template_id = _library.getMonomerTemplateIdByAliasHELM(monomer_class, monomer_alias); + if (template_id.empty()) + throw Error(biln_export_error); + return format_biln_alias(getKetStrProp(_library.getMonomerTemplateById(template_id), alias)); + }; + for (const auto& monomer_id : monomer_ids) { - std::vector chain; - for (const auto& monomer_id : sequence) - { - auto monomer_class = doc.getMonomerClass(monomer_id); - const auto& monomer = monomers.at(monomer_id); - if (monomer_class != MonomerClass::AminoAcid) - throw Error("Cannot save in BILN format - expected AminoAcid monomer but found %s monomer %s.", - MonomerTemplate::MonomerClassToStr(monomer_class).c_str(), monomer->alias().c_str()); - monomer_to_chain_pos.emplace(monomer_id, std::make_pair(chains.size(), chain.size())); - chain.emplace_back(format_biln_alias(monomer->alias())); - } - if (!chain.empty()) - chains.emplace_back(chain); + auto monomer_class = doc.getMonomerClass(monomer_id); + const auto& monomer = monomers.at(monomer_id); + if (monomer_class != MonomerClass::AminoAcid && monomer_class != MonomerClass::CHEM) + throw Error(biln_export_error); + auto biln_alias = get_biln_alias(monomer_class, monomer->alias()); + const int node_idx = static_cast(nodes.size()); + nodes.push_back({monomer_id, monomer->ref(), monomer->alias(), biln_alias, monomer_class}); + monomer_ref_to_node.emplace(monomer->ref(), node_idx); } - int bond_idx = 1; - for (const auto& connection : doc.nonSequenceConnections()) + std::vector next(nodes.size(), -1); + std::vector prev(nodes.size(), -1); + std::vector explicit_connections; + std::set> used_connection_endpoints; + + auto is_biln_backbone_connection = [](const std::string& ap1, const std::string& ap2) { + return (ap1 == kAttachmentPointR1 && ap2 == kAttachmentPointR2) || (ap1 == kAttachmentPointR2 && ap2 == kAttachmentPointR1); + }; + auto read_endpoint = [&](const KetConnectionEndPoint& ep) -> std::pair { + if (!hasKetStrProp(ep, monomerId) || !hasKetStrProp(ep, attachmentPointId)) + throw Error(biln_export_error); + if (hasKetStrProp(ep, moleculeId) || hasKetStrProp(ep, atomId)) + throw Error(biln_export_error); + const auto& monomer_ref = getKetStrProp(ep, monomerId); + const auto node_it = monomer_ref_to_node.find(monomer_ref); + if (node_it == monomer_ref_to_node.end()) + throw Error(biln_export_error); + const auto& ap = getKetStrProp(ep, attachmentPointId); + std::ignore = get_biln_attachment_idx(ep); + if (monomers.at(nodes.at(node_it->second).monomer_id)->attachmentPoints().count(ap) == 0) + throw Error("Cannot save in BILN format - unsupported attachment point '%s'.", ap.c_str()); + if (!used_connection_endpoints.emplace(node_it->second, ap).second) + throw Error("Cannot save in BILN format - attachment point '%s' of monomer '%s' is used more than once.", ap.c_str(), + nodes.at(node_it->second).alias.c_str()); + return {node_it->second, ap}; + }; + + for (const auto& connection : doc.connections()) { + if (connection.connType() != KetConnection::TYPE::SINGLE) + throw Error(biln_export_error); const auto& ep1 = connection.ep1(); const auto& ep2 = connection.ep2(); - if (!hasKetStrProp(ep1, monomerId) || !hasKetStrProp(ep2, monomerId)) - throw Error("Cannot save in BILN format - only monomer connections are supported."); - - auto add_bond = [&](const KetConnectionEndPoint& ep) { - const auto& monomer_ref = getKetStrProp(ep, monomerId); - const auto& monomer_id = doc.monomerIdByRef(monomer_ref); - const auto pos_it = monomer_to_chain_pos.find(monomer_id); - if (pos_it == monomer_to_chain_pos.end()) - throw Error("Cannot save in BILN format - connection endpoint '%s' is not in a peptide sequence.", monomer_ref.c_str()); - auto [chain_idx, monomer_idx] = pos_it->second; - chains.at(chain_idx).at(monomer_idx) += "(" + std::to_string(bond_idx) + "," + get_biln_attachment_idx(ep) + ")"; + auto [node1, ap1] = read_endpoint(ep1); + auto [node2, ap2] = read_endpoint(ep2); + if (is_biln_backbone_connection(ap1, ap2)) + { + int left = ap1 == kAttachmentPointR2 ? node1 : node2; + int right = ap1 == kAttachmentPointR2 ? node2 : node1; + if (next.at(left) != -1 || prev.at(right) != -1) + throw Error("Cannot save in BILN format - branched backbones are not supported."); + next.at(left) = right; + prev.at(right) = left; + } + else + { + explicit_connections.push_back({node1, ap1, node2, ap2}); + } + } + + auto make_sort_key = [&](const std::vector& chain_nodes) { + std::string key; + for (size_t idx = 0; idx < chain_nodes.size(); idx++) + { + if (idx > 0) + key += '-'; + key += nodes.at(chain_nodes[idx]).biln_alias; + } + return key; + }; + auto finish_chain = [&](std::vector chain_nodes) { + BilnChain chain; + chain.nodes = std::move(chain_nodes); + chain.monomer_count = static_cast(chain.nodes.size()); + for (int node_idx : chain.nodes) + if (nodes.at(node_idx).monomer_class == MonomerClass::AminoAcid) + chain.amino_acid_count++; + chain.effective_amino_acid_count = chain.monomer_count <= 5 ? chain.monomer_count : chain.amino_acid_count; + chain.sort_key = make_sort_key(chain.nodes); + return chain; + }; + auto make_cycle_key = [&](const std::vector& chain_nodes, bool reverse) { + std::map node_to_pos; + std::set chain_node_set; + for (int idx = 0; idx < static_cast(chain_nodes.size()); idx++) + { + node_to_pos.emplace(chain_nodes[idx], idx); + chain_node_set.emplace(chain_nodes[idx]); + } + + std::vector candidate_connections; + for (const auto& connection : explicit_connections) + { + if (chain_node_set.count(connection.node1) && chain_node_set.count(connection.node2)) + candidate_connections.push_back(connection); + } + candidate_connections.push_back( + {chain_nodes.front(), reverse ? kAttachmentPointR2 : kAttachmentPointR1, chain_nodes.back(), reverse ? kAttachmentPointR1 : kAttachmentPointR2}); + + std::vector> node_to_bonds(chain_nodes.size()); + for (int bond_idx = 0; bond_idx < static_cast(candidate_connections.size()); bond_idx++) + { + auto pos1 = node_to_pos.at(candidate_connections[bond_idx].node1); + node_to_bonds.at(pos1).push_back(bond_idx); + if (candidate_connections[bond_idx].node2 != candidate_connections[bond_idx].node1) + { + auto pos2 = node_to_pos.at(candidate_connections[bond_idx].node2); + node_to_bonds.at(pos2).push_back(bond_idx); + } + } + + int next_bond_idx = 1; + auto endpoint_position = [&](int node_idx) { return node_to_pos.at(node_idx); }; + auto append_bond_endpoint = [&](std::string& monomer_text, BilnConnection& bond, int node_idx) { + if (bond.bond_idx == 0) + bond.bond_idx = next_bond_idx++; + if (bond.node1 == node_idx) + monomer_text += "(" + std::to_string(bond.bond_idx) + "," + bond.ap1.substr(1) + ")"; + if (bond.node2 == node_idx) + monomer_text += "(" + std::to_string(bond.bond_idx) + "," + bond.ap2.substr(1) + ")"; }; - add_bond(ep1); - add_bond(ep2); - bond_idx++; + std::string key; + for (int monomer_idx = 0; monomer_idx < static_cast(chain_nodes.size()); monomer_idx++) + { + if (monomer_idx > 0) + key += '-'; + const int node_idx = chain_nodes[monomer_idx]; + std::string monomer_text = nodes.at(node_idx).biln_alias; + auto& incident_bonds = node_to_bonds.at(monomer_idx); + std::sort(incident_bonds.begin(), incident_bonds.end(), [&](int left_idx, int right_idx) { + auto other_pos = [&](const BilnConnection& bond, int current_node) { + if (bond.node1 == current_node && bond.node2 != current_node) + return endpoint_position(bond.node2); + return endpoint_position(bond.node1); + }; + const auto& left_bond = candidate_connections.at(left_idx); + const auto& right_bond = candidate_connections.at(right_idx); + if ((left_bond.bond_idx == 0) != (right_bond.bond_idx == 0)) + return left_bond.bond_idx != 0; + if (left_bond.bond_idx != 0 && right_bond.bond_idx != 0) + return left_bond.bond_idx < right_bond.bond_idx; + auto left_pos = other_pos(left_bond, node_idx); + auto right_pos = other_pos(right_bond, node_idx); + if (left_pos != right_pos) + return left_pos < right_pos; + return std::tie(left_bond.node1, left_bond.ap1, left_bond.node2, left_bond.ap2) < + std::tie(right_bond.node1, right_bond.ap1, right_bond.node2, right_bond.ap2); + }); + for (int bond_idx : incident_bonds) + append_bond_endpoint(monomer_text, candidate_connections.at(bond_idx), node_idx); + key += monomer_text; + } + return key; + }; + + std::vector chains; + std::vector visited(nodes.size(), false); + for (int start_node = 0; start_node < static_cast(nodes.size()); start_node++) + { + if (visited.at(start_node)) + continue; + std::vector component; + std::vector stack = {start_node}; + visited.at(start_node) = true; + while (!stack.empty()) + { + int node = stack.back(); + stack.pop_back(); + component.push_back(node); + for (int adjacent : {next.at(node), prev.at(node)}) + { + if (adjacent != -1 && !visited.at(adjacent)) + { + visited.at(adjacent) = true; + stack.push_back(adjacent); + } + } + } + + bool is_cycle = component.size() > 1; + for (int node : component) + { + if (next.at(node) == -1 || prev.at(node) == -1) + { + is_cycle = false; + break; + } + } + + if (is_cycle) + { + std::vector directed_cycle; + int node = component.front(); + do + { + if (std::find(directed_cycle.begin(), directed_cycle.end(), node) != directed_cycle.end()) + throw Error("Cannot save in BILN format - invalid cyclic backbone."); + directed_cycle.push_back(node); + node = next.at(node); + if (node == -1) + throw Error("Cannot save in BILN format - invalid cyclic backbone."); + } while (node != component.front()); + if (directed_cycle.size() != component.size()) + throw Error("Cannot save in BILN format - invalid cyclic backbone."); + + std::string best_key; + std::vector best_nodes; + bool best_reverse = false; + const int cycle_size = static_cast(directed_cycle.size()); + for (int offset = 0; offset < cycle_size; offset++) + { + std::vector candidate; + for (int idx = 0; idx < cycle_size; idx++) + candidate.push_back(directed_cycle.at((offset + idx) % cycle_size)); + auto candidate_key = make_cycle_key(candidate, false); + if (best_nodes.empty() || candidate_key < best_key) + { + best_key = candidate_key; + best_nodes = candidate; + best_reverse = false; + } + + candidate.clear(); + for (int idx = 0; idx < cycle_size; idx++) + candidate.push_back(directed_cycle.at((offset - idx + cycle_size) % cycle_size)); + candidate_key = make_cycle_key(candidate, true); + if (candidate_key < best_key) + { + best_key = candidate_key; + best_nodes = candidate; + best_reverse = true; + } + } + explicit_connections.push_back( + {best_nodes.front(), best_reverse ? kAttachmentPointR2 : kAttachmentPointR1, best_nodes.back(), best_reverse ? kAttachmentPointR1 : kAttachmentPointR2}); + chains.push_back(finish_chain(best_nodes)); + } + else + { + int start = -1; + for (int node : component) + { + if (prev.at(node) == -1) + { + if (start != -1) + throw Error("Cannot save in BILN format - invalid backbone."); + start = node; + } + } + if (start == -1) + start = component.front(); + std::vector chain_nodes; + int node = start; + while (node != -1) + { + if (std::find(chain_nodes.begin(), chain_nodes.end(), node) != chain_nodes.end()) + throw Error("Cannot save in BILN format - invalid backbone."); + chain_nodes.push_back(node); + node = next.at(node); + } + if (chain_nodes.size() != component.size()) + throw Error("Cannot save in BILN format - invalid backbone."); + chains.push_back(finish_chain(chain_nodes)); + } } + std::sort(chains.begin(), chains.end(), [](const BilnChain& left, const BilnChain& right) { + if (left.effective_amino_acid_count != right.effective_amino_acid_count) + return left.effective_amino_acid_count > right.effective_amino_acid_count; + if (left.monomer_count != right.monomer_count) + return left.monomer_count > right.monomer_count; + return left.sort_key < right.sort_key; + }); + + std::map> node_to_chain_pos; + for (int chain_idx = 0; chain_idx < static_cast(chains.size()); chain_idx++) + for (int monomer_idx = 0; monomer_idx < static_cast(chains[chain_idx].nodes.size()); monomer_idx++) + node_to_chain_pos.emplace(chains[chain_idx].nodes[monomer_idx], std::make_pair(chain_idx, monomer_idx)); + + auto endpoint_position = [&](int node_idx) { return node_to_chain_pos.at(node_idx); }; + std::vector>> node_to_bonds(chains.size()); for (size_t chain_idx = 0; chain_idx < chains.size(); chain_idx++) + node_to_bonds.at(chain_idx).resize(chains[chain_idx].nodes.size()); + for (int bond_idx = 0; bond_idx < static_cast(explicit_connections.size()); bond_idx++) + { + auto pos1 = endpoint_position(explicit_connections[bond_idx].node1); + node_to_bonds.at(pos1.first).at(pos1.second).push_back(bond_idx); + if (explicit_connections[bond_idx].node2 != explicit_connections[bond_idx].node1) + { + auto pos2 = endpoint_position(explicit_connections[bond_idx].node2); + node_to_bonds.at(pos2.first).at(pos2.second).push_back(bond_idx); + } + } + + int next_bond_idx = 1; + auto append_bond_endpoint = [&](std::string& monomer_text, BilnConnection& bond, int node_idx) { + if (bond.bond_idx == 0) + bond.bond_idx = next_bond_idx++; + if (bond.node1 == node_idx) + monomer_text += "(" + std::to_string(bond.bond_idx) + "," + bond.ap1.substr(1) + ")"; + if (bond.node2 == node_idx) + monomer_text += "(" + std::to_string(bond.bond_idx) + "," + bond.ap2.substr(1) + ")"; + }; + + std::string biln_string; + for (int chain_idx = 0; chain_idx < static_cast(chains.size()); chain_idx++) { if (chain_idx > 0) biln_string += '.'; - for (size_t monomer_idx = 0; monomer_idx < chains[chain_idx].size(); monomer_idx++) + const auto& chain = chains[chain_idx]; + for (int monomer_idx = 0; monomer_idx < static_cast(chain.nodes.size()); monomer_idx++) { if (monomer_idx > 0) biln_string += '-'; - biln_string += chains[chain_idx][monomer_idx]; + const int node_idx = chain.nodes[monomer_idx]; + std::string monomer_text = nodes.at(node_idx).biln_alias; + auto& incident_bonds = node_to_bonds.at(chain_idx).at(monomer_idx); + std::sort(incident_bonds.begin(), incident_bonds.end(), [&](int left_idx, int right_idx) { + auto other_pos = [&](const BilnConnection& bond, int current_node) { + if (bond.node1 == current_node && bond.node2 != current_node) + return endpoint_position(bond.node2); + return endpoint_position(bond.node1); + }; + const auto& left_bond = explicit_connections.at(left_idx); + const auto& right_bond = explicit_connections.at(right_idx); + if ((left_bond.bond_idx == 0) != (right_bond.bond_idx == 0)) + return left_bond.bond_idx != 0; + if (left_bond.bond_idx != 0 && right_bond.bond_idx != 0) + return left_bond.bond_idx < right_bond.bond_idx; + auto left_pos = other_pos(left_bond, node_idx); + auto right_pos = other_pos(right_bond, node_idx); + if (left_pos != right_pos) + return left_pos < right_pos; + return std::tie(left_bond.node1, left_bond.ap1, left_bond.node2, left_bond.ap2) < + std::tie(right_bond.node1, right_bond.ap1, right_bond.node2, right_bond.ap2); + }); + for (int bond_idx : incident_bonds) + append_bond_endpoint(monomer_text, explicit_connections.at(bond_idx), node_idx); + biln_string += monomer_text; } } return biln_string; From 4ce5d18e74618ac1fbcf2b63f337d68bb7037564 Mon Sep 17 00:00:00 2001 From: even1024 Date: Fri, 15 May 2026 19:10:10 +0200 Subject: [PATCH 3/8] clang fix --- core/indigo-core/molecule/src/sequence_saver.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/indigo-core/molecule/src/sequence_saver.cpp b/core/indigo-core/molecule/src/sequence_saver.cpp index 75f6c7d475..ff624c693d 100644 --- a/core/indigo-core/molecule/src/sequence_saver.cpp +++ b/core/indigo-core/molecule/src/sequence_saver.cpp @@ -1145,8 +1145,8 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector Date: Fri, 15 May 2026 21:31:25 +0200 Subject: [PATCH 4/8] remote fix --- .../indigo-service/backend/service/tests/api/indigo_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/indigo-service/backend/service/tests/api/indigo_test.py b/utils/indigo-service/backend/service/tests/api/indigo_test.py index d71dc44aa2..685fbed83b 100644 --- a/utils/indigo-service/backend/service/tests/api/indigo_test.py +++ b/utils/indigo-service/backend/service/tests/api/indigo_test.py @@ -3917,10 +3917,10 @@ def test_convert_biln(self): self.assertEqual(helm_ref, result_helm) # BILN with cross-links - biln_cross = "Ac(1,2).A-K(1,3)" + biln_cross = "A-C(1,3).C(1,3)" helm_cross_ref = ( - "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}" - "$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0" + "PEPTIDE1{A.C}|PEPTIDE2{C}" + "$PEPTIDE1,PEPTIDE2,2:R3-1:R3$$$V2.0" ) headers, data = self.get_headers( { From df67b76e38d77308ca918a8c4f5f4914113c63e4 Mon Sep 17 00:00:00 2001 From: even1024 Date: Fri, 15 May 2026 21:44:02 +0200 Subject: [PATCH 5/8] black fix --- utils/indigo-service/backend/service/tests/api/indigo_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/indigo-service/backend/service/tests/api/indigo_test.py b/utils/indigo-service/backend/service/tests/api/indigo_test.py index 685fbed83b..d3391fd5b2 100644 --- a/utils/indigo-service/backend/service/tests/api/indigo_test.py +++ b/utils/indigo-service/backend/service/tests/api/indigo_test.py @@ -3919,8 +3919,7 @@ def test_convert_biln(self): # BILN with cross-links biln_cross = "A-C(1,3).C(1,3)" helm_cross_ref = ( - "PEPTIDE1{A.C}|PEPTIDE2{C}" - "$PEPTIDE1,PEPTIDE2,2:R3-1:R3$$$V2.0" + "PEPTIDE1{A.C}|PEPTIDE2{C}" "$PEPTIDE1,PEPTIDE2,2:R3-1:R3$$$V2.0" ) headers, data = self.get_headers( { From 6887a04797026e274d874fca8e38dc75ad64603a Mon Sep 17 00:00:00 2001 From: even1024 Date: Mon, 18 May 2026 20:14:26 +0200 Subject: [PATCH 6/8] Fix BILN edge cases --- api/tests/integration/ref/formats/biln.py.out | 6 ++ api/tests/integration/tests/formats/biln.py | 28 ++++++ api/wasm/indigo-ketcher/test/test.js | 16 ++++ .../molecule/src/sequence_loader_helm.cpp | 16 +++- .../molecule/src/sequence_saver.cpp | 88 +++++++++++++++---- .../backend/service/tests/api/indigo_test.py | 21 +++++ 6 files changed, 155 insertions(+), 20 deletions(-) diff --git a/api/tests/integration/ref/formats/biln.py.out b/api/tests/integration/ref/formats/biln.py.out index de62105ba1..8e881916e1 100644 --- a/api/tests/integration/ref/formats/biln.py.out +++ b/api/tests/integration/ref/formats/biln.py.out @@ -1,9 +1,11 @@ *** BILN interop *** biln_bracketed_alias:BILN->HELM SUCCEED +biln_cap:BILN->HELM SUCCEED biln_disulfides:BILN->HELM SUCCEED biln_explicit_backbone:BILN->HELM SUCCEED biln_large_bond_id:BILN->HELM SUCCEED biln_star_alias:BILN->HELM SUCCEED +biln_three_chains:BILN->HELM SUCCEED biln_two_backbones:BILN->HELM SUCCEED biln_underscore_alias:BILN->HELM SUCCEED biln_alphabetic_order:BILN->BILN SUCCEED @@ -13,6 +15,7 @@ biln_bracketed_no_hyphen:BILN->BILN SUCCEED biln_cycle_best:BILN->BILN SUCCEED biln_cycle_reverse_rotation:BILN->BILN SUCCEED biln_cycle_rotation:BILN->BILN SUCCEED +biln_cycle_with_external_bond_order:BILN->BILN SUCCEED biln_cycle_with_extra_bond_order:BILN->BILN SUCCEED biln_library_alias:BILN->BILN SUCCEED biln_multiple_nonbackbone_order:BILN->BILN SUCCEED @@ -20,13 +23,16 @@ biln_short_chain_order:BILN->BILN SUCCEED biln_valid_large_bond_ids:BILN->BILN SUCCEED helm_alias_to_biln_alias:HELM->BILN SUCCEED helm_bracketed_alias:HELM->BILN SUCCEED +helm_cap:HELM->BILN SUCCEED helm_chem_backbone:HELM->BILN SUCCEED helm_chem_with_biln_code:HELM->BILN SUCCEED helm_cycle:HELM->BILN SUCCEED helm_star_alias:HELM->BILN SUCCEED +helm_three_chains:HELM->BILN SUCCEED helm_underscore_alias:HELM->BILN SUCCEED Test 'CHEM1{[qweqwe]}$$$$V2.0': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' Test 'PEPTIDE1{A}|RNA1{R(A)P}$$$$V2.0': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' +Test 'PEPTIDE1{[Abu].[Sar].[NMeL].V.[NMeL].A.[DAla].[NMeL].[NMeL].[NMeV].[NMeThr4RBut2enyl]}$PEPTIDE1,PEPTIDE1,1:R1-11:R2$$$V2.0': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' Test KET 'custom_chem_without_biln_code': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' Test 'A(1,3)-C': got expected error 'The string cannot be interpreted as a valid BILN string.' Test 'A--C': got expected error 'The string cannot be interpreted as a valid BILN string.' diff --git a/api/tests/integration/tests/formats/biln.py b/api/tests/integration/tests/formats/biln.py index 55d1b251ee..44517252ac 100644 --- a/api/tests/integration/tests/formats/biln.py +++ b/api/tests/integration/tests/formats/biln.py @@ -25,6 +25,16 @@ ) biln_to_helm = { + # Legacy integration coverage: terminal caps must keep their explicit + # non-backbone BILN bonds during import/export roundtrip. + "biln_cap": ( + "Ac(1,2).A-K(1,3)", + "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0", + ), + "biln_three_chains": ( + "Ac(1,2).A-K(1,3)(2,2).Me(2,1)", + "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}|PEPTIDE3{[Me]}$PEPTIDE1,PEPTIDE2,1:R2-2:R3|PEPTIDE2,PEPTIDE3,2:R2-1:R1$$$V2.0", + ), "biln_two_backbones": ( "A-C-D.E-F-G", "PEPTIDE1{A.C.D}|PEPTIDE2{E.F.G}$$$$V2.0", @@ -110,6 +120,10 @@ "C(1,1)(2,3)-C-C(2,3)-C(1,2)", "C(1,1)-C(2,3)-C-C(1,2)(2,3)", ), + "biln_cycle_with_external_bond_order": ( + "C(1,1)-C(2,3)-C(1,2).C(2,3)", + "C(1,1)(2,3)-C-C(1,2).C(2,3)", + ), "biln_library_alias": ( "Edc", "Edc", @@ -135,6 +149,14 @@ print(name + ":FAILED - " + getIndigoExceptionText(e)) helm_to_biln = { + "helm_cap": ( + "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0", + "Ac(1,2).A-K(1,3)", + ), + "helm_three_chains": ( + "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}|PEPTIDE3{[Me]}$PEPTIDE1,PEPTIDE2,1:R2-2:R3|PEPTIDE2,PEPTIDE3,2:R2-1:R1$$$V2.0", + "Ac(1,2).A-K(1,3)(2,2).Me(2,1)", + ), "helm_underscore_alias": ( "PEPTIDE1{A.[1Nal].[Cys_Bn].C}$$$$V2.0", "A-1Nal-Cys_Bn-C", @@ -182,6 +204,9 @@ helm_errors = { "CHEM1{[qweqwe]}$$$$V2.0": "Only amino acids and CHEMs with BILN codes can get exported to BILN.", "PEPTIDE1{A}|RNA1{R(A)P}$$$$V2.0": "Only amino acids and CHEMs with BILN codes can get exported to BILN.", + # Legacy integration input retained as an error expectation. #3541 export + # requirement 1 forbids exporting PEPTIDE monomers without BILN codes. + "PEPTIDE1{[Abu].[Sar].[NMeL].V.[NMeL].A.[DAla].[NMeL].[NMeL].[NMeV].[NMeThr4RBut2enyl]}$PEPTIDE1,PEPTIDE1,1:R1-11:R2$$$V2.0": "Only amino acids and CHEMs with BILN codes can get exported to BILN.", } for helm in sorted(helm_errors.keys()): @@ -225,6 +250,9 @@ ) biln_errors = { + # #3541 import requirement 1: any invalid BILN string must return the + # generic error text below. This intentionally replaces older detailed + # error expectations for the pre-existing invalid integration cases. "A(1,3)-C": "The string cannot be interpreted as a valid BILN string.", "A--C": "The string cannot be interpreted as a valid BILN string.", "A-C(1,4)": "The string cannot be interpreted as a valid BILN string.", diff --git a/api/wasm/indigo-ketcher/test/test.js b/api/wasm/indigo-ketcher/test/test.js index 475c02de68..c9f6227706 100644 --- a/api/wasm/indigo-ketcher/test/test.js +++ b/api/wasm/indigo-ketcher/test/test.js @@ -1372,6 +1372,22 @@ M END }); } + { + test("BILN", "cap_cross_links", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + const biln = "Ac(1,2).A-K(1,3)"; + const res = indigo.convert(biln, "helm", options); + const res_helm = JSON.parse(res).struct; + assert.equal(res_helm, "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0"); + options.delete(); + }); + } + { test("BILN", "cross_links", () => { var fs = require('fs'); diff --git a/core/indigo-core/molecule/src/sequence_loader_helm.cpp b/core/indigo-core/molecule/src/sequence_loader_helm.cpp index 57efc6a1bf..82a375e9f9 100644 --- a/core/indigo-core/molecule/src/sequence_loader_helm.cpp +++ b/core/indigo-core/molecule/src/sequence_loader_helm.cpp @@ -665,6 +665,8 @@ void SequenceLoader::loadBILN(KetDocument& document) return MonomerClass::AminoAcid; if (_library.getMonomerTemplateIdByAlias(MonomerClass::CHEM, monomer_alias).size() > 0) return MonomerClass::CHEM; + if (_library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias + "-").size() > 0) + return MonomerClass::AminoAcid; throw_invalid_biln(); return MonomerClass::Unknown; }; @@ -777,6 +779,17 @@ void SequenceLoader::loadBILN(KetDocument& document) std::set> used_biln_endpoints; auto attachment_point = [](const BilnEndpoint& ep) { return std::string("R") + std::to_string(ep.attachment_idx); }; + auto is_terminal_cap_alias = [&](const std::string& monomer_alias) { + auto template_id = _library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias); + if (template_id.empty()) + template_id = _library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias + "-"); + if (template_id.empty()) + return false; + const auto& monomer_template = _library.getMonomerTemplateById(template_id); + const auto& template_alias = getKetStrProp(monomer_template, alias); + return template_alias.size() > 1 && template_alias.back() == '-' && monomer_template.attachmentPoints().size() == 1; + }; + auto is_terminal_cap = [&](const std::unique_ptr& monomer) { return is_terminal_cap_alias(monomer->alias()); }; auto validate_endpoint = [&](const BilnEndpoint& ep, const std::string& ap) -> const std::unique_ptr& { const auto& monomer = document.monomers().at(ep.monomer_id); if (monomer->attachmentPoints().count(ap) == 0) @@ -864,7 +877,8 @@ void SequenceLoader::loadBILN(KetDocument& document) setKetStrProp(endpoint1, attachmentPointId, ap1); setKetStrProp(endpoint2, monomerId, monomer2->ref()); setKetStrProp(endpoint2, attachmentPointId, ap2); - if ((ap1 == kAttachmentPointR1 && ap2 == kAttachmentPointR2) || (ap1 == kAttachmentPointR2 && ap2 == kAttachmentPointR1)) + if (((ap1 == kAttachmentPointR1 && ap2 == kAttachmentPointR2) || (ap1 == kAttachmentPointR2 && ap2 == kAttachmentPointR1)) && + !is_terminal_cap(monomer1) && !is_terminal_cap(monomer2)) document.addConnection(monomer1->ref(), ap1, monomer2->ref(), ap2); else document.addExplicitConnection(endpoint1, endpoint2); diff --git a/core/indigo-core/molecule/src/sequence_saver.cpp b/core/indigo-core/molecule/src/sequence_saver.cpp index ff624c693d..f80c3c4b0e 100644 --- a/core/indigo-core/molecule/src/sequence_saver.cpp +++ b/core/indigo-core/molecule/src/sequence_saver.cpp @@ -850,8 +850,11 @@ static std::string format_biln_alias(const std::string& monomer_alias) { if (monomer_alias.empty()) throw SequenceSaver::Error("Cannot save empty monomer alias in BILN format."); + auto biln_alias = monomer_alias; + if (biln_alias.size() > 1 && biln_alias.back() == '-') + biln_alias.pop_back(); bool needs_brackets = false; - for (auto ch : monomer_alias) + for (auto ch : biln_alias) { if (ch == '-') { @@ -861,7 +864,7 @@ static std::string format_biln_alias(const std::string& monomer_alias) if (ch == '.' || ch == '(' || ch == ')' || ch == ',' || ch == '[' || ch == ']' || std::isspace(static_cast(ch))) throw SequenceSaver::Error("Cannot save monomer alias '%s' in BILN format.", monomer_alias.c_str()); } - return needs_brackets ? "[" + monomer_alias + "]" : monomer_alias; + return needs_brackets ? "[" + biln_alias + "]" : biln_alias; } std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector>& sequences) @@ -905,6 +908,8 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector explicit_connections; std::set> used_connection_endpoints; + auto is_terminal_cap_alias = [&](const std::string& monomer_alias) { + auto template_id = _library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias); + if (template_id.empty()) + template_id = _library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias + "-"); + if (template_id.empty()) + return false; + const auto& monomer_template = _library.getMonomerTemplateById(template_id); + const auto& template_alias = getKetStrProp(monomer_template, alias); + return template_alias.size() > 1 && template_alias.back() == '-' && monomer_template.attachmentPoints().size() == 1; + }; + auto is_terminal_cap_node = [&](int node_idx) { + const auto& node = nodes.at(node_idx); + return node.monomer_class == MonomerClass::AminoAcid && (is_terminal_cap_alias(node.alias) || is_terminal_cap_alias(node.biln_alias)); + }; auto is_biln_backbone_connection = [](const std::string& ap1, const std::string& ap2) { return (ap1 == kAttachmentPointR1 && ap2 == kAttachmentPointR2) || (ap1 == kAttachmentPointR2 && ap2 == kAttachmentPointR1); }; @@ -956,7 +975,7 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector candidate_connections; for (const auto& connection : explicit_connections) { - if (chain_node_set.count(connection.node1) && chain_node_set.count(connection.node2)) + if (chain_node_set.count(connection.node1) || chain_node_set.count(connection.node2)) candidate_connections.push_back(connection); } candidate_connections.push_back( @@ -1013,17 +1032,17 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector> node_to_bonds(chain_nodes.size()); for (int bond_idx = 0; bond_idx < static_cast(candidate_connections.size()); bond_idx++) { - auto pos1 = node_to_pos.at(candidate_connections[bond_idx].node1); - node_to_bonds.at(pos1).push_back(bond_idx); - if (candidate_connections[bond_idx].node2 != candidate_connections[bond_idx].node1) + auto pos1 = node_to_pos.find(candidate_connections[bond_idx].node1); + if (pos1 != node_to_pos.end()) + node_to_bonds.at(pos1->second).push_back(bond_idx); + auto pos2 = node_to_pos.find(candidate_connections[bond_idx].node2); + if (pos2 != node_to_pos.end() && candidate_connections[bond_idx].node2 != candidate_connections[bond_idx].node1) { - auto pos2 = node_to_pos.at(candidate_connections[bond_idx].node2); - node_to_bonds.at(pos2).push_back(bond_idx); + node_to_bonds.at(pos2->second).push_back(bond_idx); } } int next_bond_idx = 1; - auto endpoint_position = [&](int node_idx) { return node_to_pos.at(node_idx); }; auto append_bond_endpoint = [&](std::string& monomer_text, BilnConnection& bond, int node_idx) { if (bond.bond_idx == 0) bond.bond_idx = next_bond_idx++; @@ -1042,10 +1061,12 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vectorsecond, nodes.at(other_node).biln_alias, other_node); + return std::make_tuple(1, 0, nodes.at(other_node).biln_alias, other_node); }; const auto& left_bond = candidate_connections.at(left_idx); const auto& right_bond = candidate_connections.at(right_idx); @@ -1053,10 +1074,10 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector right.effective_amino_acid_count; if (left.monomer_count != right.monomer_count) diff --git a/utils/indigo-service/backend/service/tests/api/indigo_test.py b/utils/indigo-service/backend/service/tests/api/indigo_test.py index d3391fd5b2..6a175eeb56 100644 --- a/utils/indigo-service/backend/service/tests/api/indigo_test.py +++ b/utils/indigo-service/backend/service/tests/api/indigo_test.py @@ -3916,6 +3916,27 @@ def test_convert_biln(self): result_helm = json.loads(result.text)["struct"] self.assertEqual(helm_ref, result_helm) + # BILN with terminal cap cross-link + biln_cross = "Ac(1,2).A-K(1,3)" + helm_cross_ref = ( + "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}" + "$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0" + ) + headers, data = self.get_headers( + { + "struct": biln_cross, + "options": {"monomerLibrary": monomer_library}, + "input_format": "chemical/x-biln", + "output_format": "chemical/x-helm", + } + ) + result = requests.post( + self.url_prefix + "/convert", headers=headers, data=data + ) + self.assertEqual(200, result.status_code) + result_helm = json.loads(result.text)["struct"] + self.assertEqual(helm_cross_ref, result_helm) + # BILN with cross-links biln_cross = "A-C(1,3).C(1,3)" helm_cross_ref = ( From 58959cafc50f3b76b0a97e546632a252f8d41df7 Mon Sep 17 00:00:00 2001 From: even1024 Date: Tue, 19 May 2026 00:27:46 +0200 Subject: [PATCH 7/8] Fix BILN cap validation --- api/tests/integration/ref/formats/biln.py.out | 5 + api/tests/integration/tests/formats/biln.py | 19 ++- api/wasm/indigo-ketcher/test/test.js | 64 ++++++- .../molecule/src/sequence_loader_helm.cpp | 80 +++++++-- .../molecule/src/sequence_saver.cpp | 156 ++++++++++++++++-- .../backend/service/tests/api/indigo_test.py | 2 +- 6 files changed, 293 insertions(+), 33 deletions(-) diff --git a/api/tests/integration/ref/formats/biln.py.out b/api/tests/integration/ref/formats/biln.py.out index 8e881916e1..e445929d89 100644 --- a/api/tests/integration/ref/formats/biln.py.out +++ b/api/tests/integration/ref/formats/biln.py.out @@ -17,6 +17,7 @@ biln_cycle_reverse_rotation:BILN->BILN SUCCEED biln_cycle_rotation:BILN->BILN SUCCEED biln_cycle_with_external_bond_order:BILN->BILN SUCCEED biln_cycle_with_extra_bond_order:BILN->BILN SUCCEED +biln_equal_chain_topology_order:BILN->BILN SUCCEED biln_library_alias:BILN->BILN SUCCEED biln_multiple_nonbackbone_order:BILN->BILN SUCCEED biln_short_chain_order:BILN->BILN SUCCEED @@ -33,7 +34,9 @@ helm_underscore_alias:HELM->BILN SUCCEED Test 'CHEM1{[qweqwe]}$$$$V2.0': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' Test 'PEPTIDE1{A}|RNA1{R(A)P}$$$$V2.0': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' Test 'PEPTIDE1{[Abu].[Sar].[NMeL].V.[NMeL].A.[DAla].[NMeL].[NMeL].[NMeV].[NMeThr4RBut2enyl]}$PEPTIDE1,PEPTIDE1,1:R1-11:R2$$$V2.0': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' +Test 'PEPTIDE1{[Ac]}|PEPTIDE2{K}$PEPTIDE1,PEPTIDE2,1:R1-1:R3$$$V2.0': got expected error 'Cannot save in BILN format - unsupported attachment point 'R1'.' Test KET 'custom_chem_without_biln_code': got expected error 'Only amino acids and CHEMs with BILN codes can get exported to BILN.' +Test KET 'unresolved_cap_invalid_attachment': got expected error 'Cannot save in BILN format - unsupported attachment point 'R1'.' Test 'A(1,3)-C': got expected error 'The string cannot be interpreted as a valid BILN string.' Test 'A--C': got expected error 'The string cannot be interpreted as a valid BILN string.' Test 'A-C(-1,3)-D(2,3)-E.F-G-H(-1,3)-I-K(2,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' @@ -41,6 +44,8 @@ Test 'A-C(1,3)-D(1,3)-E.F-G-H(1,3)-I-K(2,3)': got expected error 'The string can Test 'A-C(1,4)': got expected error 'The string cannot be interpreted as a valid BILN string.' Test 'A-C(1,4)-D(2,3)-E.F-G-H(1,3)-I-K(2,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' Test 'A-C(1.25,3)-D(2,3)-E.F-G-H(1.25,3)-I-K(2,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'Ac(1,1).K(1,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' +Test 'Ac(1,4).K(1,3)': got expected error 'The string cannot be interpreted as a valid BILN string.' Test 'Cys_SEt': got expected error 'The string cannot be interpreted as a valid BILN string.' Test 'D-2Thi-D-D-gGlu-meF-G-Lys-al': got expected error 'The string cannot be interpreted as a valid BILN string.' Test '[D-Cit](1,2)-aThr(1,1)(2,2)-meS(2,1)': got expected error 'The string cannot be interpreted as a valid BILN string.' diff --git a/api/tests/integration/tests/formats/biln.py b/api/tests/integration/tests/formats/biln.py index 44517252ac..80bf3abf48 100644 --- a/api/tests/integration/tests/formats/biln.py +++ b/api/tests/integration/tests/formats/biln.py @@ -29,11 +29,11 @@ # non-backbone BILN bonds during import/export roundtrip. "biln_cap": ( "Ac(1,2).A-K(1,3)", - "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0", + "PEPTIDE1{[ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0", ), "biln_three_chains": ( "Ac(1,2).A-K(1,3)(2,2).Me(2,1)", - "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}|PEPTIDE3{[Me]}$PEPTIDE1,PEPTIDE2,1:R2-2:R3|PEPTIDE2,PEPTIDE3,2:R2-1:R1$$$V2.0", + "PEPTIDE1{[ac]}|PEPTIDE2{A.K}|PEPTIDE3{[-Me]}$PEPTIDE1,PEPTIDE2,1:R2-2:R3|PEPTIDE2,PEPTIDE3,2:R2-1:R1$$$V2.0", ), "biln_two_backbones": ( "A-C-D.E-F-G", @@ -124,6 +124,10 @@ "C(1,1)-C(2,3)-C(1,2).C(2,3)", "C(1,1)(2,3)-C-C(1,2).C(2,3)", ), + "biln_equal_chain_topology_order": ( + "C(1,3).C(1,1)", + "C(1,1).C(1,3)", + ), "biln_library_alias": ( "Edc", "Edc", @@ -203,6 +207,9 @@ helm_errors = { "CHEM1{[qweqwe]}$$$$V2.0": "Only amino acids and CHEMs with BILN codes can get exported to BILN.", + # Unresolved HELM aliases that map to real BILN terminal caps must still be + # validated against the resolved library template attachment points. + "PEPTIDE1{[Ac]}|PEPTIDE2{K}$PEPTIDE1,PEPTIDE2,1:R1-1:R3$$$V2.0": "Cannot save in BILN format - unsupported attachment point 'R1'.", "PEPTIDE1{A}|RNA1{R(A)P}$$$$V2.0": "Only amino acids and CHEMs with BILN codes can get exported to BILN.", # Legacy integration input retained as an error expectation. #3541 export # requirement 1 forbids exporting PEPTIDE monomers without BILN codes. @@ -230,6 +237,10 @@ "CHEM1{[qweqwe]}$$$$V2.0", "Only amino acids and CHEMs with BILN codes can get exported to BILN.", ), + "unresolved_cap_invalid_attachment": ( + "PEPTIDE1{[Ac]}|PEPTIDE2{K}$PEPTIDE1,PEPTIDE2,1:R1-1:R3$$$V2.0", + "Cannot save in BILN format - unsupported attachment point 'R1'.", + ), } for name in sorted(ket_errors.keys()): @@ -262,6 +273,10 @@ "A-C(1.25,3)-D(2,3)-E.F-G-H(1.25,3)-I-K(2,3)": "The string cannot be interpreted as a valid BILN string.", "A-C(1,3)-D(1,3)-E.F-G-H(1,3)-I-K(2,3)": "The string cannot be interpreted as a valid BILN string.", "A-C(1,4)-D(2,3)-E.F-G-H(1,3)-I-K(2,3)": "The string cannot be interpreted as a valid BILN string.", + # Terminal caps must be resolved against their real templates, not imported + # as unresolved monomers with synthetic R1..R4 attachment points. + "Ac(1,1).K(1,3)": "The string cannot be interpreted as a valid BILN string.", + "Ac(1,4).K(1,3)": "The string cannot be interpreted as a valid BILN string.", "Cys_SEt": "The string cannot be interpreted as a valid BILN string.", } diff --git a/api/wasm/indigo-ketcher/test/test.js b/api/wasm/indigo-ketcher/test/test.js index c9f6227706..b38b3f22ef 100644 --- a/api/wasm/indigo-ketcher/test/test.js +++ b/api/wasm/indigo-ketcher/test/test.js @@ -1383,11 +1383,52 @@ M END const biln = "Ac(1,2).A-K(1,3)"; const res = indigo.convert(biln, "helm", options); const res_helm = JSON.parse(res).struct; - assert.equal(res_helm, "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0"); + assert.equal(res_helm, "PEPTIDE1{[ac]}|PEPTIDE2{A.K}$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0"); options.delete(); }); } + { + test("BILN", "cap_invalid_attachment", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + assert.throws(() => { + indigo.convert("Ac(1,1).K(1,3)", "ket", options); + }, /The string cannot be interpreted as a valid BILN string/); + options.delete(); + }); + } + + { + test("BILN", "helm_cap_invalid_attachment", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + const helm = "PEPTIDE1{[Ac]}|PEPTIDE2{K}$PEPTIDE1,PEPTIDE2,1:R1-1:R3$$$V2.0"; + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-helm"); + options.set("monomerLibrary", monomersLib); + const ket = JSON.parse(indigo.convert(helm, "ket", options)).struct; + assert.throws(() => { + indigo.convert(helm, "biln", options); + }, /unsupported attachment point 'R1'/); + + let save_options = new indigo.MapStringString(); + save_options.set("output-content-type", "application/json"); + save_options.set("input-format", "chemical/x-indigo-ket"); + save_options.set("monomerLibrary", monomersLib); + assert.throws(() => { + indigo.convert(ket, "biln", save_options); + }, /unsupported attachment point 'R1'/); + options.delete(); + save_options.delete(); + }); + } + { test("BILN", "cross_links", () => { var fs = require('fs'); @@ -1404,6 +1445,27 @@ M END }); } + { + test("BILN", "equal_chain_topology_order", () => { + var fs = require('fs'); + let options = new indigo.MapStringString(); + const monomersLib = fs.readFileSync("monomer_library.ket"); + options.set("output-content-type", "application/json"); + options.set("input-format", "chemical/x-biln"); + options.set("monomerLibrary", monomersLib); + const biln = "C(1,3).C(1,1)"; + const ket = JSON.parse(indigo.convert(biln, "ket", options)).struct; + let save_options = new indigo.MapStringString(); + save_options.set("output-content-type", "application/json"); + save_options.set("input-format", "chemical/x-indigo-ket"); + save_options.set("monomerLibrary", monomersLib); + const res_biln = JSON.parse(indigo.convert(ket, "biln", save_options)).struct; + assert.equal(res_biln, "C(1,1).C(1,3)"); + options.delete(); + save_options.delete(); + }); + } + { test("AxoLabs", "basic", () => { var fs = require('fs'); diff --git a/core/indigo-core/molecule/src/sequence_loader_helm.cpp b/core/indigo-core/molecule/src/sequence_loader_helm.cpp index 82a375e9f9..3e447c212a 100644 --- a/core/indigo-core/molecule/src/sequence_loader_helm.cpp +++ b/core/indigo-core/molecule/src/sequence_loader_helm.cpp @@ -621,6 +621,11 @@ void SequenceLoader::loadBILN(KetDocument& document) int bond_idx; std::vector endpoints; }; + struct PendingBilnEndpoint + { + int bond_idx; + int attachment_idx; + }; struct BilnChain { std::vector monomer_indices; @@ -660,15 +665,52 @@ void SequenceLoader::loadBILN(KetDocument& document) bonds.push_back({bond_idx, {}}); bonds.at(it->second).endpoints.push_back({monomer_idx, monomer_id, attachment_idx}); }; - auto get_biln_monomer_class = [&](const std::string& monomer_alias) { + auto resolve_biln_monomer = [&](const std::string& monomer_alias, const std::vector& endpoints, bool has_prev_in_chain, + bool has_next_in_chain) { if (_library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias).size() > 0) - return MonomerClass::AminoAcid; + return std::make_pair(MonomerClass::AminoAcid, monomer_alias); if (_library.getMonomerTemplateIdByAlias(MonomerClass::CHEM, monomer_alias).size() > 0) - return MonomerClass::CHEM; - if (_library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias + "-").size() > 0) - return MonomerClass::AminoAcid; + return std::make_pair(MonomerClass::CHEM, monomer_alias); + + bool uses_r1 = false; + bool uses_r2 = false; + bool uses_other = false; + for (const auto& endpoint : endpoints) + { + if (endpoint.attachment_idx == 1) + uses_r1 = true; + else if (endpoint.attachment_idx == 2) + uses_r2 = true; + else + uses_other = true; + } + + auto right_cap_alias = monomer_alias + "-"; + auto left_cap_alias = "-" + monomer_alias; + const bool has_right_cap = _library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, right_cap_alias).size() > 0; + const bool has_left_cap = _library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, left_cap_alias).size() > 0; + if (has_right_cap && !has_left_cap) + return std::make_pair(MonomerClass::AminoAcid, right_cap_alias); + if (has_left_cap && !has_right_cap) + return std::make_pair(MonomerClass::AminoAcid, left_cap_alias); + if (has_right_cap && has_left_cap) + { + if (uses_other || (uses_r1 && uses_r2)) + throw_invalid_biln(); + if (uses_r1) + return std::make_pair(MonomerClass::AminoAcid, left_cap_alias); + if (uses_r2) + return std::make_pair(MonomerClass::AminoAcid, right_cap_alias); + if (has_prev_in_chain && has_next_in_chain) + throw_invalid_biln(); + if (has_prev_in_chain) + return std::make_pair(MonomerClass::AminoAcid, left_cap_alias); + if (has_next_in_chain) + return std::make_pair(MonomerClass::AminoAcid, right_cap_alias); + return std::make_pair(MonomerClass::AminoAcid, right_cap_alias); + } throw_invalid_biln(); - return MonomerClass::Unknown; + return std::make_pair(MonomerClass::Unknown, monomer_alias); }; _row = 0; @@ -725,14 +767,8 @@ void SequenceLoader::loadBILN(KetDocument& document) if (monomer_alias.empty()) throw_invalid_biln(); - Vec3f monomer_pos(_col * LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, -LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH * _row, 0); - ambiguous_template_opts options; - const auto monomer_class = get_biln_monomer_class(monomer_alias); - auto monomer_idx = - addHelmMonomer(document, std::make_tuple(monomer_alias, false, std::string(), std::string(), options), monomer_class, monomer_pos); - std::string monomer_id = std::to_string(monomer_idx); - chain.monomer_indices.push_back(monomer_idx); - + const bool has_prev_in_chain = !chain.monomer_indices.empty(); + std::vector pending_endpoints; skip_spaces(); while (data_pos < biln.size() && biln[data_pos] == '(') { @@ -747,10 +783,21 @@ void SequenceLoader::loadBILN(KetDocument& document) if (data_pos >= biln.size() || biln[data_pos] != ')') throw_invalid_biln(); data_pos++; - remember_bond_endpoint(bond_idx, monomer_idx, monomer_id, attachment_idx); + pending_endpoints.push_back({bond_idx, attachment_idx}); skip_spaces(); } + Vec3f monomer_pos(_col * LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH, -LayoutOptions::DEFAULT_MONOMER_BOND_LENGTH * _row, 0); + ambiguous_template_opts options; + const bool has_next_in_chain = data_pos < biln.size() && biln[data_pos] == '-'; + const auto [monomer_class, monomer_load_alias] = resolve_biln_monomer(monomer_alias, pending_endpoints, has_prev_in_chain, has_next_in_chain); + auto monomer_idx = + addHelmMonomer(document, std::make_tuple(monomer_load_alias, false, std::string(), std::string(), options), monomer_class, monomer_pos); + std::string monomer_id = std::to_string(monomer_idx); + chain.monomer_indices.push_back(monomer_idx); + for (const auto& endpoint : pending_endpoints) + remember_bond_endpoint(endpoint.bond_idx, monomer_idx, monomer_id, endpoint.attachment_idx); + _col++; skip_spaces(); if (data_pos >= biln.size()) @@ -787,7 +834,8 @@ void SequenceLoader::loadBILN(KetDocument& document) return false; const auto& monomer_template = _library.getMonomerTemplateById(template_id); const auto& template_alias = getKetStrProp(monomer_template, alias); - return template_alias.size() > 1 && template_alias.back() == '-' && monomer_template.attachmentPoints().size() == 1; + return template_alias.size() > 1 && (template_alias.back() == '-' || template_alias.front() == '-') && + monomer_template.attachmentPoints().size() == 1; }; auto is_terminal_cap = [&](const std::unique_ptr& monomer) { return is_terminal_cap_alias(monomer->alias()); }; auto validate_endpoint = [&](const BilnEndpoint& ep, const std::string& ap) -> const std::unique_ptr& { diff --git a/core/indigo-core/molecule/src/sequence_saver.cpp b/core/indigo-core/molecule/src/sequence_saver.cpp index f80c3c4b0e..45577f151d 100644 --- a/core/indigo-core/molecule/src/sequence_saver.cpp +++ b/core/indigo-core/molecule/src/sequence_saver.cpp @@ -846,13 +846,18 @@ static std::string get_biln_attachment_idx(const KetConnectionEndPoint& ep) return ap.substr(1); } -static std::string format_biln_alias(const std::string& monomer_alias) +static std::string format_biln_alias(const std::string& monomer_alias, bool strip_terminal_cap = false) { if (monomer_alias.empty()) throw SequenceSaver::Error("Cannot save empty monomer alias in BILN format."); auto biln_alias = monomer_alias; - if (biln_alias.size() > 1 && biln_alias.back() == '-') - biln_alias.pop_back(); + if (strip_terminal_cap && biln_alias.size() > 1) + { + if (biln_alias.back() == '-') + biln_alias.pop_back(); + else if (biln_alias.front() == '-') + biln_alias.erase(biln_alias.begin()); + } bool needs_brackets = false; for (auto ch : biln_alias) { @@ -881,8 +886,14 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector biln_template_ids; MonomerClass monomer_class; }; + struct BilnAlias + { + std::string alias; + std::vector template_ids; + }; struct BilnConnection { int node1; @@ -895,6 +906,7 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector nodes; std::string sort_key; + std::string topology_sort_key; int amino_acid_count = 0; int effective_amino_acid_count = 0; int monomer_count = 0; @@ -905,14 +917,38 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector& template_ids) { + if (template_ids.empty()) + throw Error(biln_export_error); + const auto& monomer_template = _library.getMonomerTemplateById(template_ids.front()); + const auto& template_alias = getKetStrProp(monomer_template, alias); + const bool strip_terminal_cap = monomer_class == MonomerClass::AminoAcid && template_alias.size() > 1 && + (template_alias.back() == '-' || template_alias.front() == '-') && + monomer_template.attachmentPoints().size() == 1; + return BilnAlias{format_biln_alias(template_alias, strip_terminal_cap), template_ids}; + }; + std::string template_id = _library.getMonomerTemplateIdByAlias(monomer_class, monomer_alias); if (template_id.empty()) template_id = _library.getMonomerTemplateIdByAliasHELM(monomer_class, monomer_alias); - if (template_id.empty() && monomer_class == MonomerClass::AminoAcid) + if (!template_id.empty()) + return make_biln_alias({template_id}); + + if (monomer_class == MonomerClass::AminoAcid) + { + std::vector cap_template_ids; template_id = _library.getMonomerTemplateIdByAlias(monomer_class, monomer_alias + "-"); - if (template_id.empty()) - throw Error(biln_export_error); - return format_biln_alias(getKetStrProp(_library.getMonomerTemplateById(template_id), alias)); + if (!template_id.empty()) + cap_template_ids.push_back(template_id); + template_id = _library.getMonomerTemplateIdByAlias(monomer_class, "-" + monomer_alias); + if (!template_id.empty() && std::find(cap_template_ids.begin(), cap_template_ids.end(), template_id) == cap_template_ids.end()) + cap_template_ids.push_back(template_id); + if (!cap_template_ids.empty()) + return make_biln_alias(cap_template_ids); + } + + throw Error(biln_export_error); + return BilnAlias{}; }; for (const auto& monomer_id : monomer_ids) { @@ -922,7 +958,7 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vectoralias()); const int node_idx = static_cast(nodes.size()); - nodes.push_back({monomer_id, monomer->ref(), monomer->alias(), biln_alias, monomer_class}); + nodes.push_back({monomer_id, monomer->ref(), monomer->alias(), biln_alias.alias, biln_alias.template_ids, monomer_class}); monomer_ref_to_node.emplace(monomer->ref(), node_idx); } @@ -930,6 +966,7 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector prev(nodes.size(), -1); std::vector explicit_connections; std::set> used_connection_endpoints; + std::map> node_used_attachment_points; auto is_terminal_cap_alias = [&](const std::string& monomer_alias) { auto template_id = _library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias); @@ -939,11 +976,23 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector 1 && template_alias.back() == '-' && monomer_template.attachmentPoints().size() == 1; + return template_alias.size() > 1 && (template_alias.back() == '-' || template_alias.front() == '-') && + monomer_template.attachmentPoints().size() == 1; + }; + auto is_terminal_cap_template = [&](const std::string& template_id) { + const auto& monomer_template = _library.getMonomerTemplateById(template_id); + const auto& template_alias = getKetStrProp(monomer_template, alias); + return template_alias.size() > 1 && (template_alias.back() == '-' || template_alias.front() == '-') && + monomer_template.attachmentPoints().size() == 1; }; auto is_terminal_cap_node = [&](int node_idx) { const auto& node = nodes.at(node_idx); - return node.monomer_class == MonomerClass::AminoAcid && (is_terminal_cap_alias(node.alias) || is_terminal_cap_alias(node.biln_alias)); + if (node.monomer_class != MonomerClass::AminoAcid) + return false; + for (const auto& template_id : node.biln_template_ids) + if (is_terminal_cap_template(template_id)) + return true; + return is_terminal_cap_alias(node.alias) || is_terminal_cap_alias(node.biln_alias); }; auto is_biln_backbone_connection = [](const std::string& ap1, const std::string& ap2) { return (ap1 == kAttachmentPointR1 && ap2 == kAttachmentPointR2) || (ap1 == kAttachmentPointR2 && ap2 == kAttachmentPointR1); @@ -959,11 +1008,22 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vectorsecond).monomer_id)->attachmentPoints().count(ap) == 0) + const auto& node = nodes.at(node_it->second); + bool supported_by_biln_template = false; + for (const auto& template_id : node.biln_template_ids) + { + if (_library.getMonomerTemplateById(template_id).attachmentPoints().count(ap) > 0) + { + supported_by_biln_template = true; + break; + } + } + if (monomers.at(node.monomer_id)->attachmentPoints().count(ap) == 0 || !supported_by_biln_template) throw Error("Cannot save in BILN format - unsupported attachment point '%s'.", ap.c_str()); if (!used_connection_endpoints.emplace(node_it->second, ap).second) throw Error("Cannot save in BILN format - attachment point '%s' of monomer '%s' is used more than once.", ap.c_str(), - nodes.at(node_it->second).alias.c_str()); + node.alias.c_str()); + node_used_attachment_points[node_it->second].emplace(ap); return {node_it->second, ap}; }; @@ -990,6 +1050,32 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector& chain_nodes) { std::string key; for (size_t idx = 0; idx < chain_nodes.size(); idx++) @@ -1199,6 +1285,48 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector node_to_pos; + for (int idx = 0; idx < static_cast(chain.nodes.size()); idx++) + node_to_pos.emplace(chain.nodes[idx], idx); + + std::vector endpoint_keys; + auto add_endpoint_key = [&](int local_node, const std::string& local_ap, int other_node, const std::string& other_ap) { + std::string key = std::to_string(node_to_pos.at(local_node)) + ":" + nodes.at(local_node).biln_alias + ":" + local_ap + ">"; + const auto other_pos = node_to_pos.find(other_node); + if (other_pos != node_to_pos.end()) + key += "I:" + std::to_string(other_pos->second) + ":" + nodes.at(other_node).biln_alias + ":" + other_ap; + else + key += "E:" + nodes.at(other_node).biln_alias + ":" + other_ap; + endpoint_keys.push_back(key); + }; + + for (const auto& connection : explicit_connections) + { + const auto node1_pos = node_to_pos.find(connection.node1); + const auto node2_pos = node_to_pos.find(connection.node2); + if (node1_pos == node_to_pos.end() && node2_pos == node_to_pos.end()) + continue; + if (node1_pos != node_to_pos.end()) + add_endpoint_key(connection.node1, connection.ap1, connection.node2, connection.ap2); + if (node2_pos != node_to_pos.end() && connection.node2 != connection.node1) + add_endpoint_key(connection.node2, connection.ap2, connection.node1, connection.ap1); + } + + std::sort(endpoint_keys.begin(), endpoint_keys.end()); + std::string key; + for (const auto& endpoint_key : endpoint_keys) + { + if (!key.empty()) + key += "|"; + key += endpoint_key; + } + return key; + }; + + for (auto& chain : chains) + chain.topology_sort_key = make_chain_topology_key(chain); + std::sort(chains.begin(), chains.end(), [&](const BilnChain& left, const BilnChain& right) { auto terminal_cap_rank = [&](const BilnChain& chain) { if (chain.monomer_count != 1 || chain.nodes.empty()) @@ -1233,7 +1361,9 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector right.effective_amino_acid_count; if (left.monomer_count != right.monomer_count) return left.monomer_count > right.monomer_count; - return left.sort_key < right.sort_key; + if (left.sort_key != right.sort_key) + return left.sort_key < right.sort_key; + return left.topology_sort_key < right.topology_sort_key; }); std::map> node_to_chain_pos; diff --git a/utils/indigo-service/backend/service/tests/api/indigo_test.py b/utils/indigo-service/backend/service/tests/api/indigo_test.py index 6a175eeb56..50fe24f38e 100644 --- a/utils/indigo-service/backend/service/tests/api/indigo_test.py +++ b/utils/indigo-service/backend/service/tests/api/indigo_test.py @@ -3919,7 +3919,7 @@ def test_convert_biln(self): # BILN with terminal cap cross-link biln_cross = "Ac(1,2).A-K(1,3)" helm_cross_ref = ( - "PEPTIDE1{[Ac]}|PEPTIDE2{A.K}" + "PEPTIDE1{[ac]}|PEPTIDE2{A.K}" "$PEPTIDE1,PEPTIDE2,1:R2-2:R3$$$V2.0" ) headers, data = self.get_headers( From e06d2ab6e0ccaad9f9c444e00ac0d043efd4f4bc Mon Sep 17 00:00:00 2001 From: even1024 Date: Wed, 20 May 2026 10:40:30 +0200 Subject: [PATCH 8/8] Apply BILN clang-format --- .../molecule/src/sequence_loader_helm.cpp | 5 ++--- core/indigo-core/molecule/src/sequence_saver.cpp | 12 ++++-------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/core/indigo-core/molecule/src/sequence_loader_helm.cpp b/core/indigo-core/molecule/src/sequence_loader_helm.cpp index 3e447c212a..c7795d68fe 100644 --- a/core/indigo-core/molecule/src/sequence_loader_helm.cpp +++ b/core/indigo-core/molecule/src/sequence_loader_helm.cpp @@ -666,7 +666,7 @@ void SequenceLoader::loadBILN(KetDocument& document) bonds.at(it->second).endpoints.push_back({monomer_idx, monomer_id, attachment_idx}); }; auto resolve_biln_monomer = [&](const std::string& monomer_alias, const std::vector& endpoints, bool has_prev_in_chain, - bool has_next_in_chain) { + bool has_next_in_chain) { if (_library.getMonomerTemplateIdByAlias(MonomerClass::AminoAcid, monomer_alias).size() > 0) return std::make_pair(MonomerClass::AminoAcid, monomer_alias); if (_library.getMonomerTemplateIdByAlias(MonomerClass::CHEM, monomer_alias).size() > 0) @@ -834,8 +834,7 @@ void SequenceLoader::loadBILN(KetDocument& document) return false; const auto& monomer_template = _library.getMonomerTemplateById(template_id); const auto& template_alias = getKetStrProp(monomer_template, alias); - return template_alias.size() > 1 && (template_alias.back() == '-' || template_alias.front() == '-') && - monomer_template.attachmentPoints().size() == 1; + return template_alias.size() > 1 && (template_alias.back() == '-' || template_alias.front() == '-') && monomer_template.attachmentPoints().size() == 1; }; auto is_terminal_cap = [&](const std::unique_ptr& monomer) { return is_terminal_cap_alias(monomer->alias()); }; auto validate_endpoint = [&](const BilnEndpoint& ep, const std::string& ap) -> const std::unique_ptr& { diff --git a/core/indigo-core/molecule/src/sequence_saver.cpp b/core/indigo-core/molecule/src/sequence_saver.cpp index 45577f151d..e7d806da5a 100644 --- a/core/indigo-core/molecule/src/sequence_saver.cpp +++ b/core/indigo-core/molecule/src/sequence_saver.cpp @@ -923,8 +923,7 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector 1 && - (template_alias.back() == '-' || template_alias.front() == '-') && - monomer_template.attachmentPoints().size() == 1; + (template_alias.back() == '-' || template_alias.front() == '-') && monomer_template.attachmentPoints().size() == 1; return BilnAlias{format_biln_alias(template_alias, strip_terminal_cap), template_ids}; }; @@ -976,14 +975,12 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vector 1 && (template_alias.back() == '-' || template_alias.front() == '-') && - monomer_template.attachmentPoints().size() == 1; + return template_alias.size() > 1 && (template_alias.back() == '-' || template_alias.front() == '-') && monomer_template.attachmentPoints().size() == 1; }; auto is_terminal_cap_template = [&](const std::string& template_id) { const auto& monomer_template = _library.getMonomerTemplateById(template_id); const auto& template_alias = getKetStrProp(monomer_template, alias); - return template_alias.size() > 1 && (template_alias.back() == '-' || template_alias.front() == '-') && - monomer_template.attachmentPoints().size() == 1; + return template_alias.size() > 1 && (template_alias.back() == '-' || template_alias.front() == '-') && monomer_template.attachmentPoints().size() == 1; }; auto is_terminal_cap_node = [&](int node_idx) { const auto& node = nodes.at(node_idx); @@ -1021,8 +1018,7 @@ std::string SequenceSaver::saveBILN(KetDocument& doc, const std::vectorattachmentPoints().count(ap) == 0 || !supported_by_biln_template) throw Error("Cannot save in BILN format - unsupported attachment point '%s'.", ap.c_str()); if (!used_connection_endpoints.emplace(node_it->second, ap).second) - throw Error("Cannot save in BILN format - attachment point '%s' of monomer '%s' is used more than once.", ap.c_str(), - node.alias.c_str()); + throw Error("Cannot save in BILN format - attachment point '%s' of monomer '%s' is used more than once.", ap.c_str(), node.alias.c_str()); node_used_attachment_points[node_it->second].emplace(ap); return {node_it->second, ap}; };