diff --git a/RecoTracker/LSTCore/interface/Common.h b/RecoTracker/LSTCore/interface/Common.h index b5dece7656b2d..ec0ad431898d5 100644 --- a/RecoTracker/LSTCore/interface/Common.h +++ b/RecoTracker/LSTCore/interface/Common.h @@ -87,7 +87,8 @@ namespace lst { using ArrayUxHits = edm::StdArray; }; struct Params_T5 { - static constexpr int kLayers = 5, kHits = 10; + static constexpr int kLayers = 7, kHits = 14; // 5 base + max 2 extensions + static constexpr int kBaseLayers = 5; static constexpr int kEmbed = 6; using ArrayU8xLayers = edm::StdArray; using ArrayU16xLayers = edm::StdArray; @@ -95,7 +96,8 @@ namespace lst { using ArrayFxEmbed = edm::StdArray; }; struct Params_pT5 { - static constexpr int kLayers = 7, kHits = 14; + static constexpr int kLayers = 9, kHits = 18; // 2 pixel + 7 OT (= T5::kLayers after extension) + static constexpr int kBaseLayers = 7; using ArrayU8xLayers = edm::StdArray; using ArrayU16xLayers = edm::StdArray; using ArrayUxHits = edm::StdArray; diff --git a/RecoTracker/LSTCore/interface/ModulesSoA.h b/RecoTracker/LSTCore/interface/ModulesSoA.h index 9fca722a8b5b8..dfc4678d38581 100644 --- a/RecoTracker/LSTCore/interface/ModulesSoA.h +++ b/RecoTracker/LSTCore/interface/ModulesSoA.h @@ -33,6 +33,7 @@ namespace lst { SOA_COLUMN(short, subdets), SOA_COLUMN(short, sides), SOA_COLUMN(float, eta), + SOA_COLUMN(float, phi), SOA_COLUMN(float, r), SOA_COLUMN(float, z), SOA_COLUMN(bool, isInverted), diff --git a/RecoTracker/LSTCore/interface/PixelQuintupletsSoA.h b/RecoTracker/LSTCore/interface/PixelQuintupletsSoA.h index 0175cdde74beb..e3cc1548c085f 100644 --- a/RecoTracker/LSTCore/interface/PixelQuintupletsSoA.h +++ b/RecoTracker/LSTCore/interface/PixelQuintupletsSoA.h @@ -27,6 +27,7 @@ namespace lst { SOA_COLUMN(FPX, centerX), // T3-based circle center x SOA_COLUMN(FPX, centerY), // T3-based circle center y SOA_COLUMN(bool, isDup), + SOA_COLUMN(unsigned int, nLayers), SOA_SCALAR(unsigned int, nPixelQuintuplets), SOA_SCALAR(unsigned int, totOccupancyPixelQuintuplets)); diff --git a/RecoTracker/LSTCore/interface/QuintupletsSoA.h b/RecoTracker/LSTCore/interface/QuintupletsSoA.h index ec68919078300..5e05d39f604aa 100644 --- a/RecoTracker/LSTCore/interface/QuintupletsSoA.h +++ b/RecoTracker/LSTCore/interface/QuintupletsSoA.h @@ -19,9 +19,11 @@ namespace lst { SOA_COLUMN(Params_T5::ArrayFxEmbed, t5Embed), // t5 embedding vector SOA_COLUMN(FPX, eta), SOA_COLUMN(FPX, phi), - SOA_COLUMN(FPX, score_rphisum), // r-phi based score - SOA_COLUMN(char, isDup), // duplicate flag - SOA_COLUMN(bool, tightCutFlag), // tight pass to be a TC + SOA_COLUMN(FPX, score_rphisum), // r-phi based score + SOA_COLUMN(char, isDup), // duplicate flag + SOA_COLUMN(unsigned int, nLayers), // number of active layers (5 base) + SOA_COLUMN(bool, tightCutFlag), // tight pass to be a TC + SOA_COLUMN(bool, partOfPT5), SOA_COLUMN(float, regressionRadius), SOA_COLUMN(float, regressionCenterX), SOA_COLUMN(float, regressionCenterY), @@ -29,15 +31,16 @@ namespace lst { SOA_COLUMN(FPX, innerRadius), // inner triplet circle radius SOA_COLUMN(FPX, bridgeRadius), // "middle"/bridge triplet radius SOA_COLUMN(FPX, outerRadius), // outer triplet radius - SOA_COLUMN(FPX, pt), + SOA_COLUMN(FPX, pt) #ifdef CUT_VALUE_DEBUG + , SOA_COLUMN(float, rzChiSquared), // r-z only chi2 SOA_COLUMN(float, chiSquared), SOA_COLUMN(float, nonAnchorChiSquared), SOA_COLUMN(float, dBeta1), - SOA_COLUMN(float, dBeta2), + SOA_COLUMN(float, dBeta2) #endif - SOA_COLUMN(bool, partOfPT5)); + ); using QuintupletsSoA = QuintupletsSoALayout<>; using Quintuplets = QuintupletsSoA::View; diff --git a/RecoTracker/LSTCore/src/ModuleMethods.h b/RecoTracker/LSTCore/src/ModuleMethods.h index 1d5d75f18ac2b..2aaccbc776452 100644 --- a/RecoTracker/LSTCore/src/ModuleMethods.h +++ b/RecoTracker/LSTCore/src/ModuleMethods.h @@ -169,6 +169,7 @@ namespace lst { float m_y, float m_z, float& eta, + float& phi, float& r) { subdet = (detId & (7 << 25)) >> 25; side = (subdet == Endcap) ? (detId & (3 << 23)) >> 23 : (detId & (3 << 18)) >> 18; @@ -179,6 +180,7 @@ namespace lst { r = std::sqrt(m_x * m_x + m_y * m_y + m_z * m_z); eta = ((m_z > 0) - (m_z < 0)) * std::acosh(r / std::sqrt(m_x * m_x + m_y * m_y)); + phi = std::atan2(m_y, m_x); } inline void loadCentroidsFromFile(const char* filePath, ModuleMetaData& mmd, uint16_t& nModules) { @@ -252,6 +254,7 @@ namespace lst { std::span host_subdets = modules_view.subdets(); std::span host_sides = modules_view.sides(); std::span host_eta = modules_view.eta(); + std::span host_phi = modules_view.phi(); std::span host_r = modules_view.r(); std::span host_z = modules_view.z(); std::span host_isInverted = modules_view.isInverted(); @@ -279,7 +282,7 @@ namespace lst { float m_z = mmd.module_z[detId]; unsigned int m_t = mmd.module_type[detId]; - float eta, r; + float eta, phi, r; uint16_t index; unsigned short layer, ring, rod, module, subdet, side; @@ -294,9 +297,10 @@ namespace lst { isInverted = false; isLower = false; eta = 0; + phi = 0; r = 0; } else { - setDerivedQuantities(detId, layer, ring, rod, module, subdet, side, m_x, m_y, m_z, eta, r); + setDerivedQuantities(detId, layer, ring, rod, module, subdet, side, m_x, m_y, m_z, eta, phi, r); isInverted = parseIsInverted(subdet, side, module, layer); isLower = parseIsLower(isInverted, detId); } @@ -319,6 +323,7 @@ namespace lst { host_subdets[index] = subdet; host_sides[index] = side; host_eta[index] = eta; + host_phi[index] = phi; host_r[index] = r; host_z[index] = m_z; host_isInverted[index] = isInverted; diff --git a/RecoTracker/LSTCore/src/alpaka/Kernels.h b/RecoTracker/LSTCore/src/alpaka/Kernels.h index 33759e0b49004..fee34d0b2c88b 100644 --- a/RecoTracker/LSTCore/src/alpaka/Kernels.h +++ b/RecoTracker/LSTCore/src/alpaka/Kernels.h @@ -1,6 +1,8 @@ #ifndef RecoTracker_LSTCore_src_alpaka_Kernels_h #define RecoTracker_LSTCore_src_alpaka_Kernels_h +#include + #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" #include "FWCore/Utilities/interface/CMSUnrollLoop.h" @@ -56,8 +58,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { int nMatched = 0; for (int i = 0; i < Params_T5::kHits; i++) { + // Skip sentinel values from extended slots + if (hits1[i] == lst::kTCEmptyHitIdx) + continue; bool matched = false; for (int j = 0; j < Params_T5::kHits; j++) { + if (hits2[j] == lst::kTCEmptyHitIdx) + continue; if (hits1[i] == hits2[j]) { matched = true; break; @@ -83,8 +90,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { int nMatched = 0; for (int i = 0; i < Params_pT5::kHits; i++) { + // Skip sentinel values from extended slots + if (hits1[i] == lst::kTCEmptyHitIdx) + continue; bool matched = false; for (int j = 0; j < Params_pT5::kHits; j++) { + if (hits2[j] == lst::kTCEmptyHitIdx) + continue; if (hits1[i] == hits2[j]) { matched = true; break; @@ -186,18 +198,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { for (unsigned int ix1 : cms::alpakatools::uniform_elements_y(acc, nQuintuplets_lowmod)) { unsigned int ix = quintupletModuleIndices_lowmod + ix1; + if (quintuplets.isDup()[ix]) + continue; float eta1 = __H2F(quintuplets.eta()[ix]); float phi1 = __H2F(quintuplets.phi()[ix]); - float score_rphisum1 = __H2F(quintuplets.score_rphisum()[ix]); + float dnnScore1 = quintuplets.dnnScore()[ix]; for (unsigned int jx1 : cms::alpakatools::uniform_elements_x(acc, ix1 + 1, nQuintuplets_lowmod)) { unsigned int jx = quintupletModuleIndices_lowmod + jx1; + if (quintuplets.isDup()[jx]) + continue; float eta2 = __H2F(quintuplets.eta()[jx]); float phi2 = __H2F(quintuplets.phi()[jx]); float dEta = alpaka::math::abs(acc, eta1 - eta2); float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2); - float score_rphisum2 = __H2F(quintuplets.score_rphisum()[jx]); if (dEta > 0.1f) continue; @@ -206,12 +221,35 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { continue; int nMatched = checkHitsT5(ix, jx, quintuplets); - const int minNHitsForDup_T5 = 7; - if (nMatched >= minNHitsForDup_T5) { - if (score_rphisum1 >= score_rphisum2) { + // Proportional sharing: at least 60% of the shorter track's hits. + unsigned int nLayersIx = quintuplets.nLayers()[ix]; + unsigned int nLayersJx = quintuplets.nLayers()[jx]; + unsigned int nHitsIx = 2 * nLayersIx; + unsigned int nHitsJx = 2 * nLayersJx; + int minNHitsForDup = static_cast(0.6f * (nHitsIx < nHitsJx ? nHitsIx : nHitsJx)); + if (nMatched >= minNHitsForDup) { + // Tiebreak: longer track wins; otherwise rphisum at high pT, DNN score at low pT. + if (nLayersIx > nLayersJx) { + rmQuintupletFromMemory(quintuplets, jx); + } else if (nLayersJx > nLayersIx) { rmQuintupletFromMemory(quintuplets, ix); } else { - rmQuintupletFromMemory(quintuplets, jx); + float ptIx = __H2F(quintuplets.innerRadius()[ix]) * lst::k2Rinv1GeVf * 2; + float ptJx = __H2F(quintuplets.innerRadius()[jx]) * lst::k2Rinv1GeVf * 2; + if (ptIx > 5.0f || ptJx > 5.0f) { + float rphisum1 = __H2F(quintuplets.score_rphisum()[ix]); + float rphisum2 = __H2F(quintuplets.score_rphisum()[jx]); + if (rphisum1 >= rphisum2) + rmQuintupletFromMemory(quintuplets, ix); + else + rmQuintupletFromMemory(quintuplets, jx); + } else { + float dnnScore2 = quintuplets.dnnScore()[jx]; + if (dnnScore1 <= dnnScore2) + rmQuintupletFromMemory(quintuplets, ix); + else + rmQuintupletFromMemory(quintuplets, jx); + } } } } @@ -220,6 +258,204 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } }; + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void tryExtendT5( + TAcc const& acc, Quintuplets quintuplets, unsigned int winnerIdx, unsigned int loserIdx, int loserSlot) { + if (loserSlot < 0) + return; + + unsigned int newSlot = alpaka::atomicAdd(acc, &quintuplets.nLayers()[winnerIdx], 1u, alpaka::hierarchy::Threads{}); + + if (newSlot >= Params_T5::kLayers) { + alpaka::atomicSub(acc, &quintuplets.nLayers()[winnerIdx], 1u, alpaka::hierarchy::Threads{}); + return; + } + + quintuplets.logicalLayers()[winnerIdx][newSlot] = quintuplets.logicalLayers()[loserIdx][loserSlot]; + quintuplets.lowerModuleIndices()[winnerIdx][newSlot] = quintuplets.lowerModuleIndices()[loserIdx][loserSlot]; + quintuplets.hitIndices()[winnerIdx][2 * newSlot] = quintuplets.hitIndices()[loserIdx][2 * loserSlot]; + quintuplets.hitIndices()[winnerIdx][2 * newSlot + 1] = quintuplets.hitIndices()[loserIdx][2 * loserSlot + 1]; + } + + struct ExtendT5FromDupT5 { + // Packed [score:32 | T5 index:28 | layer slot:4] for atomic best-per-OT-layer tracking. + static constexpr int kPackedScoreShift = 32; + static constexpr int kPackedIndexShift = 4; + static constexpr unsigned int kPackedIndexMask = 0xFFFFFFF; + static constexpr unsigned int kPackedSlotMask = 0xF; + static constexpr int kT5DuplicateMinSharedHits = 8; + + ALPAKA_FN_ACC void operator()(Acc1D const& acc, + ModulesConst modules, + ObjectRangesConst ranges, + Quintuplets quintuplets, + QuintupletsOccupancyConst quintupletsOccupancy) const { + // Best candidate per OT logical layer (1..11), packed score|index|slot. + uint64_t* sharedBestPacked = alpaka::declareSharedVar(acc); + + // One block per T5 in 1D; block index = ref T5 index. + const unsigned int refT5Index = alpaka::getIdx(acc)[0u]; + + // Skip empty/unallocated T5 slots. + if (quintuplets.nLayers()[refT5Index] == 0) + return; + + // Initialize shared memory once per block. + if (cms::alpakatools::once_per_block(acc)) { + for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) { + sharedBestPacked[logicalLayerBin] = 0; + } + } + alpaka::syncBlockThreads(acc); + + const float baseEta = __H2F(quintuplets.eta()[refT5Index]); + const float basePhi = __H2F(quintuplets.phi()[refT5Index]); + const uint8_t baseStartLogicalLayer = quintuplets.logicalLayers()[refT5Index][0]; + + // Hoist ref data once: hit indices and embedding read every candidate iteration otherwise. + float refEmbed[Params_T5::kEmbed]; + CMS_UNROLL_LOOP + for (unsigned int e = 0; e < Params_T5::kEmbed; ++e) + refEmbed[e] = quintuplets.t5Embed()[refT5Index][e]; + + constexpr unsigned int kRefHits = 2 * Params_T5::kBaseLayers; + unsigned int refHits[kRefHits]; + CMS_UNROLL_LOOP + for (unsigned int h = 0; h < kRefHits; ++h) + refHits[h] = quintuplets.hitIndices()[refT5Index][h]; + + const int threadIndexFlat = alpaka::getIdx(acc)[0u]; + const int blockDimFlat = alpaka::getWorkDiv(acc)[0u]; + + // Ref-starts-at-layer-1 special case: only ref's slot-1 module can host a valid candidate. + const bool restrictToRefSlot1 = (baseStartLogicalLayer == 1); + const uint16_t nEligibleT5Modules = ranges.nEligibleT5Modules(); + const int loopCount = restrictToRefSlot1 ? 1 : static_cast(nEligibleT5Modules); + + for (int idx = threadIndexFlat; idx < loopCount; idx += blockDimFlat) { + const uint16_t lowerModuleIndex = restrictToRefSlot1 ? quintuplets.lowerModuleIndices()[refT5Index][1] + : ranges.indicesOfEligibleT5Modules()[idx]; + + if (!restrictToRefSlot1) { + // Skip same-starting-layer modules (logical = physical + 6 for endcap, see Triplet.h). + const short modSubdet = modules.subdets()[lowerModuleIndex]; + const int moduleLogicalLayer = + static_cast(modules.layers()[lowerModuleIndex]) + (modSubdet == Endcap ? 6 : 0); + if (moduleLogicalLayer == static_cast(baseStartLogicalLayer)) + continue; + + // Module-level eta/phi pre-cut; margin covers per-T5 window plus T5-vs-module spread. + if (alpaka::math::abs(acc, baseEta - modules.eta()[lowerModuleIndex]) > 0.3f) + continue; + if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, basePhi, modules.phi()[lowerModuleIndex])) > 0.5f) + continue; + } + + const int firstQuintupletInModule = ranges.quintupletModuleIndices()[lowerModuleIndex]; + if (firstQuintupletInModule == -1) + continue; + + const unsigned int nQuintupletsInModule = quintupletsOccupancy.nQuintuplets()[lowerModuleIndex]; + + for (unsigned int quintupletOffset = 0; quintupletOffset < nQuintupletsInModule; ++quintupletOffset) { + const unsigned int testT5Index = firstQuintupletInModule + quintupletOffset; + if (testT5Index == refT5Index) + continue; + + // Per-T5 eta/phi window. + const float candidateEta = __H2F(quintuplets.eta()[testT5Index]); + if (alpaka::math::abs(acc, baseEta - candidateEta) > 0.1f) + continue; + + const float candidatePhi = __H2F(quintuplets.phi()[testT5Index]); + if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, basePhi, candidatePhi)) > 0.1f) + continue; + + // Embedding distance against hoisted refEmbed. + float embedDistance2 = 0.f; + CMS_UNROLL_LOOP + for (unsigned int embedIndex = 0; embedIndex < Params_T5::kEmbed; ++embedIndex) { + const float diff = refEmbed[embedIndex] - quintuplets.t5Embed()[testT5Index][embedIndex]; + embedDistance2 += diff * diff; + } + if (embedDistance2 > 1.0f) + continue; + + // Hit matching against hoisted ref hits; record the candidate slot with no shared hit. + int sharedHitCount = 0; + int unmatchedLayerSlot = -1; + CMS_UNROLL_LOOP + for (unsigned int layerIndex = 0; layerIndex < Params_T5::kBaseLayers; ++layerIndex) { + const unsigned int candidateHit0 = quintuplets.hitIndices()[testT5Index][2 * layerIndex + 0]; + const unsigned int candidateHit1 = quintuplets.hitIndices()[testT5Index][2 * layerIndex + 1]; + + bool hit0InBase = false; + bool hit1InBase = false; + CMS_UNROLL_LOOP + for (unsigned int baseHitIndex = 0; baseHitIndex < kRefHits; ++baseHitIndex) { + const unsigned int baseHit = refHits[baseHitIndex]; + hit0InBase = hit0InBase || (candidateHit0 == baseHit); + hit1InBase = hit1InBase || (candidateHit1 == baseHit); + } + + sharedHitCount += int(hit0InBase) + int(hit1InBase); + if (!hit0InBase && !hit1InBase) + unmatchedLayerSlot = layerIndex; + } + + if (sharedHitCount < kT5DuplicateMinSharedHits) + continue; + if (unmatchedLayerSlot < 0) + continue; + + // Score = DNN output; layer bin = candidate's unmatched OT layer (1..11) - 1. + const float candidateScore = quintuplets.dnnScore()[testT5Index]; + const uint8_t newLogicalLayer = quintuplets.logicalLayers()[testT5Index][unmatchedLayerSlot]; + const int logicalLayerBin = static_cast(newLogicalLayer) - 1; + + uint64_t scoreBits = std::bit_cast(candidateScore); + uint64_t newPacked = (scoreBits << kPackedScoreShift) | + (static_cast(testT5Index & kPackedIndexMask) << kPackedIndexShift) | + (unmatchedLayerSlot & kPackedSlotMask); + + // Atomic CAS into shared best-per-layer slot, retry until we win or are beaten. + uint64_t oldPacked = sharedBestPacked[logicalLayerBin]; + while (true) { + const float oldScore = std::bit_cast(static_cast(oldPacked >> kPackedScoreShift)); + if (candidateScore <= oldScore) + break; + + uint64_t assumedOld = alpaka::atomicCas( + acc, &sharedBestPacked[logicalLayerBin], oldPacked, newPacked, alpaka::hierarchy::Threads{}); + + if (assumedOld == oldPacked) { + break; + } else { + oldPacked = assumedOld; + } + } + } + } + + alpaka::syncBlockThreads(acc); + + // One thread per block applies the per-layer winners. + if (cms::alpakatools::once_per_block(acc)) { + CMS_UNROLL_LOOP + for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) { + uint64_t bestPacked = sharedBestPacked[logicalLayerBin]; + if ((bestPacked >> kPackedScoreShift) == 0) + continue; + + const int bestT5Index = static_cast((bestPacked >> kPackedIndexShift) & kPackedIndexMask); + const int bestT5LayerSlot = static_cast(bestPacked & kPackedSlotMask); + + tryExtendT5(acc, quintuplets, refT5Index, bestT5Index, bestT5LayerSlot); + } + } + } + }; + struct RemoveDupQuintupletsBeforeTC { ALPAKA_FN_ACC void operator()(Acc2D const& acc, Quintuplets quintuplets, @@ -250,7 +486,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { const bool isPT5_ix = quintuplets.partOfPT5()[ix]; const float eta1 = __H2F(quintuplets.eta()[ix]); const float phi1 = __H2F(quintuplets.phi()[ix]); - const float score_rphisum1 = __H2F(quintuplets.score_rphisum()[ix]); + const float dnnScore1 = quintuplets.dnnScore()[ix]; for (unsigned int jx1 = 0; jx1 < nQuintuplets_lowmod2; jx1++) { unsigned int jx = quintupletModuleIndices_lowmod2 + jx1; @@ -286,15 +522,29 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { d2 += diff * diff; } - if (((dR2 < 0.001f || nMatched >= minNHitsForDup_T5) && d2 < 1.0f) || (dR2 < 0.02f && d2 < 0.1f)) { - const float score_rphisum2 = __H2F(quintuplets.score_rphisum()[jx]); - if (isPT5_jx || score_rphisum1 > score_rphisum2) { - rmQuintupletFromMemory(quintuplets, ix, true); - } else if (isPT5_ix || score_rphisum1 < score_rphisum2) { - rmQuintupletFromMemory(quintuplets, jx, true); + // 99th percentile of true-dup d2 distribution measured on 100 PU200 events. + constexpr float d2Thresh = 0.25f; + if (((dR2 < 0.001f || nMatched >= minNHitsForDup_T5) && d2 < d2Thresh) || (dR2 < 0.02f && d2 < 0.1f)) { + float ptIx = __H2F(quintuplets.innerRadius()[ix]) * lst::k2Rinv1GeVf * 2; + float ptJx = __H2F(quintuplets.innerRadius()[jx]) * lst::k2Rinv1GeVf * 2; + bool highPt = (ptIx > 5.0f || ptJx > 5.0f); + bool ixLoses; + if (isPT5_jx) { + ixLoses = true; + } else if (isPT5_ix) { + ixLoses = false; + } else if (highPt) { + float rphisum1 = __H2F(quintuplets.score_rphisum()[ix]); + float rphisum2 = __H2F(quintuplets.score_rphisum()[jx]); + ixLoses = (rphisum1 > rphisum2) || (rphisum1 == rphisum2 && ix < jx); } else { - rmQuintupletFromMemory(quintuplets, (ix < jx ? ix : jx), true); + float dnnScore2 = quintuplets.dnnScore()[jx]; + ixLoses = (dnnScore1 < dnnScore2) || (dnnScore1 == dnnScore2 && ix < jx); } + if (ixLoses) + rmQuintupletFromMemory(quintuplets, ix, true); + else + rmQuintupletFromMemory(quintuplets, jx, true); } } } diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc index 86e18590b5775..9fde48df6dafe 100644 --- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc +++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc @@ -750,29 +750,7 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) tc_pls_triplets, nTotal); - // Get number of TCs to configure grid - auto nTrackCandidates_buf_h = cms::alpakatools::make_host_buffer(queue_); - auto nTrackCandidates_buf_d = - cms::alpakatools::make_device_view(queue_, (*trackCandidatesBaseDC_)->nTrackCandidates()); - alpaka::memcpy(queue_, nTrackCandidates_buf_h, nTrackCandidates_buf_d); - alpaka::wait(queue_); - unsigned int nTC = *nTrackCandidates_buf_h.data(); - - if (nTC == 0) - return; - auto const wd = cms::alpakatools::make_workdiv(nTC, 128); - - alpaka::exec(queue_, - wd, - ExtendTrackCandidatesFromDupT5{}, - modules_.const_view().modules(), - rangesDC_->const_view(), - quintupletsDC_->const_view().quintuplets(), - quintupletsDC_->const_view().quintupletsOccupancy(), - trackCandidatesBaseDC_->view(), - trackCandidatesExtendedDC_->view()); - - // Check if TC buffer was possibly truncated + // Check if either n_max_pixel_track_candidates or n_max_nonpixel_track_candidates was reached auto nTrackCanTotalHost_buf = cms::alpakatools::make_host_buffer(queue_); alpaka::memcpy(queue_, nTrackCanTotalHost_buf, @@ -971,6 +949,8 @@ void LSTEvent::createQuintuplets() { auto quintuplets = quintupletsDC_->view().quintuplets(); auto isDup_view = cms::alpakatools::make_device_view(queue_, quintuplets.isDup()); alpaka::memset(queue_, isDup_view, 0u); + auto nLayers_view = cms::alpakatools::make_device_view(queue_, quintuplets.nLayers()); + alpaka::memset(queue_, nLayers_view, 0u); auto tightCutFlag_view = cms::alpakatools::make_device_view(queue_, quintuplets.tightCutFlag()); alpaka::memset(queue_, tightCutFlag_view, 0u); auto partOfPT5_view = cms::alpakatools::make_device_view(queue_, quintuplets.partOfPT5()); @@ -994,6 +974,18 @@ void LSTEvent::createQuintuplets() { nEligibleT5Modules, ptCut_); + if (nTotalQuintuplets > 0) { + auto const extendT5_workDiv = cms::alpakatools::make_workdiv(nTotalQuintuplets, 128); + + alpaka::exec(queue_, + extendT5_workDiv, + ExtendT5FromDupT5{}, + modules_.const_view().modules(), + rangesDC_->const_view(), + quintupletsDC_->view().quintuplets(), + quintupletsDC_->const_view().quintupletsOccupancy()); + } + auto const removeDupQuintupletsAfterBuild_workDiv = cms::alpakatools::make_workdiv({max_blocks, 1, 1}, {1, 16, 16}); diff --git a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h index d137889a98fbf..96366af78066e 100644 --- a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h +++ b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h @@ -46,40 +46,37 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { pixelQuintuplets.centerX()[pixelQuintupletIndex] = __F2H(centerX); pixelQuintuplets.centerY()[pixelQuintupletIndex] = __F2H(centerY); + // Copy pixel layers (always 2) pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][0] = 0; pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][1] = 0; - pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][2] = quintuplets.logicalLayers()[t5Index][0]; - pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][3] = quintuplets.logicalLayers()[t5Index][1]; - pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][4] = quintuplets.logicalLayers()[t5Index][2]; - pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][5] = quintuplets.logicalLayers()[t5Index][3]; - pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][6] = quintuplets.logicalLayers()[t5Index][4]; - pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][0] = segments.innerLowerModuleIndices()[pixelIndex]; pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][1] = segments.outerLowerModuleIndices()[pixelIndex]; - pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][2] = quintuplets.lowerModuleIndices()[t5Index][0]; - pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][3] = quintuplets.lowerModuleIndices()[t5Index][1]; - pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][4] = quintuplets.lowerModuleIndices()[t5Index][2]; - pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][5] = quintuplets.lowerModuleIndices()[t5Index][3]; - pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][6] = quintuplets.lowerModuleIndices()[t5Index][4]; unsigned int pixelInnerMD = segments.mdIndices()[pixelIndex][0]; unsigned int pixelOuterMD = segments.mdIndices()[pixelIndex][1]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][0] = mds.anchorHitIndices()[pixelInnerMD]; pixelQuintuplets.hitIndices()[pixelQuintupletIndex][1] = mds.outerHitIndices()[pixelInnerMD]; pixelQuintuplets.hitIndices()[pixelQuintupletIndex][2] = mds.anchorHitIndices()[pixelOuterMD]; pixelQuintuplets.hitIndices()[pixelQuintupletIndex][3] = mds.outerHitIndices()[pixelOuterMD]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][4] = quintuplets.hitIndices()[t5Index][0]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][5] = quintuplets.hitIndices()[t5Index][1]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][6] = quintuplets.hitIndices()[t5Index][2]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][7] = quintuplets.hitIndices()[t5Index][3]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][8] = quintuplets.hitIndices()[t5Index][4]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][9] = quintuplets.hitIndices()[t5Index][5]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][10] = quintuplets.hitIndices()[t5Index][6]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][11] = quintuplets.hitIndices()[t5Index][7]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][12] = quintuplets.hitIndices()[t5Index][8]; - pixelQuintuplets.hitIndices()[pixelQuintupletIndex][13] = quintuplets.hitIndices()[t5Index][9]; + // Copy T5 layers (respects nLayers for extended T5s); fill remaining slots with sentinels. + unsigned int t5Layers = quintuplets.nLayers()[t5Index]; + auto& dstLayers = pixelQuintuplets.logicalLayers()[pixelQuintupletIndex]; + auto& dstModules = pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex]; + auto& dstHits = pixelQuintuplets.hitIndices()[pixelQuintupletIndex]; + auto const& srcLayers = quintuplets.logicalLayers()[t5Index]; + auto const& srcModules = quintuplets.lowerModuleIndices()[t5Index]; + auto const& srcHits = quintuplets.hitIndices()[t5Index]; + for (unsigned int i = 0; i < Params_T5::kLayers; ++i) { + const bool inT5 = (i < t5Layers); + dstLayers[2 + i] = inT5 ? srcLayers[i] : uint8_t{0}; + dstModules[2 + i] = inT5 ? srcModules[i] : lst::kTCEmptyLowerModule; + dstHits[4 + 2 * i] = inT5 ? srcHits[2 * i] : lst::kTCEmptyHitIdx; + dstHits[4 + 2 * i + 1] = inT5 ? srcHits[2 * i + 1] : lst::kTCEmptyHitIdx; + } + + // Set pT5 nLayers = 2 pixel + T5 layers + pixelQuintuplets.nLayers()[pixelQuintupletIndex] = 2 + t5Layers; #ifdef CUT_VALUE_DEBUG pixelQuintuplets.rzChiSquared()[pixelQuintupletIndex] = rzChiSquared; @@ -408,7 +405,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { float a = -100 / kR1GeVf * charge; - for (size_t i = 0; i < Params_T5::kLayers; i++) { + for (size_t i = 0; i < Params_T5::kBaseLayers; i++) { float zsi = zs[i] / 100; float rtsi = rts[i] / 100; uint16_t lowerModuleIndex = lowerModuleIndices[i]; @@ -532,23 +529,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { uint16_t lowerModuleIndex4 = quintuplets.lowerModuleIndices()[quintupletIndex][3]; uint16_t lowerModuleIndex5 = quintuplets.lowerModuleIndices()[quintupletIndex][4]; - uint16_t lowerModuleIndices[Params_T5::kLayers] = { + uint16_t lowerModuleIndices[Params_T5::kBaseLayers] = { lowerModuleIndex1, lowerModuleIndex2, lowerModuleIndex3, lowerModuleIndex4, lowerModuleIndex5}; float rtPix[Params_pLS::kLayers] = {pixelData.rt_InLo, pixelData.rt_InUp}; float xPix[Params_pLS::kLayers] = {pixelData.x_InLo, pixelData.x_InUp}; float yPix[Params_pLS::kLayers] = {pixelData.y_InLo, pixelData.y_InUp}; float zPix[Params_pLS::kLayers] = {pixelData.z_InLo, pixelData.z_InUp}; - float zs[Params_T5::kLayers] = {mds.anchorZ()[firstMDIndex], - mds.anchorZ()[secondMDIndex], - mds.anchorZ()[thirdMDIndex], - mds.anchorZ()[fourthMDIndex], - mds.anchorZ()[fifthMDIndex]}; - float rts[Params_T5::kLayers] = {mds.anchorRt()[firstMDIndex], - mds.anchorRt()[secondMDIndex], - mds.anchorRt()[thirdMDIndex], - mds.anchorRt()[fourthMDIndex], - mds.anchorRt()[fifthMDIndex]}; + float zs[Params_T5::kBaseLayers] = {mds.anchorZ()[firstMDIndex], + mds.anchorZ()[secondMDIndex], + mds.anchorZ()[thirdMDIndex], + mds.anchorZ()[fourthMDIndex], + mds.anchorZ()[fifthMDIndex]}; + float rts[Params_T5::kBaseLayers] = {mds.anchorRt()[firstMDIndex], + mds.anchorRt()[secondMDIndex], + mds.anchorRt()[thirdMDIndex], + mds.anchorRt()[fourthMDIndex], + mds.anchorRt()[fifthMDIndex]}; rzChiSquared = 0; @@ -582,16 +579,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } //outer T5 - float xs[Params_T5::kLayers] = {mds.anchorX()[firstMDIndex], - mds.anchorX()[secondMDIndex], - mds.anchorX()[thirdMDIndex], - mds.anchorX()[fourthMDIndex], - mds.anchorX()[fifthMDIndex]}; - float ys[Params_T5::kLayers] = {mds.anchorY()[firstMDIndex], - mds.anchorY()[secondMDIndex], - mds.anchorY()[thirdMDIndex], - mds.anchorY()[fourthMDIndex], - mds.anchorY()[fifthMDIndex]}; + float xs[Params_T5::kBaseLayers] = {mds.anchorX()[firstMDIndex], + mds.anchorX()[secondMDIndex], + mds.anchorX()[thirdMDIndex], + mds.anchorX()[fourthMDIndex], + mds.anchorX()[fifthMDIndex]}; + float ys[Params_T5::kBaseLayers] = {mds.anchorY()[firstMDIndex], + mds.anchorY()[secondMDIndex], + mds.anchorY()[thirdMDIndex], + mds.anchorY()[fourthMDIndex], + mds.anchorY()[fifthMDIndex]}; //get the appropriate centers centerX = pixelData.circleCenterX; diff --git a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h index 15b0aa1f7f405..c748b6b7f35a2 100644 --- a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h @@ -63,6 +63,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { quintuplets.phi()[quintupletIndex] = __F2H(phi); quintuplets.score_rphisum()[quintupletIndex] = __F2H(scores); quintuplets.isDup()[quintupletIndex] = 0; + quintuplets.nLayers()[quintupletIndex] = Params_T5::kBaseLayers; quintuplets.tightCutFlag()[quintupletIndex] = tightCutFlag; quintuplets.regressionRadius()[quintupletIndex] = regressionRadius; quintuplets.regressionCenterX()[quintupletIndex] = regressionCenterX; @@ -97,6 +98,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { for (unsigned int i = 0; i < Params_T5::kEmbed; ++i) { quintuplets.t5Embed()[quintupletIndex][i] = t5Embed[i]; } + + // Initialize extended layer slots with sentinel values + for (int i = Params_T5::kBaseLayers; i < Params_T5::kLayers; ++i) { + quintuplets.logicalLayers()[quintupletIndex][i] = 0; + quintuplets.lowerModuleIndices()[quintupletIndex][i] = lst::kTCEmptyLowerModule; + quintuplets.hitIndices()[quintupletIndex][2 * i] = lst::kTCEmptyHitIdx; + quintuplets.hitIndices()[quintupletIndex][2 * i + 1] = lst::kTCEmptyHitIdx; + } } //bounds can be found at http://uaf-10.t2.ucsd.edu/~bsathian/SDL/T5_RZFix/t5_rz_thresholds.txt @@ -1633,7 +1642,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { computeSigmasForRegression(acc, modules, lowerModuleIndices, delta1, delta2, slopes, isFlat); regressionRadius = computeRadiusUsingRegression(acc, - Params_T5::kLayers, + Params_T5::kBaseLayers, xVec, yVec, delta1, @@ -1647,7 +1656,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { //compute the other chisquared //non anchor is always shifted for tilted and endcap! - float nonAnchorDelta1[Params_T5::kLayers], nonAnchorDelta2[Params_T5::kLayers], nonAnchorSlopes[Params_T5::kLayers]; + float nonAnchorDelta1[Params_T5::kBaseLayers], nonAnchorDelta2[Params_T5::kBaseLayers], + nonAnchorSlopes[Params_T5::kBaseLayers]; float nonAnchorxs[] = {mds.outerX()[firstMDIndex], mds.outerX()[secondMDIndex], mds.outerX()[thirdMDIndex], @@ -1666,10 +1676,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { nonAnchorDelta2, nonAnchorSlopes, isFlat, - Params_T5::kLayers, + Params_T5::kBaseLayers, false); nonAnchorChiSquared = computeChiSquared(acc, - Params_T5::kLayers, + Params_T5::kBaseLayers, nonAnchorxs, nonAnchorys, nonAnchorDelta1, diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h index b6aaa14e9e14d..3afb7295e1dab 100644 --- a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h +++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h @@ -125,6 +125,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { unsigned int hit0 = hitIndices[2 * i]; unsigned int hit1 = hitIndices[2 * i + 1]; + // Skip empty slots (sentinel values from extended T5/pT5 arrays) + if (hit0 == lst::kTCEmptyHitIdx) + continue; + int layerSlot; if (i < nPixelLayers) { @@ -239,7 +243,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { if (quintuplets.isDup()[iT5] || quintuplets.partOfPT5()[iT5]) continue; - unsigned int loop_bound = pixelQuintuplets.nPixelQuintuplets() + pixelTriplets.nPixelTriplets(); + const unsigned int nPT5 = pixelQuintuplets.nPixelQuintuplets(); + const unsigned int loop_bound = nPT5 + pixelTriplets.nPixelTriplets(); float eta1 = __H2F(quintuplets.eta()[iT5]); float phi1 = __H2F(quintuplets.phi()[iT5]); @@ -249,14 +254,25 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { iEmbedT5[k] = quintuplets.t5Embed()[iT5][k]; } + // Pre-load T5 hits and iT5-only dup-cleaning constants outside the jx loop. + unsigned int iT5Hits[Params_T5::kHits]; + CMS_UNROLL_LOOP for (int i = 0; i < Params_T5::kHits; ++i) { iT5Hits[i] = quintuplets.hitIndices()[iT5][i]; } + // Longer (extended) T5s get a tighter cut: 3x smaller d2 and 6 (vs 4) shared OT hits. + const bool isExtT5 = quintuplets.nLayers()[iT5] > Params_T5::kBaseLayers; + const float d2Lo = isExtT5 ? 0.03f : 0.1f; + const float d2Hi = isExtT5 ? 0.3f : 1.0f; + const int otThresh = isExtT5 ? 6 : 4; + // Cross-clean against both pT5s and pT3s for (unsigned int jx : cms::alpakatools::uniform_elements_x(acc, loop_bound)) { + const bool isPT5 = (jx < nPT5); + const unsigned int ptidx = isPT5 ? 0u : (jx - nPT5); + float eta2, phi2; - if (jx < pixelQuintuplets.nPixelQuintuplets()) { + if (isPT5) { eta2 = __H2F(pixelQuintuplets.eta()[jx]); phi2 = __H2F(pixelQuintuplets.phi()[jx]); } else { - unsigned int ptidx = jx - pixelQuintuplets.nPixelQuintuplets(); eta2 = __H2F(pixelTriplets.eta()[ptidx]); phi2 = __H2F(pixelTriplets.phi()[ptidx]); } @@ -265,7 +281,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2); float dR2 = dEta * dEta + dPhi * dPhi; - if (jx < pixelQuintuplets.nPixelQuintuplets()) { + if (isPT5) { unsigned int jT5 = pixelQuintuplets.quintupletIndices()[jx]; float d2 = 0.f; // Compute distance-squared between the two t5 embeddings. @@ -273,11 +289,50 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { float df = iEmbedT5[k] - quintuplets.t5Embed()[jT5][k]; d2 += df * df; } - if ((dR2 < 0.02f && d2 < 0.1f) || (dR2 < 1e-3f && d2 < 1.0f)) { - quintuplets.isDup()[iT5] = true; + + if ((dR2 < 0.02f && d2 < d2Lo) || (dR2 < 1e-3f && d2 < d2Hi)) { + quintuplets.isDup()[iT5] |= 4; + } else if (dEta < 0.15f && alpaka::math::abs(acc, dPhi) < 0.15f) { + // OT hit matching: T5 hits vs pT5 OT hits + int nOTMatched = 0; + for (int i = 0; i < Params_T5::kHits; ++i) { + unsigned int hitI = iT5Hits[i]; + if (hitI == lst::kTCEmptyHitIdx) + continue; + for (int j = Params_pLS::kHits; j < Params_pT5::kHits; ++j) { + unsigned int pT5Hit = pixelQuintuplets.hitIndices()[jx][j]; + if (pT5Hit == lst::kTCEmptyHitIdx) + continue; + if (hitI == pT5Hit) { + nOTMatched++; + break; + } + } + } + if (nOTMatched >= otThresh) + quintuplets.isDup()[iT5] |= 4; } } else if (dR2 < 1e-3f) { - quintuplets.isDup()[iT5] = true; + quintuplets.isDup()[iT5] |= 4; + } else if (dEta < 0.15f && alpaka::math::abs(acc, dPhi) < 0.15f) { + // OT hit matching: T5 hits vs pT3 OT hits (same extended logic as pT5 path) + int nOTMatched = 0; + for (int i = 0; i < Params_T5::kHits; ++i) { + unsigned int hitI = iT5Hits[i]; + if (hitI == lst::kTCEmptyHitIdx) + continue; + for (int j = Params_pLS::kHits; j < Params_pT3::kHits; ++j) { + unsigned int pT3Hit = pixelTriplets.hitIndices()[ptidx][j]; + if (pT3Hit == lst::kTCEmptyHitIdx) + continue; + if (hitI == pT3Hit) { + nOTMatched++; + break; + } + } + } + if (nOTMatched >= otThresh) + quintuplets.isDup()[iT5] |= 4; } if (quintuplets.isDup()[iT5]) @@ -703,229 +758,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { } }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE void countSharedT5HitsAndFindUnmatched(QuintupletsConst quintuplets, - unsigned int testT5Index, - unsigned int refT5Index, - int& sharedHitCount, - int& unmatchedLayerSlot) { - sharedHitCount = 0; - unmatchedLayerSlot = -1; - - CMS_UNROLL_LOOP - for (int layerIndex = 0; layerIndex < Params_T5::kLayers; ++layerIndex) { - const unsigned int candidateHit0 = quintuplets.hitIndices()[testT5Index][2 * layerIndex + 0]; - const unsigned int candidateHit1 = quintuplets.hitIndices()[testT5Index][2 * layerIndex + 1]; - - bool hit0InBase = false; - bool hit1InBase = false; - // Suppress compiler warning with HIP backend -#ifndef ALPAKA_ACC_GPU_HIP_ENABLED - CMS_UNROLL_LOOP -#endif - for (int baseHitIndex = 0; baseHitIndex < Params_T5::kHits; ++baseHitIndex) { - const unsigned int baseHit = quintuplets.hitIndices()[refT5Index][baseHitIndex]; - if (candidateHit0 == baseHit) - hit0InBase = true; - if (candidateHit1 == baseHit) - hit1InBase = true; - if (hit0InBase && hit1InBase) - break; - } - - if (hit0InBase) - ++sharedHitCount; - if (hit1InBase) - ++sharedHitCount; - - if (!hit0InBase && !hit1InBase) { - unmatchedLayerSlot = layerIndex; - } - } - } - - struct ExtendTrackCandidatesFromDupT5 { - static constexpr int kPackedScoreShift = 32; // Bit shift for packing the score (upper 32 bits). - static constexpr int kPackedIndexShift = 4; // Bit shift for packing the T5 index. - static constexpr unsigned int kPackedIndexMask = 0xFFFFFFF; // Mask for the 28-bit T5 index. - static constexpr unsigned int kPackedSlotMask = 0xF; // Mask for the 4-bit layer slot. - static constexpr int kT5DuplicateMinSharedHits = 8; // Minimum shared hits required to consider a T5 for merging. - - ALPAKA_FN_ACC void operator()(Acc1D const& acc, - ModulesConst modules, - ObjectRangesConst ranges, - QuintupletsConst quintuplets, - QuintupletsOccupancyConst quintupletsOccupancy, - TrackCandidatesBase candsBase, - TrackCandidatesExtended candsExtended) const { - // Assert that the number of blocks equals nTC - ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0u] == candsBase.nTrackCandidates())); - - // Shared memory: Packed best candidate info per logical OT layer (1..11) - // Format: [Score (32 bits) | Quintuplet Index (28 bits) | Layer Slot (4 bits)] - uint64_t* sharedBestPacked = alpaka::declareSharedVar(acc); - - // In 1D, the Grid index [0] is the Block index (which is our TC index) - const unsigned int trackCandidateIndex = alpaka::getIdx(acc)[0u]; - - // Initialize shared memory once per block - if (cms::alpakatools::once_per_block(acc)) { - for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) { - sharedBestPacked[logicalLayerBin] = 0; // 0.0f score, 0 index, 0 slot - } - } - alpaka::syncBlockThreads(acc); - - // Only consider merging T5s onto T5/pT5 TCs for now - const LSTObjType trackCandidateType = candsBase.trackCandidateType()[trackCandidateIndex]; - if (!(trackCandidateType == LSTObjType::T5 || trackCandidateType == LSTObjType::pT5)) - return; - - // Base quintuplet (for T5: objectIndices[0], for pT5: objectIndices[1]) - const unsigned int refT5Index = (trackCandidateType == LSTObjType::T5) - ? candsExtended.objectIndices()[trackCandidateIndex][0] - : candsExtended.objectIndices()[trackCandidateIndex][1]; - - const float baseEta = __H2F(quintuplets.eta()[refT5Index]); - const float basePhi = __H2F(quintuplets.phi()[refT5Index]); - const uint8_t baseStartLogicalLayer = quintuplets.logicalLayers()[refT5Index][0]; - - // Module range to scan - int lowerModuleBegin = 0; - int lowerModuleEnd = static_cast(modules.nLowerModules()); - - // If starting at layer 1, restrict to the module of the second hit - if (baseStartLogicalLayer == 1) { - lowerModuleBegin = quintuplets.lowerModuleIndices()[refT5Index][1]; - lowerModuleEnd = lowerModuleBegin + 1; - } - - // Linear thread index and block dimension in 1D - const int threadIndexFlat = alpaka::getIdx(acc)[0u]; - const int blockDimFlat = alpaka::getWorkDiv(acc)[0u]; - - // Scan over lower modules - for (int lowerModuleIndex = lowerModuleBegin + threadIndexFlat; lowerModuleIndex < lowerModuleEnd; - lowerModuleIndex += blockDimFlat) { - const int firstQuintupletInModule = ranges.quintupletModuleIndices()[lowerModuleIndex]; - if (firstQuintupletInModule == -1) - continue; - - const unsigned int nQuintupletsInModule = quintupletsOccupancy.nQuintuplets()[lowerModuleIndex]; - - // Scan over quintuplets in this module - for (unsigned int quintupletOffset = 0; quintupletOffset < nQuintupletsInModule; ++quintupletOffset) { - const unsigned int testT5Index = firstQuintupletInModule + quintupletOffset; - if (testT5Index == refT5Index) - continue; - - // Require different starting layer for now - if (quintuplets.logicalLayers()[testT5Index][0] == baseStartLogicalLayer) - continue; - - // Quick eta / phi window selection - const float candidateEta = __H2F(quintuplets.eta()[testT5Index]); - if (alpaka::math::abs(acc, baseEta - candidateEta) > 0.1f) - continue; - - const float candidatePhi = __H2F(quintuplets.phi()[testT5Index]); - if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, basePhi, candidatePhi)) > 0.1f) - continue; - - // Embedding distance - float embedDistance2 = 0.f; - CMS_UNROLL_LOOP - for (unsigned int embedIndex = 0; embedIndex < Params_T5::kEmbed; ++embedIndex) { - const float diff = - quintuplets.t5Embed()[refT5Index][embedIndex] - quintuplets.t5Embed()[testT5Index][embedIndex]; - embedDistance2 += diff * diff; - } - if (embedDistance2 > 1.0f) - continue; - - // Hit matching against base T5 hits - int sharedHitCount = 0; - int unmatchedLayerSlot = -1; - - countSharedT5HitsAndFindUnmatched(quintuplets, testT5Index, refT5Index, sharedHitCount, unmatchedLayerSlot); - - if (sharedHitCount < kT5DuplicateMinSharedHits) - continue; - if (unmatchedLayerSlot < 0) - continue; - - // Candidate score is the T5 DNN output - const float candidateScore = quintuplets.dnnScore()[testT5Index]; - - // New OT logical layer from the candidate - const uint8_t newLogicalLayer = quintuplets.logicalLayers()[testT5Index][unmatchedLayerSlot]; - const int logicalLayerBin = static_cast(newLogicalLayer) - 1; // 0..kLogicalOTLayers-1 - - // Pack the Score, Index, and Slot into 64 bits for atomic update - uint64_t scoreBits = (uint64_t)std::bit_cast(candidateScore); - uint64_t newPacked = (scoreBits << kPackedScoreShift) | - ((uint64_t)(testT5Index & kPackedIndexMask) << kPackedIndexShift) | - (unmatchedLayerSlot & kPackedSlotMask); - - // Atomic CAS loop - uint64_t oldPacked = sharedBestPacked[logicalLayerBin]; - while (true) { - const float oldScore = std::bit_cast((int)(oldPacked >> kPackedScoreShift)); - - // If we aren't strictly better, stop trying - if (candidateScore <= oldScore) { - break; - } - - // Try to swap. atomicCas returns the value that was previously there - uint64_t assumedOld = alpaka::atomicCas( - acc, &sharedBestPacked[logicalLayerBin], oldPacked, newPacked, alpaka::hierarchy::Threads{}); - - if (assumedOld == oldPacked) { - break; // Success - } else { - oldPacked = assumedOld; // Failed, update view and retry - } - } - } - } - - // One thread per block finalizes by actually extending the TC - alpaka::syncBlockThreads(acc); - - if (cms::alpakatools::once_per_block(acc)) { - CMS_UNROLL_LOOP - for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) { - uint64_t bestPacked = sharedBestPacked[logicalLayerBin]; - - // Check if valid update occurred (non-zero score) - int bestScoreInt = (int)(bestPacked >> kPackedScoreShift); - if (bestScoreInt == 0) - continue; - - // Unpack Index (bits 4-31) and Slot (bits 0-3) - const int bestT5Index = (int)((bestPacked >> kPackedIndexShift) & kPackedIndexMask); - const int bestT5LayerSlot = (int)(bestPacked & kPackedSlotMask); - - const uint8_t logicalLayer = static_cast(logicalLayerBin + 1); // 1..11 - const int layerSlot = (logicalLayer - 1) + Params_TC::kPixelLayerSlots; // OT slots for TC - - const uint16_t lowerModuleIndex = quintuplets.lowerModuleIndices()[bestT5Index][bestT5LayerSlot]; - const unsigned int hitIndex0 = quintuplets.hitIndices()[bestT5Index][2 * bestT5LayerSlot + 0]; - const unsigned int hitIndex1 = quintuplets.hitIndices()[bestT5Index][2 * bestT5LayerSlot + 1]; - - addTrackCandidateLayerHits(candsBase, - candsExtended, - trackCandidateIndex, - layerSlot, - logicalLayer, - lowerModuleIndex, - hitIndex0, - hitIndex1); - } - } - } - }; - struct AddpT5asTrackCandidate { ALPAKA_FN_ACC void operator()(Acc1D const& acc, uint16_t nLowerModules, diff --git a/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc b/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc index 03aa3128ad960..79aedd7f13b55 100644 --- a/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc +++ b/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc @@ -353,7 +353,7 @@ std::vector getHitsFrompT3(LSTEvent* event, unsigned int pT3) { std::vector hits; hits.reserve(allHits.size()); for (auto const hit : allHits) - if (hits.empty() || hits.back() != hit) // should eventually check all and type + if (hit != lst::kTCEmptyHitIdx && (hits.empty() || hits.back() != hit)) hits.emplace_back(hit); return hits; } @@ -446,7 +446,7 @@ std::vector getHitsFrompT5(LSTEvent* event, unsigned int pT5) { std::vector hits; hits.reserve(allHits.size()); for (auto const hit : allHits) - if (hits.empty() || hits.back() != hit) // should eventually check all and type + if (hit != lst::kTCEmptyHitIdx && (hits.empty() || hits.back() != hit)) hits.emplace_back(hit); return hits; } diff --git a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc index c5a54d3a53ffe..510ee36af7a53 100644 --- a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc +++ b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc @@ -517,6 +517,14 @@ void createQuintupletBranches() { ana.tx->createBranch>("t5_pMatched"); ana.tx->createBranch>("t5_sim_vxy"); ana.tx->createBranch>("t5_sim_vz"); + ana.tx->createBranch>("t5_nLayers"); // 5 base + ana.tx->createBranch>("t5_isDupBitmask"); // dup-cleaning bitmask + ana.tx->createBranch>("t5_partOfPT5"); + ana.tx->createBranch>("t5_tightCutFlag"); + ana.tx->createBranch>("t5_score_rphisum"); + ana.tx->createBranch>>("t5_hitIndices"); // SoA hit indices, length 2*nLayers + ana.tx->createBranch>>("t5_logicalLayers"); // logical layer ids, length nLayers + ana.tx->createBranch>("t5_moduleIdx"); // T5's lower module index } //________________________________________________________________________________________________________________________________ @@ -579,13 +587,14 @@ void createPixelTripletBranches() { #endif ana.tx->createBranch>("sim_pT3_matched"); ana.tx->createBranch>("pT3_score"); - ana.tx->createBranch>("pT3_pt"); // pt (taken from the pLS) - ana.tx->createBranch>("pT3_eta"); // eta (taken from the pLS) - ana.tx->createBranch>("pT3_phi"); // phi (taken from the pLS) - ana.tx->createBranch>("pT3_plsIdx"); // idx to pLS - ana.tx->createBranch>("pT3_t3Idx"); // idx to T3 - ana.tx->createBranch>("pT3_isFake"); // 1 if pT3 is fake 0 other if not - ana.tx->createBranch>("pT3_isDuplicate"); // 1 if pT3 is duplicate 0 other if not + ana.tx->createBranch>("pT3_pt"); // pt (taken from the pLS) + ana.tx->createBranch>("pT3_eta"); // eta (taken from the pLS) + ana.tx->createBranch>("pT3_phi"); // phi (taken from the pLS) + ana.tx->createBranch>("pT3_plsIdx"); // idx to pLS + ana.tx->createBranch>("pT3_t3Idx"); // idx to T3 + ana.tx->createBranch>>("pT3_otHitIndices"); // OT hit indices from the T3 + ana.tx->createBranch>("pT3_isFake"); // 1 if pT3 is fake 0 other if not + ana.tx->createBranch>("pT3_isDuplicate"); // 1 if pT3 is duplicate 0 other if not ana.tx->createBranch>("pT3_simIdx"); // idx of best matched (highest nhit and > 75%) simulated track ana.tx->createBranch>("pT3_pix_eta"); ana.tx->createBranch>("pT3_pix_phi"); @@ -1649,6 +1658,25 @@ std::map setQuintupletBranches(LSTEvent* event, ana.tx->pushbackToBranch>("t5_embed", current_t5_embed); ana.tx->pushbackToBranch("t5_dnnScore", quintuplets.dnnScore()[t5Idx]); + unsigned int nL = quintuplets.nLayers()[t5Idx]; + ana.tx->pushbackToBranch("t5_nLayers", static_cast(nL)); + ana.tx->pushbackToBranch("t5_moduleIdx", static_cast(idx)); + std::vector hitIdxVec; + std::vector logLayerVec; + hitIdxVec.reserve(2 * nL); + logLayerVec.reserve(nL); + for (unsigned int iL = 0; iL < nL; ++iL) { + hitIdxVec.push_back(static_cast(quintuplets.hitIndices()[t5Idx][2 * iL])); + hitIdxVec.push_back(static_cast(quintuplets.hitIndices()[t5Idx][2 * iL + 1])); + logLayerVec.push_back(static_cast(quintuplets.logicalLayers()[t5Idx][iL])); + } + ana.tx->pushbackToBranch>("t5_hitIndices", hitIdxVec); + ana.tx->pushbackToBranch>("t5_logicalLayers", logLayerVec); + ana.tx->pushbackToBranch("t5_isDupBitmask", static_cast(quintuplets.isDup()[t5Idx])); + ana.tx->pushbackToBranch("t5_partOfPT5", quintuplets.partOfPT5()[t5Idx] ? 1 : 0); + ana.tx->pushbackToBranch("t5_tightCutFlag", quintuplets.tightCutFlag()[t5Idx] ? 1 : 0); + ana.tx->pushbackToBranch("t5_score_rphisum", __H2F(quintuplets.score_rphisum()[t5Idx])); + bool isfake = true; for (size_t isim = 0; isim < simidx.size(); ++isim) { if (simidxfrac[isim] > matchfrac) { @@ -1944,6 +1972,10 @@ std::map setPixelTripletBranches(LSTEvent* event, unsigned int t3_idx = t3_idx_map.at(t3Idx); ana.tx->pushbackToBranch("pT3_t3Idx", t3_idx); } + std::vector otHits; + for (int ih = Params_pLS::kHits; ih < Params_pT3::kHits; ++ih) + otHits.push_back(static_cast(pixelTriplets.hitIndices()[ipT3][ih])); + ana.tx->pushbackToBranch>("pT3_otHitIndices", otHits); bool isfake = true; for (size_t isim = 0; isim < simidx.size(); ++isim) { if (simidxfrac[isim] > matchfrac) {