diff --git a/RecoTracker/LSTCore/interface/Common.h b/RecoTracker/LSTCore/interface/Common.h
index b5dece7656b2d..ec0ad431898d5 100644
--- a/RecoTracker/LSTCore/interface/Common.h
+++ b/RecoTracker/LSTCore/interface/Common.h
@@ -87,7 +87,8 @@ namespace lst {
     using ArrayUxHits = edm::StdArray<unsigned int, kHits>;
   };
   struct Params_T5 {
-    static constexpr int kLayers = 5, kHits = 10;
+    static constexpr int kLayers = 7, kHits = 14;  // 5 base + max 2 extensions
+    static constexpr int kBaseLayers = 5;
     static constexpr int kEmbed = 6;
     using ArrayU8xLayers = edm::StdArray<uint8_t, kLayers>;
     using ArrayU16xLayers = edm::StdArray<uint16_t, kLayers>;
@@ -95,7 +96,8 @@ namespace lst {
     using ArrayFxEmbed = edm::StdArray<float, kEmbed>;
   };
   struct Params_pT5 {
-    static constexpr int kLayers = 7, kHits = 14;
+    static constexpr int kLayers = 9, kHits = 18;  // 2 pixel + 7 OT (= T5::kLayers after extension)
+    static constexpr int kBaseLayers = 7;
     using ArrayU8xLayers = edm::StdArray<uint8_t, kLayers>;
     using ArrayU16xLayers = edm::StdArray<uint16_t, kLayers>;
     using ArrayUxHits = edm::StdArray<unsigned int, kHits>;
diff --git a/RecoTracker/LSTCore/interface/ModulesSoA.h b/RecoTracker/LSTCore/interface/ModulesSoA.h
index 9fca722a8b5b8..dfc4678d38581 100644
--- a/RecoTracker/LSTCore/interface/ModulesSoA.h
+++ b/RecoTracker/LSTCore/interface/ModulesSoA.h
@@ -33,6 +33,7 @@ namespace lst {
                       SOA_COLUMN(short, subdets),
                       SOA_COLUMN(short, sides),
                       SOA_COLUMN(float, eta),
+                      SOA_COLUMN(float, phi),
                       SOA_COLUMN(float, r),
                       SOA_COLUMN(float, z),
                       SOA_COLUMN(bool, isInverted),
diff --git a/RecoTracker/LSTCore/interface/PixelQuintupletsSoA.h b/RecoTracker/LSTCore/interface/PixelQuintupletsSoA.h
index 0175cdde74beb..e3cc1548c085f 100644
--- a/RecoTracker/LSTCore/interface/PixelQuintupletsSoA.h
+++ b/RecoTracker/LSTCore/interface/PixelQuintupletsSoA.h
@@ -27,6 +27,7 @@ namespace lst {
                       SOA_COLUMN(FPX, centerX),  // T3-based circle center x
                       SOA_COLUMN(FPX, centerY),  // T3-based circle center y
                       SOA_COLUMN(bool, isDup),
+                      SOA_COLUMN(unsigned int, nLayers),
                       SOA_SCALAR(unsigned int, nPixelQuintuplets),
                       SOA_SCALAR(unsigned int, totOccupancyPixelQuintuplets));
 
diff --git a/RecoTracker/LSTCore/interface/QuintupletsSoA.h b/RecoTracker/LSTCore/interface/QuintupletsSoA.h
index ec68919078300..5e05d39f604aa 100644
--- a/RecoTracker/LSTCore/interface/QuintupletsSoA.h
+++ b/RecoTracker/LSTCore/interface/QuintupletsSoA.h
@@ -19,9 +19,11 @@ namespace lst {
                       SOA_COLUMN(Params_T5::ArrayFxEmbed, t5Embed),                // t5 embedding vector
                       SOA_COLUMN(FPX, eta),
                       SOA_COLUMN(FPX, phi),
-                      SOA_COLUMN(FPX, score_rphisum),  // r-phi based score
-                      SOA_COLUMN(char, isDup),         // duplicate flag
-                      SOA_COLUMN(bool, tightCutFlag),  // tight pass to be a TC
+                      SOA_COLUMN(FPX, score_rphisum),     // r-phi based score
+                      SOA_COLUMN(char, isDup),            // duplicate flag
+                      SOA_COLUMN(unsigned int, nLayers),  // number of active layers (5 base)
+                      SOA_COLUMN(bool, tightCutFlag),     // tight pass to be a TC
+                      SOA_COLUMN(bool, partOfPT5),
                       SOA_COLUMN(float, regressionRadius),
                       SOA_COLUMN(float, regressionCenterX),
                       SOA_COLUMN(float, regressionCenterY),
@@ -29,15 +31,16 @@ namespace lst {
                       SOA_COLUMN(FPX, innerRadius),   // inner triplet circle radius
                       SOA_COLUMN(FPX, bridgeRadius),  // "middle"/bridge triplet radius
                       SOA_COLUMN(FPX, outerRadius),   // outer triplet radius
-                      SOA_COLUMN(FPX, pt),
+                      SOA_COLUMN(FPX, pt)
 #ifdef CUT_VALUE_DEBUG
+                          ,
                       SOA_COLUMN(float, rzChiSquared),  // r-z only chi2
                       SOA_COLUMN(float, chiSquared),
                       SOA_COLUMN(float, nonAnchorChiSquared),
                       SOA_COLUMN(float, dBeta1),
-                      SOA_COLUMN(float, dBeta2),
+                      SOA_COLUMN(float, dBeta2)
 #endif
-                      SOA_COLUMN(bool, partOfPT5));
+  );
 
   using QuintupletsSoA = QuintupletsSoALayout<>;
   using Quintuplets = QuintupletsSoA::View;
diff --git a/RecoTracker/LSTCore/src/ModuleMethods.h b/RecoTracker/LSTCore/src/ModuleMethods.h
index 1d5d75f18ac2b..2aaccbc776452 100644
--- a/RecoTracker/LSTCore/src/ModuleMethods.h
+++ b/RecoTracker/LSTCore/src/ModuleMethods.h
@@ -169,6 +169,7 @@ namespace lst {
                                    float m_y,
                                    float m_z,
                                    float& eta,
+                                   float& phi,
                                    float& r) {
     subdet = (detId & (7 << 25)) >> 25;
     side = (subdet == Endcap) ? (detId & (3 << 23)) >> 23 : (detId & (3 << 18)) >> 18;
@@ -179,6 +180,7 @@ namespace lst {
 
     r = std::sqrt(m_x * m_x + m_y * m_y + m_z * m_z);
     eta = ((m_z > 0) - (m_z < 0)) * std::acosh(r / std::sqrt(m_x * m_x + m_y * m_y));
+    phi = std::atan2(m_y, m_x);
   }
 
   inline void loadCentroidsFromFile(const char* filePath, ModuleMetaData& mmd, uint16_t& nModules) {
@@ -252,6 +254,7 @@ namespace lst {
     std::span<short> host_subdets = modules_view.subdets();
     std::span<short> host_sides = modules_view.sides();
     std::span<float> host_eta = modules_view.eta();
+    std::span<float> host_phi = modules_view.phi();
     std::span<float> host_r = modules_view.r();
     std::span<float> host_z = modules_view.z();
     std::span<bool> host_isInverted = modules_view.isInverted();
@@ -279,7 +282,7 @@ namespace lst {
       float m_z = mmd.module_z[detId];
       unsigned int m_t = mmd.module_type[detId];
 
-      float eta, r;
+      float eta, phi, r;
 
       uint16_t index;
       unsigned short layer, ring, rod, module, subdet, side;
@@ -294,9 +297,10 @@ namespace lst {
         isInverted = false;
         isLower = false;
         eta = 0;
+        phi = 0;
         r = 0;
       } else {
-        setDerivedQuantities(detId, layer, ring, rod, module, subdet, side, m_x, m_y, m_z, eta, r);
+        setDerivedQuantities(detId, layer, ring, rod, module, subdet, side, m_x, m_y, m_z, eta, phi, r);
         isInverted = parseIsInverted(subdet, side, module, layer);
         isLower = parseIsLower(isInverted, detId);
       }
@@ -319,6 +323,7 @@ namespace lst {
       host_subdets[index] = subdet;
       host_sides[index] = side;
       host_eta[index] = eta;
+      host_phi[index] = phi;
       host_r[index] = r;
       host_z[index] = m_z;
       host_isInverted[index] = isInverted;
diff --git a/RecoTracker/LSTCore/src/alpaka/Kernels.h b/RecoTracker/LSTCore/src/alpaka/Kernels.h
index 33759e0b49004..fee34d0b2c88b 100644
--- a/RecoTracker/LSTCore/src/alpaka/Kernels.h
+++ b/RecoTracker/LSTCore/src/alpaka/Kernels.h
@@ -1,6 +1,8 @@
 #ifndef RecoTracker_LSTCore_src_alpaka_Kernels_h
 #define RecoTracker_LSTCore_src_alpaka_Kernels_h
 
+#include <bit>
+
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
 
@@ -56,8 +58,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     int nMatched = 0;
     for (int i = 0; i < Params_T5::kHits; i++) {
+      // Skip sentinel values from extended slots
+      if (hits1[i] == lst::kTCEmptyHitIdx)
+        continue;
       bool matched = false;
       for (int j = 0; j < Params_T5::kHits; j++) {
+        if (hits2[j] == lst::kTCEmptyHitIdx)
+          continue;
         if (hits1[i] == hits2[j]) {
           matched = true;
           break;
@@ -83,8 +90,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     int nMatched = 0;
     for (int i = 0; i < Params_pT5::kHits; i++) {
+      // Skip sentinel values from extended slots
+      if (hits1[i] == lst::kTCEmptyHitIdx)
+        continue;
       bool matched = false;
       for (int j = 0; j < Params_pT5::kHits; j++) {
+        if (hits2[j] == lst::kTCEmptyHitIdx)
+          continue;
         if (hits1[i] == hits2[j]) {
           matched = true;
           break;
@@ -186,18 +198,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
         for (unsigned int ix1 : cms::alpakatools::uniform_elements_y(acc, nQuintuplets_lowmod)) {
           unsigned int ix = quintupletModuleIndices_lowmod + ix1;
+          if (quintuplets.isDup()[ix])
+            continue;
           float eta1 = __H2F(quintuplets.eta()[ix]);
           float phi1 = __H2F(quintuplets.phi()[ix]);
-          float score_rphisum1 = __H2F(quintuplets.score_rphisum()[ix]);
+          float dnnScore1 = quintuplets.dnnScore()[ix];
 
           for (unsigned int jx1 : cms::alpakatools::uniform_elements_x(acc, ix1 + 1, nQuintuplets_lowmod)) {
             unsigned int jx = quintupletModuleIndices_lowmod + jx1;
+            if (quintuplets.isDup()[jx])
+              continue;
 
             float eta2 = __H2F(quintuplets.eta()[jx]);
             float phi2 = __H2F(quintuplets.phi()[jx]);
             float dEta = alpaka::math::abs(acc, eta1 - eta2);
             float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2);
-            float score_rphisum2 = __H2F(quintuplets.score_rphisum()[jx]);
 
             if (dEta > 0.1f)
               continue;
@@ -206,12 +221,35 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
               continue;
 
             int nMatched = checkHitsT5(ix, jx, quintuplets);
-            const int minNHitsForDup_T5 = 7;
-            if (nMatched >= minNHitsForDup_T5) {
-              if (score_rphisum1 >= score_rphisum2) {
+            // Proportional sharing: at least 60% of the shorter track's hits.
+            unsigned int nLayersIx = quintuplets.nLayers()[ix];
+            unsigned int nLayersJx = quintuplets.nLayers()[jx];
+            unsigned int nHitsIx = 2 * nLayersIx;
+            unsigned int nHitsJx = 2 * nLayersJx;
+            int minNHitsForDup = static_cast<int>(0.6f * (nHitsIx < nHitsJx ? nHitsIx : nHitsJx));
+            if (nMatched >= minNHitsForDup) {
+              // Tiebreak: longer track wins; otherwise rphisum at high pT, DNN score at low pT.
+              if (nLayersIx > nLayersJx) {
+                rmQuintupletFromMemory(quintuplets, jx);
+              } else if (nLayersJx > nLayersIx) {
                 rmQuintupletFromMemory(quintuplets, ix);
               } else {
-                rmQuintupletFromMemory(quintuplets, jx);
+                float ptIx = __H2F(quintuplets.innerRadius()[ix]) * lst::k2Rinv1GeVf * 2;
+                float ptJx = __H2F(quintuplets.innerRadius()[jx]) * lst::k2Rinv1GeVf * 2;
+                if (ptIx > 5.0f || ptJx > 5.0f) {
+                  float rphisum1 = __H2F(quintuplets.score_rphisum()[ix]);
+                  float rphisum2 = __H2F(quintuplets.score_rphisum()[jx]);
+                  if (rphisum1 >= rphisum2)
+                    rmQuintupletFromMemory(quintuplets, ix);
+                  else
+                    rmQuintupletFromMemory(quintuplets, jx);
+                } else {
+                  float dnnScore2 = quintuplets.dnnScore()[jx];
+                  if (dnnScore1 <= dnnScore2)
+                    rmQuintupletFromMemory(quintuplets, ix);
+                  else
+                    rmQuintupletFromMemory(quintuplets, jx);
+                }
               }
             }
           }
@@ -220,6 +258,204 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     }
   };
 
+  template <typename TAcc>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void tryExtendT5(
+      TAcc const& acc, Quintuplets quintuplets, unsigned int winnerIdx, unsigned int loserIdx, int loserSlot) {
+    if (loserSlot < 0)
+      return;
+
+    unsigned int newSlot = alpaka::atomicAdd(acc, &quintuplets.nLayers()[winnerIdx], 1u, alpaka::hierarchy::Threads{});
+
+    if (newSlot >= Params_T5::kLayers) {
+      alpaka::atomicSub(acc, &quintuplets.nLayers()[winnerIdx], 1u, alpaka::hierarchy::Threads{});
+      return;
+    }
+
+    quintuplets.logicalLayers()[winnerIdx][newSlot] = quintuplets.logicalLayers()[loserIdx][loserSlot];
+    quintuplets.lowerModuleIndices()[winnerIdx][newSlot] = quintuplets.lowerModuleIndices()[loserIdx][loserSlot];
+    quintuplets.hitIndices()[winnerIdx][2 * newSlot] = quintuplets.hitIndices()[loserIdx][2 * loserSlot];
+    quintuplets.hitIndices()[winnerIdx][2 * newSlot + 1] = quintuplets.hitIndices()[loserIdx][2 * loserSlot + 1];
+  }
+
+  struct ExtendT5FromDupT5 {
+    // Packed [score:32 | T5 index:28 | layer slot:4] for atomic best-per-OT-layer tracking.
+    static constexpr int kPackedScoreShift = 32;
+    static constexpr int kPackedIndexShift = 4;
+    static constexpr unsigned int kPackedIndexMask = 0xFFFFFFF;
+    static constexpr unsigned int kPackedSlotMask = 0xF;
+    static constexpr int kT5DuplicateMinSharedHits = 8;
+
+    ALPAKA_FN_ACC void operator()(Acc1D const& acc,
+                                  ModulesConst modules,
+                                  ObjectRangesConst ranges,
+                                  Quintuplets quintuplets,
+                                  QuintupletsOccupancyConst quintupletsOccupancy) const {
+      // Best candidate per OT logical layer (1..11), packed score|index|slot.
+      uint64_t* sharedBestPacked = alpaka::declareSharedVar<uint64_t[lst::kLogicalOTLayers], __COUNTER__>(acc);
+
+      // One block per T5 in 1D; block index = ref T5 index.
+      const unsigned int refT5Index = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
+
+      // Skip empty/unallocated T5 slots.
+      if (quintuplets.nLayers()[refT5Index] == 0)
+        return;
+
+      // Initialize shared memory once per block.
+      if (cms::alpakatools::once_per_block(acc)) {
+        for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) {
+          sharedBestPacked[logicalLayerBin] = 0;
+        }
+      }
+      alpaka::syncBlockThreads(acc);
+
+      const float baseEta = __H2F(quintuplets.eta()[refT5Index]);
+      const float basePhi = __H2F(quintuplets.phi()[refT5Index]);
+      const uint8_t baseStartLogicalLayer = quintuplets.logicalLayers()[refT5Index][0];
+
+      // Hoist ref data once: hit indices and embedding read every candidate iteration otherwise.
+      float refEmbed[Params_T5::kEmbed];
+      CMS_UNROLL_LOOP
+      for (unsigned int e = 0; e < Params_T5::kEmbed; ++e)
+        refEmbed[e] = quintuplets.t5Embed()[refT5Index][e];
+
+      constexpr unsigned int kRefHits = 2 * Params_T5::kBaseLayers;
+      unsigned int refHits[kRefHits];
+      CMS_UNROLL_LOOP
+      for (unsigned int h = 0; h < kRefHits; ++h)
+        refHits[h] = quintuplets.hitIndices()[refT5Index][h];
+
+      const int threadIndexFlat = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
+      const int blockDimFlat = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
+
+      // Ref-starts-at-layer-1 special case: only ref's slot-1 module can host a valid candidate.
+      const bool restrictToRefSlot1 = (baseStartLogicalLayer == 1);
+      const uint16_t nEligibleT5Modules = ranges.nEligibleT5Modules();
+      const int loopCount = restrictToRefSlot1 ? 1 : static_cast<int>(nEligibleT5Modules);
+
+      for (int idx = threadIndexFlat; idx < loopCount; idx += blockDimFlat) {
+        const uint16_t lowerModuleIndex = restrictToRefSlot1 ? quintuplets.lowerModuleIndices()[refT5Index][1]
+                                                             : ranges.indicesOfEligibleT5Modules()[idx];
+
+        if (!restrictToRefSlot1) {
+          // Skip same-starting-layer modules (logical = physical + 6 for endcap, see Triplet.h).
+          const short modSubdet = modules.subdets()[lowerModuleIndex];
+          const int moduleLogicalLayer =
+              static_cast<int>(modules.layers()[lowerModuleIndex]) + (modSubdet == Endcap ? 6 : 0);
+          if (moduleLogicalLayer == static_cast<int>(baseStartLogicalLayer))
+            continue;
+
+          // Module-level eta/phi pre-cut; margin covers per-T5 window plus T5-vs-module spread.
+          if (alpaka::math::abs(acc, baseEta - modules.eta()[lowerModuleIndex]) > 0.3f)
+            continue;
+          if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, basePhi, modules.phi()[lowerModuleIndex])) > 0.5f)
+            continue;
+        }
+
+        const int firstQuintupletInModule = ranges.quintupletModuleIndices()[lowerModuleIndex];
+        if (firstQuintupletInModule == -1)
+          continue;
+
+        const unsigned int nQuintupletsInModule = quintupletsOccupancy.nQuintuplets()[lowerModuleIndex];
+
+        for (unsigned int quintupletOffset = 0; quintupletOffset < nQuintupletsInModule; ++quintupletOffset) {
+          const unsigned int testT5Index = firstQuintupletInModule + quintupletOffset;
+          if (testT5Index == refT5Index)
+            continue;
+
+          // Per-T5 eta/phi window.
+          const float candidateEta = __H2F(quintuplets.eta()[testT5Index]);
+          if (alpaka::math::abs(acc, baseEta - candidateEta) > 0.1f)
+            continue;
+
+          const float candidatePhi = __H2F(quintuplets.phi()[testT5Index]);
+          if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, basePhi, candidatePhi)) > 0.1f)
+            continue;
+
+          // Embedding distance against hoisted refEmbed.
+          float embedDistance2 = 0.f;
+          CMS_UNROLL_LOOP
+          for (unsigned int embedIndex = 0; embedIndex < Params_T5::kEmbed; ++embedIndex) {
+            const float diff = refEmbed[embedIndex] - quintuplets.t5Embed()[testT5Index][embedIndex];
+            embedDistance2 += diff * diff;
+          }
+          if (embedDistance2 > 1.0f)
+            continue;
+
+          // Hit matching against hoisted ref hits; record the candidate slot with no shared hit.
+          int sharedHitCount = 0;
+          int unmatchedLayerSlot = -1;
+          CMS_UNROLL_LOOP
+          for (unsigned int layerIndex = 0; layerIndex < Params_T5::kBaseLayers; ++layerIndex) {
+            const unsigned int candidateHit0 = quintuplets.hitIndices()[testT5Index][2 * layerIndex + 0];
+            const unsigned int candidateHit1 = quintuplets.hitIndices()[testT5Index][2 * layerIndex + 1];
+
+            bool hit0InBase = false;
+            bool hit1InBase = false;
+            CMS_UNROLL_LOOP
+            for (unsigned int baseHitIndex = 0; baseHitIndex < kRefHits; ++baseHitIndex) {
+              const unsigned int baseHit = refHits[baseHitIndex];
+              hit0InBase = hit0InBase || (candidateHit0 == baseHit);
+              hit1InBase = hit1InBase || (candidateHit1 == baseHit);
+            }
+
+            sharedHitCount += int(hit0InBase) + int(hit1InBase);
+            if (!hit0InBase && !hit1InBase)
+              unmatchedLayerSlot = layerIndex;
+          }
+
+          if (sharedHitCount < kT5DuplicateMinSharedHits)
+            continue;
+          if (unmatchedLayerSlot < 0)
+            continue;
+
+          // Score = DNN output; layer bin = candidate's unmatched OT layer (1..11) - 1.
+          const float candidateScore = quintuplets.dnnScore()[testT5Index];
+          const uint8_t newLogicalLayer = quintuplets.logicalLayers()[testT5Index][unmatchedLayerSlot];
+          const int logicalLayerBin = static_cast<int>(newLogicalLayer) - 1;
+
+          uint64_t scoreBits = std::bit_cast<uint32_t>(candidateScore);
+          uint64_t newPacked = (scoreBits << kPackedScoreShift) |
+                               (static_cast<uint64_t>(testT5Index & kPackedIndexMask) << kPackedIndexShift) |
+                               (unmatchedLayerSlot & kPackedSlotMask);
+
+          // Atomic CAS into shared best-per-layer slot, retry until we win or are beaten.
+          uint64_t oldPacked = sharedBestPacked[logicalLayerBin];
+          while (true) {
+            const float oldScore = std::bit_cast<float>(static_cast<uint32_t>(oldPacked >> kPackedScoreShift));
+            if (candidateScore <= oldScore)
+              break;
+
+            uint64_t assumedOld = alpaka::atomicCas(
+                acc, &sharedBestPacked[logicalLayerBin], oldPacked, newPacked, alpaka::hierarchy::Threads{});
+
+            if (assumedOld == oldPacked) {
+              break;
+            } else {
+              oldPacked = assumedOld;
+            }
+          }
+        }
+      }
+
+      alpaka::syncBlockThreads(acc);
+
+      // One thread per block applies the per-layer winners.
+      if (cms::alpakatools::once_per_block(acc)) {
+        CMS_UNROLL_LOOP
+        for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) {
+          uint64_t bestPacked = sharedBestPacked[logicalLayerBin];
+          if ((bestPacked >> kPackedScoreShift) == 0)
+            continue;
+
+          const int bestT5Index = static_cast<int>((bestPacked >> kPackedIndexShift) & kPackedIndexMask);
+          const int bestT5LayerSlot = static_cast<int>(bestPacked & kPackedSlotMask);
+
+          tryExtendT5(acc, quintuplets, refT5Index, bestT5Index, bestT5LayerSlot);
+        }
+      }
+    }
+  };
+
   struct RemoveDupQuintupletsBeforeTC {
     ALPAKA_FN_ACC void operator()(Acc2D const& acc,
                                   Quintuplets quintuplets,
@@ -250,7 +486,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
             const bool isPT5_ix = quintuplets.partOfPT5()[ix];
             const float eta1 = __H2F(quintuplets.eta()[ix]);
             const float phi1 = __H2F(quintuplets.phi()[ix]);
-            const float score_rphisum1 = __H2F(quintuplets.score_rphisum()[ix]);
+            const float dnnScore1 = quintuplets.dnnScore()[ix];
 
             for (unsigned int jx1 = 0; jx1 < nQuintuplets_lowmod2; jx1++) {
               unsigned int jx = quintupletModuleIndices_lowmod2 + jx1;
@@ -286,15 +522,29 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                 d2 += diff * diff;
               }
 
-              if (((dR2 < 0.001f || nMatched >= minNHitsForDup_T5) && d2 < 1.0f) || (dR2 < 0.02f && d2 < 0.1f)) {
-                const float score_rphisum2 = __H2F(quintuplets.score_rphisum()[jx]);
-                if (isPT5_jx || score_rphisum1 > score_rphisum2) {
-                  rmQuintupletFromMemory(quintuplets, ix, true);
-                } else if (isPT5_ix || score_rphisum1 < score_rphisum2) {
-                  rmQuintupletFromMemory(quintuplets, jx, true);
+              // 99th percentile of true-dup d2 distribution measured on 100 PU200 events.
+              constexpr float d2Thresh = 0.25f;
+              if (((dR2 < 0.001f || nMatched >= minNHitsForDup_T5) && d2 < d2Thresh) || (dR2 < 0.02f && d2 < 0.1f)) {
+                float ptIx = __H2F(quintuplets.innerRadius()[ix]) * lst::k2Rinv1GeVf * 2;
+                float ptJx = __H2F(quintuplets.innerRadius()[jx]) * lst::k2Rinv1GeVf * 2;
+                bool highPt = (ptIx > 5.0f || ptJx > 5.0f);
+                bool ixLoses;
+                if (isPT5_jx) {
+                  ixLoses = true;
+                } else if (isPT5_ix) {
+                  ixLoses = false;
+                } else if (highPt) {
+                  float rphisum1 = __H2F(quintuplets.score_rphisum()[ix]);
+                  float rphisum2 = __H2F(quintuplets.score_rphisum()[jx]);
+                  ixLoses = (rphisum1 > rphisum2) || (rphisum1 == rphisum2 && ix < jx);
                 } else {
-                  rmQuintupletFromMemory(quintuplets, (ix < jx ? ix : jx), true);
+                  float dnnScore2 = quintuplets.dnnScore()[jx];
+                  ixLoses = (dnnScore1 < dnnScore2) || (dnnScore1 == dnnScore2 && ix < jx);
                 }
+                if (ixLoses)
+                  rmQuintupletFromMemory(quintuplets, ix, true);
+                else
+                  rmQuintupletFromMemory(quintuplets, jx, true);
               }
             }
           }
diff --git a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
index 86e18590b5775..9fde48df6dafe 100644
--- a/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
+++ b/RecoTracker/LSTCore/src/alpaka/LSTEvent.dev.cc
@@ -750,29 +750,7 @@ void LSTEvent::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets)
                       tc_pls_triplets,
                       nTotal);
 
-  // Get number of TCs to configure grid
-  auto nTrackCandidates_buf_h = cms::alpakatools::make_host_buffer<unsigned int>(queue_);
-  auto nTrackCandidates_buf_d =
-      cms::alpakatools::make_device_view(queue_, (*trackCandidatesBaseDC_)->nTrackCandidates());
-  alpaka::memcpy(queue_, nTrackCandidates_buf_h, nTrackCandidates_buf_d);
-  alpaka::wait(queue_);
-  unsigned int nTC = *nTrackCandidates_buf_h.data();
-
-  if (nTC == 0)
-    return;
-  auto const wd = cms::alpakatools::make_workdiv<Acc1D>(nTC, 128);
-
-  alpaka::exec<Acc1D>(queue_,
-                      wd,
-                      ExtendTrackCandidatesFromDupT5{},
-                      modules_.const_view().modules(),
-                      rangesDC_->const_view(),
-                      quintupletsDC_->const_view().quintuplets(),
-                      quintupletsDC_->const_view().quintupletsOccupancy(),
-                      trackCandidatesBaseDC_->view(),
-                      trackCandidatesExtendedDC_->view());
-
-  // Check if TC buffer was possibly truncated
+  // Check if either n_max_pixel_track_candidates or n_max_nonpixel_track_candidates was reached
   auto nTrackCanTotalHost_buf = cms::alpakatools::make_host_buffer<unsigned int>(queue_);
   alpaka::memcpy(queue_,
                  nTrackCanTotalHost_buf,
@@ -971,6 +949,8 @@ void LSTEvent::createQuintuplets() {
     auto quintuplets = quintupletsDC_->view().quintuplets();
     auto isDup_view = cms::alpakatools::make_device_view(queue_, quintuplets.isDup());
     alpaka::memset(queue_, isDup_view, 0u);
+    auto nLayers_view = cms::alpakatools::make_device_view(queue_, quintuplets.nLayers());
+    alpaka::memset(queue_, nLayers_view, 0u);
     auto tightCutFlag_view = cms::alpakatools::make_device_view(queue_, quintuplets.tightCutFlag());
     alpaka::memset(queue_, tightCutFlag_view, 0u);
     auto partOfPT5_view = cms::alpakatools::make_device_view(queue_, quintuplets.partOfPT5());
@@ -994,6 +974,18 @@ void LSTEvent::createQuintuplets() {
                       nEligibleT5Modules,
                       ptCut_);
 
+  if (nTotalQuintuplets > 0) {
+    auto const extendT5_workDiv = cms::alpakatools::make_workdiv<Acc1D>(nTotalQuintuplets, 128);
+
+    alpaka::exec<Acc1D>(queue_,
+                        extendT5_workDiv,
+                        ExtendT5FromDupT5{},
+                        modules_.const_view().modules(),
+                        rangesDC_->const_view(),
+                        quintupletsDC_->view().quintuplets(),
+                        quintupletsDC_->const_view().quintupletsOccupancy());
+  }
+
   auto const removeDupQuintupletsAfterBuild_workDiv =
       cms::alpakatools::make_workdiv<Acc3D>({max_blocks, 1, 1}, {1, 16, 16});
 
diff --git a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h
index d137889a98fbf..96366af78066e 100644
--- a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h
+++ b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h
@@ -46,40 +46,37 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     pixelQuintuplets.centerX()[pixelQuintupletIndex] = __F2H(centerX);
     pixelQuintuplets.centerY()[pixelQuintupletIndex] = __F2H(centerY);
 
+    // Copy pixel layers (always 2)
     pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][0] = 0;
     pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][1] = 0;
-    pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][2] = quintuplets.logicalLayers()[t5Index][0];
-    pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][3] = quintuplets.logicalLayers()[t5Index][1];
-    pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][4] = quintuplets.logicalLayers()[t5Index][2];
-    pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][5] = quintuplets.logicalLayers()[t5Index][3];
-    pixelQuintuplets.logicalLayers()[pixelQuintupletIndex][6] = quintuplets.logicalLayers()[t5Index][4];
-
     pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][0] = segments.innerLowerModuleIndices()[pixelIndex];
     pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][1] = segments.outerLowerModuleIndices()[pixelIndex];
-    pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][2] = quintuplets.lowerModuleIndices()[t5Index][0];
-    pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][3] = quintuplets.lowerModuleIndices()[t5Index][1];
-    pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][4] = quintuplets.lowerModuleIndices()[t5Index][2];
-    pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][5] = quintuplets.lowerModuleIndices()[t5Index][3];
-    pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex][6] = quintuplets.lowerModuleIndices()[t5Index][4];
 
     unsigned int pixelInnerMD = segments.mdIndices()[pixelIndex][0];
     unsigned int pixelOuterMD = segments.mdIndices()[pixelIndex][1];
-
     pixelQuintuplets.hitIndices()[pixelQuintupletIndex][0] = mds.anchorHitIndices()[pixelInnerMD];
     pixelQuintuplets.hitIndices()[pixelQuintupletIndex][1] = mds.outerHitIndices()[pixelInnerMD];
     pixelQuintuplets.hitIndices()[pixelQuintupletIndex][2] = mds.anchorHitIndices()[pixelOuterMD];
     pixelQuintuplets.hitIndices()[pixelQuintupletIndex][3] = mds.outerHitIndices()[pixelOuterMD];
 
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][4] = quintuplets.hitIndices()[t5Index][0];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][5] = quintuplets.hitIndices()[t5Index][1];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][6] = quintuplets.hitIndices()[t5Index][2];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][7] = quintuplets.hitIndices()[t5Index][3];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][8] = quintuplets.hitIndices()[t5Index][4];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][9] = quintuplets.hitIndices()[t5Index][5];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][10] = quintuplets.hitIndices()[t5Index][6];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][11] = quintuplets.hitIndices()[t5Index][7];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][12] = quintuplets.hitIndices()[t5Index][8];
-    pixelQuintuplets.hitIndices()[pixelQuintupletIndex][13] = quintuplets.hitIndices()[t5Index][9];
+    // Copy T5 layers (respects nLayers for extended T5s); fill remaining slots with sentinels.
+    unsigned int t5Layers = quintuplets.nLayers()[t5Index];
+    auto& dstLayers = pixelQuintuplets.logicalLayers()[pixelQuintupletIndex];
+    auto& dstModules = pixelQuintuplets.lowerModuleIndices()[pixelQuintupletIndex];
+    auto& dstHits = pixelQuintuplets.hitIndices()[pixelQuintupletIndex];
+    auto const& srcLayers = quintuplets.logicalLayers()[t5Index];
+    auto const& srcModules = quintuplets.lowerModuleIndices()[t5Index];
+    auto const& srcHits = quintuplets.hitIndices()[t5Index];
+    for (unsigned int i = 0; i < Params_T5::kLayers; ++i) {
+      const bool inT5 = (i < t5Layers);
+      dstLayers[2 + i] = inT5 ? srcLayers[i] : uint8_t{0};
+      dstModules[2 + i] = inT5 ? srcModules[i] : lst::kTCEmptyLowerModule;
+      dstHits[4 + 2 * i] = inT5 ? srcHits[2 * i] : lst::kTCEmptyHitIdx;
+      dstHits[4 + 2 * i + 1] = inT5 ? srcHits[2 * i + 1] : lst::kTCEmptyHitIdx;
+    }
+
+    // Set pT5 nLayers = 2 pixel + T5 layers
+    pixelQuintuplets.nLayers()[pixelQuintupletIndex] = 2 + t5Layers;
 
 #ifdef CUT_VALUE_DEBUG
     pixelQuintuplets.rzChiSquared()[pixelQuintupletIndex] = rzChiSquared;
@@ -408,7 +405,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     float a = -100 / kR1GeVf * charge;
 
-    for (size_t i = 0; i < Params_T5::kLayers; i++) {
+    for (size_t i = 0; i < Params_T5::kBaseLayers; i++) {
       float zsi = zs[i] / 100;
       float rtsi = rts[i] / 100;
       uint16_t lowerModuleIndex = lowerModuleIndices[i];
@@ -532,23 +529,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     uint16_t lowerModuleIndex4 = quintuplets.lowerModuleIndices()[quintupletIndex][3];
     uint16_t lowerModuleIndex5 = quintuplets.lowerModuleIndices()[quintupletIndex][4];
 
-    uint16_t lowerModuleIndices[Params_T5::kLayers] = {
+    uint16_t lowerModuleIndices[Params_T5::kBaseLayers] = {
         lowerModuleIndex1, lowerModuleIndex2, lowerModuleIndex3, lowerModuleIndex4, lowerModuleIndex5};
 
     float rtPix[Params_pLS::kLayers] = {pixelData.rt_InLo, pixelData.rt_InUp};
     float xPix[Params_pLS::kLayers] = {pixelData.x_InLo, pixelData.x_InUp};
     float yPix[Params_pLS::kLayers] = {pixelData.y_InLo, pixelData.y_InUp};
     float zPix[Params_pLS::kLayers] = {pixelData.z_InLo, pixelData.z_InUp};
-    float zs[Params_T5::kLayers] = {mds.anchorZ()[firstMDIndex],
-                                    mds.anchorZ()[secondMDIndex],
-                                    mds.anchorZ()[thirdMDIndex],
-                                    mds.anchorZ()[fourthMDIndex],
-                                    mds.anchorZ()[fifthMDIndex]};
-    float rts[Params_T5::kLayers] = {mds.anchorRt()[firstMDIndex],
-                                     mds.anchorRt()[secondMDIndex],
-                                     mds.anchorRt()[thirdMDIndex],
-                                     mds.anchorRt()[fourthMDIndex],
-                                     mds.anchorRt()[fifthMDIndex]};
+    float zs[Params_T5::kBaseLayers] = {mds.anchorZ()[firstMDIndex],
+                                        mds.anchorZ()[secondMDIndex],
+                                        mds.anchorZ()[thirdMDIndex],
+                                        mds.anchorZ()[fourthMDIndex],
+                                        mds.anchorZ()[fifthMDIndex]};
+    float rts[Params_T5::kBaseLayers] = {mds.anchorRt()[firstMDIndex],
+                                         mds.anchorRt()[secondMDIndex],
+                                         mds.anchorRt()[thirdMDIndex],
+                                         mds.anchorRt()[fourthMDIndex],
+                                         mds.anchorRt()[fifthMDIndex]};
 
     rzChiSquared = 0;
 
@@ -582,16 +579,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     }
 
     //outer T5
-    float xs[Params_T5::kLayers] = {mds.anchorX()[firstMDIndex],
-                                    mds.anchorX()[secondMDIndex],
-                                    mds.anchorX()[thirdMDIndex],
-                                    mds.anchorX()[fourthMDIndex],
-                                    mds.anchorX()[fifthMDIndex]};
-    float ys[Params_T5::kLayers] = {mds.anchorY()[firstMDIndex],
-                                    mds.anchorY()[secondMDIndex],
-                                    mds.anchorY()[thirdMDIndex],
-                                    mds.anchorY()[fourthMDIndex],
-                                    mds.anchorY()[fifthMDIndex]};
+    float xs[Params_T5::kBaseLayers] = {mds.anchorX()[firstMDIndex],
+                                        mds.anchorX()[secondMDIndex],
+                                        mds.anchorX()[thirdMDIndex],
+                                        mds.anchorX()[fourthMDIndex],
+                                        mds.anchorX()[fifthMDIndex]};
+    float ys[Params_T5::kBaseLayers] = {mds.anchorY()[firstMDIndex],
+                                        mds.anchorY()[secondMDIndex],
+                                        mds.anchorY()[thirdMDIndex],
+                                        mds.anchorY()[fourthMDIndex],
+                                        mds.anchorY()[fifthMDIndex]};
 
     //get the appropriate centers
     centerX = pixelData.circleCenterX;
diff --git a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h
index 15b0aa1f7f405..c748b6b7f35a2 100644
--- a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h
+++ b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h
@@ -63,6 +63,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     quintuplets.phi()[quintupletIndex] = __F2H(phi);
     quintuplets.score_rphisum()[quintupletIndex] = __F2H(scores);
     quintuplets.isDup()[quintupletIndex] = 0;
+    quintuplets.nLayers()[quintupletIndex] = Params_T5::kBaseLayers;
     quintuplets.tightCutFlag()[quintupletIndex] = tightCutFlag;
     quintuplets.regressionRadius()[quintupletIndex] = regressionRadius;
     quintuplets.regressionCenterX()[quintupletIndex] = regressionCenterX;
@@ -97,6 +98,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     for (unsigned int i = 0; i < Params_T5::kEmbed; ++i) {
       quintuplets.t5Embed()[quintupletIndex][i] = t5Embed[i];
     }
+
+    // Initialize extended layer slots with sentinel values
+    for (int i = Params_T5::kBaseLayers; i < Params_T5::kLayers; ++i) {
+      quintuplets.logicalLayers()[quintupletIndex][i] = 0;
+      quintuplets.lowerModuleIndices()[quintupletIndex][i] = lst::kTCEmptyLowerModule;
+      quintuplets.hitIndices()[quintupletIndex][2 * i] = lst::kTCEmptyHitIdx;
+      quintuplets.hitIndices()[quintupletIndex][2 * i + 1] = lst::kTCEmptyHitIdx;
+    }
   }
 
   //bounds can be found at http://uaf-10.t2.ucsd.edu/~bsathian/SDL/T5_RZFix/t5_rz_thresholds.txt
@@ -1633,7 +1642,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     computeSigmasForRegression(acc, modules, lowerModuleIndices, delta1, delta2, slopes, isFlat);
     regressionRadius = computeRadiusUsingRegression(acc,
-                                                    Params_T5::kLayers,
+                                                    Params_T5::kBaseLayers,
                                                     xVec,
                                                     yVec,
                                                     delta1,
@@ -1647,7 +1656,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
 
     //compute the other chisquared
     //non anchor is always shifted for tilted and endcap!
-    float nonAnchorDelta1[Params_T5::kLayers], nonAnchorDelta2[Params_T5::kLayers], nonAnchorSlopes[Params_T5::kLayers];
+    float nonAnchorDelta1[Params_T5::kBaseLayers], nonAnchorDelta2[Params_T5::kBaseLayers],
+        nonAnchorSlopes[Params_T5::kBaseLayers];
     float nonAnchorxs[] = {mds.outerX()[firstMDIndex],
                            mds.outerX()[secondMDIndex],
                            mds.outerX()[thirdMDIndex],
@@ -1666,10 +1676,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                                nonAnchorDelta2,
                                nonAnchorSlopes,
                                isFlat,
-                               Params_T5::kLayers,
+                               Params_T5::kBaseLayers,
                                false);
     nonAnchorChiSquared = computeChiSquared(acc,
-                                            Params_T5::kLayers,
+                                            Params_T5::kBaseLayers,
                                             nonAnchorxs,
                                             nonAnchorys,
                                             nonAnchorDelta1,
diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
index b6aaa14e9e14d..3afb7295e1dab 100644
--- a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
+++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h
@@ -125,6 +125,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
       unsigned int hit0 = hitIndices[2 * i];
       unsigned int hit1 = hitIndices[2 * i + 1];
 
+      // Skip empty slots (sentinel values from extended T5/pT5 arrays)
+      if (hit0 == lst::kTCEmptyHitIdx)
+        continue;
+
       int layerSlot;
 
       if (i < nPixelLayers) {
@@ -239,7 +243,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
           if (quintuplets.isDup()[iT5] || quintuplets.partOfPT5()[iT5])
             continue;
 
-          unsigned int loop_bound = pixelQuintuplets.nPixelQuintuplets() + pixelTriplets.nPixelTriplets();
+          const unsigned int nPT5 = pixelQuintuplets.nPixelQuintuplets();
+          const unsigned int loop_bound = nPT5 + pixelTriplets.nPixelTriplets();
 
           float eta1 = __H2F(quintuplets.eta()[iT5]);
           float phi1 = __H2F(quintuplets.phi()[iT5]);
@@ -249,14 +254,25 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
             iEmbedT5[k] = quintuplets.t5Embed()[iT5][k];
           }
 
+          // Pre-load T5 hits and iT5-only dup-cleaning constants outside the jx loop.
+          unsigned int iT5Hits[Params_T5::kHits];
+          CMS_UNROLL_LOOP for (int i = 0; i < Params_T5::kHits; ++i) { iT5Hits[i] = quintuplets.hitIndices()[iT5][i]; }
+          // Longer (extended) T5s get a tighter cut: 3x smaller d2 and 6 (vs 4) shared OT hits.
+          const bool isExtT5 = quintuplets.nLayers()[iT5] > Params_T5::kBaseLayers;
+          const float d2Lo = isExtT5 ? 0.03f : 0.1f;
+          const float d2Hi = isExtT5 ? 0.3f : 1.0f;
+          const int otThresh = isExtT5 ? 6 : 4;
+
           // Cross-clean against both pT5s and pT3s
           for (unsigned int jx : cms::alpakatools::uniform_elements_x(acc, loop_bound)) {
+            const bool isPT5 = (jx < nPT5);
+            const unsigned int ptidx = isPT5 ? 0u : (jx - nPT5);
+
             float eta2, phi2;
-            if (jx < pixelQuintuplets.nPixelQuintuplets()) {
+            if (isPT5) {
               eta2 = __H2F(pixelQuintuplets.eta()[jx]);
               phi2 = __H2F(pixelQuintuplets.phi()[jx]);
             } else {
-              unsigned int ptidx = jx - pixelQuintuplets.nPixelQuintuplets();
               eta2 = __H2F(pixelTriplets.eta()[ptidx]);
               phi2 = __H2F(pixelTriplets.phi()[ptidx]);
             }
@@ -265,7 +281,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
             float dPhi = cms::alpakatools::deltaPhi(acc, phi1, phi2);
             float dR2 = dEta * dEta + dPhi * dPhi;
 
-            if (jx < pixelQuintuplets.nPixelQuintuplets()) {
+            if (isPT5) {
               unsigned int jT5 = pixelQuintuplets.quintupletIndices()[jx];
               float d2 = 0.f;
               // Compute distance-squared between the two t5 embeddings.
@@ -273,11 +289,50 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
                 float df = iEmbedT5[k] - quintuplets.t5Embed()[jT5][k];
                 d2 += df * df;
               }
-              if ((dR2 < 0.02f && d2 < 0.1f) || (dR2 < 1e-3f && d2 < 1.0f)) {
-                quintuplets.isDup()[iT5] = true;
+
+              if ((dR2 < 0.02f && d2 < d2Lo) || (dR2 < 1e-3f && d2 < d2Hi)) {
+                quintuplets.isDup()[iT5] |= 4;
+              } else if (dEta < 0.15f && alpaka::math::abs(acc, dPhi) < 0.15f) {
+                // OT hit matching: T5 hits vs pT5 OT hits
+                int nOTMatched = 0;
+                for (int i = 0; i < Params_T5::kHits; ++i) {
+                  unsigned int hitI = iT5Hits[i];
+                  if (hitI == lst::kTCEmptyHitIdx)
+                    continue;
+                  for (int j = Params_pLS::kHits; j < Params_pT5::kHits; ++j) {
+                    unsigned int pT5Hit = pixelQuintuplets.hitIndices()[jx][j];
+                    if (pT5Hit == lst::kTCEmptyHitIdx)
+                      continue;
+                    if (hitI == pT5Hit) {
+                      nOTMatched++;
+                      break;
+                    }
+                  }
+                }
+                if (nOTMatched >= otThresh)
+                  quintuplets.isDup()[iT5] |= 4;
               }
             } else if (dR2 < 1e-3f) {
-              quintuplets.isDup()[iT5] = true;
+              quintuplets.isDup()[iT5] |= 4;
+            } else if (dEta < 0.15f && alpaka::math::abs(acc, dPhi) < 0.15f) {
+              // OT hit matching: T5 hits vs pT3 OT hits (same extended logic as pT5 path)
+              int nOTMatched = 0;
+              for (int i = 0; i < Params_T5::kHits; ++i) {
+                unsigned int hitI = iT5Hits[i];
+                if (hitI == lst::kTCEmptyHitIdx)
+                  continue;
+                for (int j = Params_pLS::kHits; j < Params_pT3::kHits; ++j) {
+                  unsigned int pT3Hit = pixelTriplets.hitIndices()[ptidx][j];
+                  if (pT3Hit == lst::kTCEmptyHitIdx)
+                    continue;
+                  if (hitI == pT3Hit) {
+                    nOTMatched++;
+                    break;
+                  }
+                }
+              }
+              if (nOTMatched >= otThresh)
+                quintuplets.isDup()[iT5] |= 4;
             }
 
             if (quintuplets.isDup()[iT5])
@@ -703,229 +758,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::lst {
     }
   };
 
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE void countSharedT5HitsAndFindUnmatched(QuintupletsConst quintuplets,
-                                                                        unsigned int testT5Index,
-                                                                        unsigned int refT5Index,
-                                                                        int& sharedHitCount,
-                                                                        int& unmatchedLayerSlot) {
-    sharedHitCount = 0;
-    unmatchedLayerSlot = -1;
-
-    CMS_UNROLL_LOOP
-    for (int layerIndex = 0; layerIndex < Params_T5::kLayers; ++layerIndex) {
-      const unsigned int candidateHit0 = quintuplets.hitIndices()[testT5Index][2 * layerIndex + 0];
-      const unsigned int candidateHit1 = quintuplets.hitIndices()[testT5Index][2 * layerIndex + 1];
-
-      bool hit0InBase = false;
-      bool hit1InBase = false;
-      // Suppress compiler warning with HIP backend
-#ifndef ALPAKA_ACC_GPU_HIP_ENABLED
-      CMS_UNROLL_LOOP
-#endif
-      for (int baseHitIndex = 0; baseHitIndex < Params_T5::kHits; ++baseHitIndex) {
-        const unsigned int baseHit = quintuplets.hitIndices()[refT5Index][baseHitIndex];
-        if (candidateHit0 == baseHit)
-          hit0InBase = true;
-        if (candidateHit1 == baseHit)
-          hit1InBase = true;
-        if (hit0InBase && hit1InBase)
-          break;
-      }
-
-      if (hit0InBase)
-        ++sharedHitCount;
-      if (hit1InBase)
-        ++sharedHitCount;
-
-      if (!hit0InBase && !hit1InBase) {
-        unmatchedLayerSlot = layerIndex;
-      }
-    }
-  }
-
-  struct ExtendTrackCandidatesFromDupT5 {
-    static constexpr int kPackedScoreShift = 32;                 // Bit shift for packing the score (upper 32 bits).
-    static constexpr int kPackedIndexShift = 4;                  // Bit shift for packing the T5 index.
-    static constexpr unsigned int kPackedIndexMask = 0xFFFFFFF;  // Mask for the 28-bit T5 index.
-    static constexpr unsigned int kPackedSlotMask = 0xF;         // Mask for the 4-bit layer slot.
-    static constexpr int kT5DuplicateMinSharedHits = 8;  // Minimum shared hits required to consider a T5 for merging.
-
-    ALPAKA_FN_ACC void operator()(Acc1D const& acc,
-                                  ModulesConst modules,
-                                  ObjectRangesConst ranges,
-                                  QuintupletsConst quintuplets,
-                                  QuintupletsOccupancyConst quintupletsOccupancy,
-                                  TrackCandidatesBase candsBase,
-                                  TrackCandidatesExtended candsExtended) const {
-      // Assert that the number of blocks equals nTC
-      ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u] == candsBase.nTrackCandidates()));
-
-      // Shared memory: Packed best candidate info per logical OT layer (1..11)
-      // Format: [Score (32 bits) | Quintuplet Index (28 bits) | Layer Slot (4 bits)]
-      uint64_t* sharedBestPacked = alpaka::declareSharedVar<uint64_t[lst::kLogicalOTLayers], __COUNTER__>(acc);
-
-      // In 1D, the Grid index [0] is the Block index (which is our TC index)
-      const unsigned int trackCandidateIndex = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
-
-      // Initialize shared memory once per block
-      if (cms::alpakatools::once_per_block(acc)) {
-        for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) {
-          sharedBestPacked[logicalLayerBin] = 0;  // 0.0f score, 0 index, 0 slot
-        }
-      }
-      alpaka::syncBlockThreads(acc);
-
-      // Only consider merging T5s onto T5/pT5 TCs for now
-      const LSTObjType trackCandidateType = candsBase.trackCandidateType()[trackCandidateIndex];
-      if (!(trackCandidateType == LSTObjType::T5 || trackCandidateType == LSTObjType::pT5))
-        return;
-
-      // Base quintuplet (for T5: objectIndices[0], for pT5: objectIndices[1])
-      const unsigned int refT5Index = (trackCandidateType == LSTObjType::T5)
-                                          ? candsExtended.objectIndices()[trackCandidateIndex][0]
-                                          : candsExtended.objectIndices()[trackCandidateIndex][1];
-
-      const float baseEta = __H2F(quintuplets.eta()[refT5Index]);
-      const float basePhi = __H2F(quintuplets.phi()[refT5Index]);
-      const uint8_t baseStartLogicalLayer = quintuplets.logicalLayers()[refT5Index][0];
-
-      // Module range to scan
-      int lowerModuleBegin = 0;
-      int lowerModuleEnd = static_cast<int>(modules.nLowerModules());
-
-      // If starting at layer 1, restrict to the module of the second hit
-      if (baseStartLogicalLayer == 1) {
-        lowerModuleBegin = quintuplets.lowerModuleIndices()[refT5Index][1];
-        lowerModuleEnd = lowerModuleBegin + 1;
-      }
-
-      // Linear thread index and block dimension in 1D
-      const int threadIndexFlat = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
-      const int blockDimFlat = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
-
-      // Scan over lower modules
-      for (int lowerModuleIndex = lowerModuleBegin + threadIndexFlat; lowerModuleIndex < lowerModuleEnd;
-           lowerModuleIndex += blockDimFlat) {
-        const int firstQuintupletInModule = ranges.quintupletModuleIndices()[lowerModuleIndex];
-        if (firstQuintupletInModule == -1)
-          continue;
-
-        const unsigned int nQuintupletsInModule = quintupletsOccupancy.nQuintuplets()[lowerModuleIndex];
-
-        // Scan over quintuplets in this module
-        for (unsigned int quintupletOffset = 0; quintupletOffset < nQuintupletsInModule; ++quintupletOffset) {
-          const unsigned int testT5Index = firstQuintupletInModule + quintupletOffset;
-          if (testT5Index == refT5Index)
-            continue;
-
-          // Require different starting layer for now
-          if (quintuplets.logicalLayers()[testT5Index][0] == baseStartLogicalLayer)
-            continue;
-
-          // Quick eta / phi window selection
-          const float candidateEta = __H2F(quintuplets.eta()[testT5Index]);
-          if (alpaka::math::abs(acc, baseEta - candidateEta) > 0.1f)
-            continue;
-
-          const float candidatePhi = __H2F(quintuplets.phi()[testT5Index]);
-          if (alpaka::math::abs(acc, cms::alpakatools::deltaPhi(acc, basePhi, candidatePhi)) > 0.1f)
-            continue;
-
-          // Embedding distance
-          float embedDistance2 = 0.f;
-          CMS_UNROLL_LOOP
-          for (unsigned int embedIndex = 0; embedIndex < Params_T5::kEmbed; ++embedIndex) {
-            const float diff =
-                quintuplets.t5Embed()[refT5Index][embedIndex] - quintuplets.t5Embed()[testT5Index][embedIndex];
-            embedDistance2 += diff * diff;
-          }
-          if (embedDistance2 > 1.0f)
-            continue;
-
-          // Hit matching against base T5 hits
-          int sharedHitCount = 0;
-          int unmatchedLayerSlot = -1;
-
-          countSharedT5HitsAndFindUnmatched(quintuplets, testT5Index, refT5Index, sharedHitCount, unmatchedLayerSlot);
-
-          if (sharedHitCount < kT5DuplicateMinSharedHits)
-            continue;
-          if (unmatchedLayerSlot < 0)
-            continue;
-
-          // Candidate score is the T5 DNN output
-          const float candidateScore = quintuplets.dnnScore()[testT5Index];
-
-          // New OT logical layer from the candidate
-          const uint8_t newLogicalLayer = quintuplets.logicalLayers()[testT5Index][unmatchedLayerSlot];
-          const int logicalLayerBin = static_cast<int>(newLogicalLayer) - 1;  // 0..kLogicalOTLayers-1
-
-          // Pack the Score, Index, and Slot into 64 bits for atomic update
-          uint64_t scoreBits = (uint64_t)std::bit_cast<int>(candidateScore);
-          uint64_t newPacked = (scoreBits << kPackedScoreShift) |
-                               ((uint64_t)(testT5Index & kPackedIndexMask) << kPackedIndexShift) |
-                               (unmatchedLayerSlot & kPackedSlotMask);
-
-          // Atomic CAS loop
-          uint64_t oldPacked = sharedBestPacked[logicalLayerBin];
-          while (true) {
-            const float oldScore = std::bit_cast<float>((int)(oldPacked >> kPackedScoreShift));
-
-            // If we aren't strictly better, stop trying
-            if (candidateScore <= oldScore) {
-              break;
-            }
-
-            // Try to swap. atomicCas returns the value that was previously there
-            uint64_t assumedOld = alpaka::atomicCas(
-                acc, &sharedBestPacked[logicalLayerBin], oldPacked, newPacked, alpaka::hierarchy::Threads{});
-
-            if (assumedOld == oldPacked) {
-              break;  // Success
-            } else {
-              oldPacked = assumedOld;  // Failed, update view and retry
-            }
-          }
-        }
-      }
-
-      // One thread per block finalizes by actually extending the TC
-      alpaka::syncBlockThreads(acc);
-
-      if (cms::alpakatools::once_per_block(acc)) {
-        CMS_UNROLL_LOOP
-        for (int logicalLayerBin = 0; logicalLayerBin < lst::kLogicalOTLayers; ++logicalLayerBin) {
-          uint64_t bestPacked = sharedBestPacked[logicalLayerBin];
-
-          // Check if valid update occurred (non-zero score)
-          int bestScoreInt = (int)(bestPacked >> kPackedScoreShift);
-          if (bestScoreInt == 0)
-            continue;
-
-          // Unpack Index (bits 4-31) and Slot (bits 0-3)
-          const int bestT5Index = (int)((bestPacked >> kPackedIndexShift) & kPackedIndexMask);
-          const int bestT5LayerSlot = (int)(bestPacked & kPackedSlotMask);
-
-          const uint8_t logicalLayer = static_cast<uint8_t>(logicalLayerBin + 1);  // 1..11
-          const int layerSlot = (logicalLayer - 1) + Params_TC::kPixelLayerSlots;  // OT slots for TC
-
-          const uint16_t lowerModuleIndex = quintuplets.lowerModuleIndices()[bestT5Index][bestT5LayerSlot];
-          const unsigned int hitIndex0 = quintuplets.hitIndices()[bestT5Index][2 * bestT5LayerSlot + 0];
-          const unsigned int hitIndex1 = quintuplets.hitIndices()[bestT5Index][2 * bestT5LayerSlot + 1];
-
-          addTrackCandidateLayerHits(candsBase,
-                                     candsExtended,
-                                     trackCandidateIndex,
-                                     layerSlot,
-                                     logicalLayer,
-                                     lowerModuleIndex,
-                                     hitIndex0,
-                                     hitIndex1);
-        }
-      }
-    }
-  };
-
   struct AddpT5asTrackCandidate {
     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
                                   uint16_t nLowerModules,
diff --git a/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc b/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc
index 03aa3128ad960..79aedd7f13b55 100644
--- a/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc
+++ b/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc
@@ -353,7 +353,7 @@ std::vector<unsigned int> getHitsFrompT3(LSTEvent* event, unsigned int pT3) {
   std::vector<unsigned int> hits;
   hits.reserve(allHits.size());
   for (auto const hit : allHits)
-    if (hits.empty() || hits.back() != hit)  // should eventually check all and type
+    if (hit != lst::kTCEmptyHitIdx && (hits.empty() || hits.back() != hit))
       hits.emplace_back(hit);
   return hits;
 }
@@ -446,7 +446,7 @@ std::vector<unsigned int> getHitsFrompT5(LSTEvent* event, unsigned int pT5) {
   std::vector<unsigned int> hits;
   hits.reserve(allHits.size());
   for (auto const hit : allHits)
-    if (hits.empty() || hits.back() != hit)  // should eventually check all and type
+    if (hit != lst::kTCEmptyHitIdx && (hits.empty() || hits.back() != hit))
       hits.emplace_back(hit);
   return hits;
 }
diff --git a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc
index c5a54d3a53ffe..510ee36af7a53 100644
--- a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc
+++ b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc
@@ -517,6 +517,14 @@ void createQuintupletBranches() {
   ana.tx->createBranch<std::vector<float>>("t5_pMatched");
   ana.tx->createBranch<std::vector<float>>("t5_sim_vxy");
   ana.tx->createBranch<std::vector<float>>("t5_sim_vz");
+  ana.tx->createBranch<std::vector<int>>("t5_nLayers");       // 5 base
+  ana.tx->createBranch<std::vector<int>>("t5_isDupBitmask");  // dup-cleaning bitmask
+  ana.tx->createBranch<std::vector<int>>("t5_partOfPT5");
+  ana.tx->createBranch<std::vector<int>>("t5_tightCutFlag");
+  ana.tx->createBranch<std::vector<float>>("t5_score_rphisum");
+  ana.tx->createBranch<std::vector<std::vector<int>>>("t5_hitIndices");     // SoA hit indices, length 2*nLayers
+  ana.tx->createBranch<std::vector<std::vector<int>>>("t5_logicalLayers");  // logical layer ids, length nLayers
+  ana.tx->createBranch<std::vector<int>>("t5_moduleIdx");                   // T5's lower module index
 }
 
 //________________________________________________________________________________________________________________________________
@@ -579,13 +587,14 @@ void createPixelTripletBranches() {
 #endif
   ana.tx->createBranch<std::vector<int>>("sim_pT3_matched");
   ana.tx->createBranch<std::vector<float>>("pT3_score");
-  ana.tx->createBranch<std::vector<float>>("pT3_pt");         // pt (taken from the pLS)
-  ana.tx->createBranch<std::vector<float>>("pT3_eta");        // eta (taken from the pLS)
-  ana.tx->createBranch<std::vector<float>>("pT3_phi");        // phi (taken from the pLS)
-  ana.tx->createBranch<std::vector<int>>("pT3_plsIdx");       // idx to pLS
-  ana.tx->createBranch<std::vector<int>>("pT3_t3Idx");        // idx to T3
-  ana.tx->createBranch<std::vector<int>>("pT3_isFake");       // 1 if pT3 is fake 0 other if not
-  ana.tx->createBranch<std::vector<int>>("pT3_isDuplicate");  // 1 if pT3 is duplicate 0 other if not
+  ana.tx->createBranch<std::vector<float>>("pT3_pt");                       // pt (taken from the pLS)
+  ana.tx->createBranch<std::vector<float>>("pT3_eta");                      // eta (taken from the pLS)
+  ana.tx->createBranch<std::vector<float>>("pT3_phi");                      // phi (taken from the pLS)
+  ana.tx->createBranch<std::vector<int>>("pT3_plsIdx");                     // idx to pLS
+  ana.tx->createBranch<std::vector<int>>("pT3_t3Idx");                      // idx to T3
+  ana.tx->createBranch<std::vector<std::vector<int>>>("pT3_otHitIndices");  // OT hit indices from the T3
+  ana.tx->createBranch<std::vector<int>>("pT3_isFake");                     // 1 if pT3 is fake 0 other if not
+  ana.tx->createBranch<std::vector<int>>("pT3_isDuplicate");                // 1 if pT3 is duplicate 0 other if not
   ana.tx->createBranch<std::vector<int>>("pT3_simIdx");  // idx of best matched (highest nhit and > 75%) simulated track
   ana.tx->createBranch<std::vector<float>>("pT3_pix_eta");
   ana.tx->createBranch<std::vector<float>>("pT3_pix_phi");
@@ -1649,6 +1658,25 @@ std::map<unsigned int, unsigned int> setQuintupletBranches(LSTEvent* event,
       ana.tx->pushbackToBranch<std::vector<float>>("t5_embed", current_t5_embed);
       ana.tx->pushbackToBranch<float>("t5_dnnScore", quintuplets.dnnScore()[t5Idx]);
 
+      unsigned int nL = quintuplets.nLayers()[t5Idx];
+      ana.tx->pushbackToBranch<int>("t5_nLayers", static_cast<int>(nL));
+      ana.tx->pushbackToBranch<int>("t5_moduleIdx", static_cast<int>(idx));
+      std::vector<int> hitIdxVec;
+      std::vector<int> logLayerVec;
+      hitIdxVec.reserve(2 * nL);
+      logLayerVec.reserve(nL);
+      for (unsigned int iL = 0; iL < nL; ++iL) {
+        hitIdxVec.push_back(static_cast<int>(quintuplets.hitIndices()[t5Idx][2 * iL]));
+        hitIdxVec.push_back(static_cast<int>(quintuplets.hitIndices()[t5Idx][2 * iL + 1]));
+        logLayerVec.push_back(static_cast<int>(quintuplets.logicalLayers()[t5Idx][iL]));
+      }
+      ana.tx->pushbackToBranch<std::vector<int>>("t5_hitIndices", hitIdxVec);
+      ana.tx->pushbackToBranch<std::vector<int>>("t5_logicalLayers", logLayerVec);
+      ana.tx->pushbackToBranch<int>("t5_isDupBitmask", static_cast<int>(quintuplets.isDup()[t5Idx]));
+      ana.tx->pushbackToBranch<int>("t5_partOfPT5", quintuplets.partOfPT5()[t5Idx] ? 1 : 0);
+      ana.tx->pushbackToBranch<int>("t5_tightCutFlag", quintuplets.tightCutFlag()[t5Idx] ? 1 : 0);
+      ana.tx->pushbackToBranch<float>("t5_score_rphisum", __H2F(quintuplets.score_rphisum()[t5Idx]));
+
       bool isfake = true;
       for (size_t isim = 0; isim < simidx.size(); ++isim) {
         if (simidxfrac[isim] > matchfrac) {
@@ -1944,6 +1972,10 @@ std::map<unsigned int, unsigned int> setPixelTripletBranches(LSTEvent* event,
       unsigned int t3_idx = t3_idx_map.at(t3Idx);
       ana.tx->pushbackToBranch<int>("pT3_t3Idx", t3_idx);
     }
+    std::vector<int> otHits;
+    for (int ih = Params_pLS::kHits; ih < Params_pT3::kHits; ++ih)
+      otHits.push_back(static_cast<int>(pixelTriplets.hitIndices()[ipT3][ih]));
+    ana.tx->pushbackToBranch<std::vector<int>>("pT3_otHitIndices", otHits);
     bool isfake = true;
     for (size_t isim = 0; isim < simidx.size(); ++isim) {
       if (simidxfrac[isim] > matchfrac) {