From 376d0d07f81275d2e47406a1591aee8c5ea12d3c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 16 Jun 2026 10:12:39 -0300 Subject: [PATCH 01/17] small improvements --- src/Common/CurrentMetrics.cpp | 2 ++ src/Common/ProfileEvents.cpp | 4 +++ .../ExportPartitionManifestUpdatingTask.cpp | 13 ++++++-- .../ExportPartitionTaskScheduler.cpp | 2 +- .../MergeTree/ExportPartitionUtils.cpp | 32 +++++++++++++++++++ src/Storages/MergeTree/ExportPartitionUtils.h | 12 +++++++ src/Storages/StorageReplicatedMergeTree.cpp | 4 +-- src/Storages/StorageReplicatedMergeTree.h | 9 +++++- 8 files changed, 71 insertions(+), 7 deletions(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index c46cc25687aa..3dfee74eb565 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -13,6 +13,8 @@ M(MergeParts, "Number of source parts participating in current background merges") \ M(Move, "Number of currently executing moves") \ M(Export, "Number of currently executing exports") \ + M(ExportPartitionLockWaitingReaders, "Number of threads currently waiting to acquire the export partition in-memory state lock for reading (shared).") \ + M(ExportPartitionLockWaitingWriters, "Number of threads currently waiting to acquire the export partition in-memory state lock for writing (exclusive).") \ M(PartMutation, "Number of mutations (ALTER DELETE/UPDATE)") \ M(ReplicatedFetch, "Number of data parts being fetched from replica") \ M(ReplicatedSend, "Number of data parts being sent to replicas") \ diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 993e9d6a85cd..2c29449707c0 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -344,6 +344,10 @@ M(ExportPartitionZooKeeperMulti, "Number of 'multi' requests to ZooKeeper made by the export partition feature.", ValueType::Number) \ M(ExportPartitionZooKeeperExists, "Number of 'exists' requests to ZooKeeper made by the export partition feature.", ValueType::Number) \ M(ExportPartsRejectedByMemoryLimit, "Number of background export part tasks rejected due to background memory limit.", ValueType::Number) \ + M(ExportPartitionLockReadWaitMicroseconds, "Total time spent waiting to acquire the export partition in-memory state lock for reading (shared).", ValueType::Microseconds) \ + M(ExportPartitionLockWriteWaitMicroseconds, "Total time spent waiting to acquire the export partition in-memory state lock for writing (exclusive).", ValueType::Microseconds) \ + M(ExportPartitionLockReadAcquisitions, "Number of times the export partition in-memory state lock was acquired for reading (shared).", ValueType::Number) \ + M(ExportPartitionLockWriteAcquisitions, "Number of times the export partition in-memory state lock was acquired for writing (exclusive).", ValueType::Number) \ \ M(DistributedConnectionTries, "Total count of distributed connection attempts.", ValueType::Number) \ M(DistributedConnectionUsable, "Total count of successful distributed connections to a usable server (with required table, but maybe stale).", ValueType::Number) \ diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index b45f3667be87..f6140f845839 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -310,7 +310,7 @@ std::vector ExportPartitionManifestUpdatingTask:: std::vector snapshots; { - std::lock_guard lock(storage.export_merge_tree_partition_mutex); + auto lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); snapshots.reserve(storage.export_merge_tree_partition_task_entries_by_key.size()); for (const auto & entry : storage.export_merge_tree_partition_task_entries_by_key) @@ -379,7 +379,11 @@ void ExportPartitionManifestUpdatingTask::poll() } { - std::lock_guard lock(storage.export_merge_tree_partition_mutex); + /// Writer critical section. With the read-write lock this is the exclusive side; the + /// expensive Iceberg/REST-catalog commits are still performed below, AFTER the lock is + /// released (see deferred_commits), so the shared-lock reader of + /// system.replicated_partition_exports is not held up by catalog round-trips. + auto lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), storage.export_merge_tree_partition_task_entries_by_key.size()); @@ -644,7 +648,10 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() try { - std::lock_guard task_entries_lock(storage.export_merge_tree_partition_mutex); + /// Writer critical section (exclusive side of the read-write lock). No catalog I/O is done + /// here, only ZooKeeper status reads and in-memory updates, so the shared-lock reader of + /// system.replicated_partition_exports is not blocked by slow catalog commits. + auto task_entries_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); auto zk = storage.getZooKeeper(); LOG_INFO(storage.log, "ExportPartition Manifest Updating task: handling status changes. Number of status changes: {}", local_status_changes.size()); diff --git a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp index d722fb77b2c0..f503507cab7a 100644 --- a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp +++ b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp @@ -77,7 +77,7 @@ void ExportPartitionTaskScheduler::run() const uint32_t seed = uint32_t(std::hash{}(storage.replica_name)) ^ uint32_t(scheduled_exports_count); pcg64_fast rng(seed); - std::lock_guard lock(storage.export_merge_tree_partition_mutex); + auto lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); auto zk = storage.getZooKeeper(); diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp index 4a447c8de3ab..c51aef45d1f9 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.cpp +++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,6 +25,16 @@ namespace ProfileEvents extern const Event ExportPartitionZooKeeperGetChildren; extern const Event ExportPartitionZooKeeperSet; extern const Event ExportPartitionZooKeeperMulti; + extern const Event ExportPartitionLockReadWaitMicroseconds; + extern const Event ExportPartitionLockWriteWaitMicroseconds; + extern const Event ExportPartitionLockReadAcquisitions; + extern const Event ExportPartitionLockWriteAcquisitions; +} + +namespace CurrentMetrics +{ + extern const Metric ExportPartitionLockWaitingReaders; + extern const Metric ExportPartitionLockWaitingWriters; } namespace DB @@ -47,6 +59,26 @@ namespace fs = std::filesystem; namespace ExportPartitionUtils { + std::shared_lock lockShared(SharedMutex & mutex) + { + CurrentMetrics::Increment waiting(CurrentMetrics::ExportPartitionLockWaitingReaders); + Stopwatch watch; + std::shared_lock lock(mutex); + ProfileEvents::increment(ProfileEvents::ExportPartitionLockReadWaitMicroseconds, watch.elapsedMicroseconds()); + ProfileEvents::increment(ProfileEvents::ExportPartitionLockReadAcquisitions); + return lock; + } + + std::unique_lock lockExclusive(SharedMutex & mutex) + { + CurrentMetrics::Increment waiting(CurrentMetrics::ExportPartitionLockWaitingWriters); + Stopwatch watch; + std::unique_lock lock(mutex); + ProfileEvents::increment(ProfileEvents::ExportPartitionLockWriteWaitMicroseconds, watch.elapsedMicroseconds()); + ProfileEvents::increment(ProfileEvents::ExportPartitionLockWriteAcquisitions); + return lock; + } + std::vector getPartitionValuesForIcebergCommit( MergeTreeData & storage, const String & partition_id) { diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h index eb67d288d71e..90cacf81506f 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.h +++ b/src/Storages/MergeTree/ExportPartitionUtils.h @@ -3,8 +3,11 @@ #include #include #include +#include +#include #include #include +#include #include #include "Storages/IStorage.h" #include @@ -22,6 +25,15 @@ struct ExportReplicatedMergeTreePartitionManifest; namespace ExportPartitionUtils { + /// Instrumented acquisition of the storage-wide export partition state lock. + /// `lockShared` is for readers (e.g. system.replicated_partition_exports); `lockExclusive` + /// is for the brief in-memory mutations performed by the background tasks and KILL EXPORT + /// PARTITION. Both record wait time and waiting-thread counts via the ExportPartitionLock* + /// metrics. The lock MUST NOT be held across ZooKeeper round-trips - gather ZK data first, + /// then take the lock only to apply the result. + std::shared_lock lockShared(SharedMutex & mutex); + std::unique_lock lockExclusive(SharedMutex & mutex); + std::vector getExportedPaths(const LoggerPtr & log, const zkutil::ZooKeeperPtr & zk, const std::string & export_path); ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 7aaecac34832..3a749bc74a04 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -6131,7 +6131,7 @@ void StorageReplicatedMergeTree::shutdown(bool) } { - std::lock_guard lock(export_merge_tree_partition_mutex); + auto lock = ExportPartitionUtils::lockExclusive(export_merge_tree_partition_mutex); export_merge_tree_partition_task_entries.clear(); } @@ -10130,7 +10130,7 @@ CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & return CancellationCode::CancelSent; }; - std::lock_guard lock(export_merge_tree_partition_mutex); + auto lock = ExportPartitionUtils::lockExclusive(export_merge_tree_partition_mutex); const auto zk = getZooKeeper(); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 30b038901e96..f9f1fd19308c 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -525,7 +526,13 @@ class StorageReplicatedMergeTree final : public MergeTreeData Coordination::WatchCallbackPtr export_merge_tree_partition_watch_callback; - std::mutex export_merge_tree_partition_mutex; + /// Read-write lock guarding the in-memory mirror of export tasks + /// (export_merge_tree_partition_task_entries*). Readers (system.replicated_partition_exports + /// via getPartitionExportsInfo) take it shared; the background poll/scheduler/status tasks and + /// KILL EXPORT PARTITION take it exclusively, but only for brief in-memory mutations - never + /// across ZooKeeper round-trips. Acquire via ExportPartitionUtils::lockShared / lockExclusive + /// so contention is reflected in the ExportPartitionLock* metrics. + mutable SharedMutex export_merge_tree_partition_mutex; BackgroundSchedulePoolTaskHolder export_merge_tree_partition_select_task; From 22e5746b59074755a9ef10fdcfb1bd7a9b28be4c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 17 Jun 2026 19:15:23 -0300 Subject: [PATCH 02/17] refactor export updating task to make lock free network requests --- .../ExportPartitionManifestUpdatingTask.cpp | 127 +++++++++++------- .../ExportPartitionManifestUpdatingTask.h | 9 ++ .../ExportPartitionTaskScheduler.cpp | 47 +++++-- src/Storages/StorageReplicatedMergeTree.cpp | 28 +++- src/Storages/StorageReplicatedMergeTree.h | 22 ++- 5 files changed, 163 insertions(+), 70 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index f6140f845839..54ee78596f7b 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -153,6 +153,7 @@ namespace const ExportReplicatedMergeTreePartitionManifest & metadata, const time_t now, const bool is_pending, + SharedMutex & mirror_mutex, auto & entries_by_key, std::vector & deferred_commits ) @@ -168,9 +169,14 @@ namespace zk->tryRemoveRecursive(fs::path(entry_path)); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRemoveRecursive); - auto it = entries_by_key.find(key); - if (it != entries_by_key.end()) - entries_by_key.erase(it); + /// Brief exclusive mirror lock around the in-memory erase only - the ZK removal above + /// ran lock-free. + { + auto mirror_lock = ExportPartitionUtils::lockExclusive(mirror_mutex); + auto it = entries_by_key.find(key); + if (it != entries_by_key.end()) + entries_by_key.erase(it); + } LOG_INFO(log, "ExportPartition Manifest Updating Task: Removed {}: expired", key); return true; @@ -379,11 +385,15 @@ void ExportPartitionManifestUpdatingTask::poll() } { - /// Writer critical section. With the read-write lock this is the exclusive side; the - /// expensive Iceberg/REST-catalog commits are still performed below, AFTER the lock is - /// released (see deferred_commits), so the shared-lock reader of - /// system.replicated_partition_exports is not held up by catalog round-trips. - auto lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); + /// Task-serialization critical section: background_task_serialization_mutex is held + /// across the ZooKeeper reads below so poll() and handleStatusChanges() never overlap, + /// but it is NOT the mirror lock. Every in-memory mutation of the task container takes + /// export_merge_tree_partition_mutex exclusively for the brief mutation only (see the + /// lockExclusive scopes below and inside addTask / tryCleanup / removeStaleEntries), so + /// the shared-lock reader of system.replicated_partition_exports is never blocked by a + /// ZooKeeper round-trip. The expensive Iceberg/REST-catalog commits run afterwards, with + /// no lock held at all (see deferred_commits). + std::lock_guard task_guard(background_task_serialization_mutex); LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), storage.export_merge_tree_partition_task_entries_by_key.size()); @@ -434,12 +444,15 @@ void ExportPartitionManifestUpdatingTask::poll() /// If the entry is up to date and we don't have the cleanup lock, refresh the in-memory /// last_exception (surfaced by system.replicated_partition_exports) and early exit. - /// Direct mutation of the `mutable` field is safe under export_merge_tree_partition_mutex, - /// which is held throughout poll(). + /// The `mutable` field is mutated under a brief exclusive mirror lock so a concurrent + /// shared-lock reader never observes a half-written value. if (!cleanup_lock && has_local_entry_and_is_up_to_date) { if (!last_exception_per_replica.empty()) + { + auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); local_entry->last_exception_per_replica = std::move(last_exception_per_replica); + } continue; } @@ -486,6 +499,7 @@ void ExportPartitionManifestUpdatingTask::poll() metadata, now, *status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING, + storage.export_merge_tree_partition_mutex, entries_by_key, deferred_commits); @@ -496,9 +510,13 @@ void ExportPartitionManifestUpdatingTask::poll() if (has_local_entry_and_is_up_to_date) { /// Same refresh as the early-exit branch above; we also reach this point when - /// holding the cleanup lock (cleanup did not consume the entry). + /// holding the cleanup lock (cleanup did not consume the entry). Mutated under a + /// brief exclusive mirror lock. if (!last_exception_per_replica.empty()) + { + auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); local_entry->last_exception_per_replica = std::move(last_exception_per_replica); + } LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: already exists", key); continue; } @@ -511,9 +529,10 @@ void ExportPartitionManifestUpdatingTask::poll() LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_by_key.size()); } - /// `export_merge_tree_partition_mutex` released here. Everything below runs without it - /// so concurrent readers of `system.replicated_partition_exports` and other writers are - /// not blocked by the (potentially slow) catalog round-trips below. + /// `background_task_serialization_mutex` released here (the mirror lock was only ever held + /// briefly, per mutation, inside the section above). Everything below runs without any lock + /// so concurrent readers of `system.replicated_partition_exports` and handleStatusChanges() + /// are not blocked by the (potentially slow) catalog round-trips below. /// /// `cleanup_lock` (the ZK ephemeral node) is INTENTIONALLY still held here and is only /// destructed at end of function. This preserves the existing cross-replica invariant: @@ -593,7 +612,10 @@ void ExportPartitionManifestUpdatingTask::addTask( } /// Insert or update entry. The multi_index container automatically maintains both indexes. + /// Only the container mutation takes the mirror lock (exclusively, briefly); the part-reference + /// gathering above ran lock-free. ExportReplicatedMergeTreePartitionTaskEntry entry {metadata, status, std::move(part_references), std::move(last_exception_per_replica)}; + auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); auto it = entries_by_key.find(key); if (it != entries_by_key.end()) entries_by_key.replace(it, entry); @@ -606,18 +628,23 @@ void ExportPartitionManifestUpdatingTask::removeStaleEntries( auto & entries_by_key ) { - for (auto it = entries_by_key.begin(); it != entries_by_key.end();) + /// Collect the stale keys first (read-only iteration; poll() is the sole container mutator + /// while it holds background_task_serialization_mutex). Then kill the local export tasks and + /// erase the entries, taking the mirror lock exclusively only for the brief erase and never + /// across killExportPart (which takes export_manifests_mutex). + std::vector> stale_keys; /// (composite key, transaction id) + for (const auto & entry : entries_by_key) { - const auto & key = it->getCompositeKey(); + const auto & key = entry.getCompositeKey(); if (zk_children.contains(key)) - { - ++it; continue; - } + stale_keys.emplace_back(key, entry.manifest.transaction_id); + } - const auto & transaction_id = it->manifest.transaction_id; + for (const auto & [key, transaction_id] : stale_keys) + { LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Export task {} was deleted, calling killExportPartition for transaction {}", key, transaction_id); - + try { storage.killExportPart(transaction_id); @@ -627,7 +654,10 @@ void ExportPartitionManifestUpdatingTask::removeStaleEntries( tryLogCurrentException(storage.log, __PRETTY_FUNCTION__); } - it = entries_by_key.erase(it); + auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); + auto it = entries_by_key.find(key); + if (it != entries_by_key.end()) + entries_by_key.erase(it); } } @@ -648,10 +678,12 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() try { - /// Writer critical section (exclusive side of the read-write lock). No catalog I/O is done - /// here, only ZooKeeper status reads and in-memory updates, so the shared-lock reader of - /// system.replicated_partition_exports is not blocked by slow catalog commits. - auto task_entries_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); + /// Task-serialization critical section: background_task_serialization_mutex serializes + /// this against poll() and is held across the ZooKeeper status reads below. No catalog I/O + /// happens here. Each in-memory entry mutation takes export_merge_tree_partition_mutex + /// exclusively for the brief write only, so the shared-lock reader of + /// system.replicated_partition_exports is never blocked by a ZooKeeper round-trip. + std::lock_guard task_guard(background_task_serialization_mutex); auto zk = storage.getZooKeeper(); LOG_INFO(storage.log, "ExportPartition Manifest Updating task: handling status changes. Number of status changes: {}", local_status_changes.size()); @@ -695,20 +727,17 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() LOG_INFO(storage.log, "ExportPartition Manifest Updating task: status changed for task {}. New status: {}", key, magic_enum::enum_name(*new_status).data()); - /// Refresh last_exception leaves too. Status transitions to FAILED (via commit budget) - /// and KILLED (via timeout) atomically write a per-replica leaf in the same multi, so - /// reading them here ensures the system table surfaces the cause together with the - /// visible state change. No new watch is added — this piggybacks on the existing - /// status watch. An empty result means "nothing actionable" and leaves the previous - /// snapshot intact. - if (auto fetched = readLastExceptionPerReplica( - zk, fs::path(storage.zookeeper_path) / "exports" / key, key, storage.log.load()); - !fetched.empty()) - { - it->last_exception_per_replica = std::move(fetched); - } - - /// If status changed to KILLED, cancel local export operations + /// Refresh last_exception leaves too (ZooKeeper read, lock-free). Status transitions to + /// FAILED (via commit budget) and KILLED (via timeout) atomically write a per-replica + /// leaf in the same multi, so reading them here ensures the system table surfaces the + /// cause together with the visible state change. No new watch is added — this piggybacks + /// on the existing status watch. An empty result means "nothing actionable" and leaves + /// the previous snapshot intact. + auto fetched = readLastExceptionPerReplica( + zk, fs::path(storage.zookeeper_path) / "exports" / key, key, storage.log.load()); + + /// If status changed to KILLED, cancel local export operations. killExportPart takes + /// export_manifests_mutex (not the mirror lock), so call it without the mirror lock held. if (*new_status == ExportReplicatedMergeTreePartitionTaskEntry::Status::KILLED) { try @@ -722,12 +751,20 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() } } - it->status = *new_status; - - if (it->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + /// Apply the in-memory updates under a brief exclusive mirror lock. { - /// we no longer need to keep the data parts alive - it->part_references.clear(); + auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); + + if (!fetched.empty()) + it->last_exception_per_replica = std::move(fetched); + + it->status = *new_status; + + if (it->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { + /// we no longer need to keep the data parts alive + it->part_references.clear(); + } } local_status_changes.pop(); diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h index 32487f2dc68c..2bf3cf01bd1b 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h @@ -45,6 +45,15 @@ class ExportPartitionManifestUpdatingTask std::mutex status_changes_mutex; std::queue status_changes; + + /// Serializes the full bodies of poll() and handleStatusChanges() against each other. + /// Held across ZooKeeper I/O so those two tasks never overlap; the mirror lock + /// (StorageReplicatedMergeTree::export_merge_tree_partition_mutex) is then taken only + /// briefly under this, for the in-memory container mutations. This is what lets the + /// system.replicated_partition_exports reader (which takes the mirror lock shared and + /// briefly) avoid waiting behind slow-network ZooKeeper round-trips. + /// Lock ordering: this -> export_merge_tree_partition_mutex -> export_manifests_mutex. + std::mutex background_task_serialization_mutex; }; } diff --git a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp index f503507cab7a..f61303f6a65f 100644 --- a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp +++ b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp @@ -77,12 +77,39 @@ void ExportPartitionTaskScheduler::run() const uint32_t seed = uint32_t(std::hash{}(storage.replica_name)) ^ uint32_t(scheduled_exports_count); pcg64_fast rng(seed); - auto lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); + /// Snapshot the PENDING entries under a brief shared lock, then perform all ZooKeeper + /// work and exportPartToTable below WITHOUT holding the lock. The scheduler is a pure + /// reader of the in-memory mirror - it no longer writes entry.status (the status + /// converges via the status watch -> handleStatusChanges and poll()), so a shared lock + /// is sufficient and the system.replicated_partition_exports reader is never blocked by + /// the scheduler. + struct PendingEntrySnapshot + { + std::string key; + ExportReplicatedMergeTreePartitionManifest manifest; + }; + + std::vector pending_entries; + { + auto lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); + + // Iterate sorted by create_time + for (const auto & entry : storage.export_merge_tree_partition_task_entries_by_create_time) + { + /// No need to query zk for status if the local one is not PENDING + if (entry.status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Skipping... Local status is {}", magic_enum::enum_name(entry.status).data()); + continue; + } + + pending_entries.push_back(PendingEntrySnapshot{entry.getCompositeKey(), entry.manifest}); + } + } auto zk = storage.getZooKeeper(); - // Iterate sorted by create_time - for (auto & entry : storage.export_merge_tree_partition_task_entries_by_create_time) + for (const auto & pending : pending_entries) { if (scheduled_exports_count >= available_move_executors) { @@ -90,18 +117,11 @@ void ExportPartitionTaskScheduler::run() break; } - const auto & manifest = entry.manifest; - const auto key = entry.getCompositeKey(); + const auto & manifest = pending.manifest; + const auto & key = pending.key; const auto database = storage.getContext()->resolveDatabase(manifest.destination_database); const auto & table = manifest.destination_table; - /// No need to query zk for status if the local one is not PENDING - if (entry.status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) - { - LOG_INFO(storage.log, "ExportPartition scheduler task: Skipping... Local status is {}", magic_enum::enum_name(entry.status).data()); - continue; - } - const auto destination_storage_id = StorageID(QualifiedTableName {database, table}); const auto destination_storage = DatabaseCatalog::instance().tryGetTable(destination_storage_id, storage.getContext()); @@ -131,8 +151,7 @@ void ExportPartitionTaskScheduler::run() if (status_in_zk.value() != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) { - entry.status = status_in_zk.value(); - LOG_INFO(storage.log, "ExportPartition scheduler task: Skipping {}... Status from zk is {}", key, magic_enum::enum_name(entry.status).data()); + LOG_INFO(storage.log, "ExportPartition scheduler task: Skipping {}... Status from zk is {}", key, magic_enum::enum_name(status_in_zk.value()).data()); continue; } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 3a749bc74a04..1b4aa61c834b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -10130,23 +10130,39 @@ CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & return CancellationCode::CancelSent; }; - auto lock = ExportPartitionUtils::lockExclusive(export_merge_tree_partition_mutex); - const auto zk = getZooKeeper(); + /// Look up the entry in the in-memory mirror under a brief shared lock and copy out what we + /// need; release it before any ZooKeeper round-trip so the system.replicated_partition_exports + /// reader is never blocked. This is a pure read of the container, so a shared lock is enough - + /// the KILLED status set in ZooKeeper below propagates back into the mirror via the status + /// watch -> handleStatusChanges. + bool local_entry_found = false; + bool local_entry_pending = false; + std::string local_composite_key; + { + auto lock = ExportPartitionUtils::lockShared(export_merge_tree_partition_mutex); + const auto entry = export_merge_tree_partition_task_entries_by_transaction_id.find(transaction_id); + if (entry != export_merge_tree_partition_task_entries_by_transaction_id.end()) + { + local_entry_found = true; + local_entry_pending = entry->status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING; + local_composite_key = entry->getCompositeKey(); + } + } + /// if we have the entry locally, no need to list from zk. we can save some requests. - const auto & entry = export_merge_tree_partition_task_entries_by_transaction_id.find(transaction_id); - if (entry != export_merge_tree_partition_task_entries_by_transaction_id.end()) + if (local_entry_found) { LOG_INFO(log, "Export partition task found locally, trying to cancel it"); /// found locally, no need to get children on zk - if (entry->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + if (!local_entry_pending) { LOG_INFO(log, "Export partition task is not pending, can not cancel it"); return CancellationCode::CancelCannotBeSent; } - return try_set_status_to_killed(zk, fs::path(zookeeper_path) / "exports" / entry->getCompositeKey() / "status"); + return try_set_status_to_killed(zk, fs::path(zookeeper_path) / "exports" / local_composite_key / "status"); } else { diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index f9f1fd19308c..2e1cd79fd71a 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -527,11 +527,23 @@ class StorageReplicatedMergeTree final : public MergeTreeData Coordination::WatchCallbackPtr export_merge_tree_partition_watch_callback; /// Read-write lock guarding the in-memory mirror of export tasks - /// (export_merge_tree_partition_task_entries*). Readers (system.replicated_partition_exports - /// via getPartitionExportsInfo) take it shared; the background poll/scheduler/status tasks and - /// KILL EXPORT PARTITION take it exclusively, but only for brief in-memory mutations - never - /// across ZooKeeper round-trips. Acquire via ExportPartitionUtils::lockShared / lockExclusive - /// so contention is reflected in the ExportPartitionLock* metrics. + /// (export_merge_tree_partition_task_entries*). It is taken ONLY for brief in-memory + /// accesses, never across a ZooKeeper round-trip: + /// - Readers take it shared: system.replicated_partition_exports via getPartitionExportsInfo, + /// the scheduler run() snapshotting PENDING entries, and KILL EXPORT PARTITION looking up + /// the entry locally. + /// - Writers take it exclusively for the duration of a single container mutation only: + /// poll() and handleStatusChanges() apply each insert/replace/erase/field-update under it, + /// and shutdown clears the container. + /// The ZooKeeper I/O of poll() and handleStatusChanges() is instead serialized by + /// ExportPartitionManifestUpdatingTask::background_task_serialization_mutex (the "task lock"), + /// which those two tasks hold across their ZooKeeper reads but which readers never touch. This + /// split is what keeps the shared-lock reader off the slow-network ZooKeeper critical path + /// (DB::SharedMutex is writer-preferring, so a writer that held this across ZooKeeper would + /// otherwise block every new reader). + /// Lock ordering: background_task_serialization_mutex -> this -> export_manifests_mutex. + /// Acquire via ExportPartitionUtils::lockShared / lockExclusive so contention is reflected in + /// the ExportPartitionLock* metrics. mutable SharedMutex export_merge_tree_partition_mutex; BackgroundSchedulePoolTaskHolder export_merge_tree_partition_select_task; From accaf963c44d20c3604a180282219b9bf073e6e1 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 19 Jun 2026 09:46:49 -0300 Subject: [PATCH 03/17] my own modifications --- .../ExportPartitionManifestUpdatingTask.cpp | 91 ++++++++++--------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index 54ee78596f7b..734d0fa1c466 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -426,12 +426,6 @@ void ExportPartitionManifestUpdatingTask::poll() const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); - /// Read last_exception leaves (no watch). Surfacing exceptions in the system table relies - /// on this read being part of every poll cycle: per-part failures during PENDING do not - /// trigger a status watch, so the only refresh path while the task is still in-flight is - /// the periodic poll. An empty result collapses every "nothing actionable" case - /// (transient ZK error, no children, all leaves ZNONODE/malformed) into a no-op so the - /// in-memory copy stays intact. auto last_exception_per_replica = readLastExceptionPerReplica( zk, fs::path(entry_path), key, storage.log.load()); @@ -439,40 +433,43 @@ void ExportPartitionManifestUpdatingTask::poll() /// If the zk entry has been replaced with export_merge_tree_partition_force_export, checking only for the export key is not enough /// we need to make sure it is the same transaction id. If it is not, it needs to be replaced. - bool has_local_entry_and_is_up_to_date = local_entry != entries_by_key.end() + bool has_local_entry = local_entry != entries_by_key.end() && local_entry->manifest.transaction_id == metadata.transaction_id; - /// If the entry is up to date and we don't have the cleanup lock, refresh the in-memory - /// last_exception (surfaced by system.replicated_partition_exports) and early exit. - /// The `mutable` field is mutated under a brief exclusive mirror lock so a concurrent - /// shared-lock reader never observes a half-written value. - if (!cleanup_lock && has_local_entry_and_is_up_to_date) - { - if (!last_exception_per_replica.empty()) - { - auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); - local_entry->last_exception_per_replica = std::move(last_exception_per_replica); - } - continue; - } + std::string status_string; - std::weak_ptr weak_manifest_updater = storage.export_merge_tree_partition_manifest_updater; + /// In theory, we should be notified when the status changes by the status watch + /// but in practice, the watch is not always reliable (e.g. if the ZooKeeper session is lost) + /// so we need to read the status from the ZK node directly. + if (has_local_entry) + { + ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); + ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGet); - auto status_watch_callback = std::make_shared([weak_manifest_updater, key](const Coordination::WatchResponse &) + zk->tryGet(fs::path(entry_path) / "status", status_string); + } + else { - /// If the table is dropped but the watch is not removed, we need to prevent use after free - /// below code assumes that if manifest updater is still alive, the status handling task is also alive - if (auto manifest_updater = weak_manifest_updater.lock()) + /// If we don't have a local entry, we need to arm a status watch to be notified when the status changes + std::weak_ptr weak_manifest_updater = storage.export_merge_tree_partition_manifest_updater; + auto status_watch_callback = std::make_shared([weak_manifest_updater, key](const Coordination::WatchResponse &) { - manifest_updater->addStatusChange(key); - manifest_updater->storage.export_merge_tree_partition_status_handling_task->schedule(); - } - }); + /// If the table is dropped but the watch is not removed, we need to prevent use after free + /// below code assumes that if manifest updater is still alive, the status handling task is also alive + if (auto manifest_updater = weak_manifest_updater.lock()) + { + manifest_updater->addStatusChange(key); + manifest_updater->storage.export_merge_tree_partition_status_handling_task->schedule(); + } + }); - ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); - ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGetWatch); - std::string status_string; - if (!zk->tryGetWatch(fs::path(entry_path) / "status", status_string, nullptr, status_watch_callback)) + ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); + ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGetWatch); + + zk->tryGetWatch(fs::path(entry_path) / "status", status_string, nullptr, status_watch_callback); + } + + if (status_string.empty()) { LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: missing status", key); continue; @@ -507,21 +504,29 @@ void ExportPartitionManifestUpdatingTask::poll() continue; } - if (has_local_entry_and_is_up_to_date) + if (!has_local_entry) { - /// Same refresh as the early-exit branch above; we also reach this point when - /// holding the cleanup lock (cleanup did not consume the entry). Mutated under a - /// brief exclusive mirror lock. + addTask(metadata, *status, std::move(last_exception_per_replica), key, entries_by_key); + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Added new entry for task {}", key); + } + + /// If we already have the local entry, we need to update it if the status has changed or if there are new last exceptions + const bool status_changed = local_entry->status != *status; + if (!last_exception_per_replica.empty() || status_changed) + { + auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); if (!last_exception_per_replica.empty()) - { - auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); local_entry->last_exception_per_replica = std::move(last_exception_per_replica); + if (status_changed) + { + local_entry->status = *status; + if (local_entry->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + /// terminal now - we no longer need to keep the data parts alive + local_entry->part_references.clear(); } - LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: already exists", key); - continue; } - - addTask(metadata, *status, std::move(last_exception_per_replica), key, entries_by_key); + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: already exists", key); + } /// Remove entries that were deleted by someone else From ccee3a402902fcdf38bdb9edfd8114ebba76af80 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 19 Jun 2026 10:03:58 -0300 Subject: [PATCH 04/17] add missing continue --- src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index 734d0fa1c466..a532f3464963 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -508,6 +508,7 @@ void ExportPartitionManifestUpdatingTask::poll() { addTask(metadata, *status, std::move(last_exception_per_replica), key, entries_by_key); LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Added new entry for task {}", key); + continue; } /// If we already have the local entry, we need to update it if the status has changed or if there are new last exceptions From 3b9da6db4fdfd530fa9dbf0059edd1d3d3162079 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 19 Jun 2026 10:27:04 -0300 Subject: [PATCH 05/17] add best-effort comment --- src/Storages/StorageReplicatedMergeTree.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 1b4aa61c834b..bed2ee163e7e 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -10095,6 +10095,8 @@ CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & /// Called from a query thread (KILL EXPORT PARTITION via InterpreterKillQueryQuery), which does not have a component set. auto component_guard = Coordination::setCurrentComponent("StorageReplicatedMergeTree::killExportPartition"); + /// This is best-effort, even if we manage to set it to killed, it might be overwritten by a successful commit. + auto try_set_status_to_killed = [this](const zkutil::ZooKeeperPtr & zk, const std::string & status_path) { Coordination::Stat stat; From 18aa8d6cff4d4d4b19d32f1822bd72f6d6199ea7 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 19 Jun 2026 10:38:51 -0300 Subject: [PATCH 06/17] remove unnecessary comments --- .../ExportPartitionManifestUpdatingTask.cpp | 6 ------ src/Storages/StorageReplicatedMergeTree.h | 18 ------------------ 2 files changed, 24 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index a532f3464963..9ef541037b9e 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -387,12 +387,6 @@ void ExportPartitionManifestUpdatingTask::poll() { /// Task-serialization critical section: background_task_serialization_mutex is held /// across the ZooKeeper reads below so poll() and handleStatusChanges() never overlap, - /// but it is NOT the mirror lock. Every in-memory mutation of the task container takes - /// export_merge_tree_partition_mutex exclusively for the brief mutation only (see the - /// lockExclusive scopes below and inside addTask / tryCleanup / removeStaleEntries), so - /// the shared-lock reader of system.replicated_partition_exports is never blocked by a - /// ZooKeeper round-trip. The expensive Iceberg/REST-catalog commits run afterwards, with - /// no lock held at all (see deferred_commits). std::lock_guard task_guard(background_task_serialization_mutex); LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), storage.export_merge_tree_partition_task_entries_by_key.size()); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 2e1cd79fd71a..86e590404f2b 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -526,24 +526,6 @@ class StorageReplicatedMergeTree final : public MergeTreeData Coordination::WatchCallbackPtr export_merge_tree_partition_watch_callback; - /// Read-write lock guarding the in-memory mirror of export tasks - /// (export_merge_tree_partition_task_entries*). It is taken ONLY for brief in-memory - /// accesses, never across a ZooKeeper round-trip: - /// - Readers take it shared: system.replicated_partition_exports via getPartitionExportsInfo, - /// the scheduler run() snapshotting PENDING entries, and KILL EXPORT PARTITION looking up - /// the entry locally. - /// - Writers take it exclusively for the duration of a single container mutation only: - /// poll() and handleStatusChanges() apply each insert/replace/erase/field-update under it, - /// and shutdown clears the container. - /// The ZooKeeper I/O of poll() and handleStatusChanges() is instead serialized by - /// ExportPartitionManifestUpdatingTask::background_task_serialization_mutex (the "task lock"), - /// which those two tasks hold across their ZooKeeper reads but which readers never touch. This - /// split is what keeps the shared-lock reader off the slow-network ZooKeeper critical path - /// (DB::SharedMutex is writer-preferring, so a writer that held this across ZooKeeper would - /// otherwise block every new reader). - /// Lock ordering: background_task_serialization_mutex -> this -> export_manifests_mutex. - /// Acquire via ExportPartitionUtils::lockShared / lockExclusive so contention is reflected in - /// the ExportPartitionLock* metrics. mutable SharedMutex export_merge_tree_partition_mutex; BackgroundSchedulePoolTaskHolder export_merge_tree_partition_select_task; From 15b140fb15b12207dd5e4b3aaac1d41084d94c5c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jun 2026 14:00:37 -0300 Subject: [PATCH 07/17] faster, but unsure about ub and code complexity --- .../ExportPartitionManifestUpdatingTask.cpp | 91 ++++++++++++++----- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index 57741ece593d..dd446f0aaad4 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -360,7 +360,12 @@ void ExportPartitionManifestUpdatingTask::poll() /// across the ZooKeeper reads below so poll() and handleStatusChanges() never overlap, std::lock_guard task_guard(background_task_serialization_mutex); - LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), storage.export_merge_tree_partition_task_entries_by_key.size()); + size_t entries_count_before = 0; + { + auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); + entries_count_before = storage.export_merge_tree_partition_task_entries_by_key.size(); + } + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), entries_count_before); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGetChildrenWatch); @@ -394,12 +399,23 @@ void ExportPartitionManifestUpdatingTask::poll() auto last_exception_per_replica = readLastExceptionPerReplica( zk, fs::path(entry_path), key, storage.log.load()); - const auto local_entry = entries_by_key.find(key); - + /// Snapshot the lookup under a brief shared mirror lock so the read never races a + /// concurrent exclusive mutation. We capture only existence and the current status into + /// locals and release the lock immediately - the iterator is not carried across the + /// ZooKeeper I/O below (and never across the shared->exclusive gap, since DB::SharedMutex + /// is not upgradeable). /// If the zk entry has been replaced with export_merge_tree_partition_force_export, checking only for the export key is not enough /// we need to make sure it is the same transaction id. If it is not, it needs to be replaced. - bool has_local_entry = local_entry != entries_by_key.end() - && local_entry->manifest.transaction_id == metadata.transaction_id; + bool has_local_entry = false; + ExportReplicatedMergeTreePartitionTaskEntry::Status local_status{}; + { + auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); + const auto local_entry = entries_by_key.find(key); + has_local_entry = local_entry != entries_by_key.end() + && local_entry->manifest.transaction_id == metadata.transaction_id; + if (has_local_entry) + local_status = local_entry->status; + } std::string status_string; @@ -470,19 +486,24 @@ void ExportPartitionManifestUpdatingTask::poll() continue; } - /// If we already have the local entry, we need to update it if the status has changed or if there are new last exceptions - const bool status_changed = local_entry->status != *status; + /// If we already have the local entry, we need to update it if the status has changed or if there are new last exceptions. + /// `status_changed` is computed from the snapshot taken above; the actual mutation re-finds + /// the iterator under a brief exclusive mirror lock so no iterator crosses the shared->exclusive gap. + const bool status_changed = local_status != *status; if (!last_exception_per_replica.empty() || status_changed) { auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); + auto it = entries_by_key.find(key); + if (it == entries_by_key.end()) + continue; if (!last_exception_per_replica.empty()) - local_entry->last_exception_per_replica = std::move(last_exception_per_replica); + it->last_exception_per_replica = std::move(last_exception_per_replica); if (status_changed) { - local_entry->status = *status; - if (local_entry->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + it->status = *status; + if (it->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) /// terminal now - we no longer need to keep the data parts alive - local_entry->part_references.clear(); + it->part_references.clear(); } } LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: already exists", key); @@ -492,7 +513,12 @@ void ExportPartitionManifestUpdatingTask::poll() /// Remove entries that were deleted by someone else removeStaleEntries(zk_children, entries_by_key); - LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_by_key.size()); + size_t entries_count_after = 0; + { + auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); + entries_count_after = entries_by_key.size(); + } + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_count_after); } /// `background_task_serialization_mutex` released here (the mirror lock was only ever held /// briefly, per mutation, inside the section above). Everything below runs without any lock @@ -598,12 +624,15 @@ void ExportPartitionManifestUpdatingTask::removeStaleEntries( /// erase the entries, taking the mirror lock exclusively only for the brief erase and never /// across killExportPart (which takes export_manifests_mutex). std::vector> stale_keys; /// (composite key, transaction id) - for (const auto & entry : entries_by_key) { - const auto & key = entry.getCompositeKey(); - if (zk_children.contains(key)) - continue; - stale_keys.emplace_back(key, entry.manifest.transaction_id); + auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); + for (const auto & entry : entries_by_key) + { + const auto & key = entry.getCompositeKey(); + if (zk_children.contains(key)) + continue; + stale_keys.emplace_back(key, entry.manifest.transaction_id); + } } for (const auto & [key, transaction_id] : stale_keys) @@ -664,11 +693,19 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() "Failpoint: simulating exception during status change handling for key {}", key); }); - auto it = storage.export_merge_tree_partition_task_entries_by_key.find(key); - if (it == storage.export_merge_tree_partition_task_entries_by_key.end()) + /// Snapshot existence and the transaction id under a brief shared mirror lock. The + /// iterator is not carried across the ZooKeeper I/O below; the actual write re-finds it + /// under an exclusive lock (DB::SharedMutex is not upgradeable). + std::string transaction_id; { - local_status_changes.pop(); - continue; + auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); + auto it = storage.export_merge_tree_partition_task_entries_by_key.find(key); + if (it == storage.export_merge_tree_partition_task_entries_by_key.end()) + { + local_status_changes.pop(); + continue; + } + transaction_id = it->manifest.transaction_id; } ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); @@ -708,7 +745,7 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() try { LOG_INFO(storage.log, "ExportPartition Manifest Updating task: killing export partition for task {}", key); - storage.killExportPart(it->manifest.transaction_id); + storage.killExportPart(transaction_id); } catch (...) { @@ -716,10 +753,18 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() } } - /// Apply the in-memory updates under a brief exclusive mirror lock. + /// Apply the in-memory updates under a brief exclusive mirror lock. Re-find the iterator + /// here so it is obtained and used under the same exclusive lock that guards the write. { auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); + auto it = storage.export_merge_tree_partition_task_entries_by_key.find(key); + if (it == storage.export_merge_tree_partition_task_entries_by_key.end()) + { + local_status_changes.pop(); + continue; + } + if (!fetched.empty()) it->last_exception_per_replica = std::move(fetched); From 415a80a57a0e165c02e1fd6cd9135c4b1ab0c16a Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jun 2026 20:45:26 -0300 Subject: [PATCH 08/17] multiversion as opposed to locking --- src/Common/CurrentMetrics.cpp | 2 - src/Common/ProfileEvents.cpp | 4 - .../ExportPartitionManifestUpdatingTask.cpp | 234 +++++++----------- .../ExportPartitionManifestUpdatingTask.h | 14 +- .../ExportPartitionTaskScheduler.cpp | 51 ++-- .../MergeTree/ExportPartitionUtils.cpp | 32 --- src/Storages/MergeTree/ExportPartitionUtils.h | 12 - src/Storages/StorageReplicatedMergeTree.cpp | 28 ++- src/Storages/StorageReplicatedMergeTree.h | 13 +- 9 files changed, 142 insertions(+), 248 deletions(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 3dfee74eb565..c46cc25687aa 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -13,8 +13,6 @@ M(MergeParts, "Number of source parts participating in current background merges") \ M(Move, "Number of currently executing moves") \ M(Export, "Number of currently executing exports") \ - M(ExportPartitionLockWaitingReaders, "Number of threads currently waiting to acquire the export partition in-memory state lock for reading (shared).") \ - M(ExportPartitionLockWaitingWriters, "Number of threads currently waiting to acquire the export partition in-memory state lock for writing (exclusive).") \ M(PartMutation, "Number of mutations (ALTER DELETE/UPDATE)") \ M(ReplicatedFetch, "Number of data parts being fetched from replica") \ M(ReplicatedSend, "Number of data parts being sent to replicas") \ diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 2c29449707c0..993e9d6a85cd 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -344,10 +344,6 @@ M(ExportPartitionZooKeeperMulti, "Number of 'multi' requests to ZooKeeper made by the export partition feature.", ValueType::Number) \ M(ExportPartitionZooKeeperExists, "Number of 'exists' requests to ZooKeeper made by the export partition feature.", ValueType::Number) \ M(ExportPartsRejectedByMemoryLimit, "Number of background export part tasks rejected due to background memory limit.", ValueType::Number) \ - M(ExportPartitionLockReadWaitMicroseconds, "Total time spent waiting to acquire the export partition in-memory state lock for reading (shared).", ValueType::Microseconds) \ - M(ExportPartitionLockWriteWaitMicroseconds, "Total time spent waiting to acquire the export partition in-memory state lock for writing (exclusive).", ValueType::Microseconds) \ - M(ExportPartitionLockReadAcquisitions, "Number of times the export partition in-memory state lock was acquired for reading (shared).", ValueType::Number) \ - M(ExportPartitionLockWriteAcquisitions, "Number of times the export partition in-memory state lock was acquired for writing (exclusive).", ValueType::Number) \ \ M(DistributedConnectionTries, "Total count of distributed connection attempts.", ValueType::Number) \ M(DistributedConnectionUsable, "Total count of successful distributed connections to a usable server (with required table, but maybe stale).", ValueType::Number) \ diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index dd446f0aaad4..87c59df7ab5c 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -39,7 +39,7 @@ namespace { /// Work item describing a commit-recovery attempt that has been deferred out of /// the `poll()` critical section. Captures everything by value so it can be - /// executed safely after `export_merge_tree_partition_mutex` has been released. + /// executed safely after the M_task critical section has been released. struct CommitRecoveryWork { ExportReplicatedMergeTreePartitionManifest metadata; @@ -245,11 +245,9 @@ namespace return; } - /// A replica exported the last part but the commit never landed. Capture everything - /// needed to run commit() outside `export_merge_tree_partition_mutex`. The - /// commit path performs network I/O (REST catalog + S3) with up to - /// MAX_TRANSACTION_RETRIES = 100 retries; holding the storage-wide mutex across - /// that work is what caused `system.replicated_partition_exports` to hang. + /// A replica exported the last part but the commit never landed. Run commit() + /// outside M_task: it does network I/O (REST catalog + S3, up to + /// MAX_TRANSACTION_RETRIES=100) and must not block the other background task. /// /// The outer poll() loop stays on the normal path: it will call addTask() so the /// in-memory container reflects the PENDING entry. The status watch registered by @@ -271,51 +269,52 @@ ExportPartitionManifestUpdatingTask::ExportPartitionManifestUpdatingTask(Storage { } -std::vector ExportPartitionManifestUpdatingTask::getPartitionExportsInfo() const +void ExportPartitionManifestUpdatingTask::publishReadModel() { - /// Snapshot just the fields we need under the lock, then build the public - /// ReplicatedPartitionExportInfo vector after releasing it. This keeps the critical - /// section O(entries) of cheap struct copies (manifest + enum) rather than also - /// covering the formatting / Array construction performed by the caller. - struct EntrySnapshot - { - ExportReplicatedMergeTreePartitionManifest manifest; - ExportReplicatedMergeTreePartitionTaskEntry::Status status; - std::map last_exception_per_replica; - }; - - std::vector snapshots; - + /// Called under M_task. part_references are not copied: the published version is informational + /// only, and parts must be pinned solely by the authoritative container. + auto model = std::make_unique(); + for (const auto & entry : storage.export_merge_tree_partition_task_entries_by_key) { - auto lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); - - snapshots.reserve(storage.export_merge_tree_partition_task_entries_by_key.size()); - for (const auto & entry : storage.export_merge_tree_partition_task_entries_by_key) - snapshots.push_back(EntrySnapshot{entry.manifest, entry.status, entry.last_exception_per_replica}); + ExportReplicatedMergeTreePartitionTaskEntry copy = entry; + copy.part_references.clear(); /// never pin parts inside a published version + model->insert(std::move(copy)); } + storage.export_read_model.set(std::move(model)); +} + +std::vector ExportPartitionManifestUpdatingTask::getPartitionExportsInfo() const +{ + /// Wait-free read of the published snapshot: a shared_ptr copy, no lock, no ZooKeeper. + const auto model = storage.export_read_model.get(); std::vector infos; - infos.reserve(snapshots.size()); + if (!model) + return infos; + + infos.reserve(model->size()); - for (auto & snapshot : snapshots) + for (const auto & entry : model->get()) { + const auto & manifest = entry.manifest; + ReplicatedPartitionExportInfo info; - info.destination_database = snapshot.manifest.destination_database; - info.destination_table = snapshot.manifest.destination_table; - info.partition_id = snapshot.manifest.partition_id; - info.transaction_id = snapshot.manifest.transaction_id; - info.query_id = snapshot.manifest.query_id; - info.create_time = snapshot.manifest.create_time; - info.source_replica = snapshot.manifest.source_replica; - info.parts_count = snapshot.manifest.number_of_parts; - info.parts_to_do = snapshot.manifest.parts.size(); - info.parts = std::move(snapshot.manifest.parts); - info.status = magic_enum::enum_name(snapshot.status); - - info.last_exception_per_replica.reserve(snapshot.last_exception_per_replica.size()); + info.destination_database = manifest.destination_database; + info.destination_table = manifest.destination_table; + info.partition_id = manifest.partition_id; + info.transaction_id = manifest.transaction_id; + info.query_id = manifest.query_id; + info.create_time = manifest.create_time; + info.source_replica = manifest.source_replica; + info.parts_count = manifest.number_of_parts; + info.parts_to_do = manifest.parts.size(); + info.parts = manifest.parts; + info.status = magic_enum::enum_name(entry.status); + + info.last_exception_per_replica.reserve(entry.last_exception_per_replica.size()); size_t total_exception_count = 0; - for (const auto & [_, ex] : snapshot.last_exception_per_replica) + for (const auto & [_, ex] : entry.last_exception_per_replica) { total_exception_count += ex.count; info.last_exception_per_replica.push_back(ex); @@ -356,16 +355,12 @@ void ExportPartitionManifestUpdatingTask::poll() } { - /// Task-serialization critical section: background_task_serialization_mutex is held - /// across the ZooKeeper reads below so poll() and handleStatusChanges() never overlap, + /// M_task: serializes poll() vs handleStatusChanges() and guards the authoritative + /// container across the ZooKeeper reads below. Readers use the read-model we republish at + /// the end of this block. std::lock_guard task_guard(background_task_serialization_mutex); - size_t entries_count_before = 0; - { - auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); - entries_count_before = storage.export_merge_tree_partition_task_entries_by_key.size(); - } - LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), entries_count_before); + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), storage.export_merge_tree_partition_task_entries_by_key.size()); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGetChildrenWatch); @@ -399,23 +394,12 @@ void ExportPartitionManifestUpdatingTask::poll() auto last_exception_per_replica = readLastExceptionPerReplica( zk, fs::path(entry_path), key, storage.log.load()); - /// Snapshot the lookup under a brief shared mirror lock so the read never races a - /// concurrent exclusive mutation. We capture only existence and the current status into - /// locals and release the lock immediately - the iterator is not carried across the - /// ZooKeeper I/O below (and never across the shared->exclusive gap, since DB::SharedMutex - /// is not upgradeable). + /// Plain lookup is safe: poll() is the sole mutator while holding M_task. /// If the zk entry has been replaced with export_merge_tree_partition_force_export, checking only for the export key is not enough /// we need to make sure it is the same transaction id. If it is not, it needs to be replaced. - bool has_local_entry = false; - ExportReplicatedMergeTreePartitionTaskEntry::Status local_status{}; - { - auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); - const auto local_entry = entries_by_key.find(key); - has_local_entry = local_entry != entries_by_key.end() - && local_entry->manifest.transaction_id == metadata.transaction_id; - if (has_local_entry) - local_status = local_entry->status; - } + const auto local_entry = entries_by_key.find(key); + const bool has_local_entry = local_entry != entries_by_key.end() + && local_entry->manifest.transaction_id == metadata.transaction_id; std::string status_string; @@ -487,23 +471,17 @@ void ExportPartitionManifestUpdatingTask::poll() } /// If we already have the local entry, we need to update it if the status has changed or if there are new last exceptions. - /// `status_changed` is computed from the snapshot taken above; the actual mutation re-finds - /// the iterator under a brief exclusive mirror lock so no iterator crosses the shared->exclusive gap. - const bool status_changed = local_status != *status; + const bool status_changed = local_entry->status != *status; if (!last_exception_per_replica.empty() || status_changed) { - auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); - auto it = entries_by_key.find(key); - if (it == entries_by_key.end()) - continue; if (!last_exception_per_replica.empty()) - it->last_exception_per_replica = std::move(last_exception_per_replica); + local_entry->last_exception_per_replica = std::move(last_exception_per_replica); if (status_changed) { - it->status = *status; - if (it->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + local_entry->status = *status; + if (local_entry->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) /// terminal now - we no longer need to keep the data parts alive - it->part_references.clear(); + local_entry->part_references.clear(); } } LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: already exists", key); @@ -513,17 +491,13 @@ void ExportPartitionManifestUpdatingTask::poll() /// Remove entries that were deleted by someone else removeStaleEntries(zk_children, entries_by_key); - size_t entries_count_after = 0; - { - auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); - entries_count_after = entries_by_key.size(); - } - LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_count_after); + /// Publish this poll() cycle's changes to readers. + publishReadModel(); + + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_by_key.size()); } - /// `background_task_serialization_mutex` released here (the mirror lock was only ever held - /// briefly, per mutation, inside the section above). Everything below runs without any lock - /// so concurrent readers of `system.replicated_partition_exports` and handleStatusChanges() - /// are not blocked by the (potentially slow) catalog round-trips below. + /// M_task released here. The catalog round-trips below run without any lock, so readers and + /// handleStatusChanges() are not blocked by them. /// /// `cleanup_lock` (the ZK ephemeral node) is INTENTIONALLY still held here and is only /// destructed at end of function. This preserves the existing cross-replica invariant: @@ -602,11 +576,8 @@ void ExportPartitionManifestUpdatingTask::addTask( } } - /// Insert or update entry. The multi_index container automatically maintains both indexes. - /// Only the container mutation takes the mirror lock (exclusively, briefly); the part-reference - /// gathering above ran lock-free. + /// Called from poll() under M_task (sole mutator), so no extra locking is required. ExportReplicatedMergeTreePartitionTaskEntry entry {metadata, status, std::move(part_references), std::move(last_exception_per_replica)}; - auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); auto it = entries_by_key.find(key); if (it != entries_by_key.end()) entries_by_key.replace(it, entry); @@ -619,20 +590,15 @@ void ExportPartitionManifestUpdatingTask::removeStaleEntries( auto & entries_by_key ) { - /// Collect the stale keys first (read-only iteration; poll() is the sole container mutator - /// while it holds background_task_serialization_mutex). Then kill the local export tasks and - /// erase the entries, taking the mirror lock exclusively only for the brief erase and never - /// across killExportPart (which takes export_manifests_mutex). + /// Called from poll() under M_task. Collect stale keys first, then kill and erase them + /// (killExportPart takes export_manifests_mutex; lock order M_task -> export_manifests_mutex). std::vector> stale_keys; /// (composite key, transaction id) + for (const auto & entry : entries_by_key) { - auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); - for (const auto & entry : entries_by_key) - { - const auto & key = entry.getCompositeKey(); - if (zk_children.contains(key)) - continue; - stale_keys.emplace_back(key, entry.manifest.transaction_id); - } + const auto & key = entry.getCompositeKey(); + if (zk_children.contains(key)) + continue; + stale_keys.emplace_back(key, entry.manifest.transaction_id); } for (const auto & [key, transaction_id] : stale_keys) @@ -648,7 +614,6 @@ void ExportPartitionManifestUpdatingTask::removeStaleEntries( tryLogCurrentException(storage.log, __PRETTY_FUNCTION__); } - auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); auto it = entries_by_key.find(key); if (it != entries_by_key.end()) entries_by_key.erase(it); @@ -663,7 +628,7 @@ void ExportPartitionManifestUpdatingTask::addStatusChange(const std::string & ke void ExportPartitionManifestUpdatingTask::handleStatusChanges() { - /// copy the events to a local queue to avoid holding the status_changes_mutex while also holding export_merge_tree_partition_mutex + /// copy the events to a local queue to avoid holding status_changes_mutex under M_task std::queue local_status_changes; { std::lock_guard lock(status_changes_mutex); @@ -672,14 +637,13 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() try { - /// Task-serialization critical section: background_task_serialization_mutex serializes - /// this against poll() and is held across the ZooKeeper status reads below. No catalog I/O - /// happens here. Each in-memory entry mutation takes export_merge_tree_partition_mutex - /// exclusively for the brief write only, so the shared-lock reader of - /// system.replicated_partition_exports is never blocked by a ZooKeeper round-trip. + /// M_task: serializes this against poll() and guards the authoritative container across the + /// ZooKeeper reads below. Readers use the read-model we republish at the end of this batch. std::lock_guard task_guard(background_task_serialization_mutex); auto zk = storage.getZooKeeper(); + const bool had_changes = !local_status_changes.empty(); + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: handling status changes. Number of status changes: {}", local_status_changes.size()); while (!local_status_changes.empty()) @@ -693,20 +657,15 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() "Failpoint: simulating exception during status change handling for key {}", key); }); - /// Snapshot existence and the transaction id under a brief shared mirror lock. The - /// iterator is not carried across the ZooKeeper I/O below; the actual write re-finds it - /// under an exclusive lock (DB::SharedMutex is not upgradeable). - std::string transaction_id; + /// Holding M_task (sole mutator), the iterator stays valid across the ZooKeeper reads + /// and killExportPart below (killExportPart touches only export_manifests_mutex). + const auto it = storage.export_merge_tree_partition_task_entries_by_key.find(key); + if (it == storage.export_merge_tree_partition_task_entries_by_key.end()) { - auto mirror_lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); - auto it = storage.export_merge_tree_partition_task_entries_by_key.find(key); - if (it == storage.export_merge_tree_partition_task_entries_by_key.end()) - { - local_status_changes.pop(); - continue; - } - transaction_id = it->manifest.transaction_id; + local_status_changes.pop(); + continue; } + const std::string transaction_id = it->manifest.transaction_id; ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGet); @@ -738,8 +697,7 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() auto fetched = readLastExceptionPerReplica( zk, fs::path(storage.zookeeper_path) / "exports" / key, key, storage.log.load()); - /// If status changed to KILLED, cancel local export operations. killExportPart takes - /// export_manifests_mutex (not the mirror lock), so call it without the mirror lock held. + /// If status changed to KILLED, cancel local export operations. if (*new_status == ExportReplicatedMergeTreePartitionTaskEntry::Status::KILLED) { try @@ -753,32 +711,24 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() } } - /// Apply the in-memory updates under a brief exclusive mirror lock. Re-find the iterator - /// here so it is obtained and used under the same exclusive lock that guards the write. - { - auto mirror_lock = ExportPartitionUtils::lockExclusive(storage.export_merge_tree_partition_mutex); - - auto it = storage.export_merge_tree_partition_task_entries_by_key.find(key); - if (it == storage.export_merge_tree_partition_task_entries_by_key.end()) - { - local_status_changes.pop(); - continue; - } - - if (!fetched.empty()) - it->last_exception_per_replica = std::move(fetched); + /// Apply the in-memory updates directly (poll() cannot run concurrently under M_task). + if (!fetched.empty()) + it->last_exception_per_replica = std::move(fetched); - it->status = *new_status; + it->status = *new_status; - if (it->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) - { - /// we no longer need to keep the data parts alive - it->part_references.clear(); - } + if (it->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { + /// we no longer need to keep the data parts alive + it->part_references.clear(); } local_status_changes.pop(); } + + /// Publish this batch's status transitions to readers. + if (had_changes) + publishReadModel(); } catch (...) { diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h index 2bf3cf01bd1b..6acdf8df3190 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h @@ -43,16 +43,16 @@ class ExportPartitionManifestUpdatingTask auto & entries_by_key ); + /// Republish export_read_model as a fresh immutable copy (part_references stripped). Called + /// under M_task at the end of each poll() / handleStatusChanges() batch. + void publishReadModel(); + std::mutex status_changes_mutex; std::queue status_changes; - /// Serializes the full bodies of poll() and handleStatusChanges() against each other. - /// Held across ZooKeeper I/O so those two tasks never overlap; the mirror lock - /// (StorageReplicatedMergeTree::export_merge_tree_partition_mutex) is then taken only - /// briefly under this, for the in-memory container mutations. This is what lets the - /// system.replicated_partition_exports reader (which takes the mirror lock shared and - /// briefly) avoid waiting behind slow-network ZooKeeper round-trips. - /// Lock ordering: this -> export_merge_tree_partition_mutex -> export_manifests_mutex. + /// M_task: serializes poll() and handleStatusChanges() and is the sole guard of the + /// writer-private authoritative container. Held across ZooKeeper I/O; no reader takes it + /// (readers use export_read_model). Lock ordering: this -> export_manifests_mutex. std::mutex background_task_serialization_mutex; }; diff --git a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp index f61303f6a65f..293f3bdb1708 100644 --- a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp +++ b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp @@ -77,39 +77,17 @@ void ExportPartitionTaskScheduler::run() const uint32_t seed = uint32_t(std::hash{}(storage.replica_name)) ^ uint32_t(scheduled_exports_count); pcg64_fast rng(seed); - /// Snapshot the PENDING entries under a brief shared lock, then perform all ZooKeeper - /// work and exportPartToTable below WITHOUT holding the lock. The scheduler is a pure - /// reader of the in-memory mirror - it no longer writes entry.status (the status - /// converges via the status watch -> handleStatusChanges and poll()), so a shared lock - /// is sufficient and the system.replicated_partition_exports reader is never blocked by - /// the scheduler. - struct PendingEntrySnapshot - { - std::string key; - ExportReplicatedMergeTreePartitionManifest manifest; - }; - - std::vector pending_entries; - { - auto lock = ExportPartitionUtils::lockShared(storage.export_merge_tree_partition_mutex); - - // Iterate sorted by create_time - for (const auto & entry : storage.export_merge_tree_partition_task_entries_by_create_time) - { - /// No need to query zk for status if the local one is not PENDING - if (entry.status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) - { - LOG_INFO(storage.log, "ExportPartition scheduler task: Skipping... Local status is {}", magic_enum::enum_name(entry.status).data()); - continue; - } - - pending_entries.push_back(PendingEntrySnapshot{entry.getCompositeKey(), entry.manifest}); - } - } + /// Hold the published snapshot for the whole pass and iterate it directly (sorted by + /// create_time). It is immutable and the shared_ptr copy never blocks the writer. The scheduler + /// is a pure reader; status converges via the status watch -> handleStatusChanges and poll(). + const auto model = storage.export_read_model.get(); + if (!model) + return; auto zk = storage.getZooKeeper(); - for (const auto & pending : pending_entries) + // Iterate sorted by create_time + for (const auto & entry : model->get()) { if (scheduled_exports_count >= available_move_executors) { @@ -117,8 +95,15 @@ void ExportPartitionTaskScheduler::run() break; } - const auto & manifest = pending.manifest; - const auto & key = pending.key; + /// No need to query zk for status if the local one is not PENDING + if (entry.status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Skipping... Local status is {}", magic_enum::enum_name(entry.status).data()); + continue; + } + + const auto & manifest = entry.manifest; + const auto key = entry.getCompositeKey(); const auto database = storage.getContext()->resolveDatabase(manifest.destination_database); const auto & table = manifest.destination_table; @@ -158,7 +143,7 @@ void ExportPartitionTaskScheduler::run() ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGetChildren); std::vector parts_in_processing_or_pending; - + if (Coordination::Error::ZOK != zk->tryGetChildren(fs::path(storage.zookeeper_path) / "exports" / key / "processing", parts_in_processing_or_pending)) { LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to get parts in processing or pending, skipping"); diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp index d6928a0f9ecd..679e07d0b132 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.cpp +++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp @@ -1,8 +1,6 @@ #include #include #include -#include -#include #include #include #include @@ -30,16 +28,6 @@ namespace ProfileEvents extern const Event ExportPartitionZooKeeperGetChildren; extern const Event ExportPartitionZooKeeperSet; extern const Event ExportPartitionZooKeeperMulti; - extern const Event ExportPartitionLockReadWaitMicroseconds; - extern const Event ExportPartitionLockWriteWaitMicroseconds; - extern const Event ExportPartitionLockReadAcquisitions; - extern const Event ExportPartitionLockWriteAcquisitions; -} - -namespace CurrentMetrics -{ - extern const Metric ExportPartitionLockWaitingReaders; - extern const Metric ExportPartitionLockWaitingWriters; } namespace DB @@ -69,26 +57,6 @@ namespace fs = std::filesystem; namespace ExportPartitionUtils { - std::shared_lock lockShared(SharedMutex & mutex) - { - CurrentMetrics::Increment waiting(CurrentMetrics::ExportPartitionLockWaitingReaders); - Stopwatch watch; - std::shared_lock lock(mutex); - ProfileEvents::increment(ProfileEvents::ExportPartitionLockReadWaitMicroseconds, watch.elapsedMicroseconds()); - ProfileEvents::increment(ProfileEvents::ExportPartitionLockReadAcquisitions); - return lock; - } - - std::unique_lock lockExclusive(SharedMutex & mutex) - { - CurrentMetrics::Increment waiting(CurrentMetrics::ExportPartitionLockWaitingWriters); - Stopwatch watch; - std::unique_lock lock(mutex); - ProfileEvents::increment(ProfileEvents::ExportPartitionLockWriteWaitMicroseconds, watch.elapsedMicroseconds()); - ProfileEvents::increment(ProfileEvents::ExportPartitionLockWriteAcquisitions); - return lock; - } - Block getPartitionSourceBlockForIcebergCommit( MergeTreeData & storage, const String & partition_id) { diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h index 15e130a6873e..0434bc59a2cb 100644 --- a/src/Storages/MergeTree/ExportPartitionUtils.h +++ b/src/Storages/MergeTree/ExportPartitionUtils.h @@ -3,11 +3,8 @@ #include #include #include -#include -#include #include #include -#include #include #include "Storages/IStorage.h" #include @@ -26,15 +23,6 @@ struct ExportReplicatedMergeTreePartitionManifest; namespace ExportPartitionUtils { - /// Instrumented acquisition of the storage-wide export partition state lock. - /// `lockShared` is for readers (e.g. system.replicated_partition_exports); `lockExclusive` - /// is for the brief in-memory mutations performed by the background tasks and KILL EXPORT - /// PARTITION. Both record wait time and waiting-thread counts via the ExportPartitionLock* - /// metrics. The lock MUST NOT be held across ZooKeeper round-trips - gather ZK data first, - /// then take the lock only to apply the result. - std::shared_lock lockShared(SharedMutex & mutex); - std::unique_lock lockExclusive(SharedMutex & mutex); - std::vector getExportedPaths(const LoggerPtr & log, const zkutil::ZooKeeperPtr & zk, const std::string & export_path); ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 8286b91507a1..2fc34081499f 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -480,6 +480,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( , export_merge_tree_partition_task_entries_by_key(export_merge_tree_partition_task_entries.get()) , export_merge_tree_partition_task_entries_by_transaction_id(export_merge_tree_partition_task_entries.get()) , export_merge_tree_partition_task_entries_by_create_time(export_merge_tree_partition_task_entries.get()) + , export_read_model(std::make_unique()) , cleanup_thread(*this) , deduplication_hashes_cache(*this, "deduplication_hashes") , async_block_ids_cache(*this, "async_blocks") @@ -6132,8 +6133,10 @@ void StorageReplicatedMergeTree::shutdown(bool) } { - auto lock = ExportPartitionUtils::lockExclusive(export_merge_tree_partition_mutex); + /// Export tasks were deactivate()d above, so no writer is running. Clear the container and + /// publish an empty read-model so late readers observe no tasks. export_merge_tree_partition_task_entries.clear(); + export_read_model.set(std::make_unique()); } { @@ -10109,22 +10112,23 @@ CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & const auto zk = getZooKeeper(); - /// Look up the entry in the in-memory mirror under a brief shared lock and copy out what we - /// need; release it before any ZooKeeper round-trip so the system.replicated_partition_exports - /// reader is never blocked. This is a pure read of the container, so a shared lock is enough - - /// the KILLED status set in ZooKeeper below propagates back into the mirror via the status - /// watch -> handleStatusChanges. + /// Read the published snapshot (shared_ptr copy, no lock, no ZooKeeper). The KILLED status set + /// below propagates back into the mirror via the status watch -> handleStatusChanges. bool local_entry_found = false; bool local_entry_pending = false; std::string local_composite_key; { - auto lock = ExportPartitionUtils::lockShared(export_merge_tree_partition_mutex); - const auto entry = export_merge_tree_partition_task_entries_by_transaction_id.find(transaction_id); - if (entry != export_merge_tree_partition_task_entries_by_transaction_id.end()) + const auto model = export_read_model.get(); + if (model) { - local_entry_found = true; - local_entry_pending = entry->status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING; - local_composite_key = entry->getCompositeKey(); + const auto & by_transaction_id = model->get(); + const auto entry = by_transaction_id.find(transaction_id); + if (entry != by_transaction_id.end()) + { + local_entry_found = true; + local_entry_pending = entry->status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING; + local_composite_key = entry->getCompositeKey(); + } } } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 86e590404f2b..ab897e7bc020 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -526,16 +527,20 @@ class StorageReplicatedMergeTree final : public MergeTreeData Coordination::WatchCallbackPtr export_merge_tree_partition_watch_callback; - mutable SharedMutex export_merge_tree_partition_mutex; - BackgroundSchedulePoolTaskHolder export_merge_tree_partition_select_task; + /// Writer-private mirror, mutated only by poll() / handleStatusChanges() under M_task. Readers + /// use export_read_model instead. part_references pin PENDING tasks' data parts and live here only. ExportPartitionTaskEntriesContainer export_merge_tree_partition_task_entries; - - // Convenience references to indexes + + // Convenience references to indexes (writer-private, see above) ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_key; ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_transaction_id; ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_create_time; + + /// Immutable snapshot republished after each writer batch (part_references stripped). Readers + /// (system table, scheduler, KILL) get() a consistent version with no lock and no ZooKeeper. + MultiVersion export_read_model; /// A thread that removes old parts, log entries, and blocks. ReplicatedMergeTreeCleanupThread cleanup_thread; From 041439e236322af75fc79e4bb7aae75e6253fd7a Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jun 2026 20:59:50 -0300 Subject: [PATCH 09/17] simplify a thing or two --- .../MergeTree/ExportPartitionManifestUpdatingTask.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index 87c59df7ab5c..4cf0cd1e8c51 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -285,14 +285,12 @@ void ExportPartitionManifestUpdatingTask::publishReadModel() std::vector ExportPartitionManifestUpdatingTask::getPartitionExportsInfo() const { - /// Wait-free read of the published snapshot: a shared_ptr copy, no lock, no ZooKeeper. const auto model = storage.export_read_model.get(); - std::vector infos; if (!model) - return infos; + return {}; - infos.reserve(model->size()); + std::vector infos(model->size()); for (const auto & entry : model->get()) { @@ -491,7 +489,6 @@ void ExportPartitionManifestUpdatingTask::poll() /// Remove entries that were deleted by someone else removeStaleEntries(zk_children, entries_by_key); - /// Publish this poll() cycle's changes to readers. publishReadModel(); LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_by_key.size()); From ef56768b4f9e3fe6c7ecc9f7155b25c5f71f9986 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jun 2026 21:07:22 -0300 Subject: [PATCH 10/17] unrelated fix --- .../MergeTree/ExportPartitionManifestUpdatingTask.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index 4cf0cd1e8c51..a6adbe12bb9c 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -478,8 +478,16 @@ void ExportPartitionManifestUpdatingTask::poll() { local_entry->status = *status; if (local_entry->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { /// terminal now - we no longer need to keep the data parts alive local_entry->part_references.clear(); + + /// looks like we missed a status change event, we should kill local operations. + if (local_entry->status == ExportReplicatedMergeTreePartitionTaskEntry::Status::KILLED) + { + storage.killExportPart(local_entry->manifest.transaction_id); + } + } } } LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: already exists", key); From 653f722feb71999e44b335139f2993aa28ea878d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jun 2026 21:29:27 -0300 Subject: [PATCH 11/17] opsy --- src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index a6adbe12bb9c..f5766edb3ca6 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -290,7 +290,8 @@ std::vector ExportPartitionManifestUpdatingTask:: if (!model) return {}; - std::vector infos(model->size()); + std::vector infos; + infos.reserve(model->size()); for (const auto & entry : model->get()) { From 07855106e6871495d2d62d12a5bf2e8df5d487fa Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 24 Jun 2026 08:43:15 -0300 Subject: [PATCH 12/17] address a few comments --- .../ExportPartitionManifestUpdatingTask.cpp | 25 +++++++------------ src/Storages/StorageReplicatedMergeTree.cpp | 3 +-- src/Storages/StorageReplicatedMergeTree.h | 1 - 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index f5766edb3ca6..b9ad4d5880bf 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -596,33 +596,27 @@ void ExportPartitionManifestUpdatingTask::removeStaleEntries( auto & entries_by_key ) { - /// Called from poll() under M_task. Collect stale keys first, then kill and erase them - /// (killExportPart takes export_manifests_mutex; lock order M_task -> export_manifests_mutex). - std::vector> stale_keys; /// (composite key, transaction id) - for (const auto & entry : entries_by_key) + for (auto it = entries_by_key.begin(); it != entries_by_key.end();) { - const auto & key = entry.getCompositeKey(); + const auto key = it->getCompositeKey(); if (zk_children.contains(key)) + { + ++it; continue; - stale_keys.emplace_back(key, entry.manifest.transaction_id); - } + } - for (const auto & [key, transaction_id] : stale_keys) - { - LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Export task {} was deleted, calling killExportPartition for transaction {}", key, transaction_id); + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Export task {} was deleted, calling killExportPartition for transaction {}", key, it->manifest.transaction_id); try { - storage.killExportPart(transaction_id); + storage.killExportPart(it->manifest.transaction_id); } catch (...) { tryLogCurrentException(storage.log, __PRETTY_FUNCTION__); } - auto it = entries_by_key.find(key); - if (it != entries_by_key.end()) - entries_by_key.erase(it); + it = entries_by_key.erase(it); } } @@ -671,7 +665,6 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() local_status_changes.pop(); continue; } - const std::string transaction_id = it->manifest.transaction_id; ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGet); @@ -709,7 +702,7 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() try { LOG_INFO(storage.log, "ExportPartition Manifest Updating task: killing export partition for task {}", key); - storage.killExportPart(transaction_id); + storage.killExportPart(it->manifest.transaction_id); } catch (...) { diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 2fc34081499f..5a8d3c3d662b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -10118,8 +10118,7 @@ CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & bool local_entry_pending = false; std::string local_composite_key; { - const auto model = export_read_model.get(); - if (model) + if (const auto model = export_read_model.get()) { const auto & by_transaction_id = model->get(); const auto entry = by_transaction_id.find(transaction_id); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index ab897e7bc020..f043a0985242 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include From 778f935a2542eb34719e25b5b8ad457533dc7802 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 24 Jun 2026 12:48:52 -0300 Subject: [PATCH 13/17] simplify things --- .../ExportPartitionManifestUpdatingTask.cpp | 71 ++++++++++--------- .../ExportPartitionManifestUpdatingTask.h | 11 ++- src/Storages/StorageReplicatedMergeTree.cpp | 10 +-- src/Storages/StorageReplicatedMergeTree.h | 9 --- 4 files changed, 42 insertions(+), 59 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index b9ad4d5880bf..3aea12a5b478 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -269,20 +269,6 @@ ExportPartitionManifestUpdatingTask::ExportPartitionManifestUpdatingTask(Storage { } -void ExportPartitionManifestUpdatingTask::publishReadModel() -{ - /// Called under M_task. part_references are not copied: the published version is informational - /// only, and parts must be pinned solely by the authoritative container. - auto model = std::make_unique(); - for (const auto & entry : storage.export_merge_tree_partition_task_entries_by_key) - { - ExportReplicatedMergeTreePartitionTaskEntry copy = entry; - copy.part_references.clear(); /// never pin parts inside a published version - model->insert(std::move(copy)); - } - storage.export_read_model.set(std::move(model)); -} - std::vector ExportPartitionManifestUpdatingTask::getPartitionExportsInfo() const { const auto model = storage.export_read_model.get(); @@ -354,12 +340,20 @@ void ExportPartitionManifestUpdatingTask::poll() } { - /// M_task: serializes poll() vs handleStatusChanges() and guards the authoritative - /// container across the ZooKeeper reads below. Readers use the read-model we republish at - /// the end of this block. + /// M_task: serializes poll() vs handleStatusChanges(). We copy the current read-model into a + /// private mutable container, mutate that copy across the ZooKeeper reads below, and publish + /// it atomically via export_read_model.set() at the end. Readers never see partial updates. std::lock_guard task_guard(background_task_serialization_mutex); - LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), storage.export_merge_tree_partition_task_entries_by_key.size()); + const auto current_model = storage.export_read_model.get(); + + auto working_model = current_model + ? std::make_unique(*current_model) + : std::make_unique(); + + auto & entries_by_key = working_model->get(); + + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), entries_by_key.size()); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGetChildrenWatch); @@ -370,8 +364,6 @@ void ExportPartitionManifestUpdatingTask::poll() const auto now = time(nullptr); - auto & entries_by_key = storage.export_merge_tree_partition_task_entries_by_key; - /// Load new entries /// If we have the cleanup lock, also remove stale entries from zk and local /// Upload dangling commit files if any @@ -498,9 +490,13 @@ void ExportPartitionManifestUpdatingTask::poll() /// Remove entries that were deleted by someone else removeStaleEntries(zk_children, entries_by_key); - publishReadModel(); + const auto entries_count = entries_by_key.size(); - LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_by_key.size()); + /// Publish the updated copy atomically. `working_model` is moved out here, so + /// `entries_by_key` (a reference into it) must not be used afterwards. + storage.export_read_model.set(std::move(working_model)); + + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_count); } /// M_task released here. The catalog round-trips below run without any lock, so readers and /// handleStatusChanges() are not blocked by them. @@ -511,11 +507,11 @@ void ExportPartitionManifestUpdatingTask::poll() /// peer replicas will not race us on the same `commit()` calls below. /// /// Shutdown safety: this function runs on a BackgroundSchedulePool task that - /// `StorageReplicatedMergeTree::shutdown()` deactivates before clearing the entry - /// container. Deactivation waits for the currently-running invocation (this very call) + /// `StorageReplicatedMergeTree::shutdown()` deactivates before publishing an empty + /// `export_read_model`. Deactivation waits for the currently-running invocation (this very call) /// to return before proceeding, so the deferred commits below complete (or throw) before - /// any teardown observes empty `export_merge_tree_partition_task_entries`. All work - /// items capture their inputs by value, so they are independent from container state. + /// any teardown observes an empty read-model. All work items capture their inputs by value, so + /// they are independent from container state. const auto log_ptr = storage.log.load(); @@ -637,8 +633,10 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() try { - /// M_task: serializes this against poll() and guards the authoritative container across the - /// ZooKeeper reads below. Readers use the read-model we republish at the end of this batch. + /// M_task: serializes this against poll(). We copy the current read-model into a private + /// mutable container, apply this batch's status transitions to that copy across the ZooKeeper + /// reads below, and publish it atomically via export_read_model.set() at the end. Readers + /// never see partial updates. std::lock_guard task_guard(background_task_serialization_mutex); auto zk = storage.getZooKeeper(); @@ -646,6 +644,12 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() LOG_INFO(storage.log, "ExportPartition Manifest Updating task: handling status changes. Number of status changes: {}", local_status_changes.size()); + const auto current_model = storage.export_read_model.get(); + auto working_model = current_model + ? std::make_unique(*current_model) + : std::make_unique(); + auto & entries_by_key = working_model->get(); + while (!local_status_changes.empty()) { const auto & key = local_status_changes.front(); @@ -657,10 +661,8 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() "Failpoint: simulating exception during status change handling for key {}", key); }); - /// Holding M_task (sole mutator), the iterator stays valid across the ZooKeeper reads - /// and killExportPart below (killExportPart touches only export_manifests_mutex). - const auto it = storage.export_merge_tree_partition_task_entries_by_key.find(key); - if (it == storage.export_merge_tree_partition_task_entries_by_key.end()) + const auto it = entries_by_key.find(key); + if (it == entries_by_key.end()) { local_status_changes.pop(); continue; @@ -725,9 +727,10 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() local_status_changes.pop(); } - /// Publish this batch's status transitions to readers. + /// Publish this batch's status transitions to readers. `working_model` is moved out here, + /// so `entries_by_key` (a reference into it) must not be used afterwards. if (had_changes) - publishReadModel(); + storage.export_read_model.set(std::move(working_model)); } catch (...) { diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h index 6acdf8df3190..e0e2975ebf5c 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h @@ -43,16 +43,13 @@ class ExportPartitionManifestUpdatingTask auto & entries_by_key ); - /// Republish export_read_model as a fresh immutable copy (part_references stripped). Called - /// under M_task at the end of each poll() / handleStatusChanges() batch. - void publishReadModel(); - std::mutex status_changes_mutex; std::queue status_changes; - /// M_task: serializes poll() and handleStatusChanges() and is the sole guard of the - /// writer-private authoritative container. Held across ZooKeeper I/O; no reader takes it - /// (readers use export_read_model). Lock ordering: this -> export_manifests_mutex. + /// M_task: serializes poll() and handleStatusChanges(). Each builds a private mutable copy of + /// the current read-model, mutates it, and atomically publishes it via export_read_model.set(). + /// Held across ZooKeeper I/O; no reader takes it (readers use export_read_model.get()). + /// Lock ordering: this -> export_manifests_mutex. std::mutex background_task_serialization_mutex; }; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 5a8d3c3d662b..aaeda9975097 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -477,9 +477,6 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( , merge_strategy_picker(*this) , queue(*this, merge_strategy_picker) , fetcher(*this) - , export_merge_tree_partition_task_entries_by_key(export_merge_tree_partition_task_entries.get()) - , export_merge_tree_partition_task_entries_by_transaction_id(export_merge_tree_partition_task_entries.get()) - , export_merge_tree_partition_task_entries_by_create_time(export_merge_tree_partition_task_entries.get()) , export_read_model(std::make_unique()) , cleanup_thread(*this) , deduplication_hashes_cache(*this, "deduplication_hashes") @@ -6132,12 +6129,7 @@ void StorageReplicatedMergeTree::shutdown(bool) std::lock_guard lock(data_parts_exchange_ptr->rwlock); } - { - /// Export tasks were deactivate()d above, so no writer is running. Clear the container and - /// publish an empty read-model so late readers observe no tasks. - export_merge_tree_partition_task_entries.clear(); - export_read_model.set(std::make_unique()); - } + export_read_model.set(std::make_unique()); { std::lock_guard lock(export_manifests_mutex); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index f043a0985242..7a6d0ebee4d6 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -528,15 +528,6 @@ class StorageReplicatedMergeTree final : public MergeTreeData BackgroundSchedulePoolTaskHolder export_merge_tree_partition_select_task; - /// Writer-private mirror, mutated only by poll() / handleStatusChanges() under M_task. Readers - /// use export_read_model instead. part_references pin PENDING tasks' data parts and live here only. - ExportPartitionTaskEntriesContainer export_merge_tree_partition_task_entries; - - // Convenience references to indexes (writer-private, see above) - ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_key; - ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_transaction_id; - ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_create_time; - /// Immutable snapshot republished after each writer batch (part_references stripped). Readers /// (system table, scheduler, KILL) get() a consistent version with no lock and no ZooKeeper. MultiVersion export_read_model; From 4e9566ab0fd213b0eb88c08018cf260353ee226d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 24 Jun 2026 13:23:59 -0300 Subject: [PATCH 14/17] rmv annoying comments --- .../ExportPartitionManifestUpdatingTask.cpp | 77 +++---------------- 1 file changed, 10 insertions(+), 67 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index 3aea12a5b478..fa54c975e9b1 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -37,9 +37,7 @@ namespace FailPoints namespace { - /// Work item describing a commit-recovery attempt that has been deferred out of - /// the `poll()` critical section. Captures everything by value so it can be - /// executed safely after the M_task critical section has been released. + /// Describes pending commits struct CommitRecoveryWork { ExportReplicatedMergeTreePartitionManifest metadata; @@ -47,17 +45,9 @@ namespace StoragePtr destination_storage; ContextPtr context; }; + /// Fetch all per-replica last_exception leaves under /last_exception and build - /// a fresh map keyed by replica name. The map key prefers the unescaped `replica` field - /// embedded in the JSON payload; if it is missing or empty, the leaf name is unescaped as - /// a fallback. - /// - /// An empty result means "nothing actionable": either the parent getChildren failed (ZK - /// glitch), the container has no children yet (no replica has reported), or every leaf - /// fetch came back ZNONODE / malformed. Callers MUST skip the assignment in that case to - /// preserve the in-memory mirror across transient errors. This is safe because per-replica - /// leaves are never individually removed — the entire entry path is wiped recursively when - /// a task is cleaned up, which is handled separately by removeStaleEntries. + /// a fresh map keyed by replica name. std::map readLastExceptionPerReplica( const zkutil::ZooKeeperPtr & zk, const std::filesystem::path & entry_path, @@ -85,8 +75,6 @@ namespace for (const auto & child : children) paths.emplace_back(container_path / child); - /// One MULTI_READ when supported, parallel async gets otherwise. See - /// ZooKeeper::multiRead in src/Common/ZooKeeper/ZooKeeper.h. ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests); ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperGet, paths.size()); auto responses = zk->tryGet(paths); @@ -127,19 +115,8 @@ namespace return out; } - /* - Enforce the PENDING task timeout and recover non-committed exports that have already - exported all parts. Entries are never removed for age — `system.replicated_partition_exports` - is append-only history, so the entry always stays in the in-memory container: a KILLED - transition is driven by the status watch, and a deferred commit is handled by the caller - after the lock is released. - - Side outputs: - - `deferred_commits`: when a PENDING entry has all parts processed but the export was - never committed, this function appends a CommitRecoveryWork item to be executed by - the caller after releasing the storage-wide mutex. The actual commit() call (which - performs network I/O to the destination catalog and S3) MUST NOT run under the lock. - */ + + /// collects pending commits and kills tasks that have timed out void tryCleanup( const zkutil::ZooKeeperPtr & zk, const std::string & entry_path, @@ -245,14 +222,7 @@ namespace return; } - /// A replica exported the last part but the commit never landed. Run commit() - /// outside M_task: it does network I/O (REST catalog + S3, up to - /// MAX_TRANSACTION_RETRIES=100) and must not block the other background task. - /// - /// The outer poll() loop stays on the normal path: it will call addTask() so the - /// in-memory container reflects the PENDING entry. The status watch registered by - /// poll() will transition the local entry to COMPLETED/FAILED once the deferred - /// commit (or a peer's commit) updates /status in ZooKeeper. + /// A replica exported the last part but the commit never landed deferred_commits.push_back(CommitRecoveryWork{ .metadata = metadata, .entry_path = entry_path, @@ -329,10 +299,7 @@ void ExportPartitionManifestUpdatingTask::poll() /// across replicas: only the replica holding it walks `tryCleanup` (task-timeout /// enforcement + commit recovery). It MUST outlive the deferred-commit loop below; otherwise a peer /// replica's next poll() could acquire it and race us on the same commit-recovery work, - /// duplicating REST-catalog round-trips and snapshot writes. The EphemeralNodeHolder - /// destructor removes the node, so we declare it at function scope and let it die - /// at the end of poll() - after all deferred commits have completed. - /// Acquired here (no mutex needed - it is just a ZK ephemeral create). + /// duplicating REST-catalog round-trips and snapshot writes. auto cleanup_lock = zkutil::EphemeralNodeHolder::tryCreate(cleanup_lock_path, *zk, storage.replica_name); if (cleanup_lock) { @@ -385,7 +352,6 @@ void ExportPartitionManifestUpdatingTask::poll() auto last_exception_per_replica = readLastExceptionPerReplica( zk, fs::path(entry_path), key, storage.log.load()); - /// Plain lookup is safe: poll() is the sole mutator while holding M_task. /// If the zk entry has been replaced with export_merge_tree_partition_force_export, checking only for the export key is not enough /// we need to make sure it is the same transaction id. If it is not, it needs to be replaced. const auto local_entry = entries_by_key.find(key); @@ -487,7 +453,6 @@ void ExportPartitionManifestUpdatingTask::poll() } - /// Remove entries that were deleted by someone else removeStaleEntries(zk_children, entries_by_key); const auto entries_count = entries_by_key.size(); @@ -498,23 +463,10 @@ void ExportPartitionManifestUpdatingTask::poll() LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_count); } - /// M_task released here. The catalog round-trips below run without any lock, so readers and - /// handleStatusChanges() are not blocked by them. - /// - /// `cleanup_lock` (the ZK ephemeral node) is INTENTIONALLY still held here and is only - /// destructed at end of function. This preserves the existing cross-replica invariant: - /// at any moment only one replica is performing commit recovery for a given table, so - /// peer replicas will not race us on the same `commit()` calls below. - /// - /// Shutdown safety: this function runs on a BackgroundSchedulePool task that - /// `StorageReplicatedMergeTree::shutdown()` deactivates before publishing an empty - /// `export_read_model`. Deactivation waits for the currently-running invocation (this very call) - /// to return before proceeding, so the deferred commits below complete (or throw) before - /// any teardown observes an empty read-model. All work items capture their inputs by value, so - /// they are independent from container state. const auto log_ptr = storage.log.load(); + /// Execute pending commits for (const auto & work : deferred_commits) { /// A replica exported the last part but the commit never landed. Try to fix it. @@ -529,10 +481,7 @@ void ExportPartitionManifestUpdatingTask::poll() "Caught exception while committing export for {}: {}", work.entry_path, e.message()); - /// Bump commit-attempts counter; transition to FAILED once the budget is exhausted. - /// This is the primary retry path for the commit phase — handlePartExportSuccess - /// only fires once (on the last part's completion); subsequent retries come from here. - const bool became_failed = ExportPartitionUtils::handleCommitFailure( + const bool exceeded_commimt_max_retries = ExportPartitionUtils::handleCommitFailure( zk, work.entry_path, work.metadata.max_retries, @@ -540,7 +489,7 @@ void ExportPartitionManifestUpdatingTask::poll() e.message(), log_ptr); - if (became_failed) + if (exceeded_commimt_max_retries) { LOG_WARNING(log_ptr, "ExportPartition Manifest Updating Task: " @@ -689,12 +638,6 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() LOG_INFO(storage.log, "ExportPartition Manifest Updating task: status changed for task {}. New status: {}", key, magic_enum::enum_name(*new_status).data()); - /// Refresh last_exception leaves too (ZooKeeper read, lock-free). Status transitions to - /// FAILED (via commit budget) and KILLED (via timeout) atomically write a per-replica - /// leaf in the same multi, so reading them here ensures the system table surfaces the - /// cause together with the visible state change. No new watch is added — this piggybacks - /// on the existing status watch. An empty result means "nothing actionable" and leaves - /// the previous snapshot intact. auto fetched = readLastExceptionPerReplica( zk, fs::path(storage.zookeeper_path) / "exports" / key, key, storage.log.load()); From 027578b0304bfcbcb57ca3029be8145c825c0b95 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 24 Jun 2026 13:27:08 -0300 Subject: [PATCH 15/17] rmv useless scope --- src/Storages/StorageReplicatedMergeTree.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index aaeda9975097..7a393bdd96c6 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -10109,17 +10109,16 @@ CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & bool local_entry_found = false; bool local_entry_pending = false; std::string local_composite_key; + + if (const auto model = export_read_model.get()) { - if (const auto model = export_read_model.get()) + const auto & by_transaction_id = model->get(); + const auto entry = by_transaction_id.find(transaction_id); + if (entry != by_transaction_id.end()) { - const auto & by_transaction_id = model->get(); - const auto entry = by_transaction_id.find(transaction_id); - if (entry != by_transaction_id.end()) - { - local_entry_found = true; - local_entry_pending = entry->status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING; - local_composite_key = entry->getCompositeKey(); - } + local_entry_found = true; + local_entry_pending = entry->status == ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING; + local_composite_key = entry->getCompositeKey(); } } From 6249c116d67e6c87a3ec61d48fdf780776d5137e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 24 Jun 2026 14:23:48 -0300 Subject: [PATCH 16/17] requeue the events in case of exception --- .../ExportPartitionManifestUpdatingTask.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index fa54c975e9b1..e7d7d2ac7065 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -580,6 +580,9 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() std::swap(status_changes, local_status_changes); } + /// Take a snapshot of all status changes. If an exception is thrown, we will requeue the whole batch. + const std::queue batch = local_status_changes; + try { /// M_task: serializes this against poll(). We copy the current read-model into a private @@ -679,26 +682,24 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() { tryLogCurrentException(storage.log, __PRETTY_FUNCTION__); - LOG_INFO(storage.log, "ExportPartition Manifest Updating task: exception thrown while handling status changes, enqueuing remaining status changes back to the status_changes queue. Number of remaining status changes: {}", local_status_changes.size()); + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: exception thrown while handling status changes; nothing was published, requeuing the whole batch. Batch size: {}", batch.size()); std::lock_guard lock(status_changes_mutex); - /// It is possible that an exception is thrown while handling the status. In this scenario - /// we need to enqueue the remaining status changes back to the status_changes queue not to lose them. - /// The other solution to this problem would be to ignore it and schedule a poll - maybe it is simpler? - if (!local_status_changes.empty()) + /// upon exception, requeue the whole batch + if (!batch.empty()) { - // Prepend remaining items before any newly-arrived items + std::queue requeued = batch; while (!status_changes.empty()) { - local_status_changes.push(std::move(status_changes.front())); + requeued.push(std::move(status_changes.front())); status_changes.pop(); } - std::swap(status_changes, local_status_changes); + std::swap(status_changes, requeued); } - LOG_INFO(storage.log, "ExportPartition Manifest Updating task: The new number of pending status after enqueueing unprocessed ones is {}", status_changes.size()); + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: pending status changes after requeue: {}", status_changes.size()); throw; } From e74eed633472882e33e7077c4690125323189a97 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 24 Jun 2026 18:10:43 -0300 Subject: [PATCH 17/17] rename variable --- .../MergeTree/ExportPartitionManifestUpdatingTask.cpp | 10 +++++----- .../MergeTree/ExportPartitionManifestUpdatingTask.h | 1 - .../MergeTree/ExportPartitionTaskScheduler.cpp | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 6 +++--- src/Storages/StorageReplicatedMergeTree.h | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp index e7d7d2ac7065..0322f1cf19ee 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -241,7 +241,7 @@ ExportPartitionManifestUpdatingTask::ExportPartitionManifestUpdatingTask(Storage std::vector ExportPartitionManifestUpdatingTask::getPartitionExportsInfo() const { - const auto model = storage.export_read_model.get(); + const auto model = storage.export_partition_manifests.get(); if (!model) return {}; @@ -312,7 +312,7 @@ void ExportPartitionManifestUpdatingTask::poll() /// it atomically via export_read_model.set() at the end. Readers never see partial updates. std::lock_guard task_guard(background_task_serialization_mutex); - const auto current_model = storage.export_read_model.get(); + const auto current_model = storage.export_partition_manifests.get(); auto working_model = current_model ? std::make_unique(*current_model) @@ -459,7 +459,7 @@ void ExportPartitionManifestUpdatingTask::poll() /// Publish the updated copy atomically. `working_model` is moved out here, so /// `entries_by_key` (a reference into it) must not be used afterwards. - storage.export_read_model.set(std::move(working_model)); + storage.export_partition_manifests.set(std::move(working_model)); LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_count); } @@ -596,7 +596,7 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() LOG_INFO(storage.log, "ExportPartition Manifest Updating task: handling status changes. Number of status changes: {}", local_status_changes.size()); - const auto current_model = storage.export_read_model.get(); + const auto current_model = storage.export_partition_manifests.get(); auto working_model = current_model ? std::make_unique(*current_model) : std::make_unique(); @@ -676,7 +676,7 @@ void ExportPartitionManifestUpdatingTask::handleStatusChanges() /// Publish this batch's status transitions to readers. `working_model` is moved out here, /// so `entries_by_key` (a reference into it) must not be used afterwards. if (had_changes) - storage.export_read_model.set(std::move(working_model)); + storage.export_partition_manifests.set(std::move(working_model)); } catch (...) { diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h index e0e2975ebf5c..3bb4e7ac92ab 100644 --- a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h @@ -49,7 +49,6 @@ class ExportPartitionManifestUpdatingTask /// M_task: serializes poll() and handleStatusChanges(). Each builds a private mutable copy of /// the current read-model, mutates it, and atomically publishes it via export_read_model.set(). /// Held across ZooKeeper I/O; no reader takes it (readers use export_read_model.get()). - /// Lock ordering: this -> export_manifests_mutex. std::mutex background_task_serialization_mutex; }; diff --git a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp index 293f3bdb1708..c54dd3f9c52d 100644 --- a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp +++ b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp @@ -80,7 +80,7 @@ void ExportPartitionTaskScheduler::run() /// Hold the published snapshot for the whole pass and iterate it directly (sorted by /// create_time). It is immutable and the shared_ptr copy never blocks the writer. The scheduler /// is a pure reader; status converges via the status watch -> handleStatusChanges and poll(). - const auto model = storage.export_read_model.get(); + const auto model = storage.export_partition_manifests.get(); if (!model) return; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 7a393bdd96c6..924c840462e2 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -477,7 +477,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( , merge_strategy_picker(*this) , queue(*this, merge_strategy_picker) , fetcher(*this) - , export_read_model(std::make_unique()) + , export_partition_manifests(std::make_unique()) , cleanup_thread(*this) , deduplication_hashes_cache(*this, "deduplication_hashes") , async_block_ids_cache(*this, "async_blocks") @@ -6129,7 +6129,7 @@ void StorageReplicatedMergeTree::shutdown(bool) std::lock_guard lock(data_parts_exchange_ptr->rwlock); } - export_read_model.set(std::make_unique()); + export_partition_manifests.set(std::make_unique()); { std::lock_guard lock(export_manifests_mutex); @@ -10110,7 +10110,7 @@ CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & bool local_entry_pending = false; std::string local_composite_key; - if (const auto model = export_read_model.get()) + if (const auto model = export_partition_manifests.get()) { const auto & by_transaction_id = model->get(); const auto entry = by_transaction_id.find(transaction_id); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 7a6d0ebee4d6..adc821b7862d 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -530,7 +530,7 @@ class StorageReplicatedMergeTree final : public MergeTreeData /// Immutable snapshot republished after each writer batch (part_references stripped). Readers /// (system table, scheduler, KILL) get() a consistent version with no lock and no ZooKeeper. - MultiVersion export_read_model; + MultiVersion export_partition_manifests; /// A thread that removes old parts, log entries, and blocks. ReplicatedMergeTreeCleanupThread cleanup_thread;