Skip to content

Commit 306a5ee

Browse files
author
ivanmorozov
committed
KIKIMR-19216: improve merging
1 parent b8a327d commit 306a5ee

File tree

5 files changed

+156
-107
lines changed

5 files changed

+156
-107
lines changed

ydb/core/tx/columnshard/engines/changes/general_compaction.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ TConclusionStatus TGeneralCompactColumnEngineChanges::DoConstructBlobs(TConstruc
6060
batch = NArrow::TStatusValidator::GetValid(batch->AddColumn(batch->num_columns(), portionRecordIndexField, column->BuildArray(batch->num_rows())));
6161
}
6262
Y_VERIFY_DEBUG(NArrow::IsSortedAndUnique(batch, resultSchema->GetIndexInfo().GetReplaceKey()));
63-
mergeStream.AddPoolSource({}, batch, nullptr);
63+
mergeStream.AddSource(batch, nullptr);
6464
}
65-
batchResults = mergeStream.DrainAllParts(CheckPoints, indexFields, true);
65+
batchResults = mergeStream.DrainAllParts(CheckPoints, indexFields);
6666
}
6767
Y_ABORT_UNLESS(batchResults.size());
6868

@@ -208,11 +208,8 @@ NColumnShard::ECumulativeCounters TGeneralCompactColumnEngineChanges::GetCounter
208208
return isSuccess ? NColumnShard::COUNTER_COMPACTION_SUCCESS : NColumnShard::COUNTER_COMPACTION_FAIL;
209209
}
210210

211-
void TGeneralCompactColumnEngineChanges::AddCheckPoint(const NIndexedReader::TSortableBatchPosition& position) {
212-
if (CheckPoints.size()) {
213-
AFL_VERIFY(CheckPoints.back().Compare(position) == std::partial_ordering::less);
214-
}
215-
CheckPoints.emplace_back(position);
211+
void TGeneralCompactColumnEngineChanges::AddCheckPoint(const NIndexedReader::TSortableBatchPosition& position, const bool include) {
212+
AFL_VERIFY(CheckPoints.emplace(position, include).second);
216213
}
217214

218215
}

ydb/core/tx/columnshard/engines/changes/general_compaction.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ class TGeneralCompactColumnEngineChanges: public TCompactColumnEngineChanges {
88
private:
99
using TBase = TCompactColumnEngineChanges;
1010
virtual void DoWriteIndexComplete(NColumnShard::TColumnShard& self, TWriteIndexCompleteContext& context) override;
11-
std::vector<NIndexedReader::TSortableBatchPosition> CheckPoints;
11+
std::map<NIndexedReader::TSortableBatchPosition, bool> CheckPoints;
1212
protected:
1313
virtual TConclusionStatus DoConstructBlobs(TConstructionContext& context) noexcept override;
1414
virtual TPortionMeta::EProduced GetResultProducedClass() const override {
@@ -19,7 +19,7 @@ class TGeneralCompactColumnEngineChanges: public TCompactColumnEngineChanges {
1919
public:
2020
using TBase::TBase;
2121

22-
void AddCheckPoint(const NIndexedReader::TSortableBatchPosition& position);
22+
void AddCheckPoint(const NIndexedReader::TSortableBatchPosition& position, const bool include = true);
2323

2424
virtual TString TypeString() const override {
2525
return StaticTypeName();

ydb/core/tx/columnshard/engines/reader/plain_reader/interval.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ void TFetchingInterval::ConstructResult() {
1616
if (i->GetStart().Compare(Start) == std::partial_ordering::equivalent && !i->IsMergingStarted()) {
1717
auto rb = i->GetBatch();
1818
if (rb) {
19-
Merger->AddPoolSource({}, rb, i->GetFilterStageData().GetNotAppliedEarlyFilter());
19+
Merger->AddSource(rb, i->GetFilterStageData().GetNotAppliedEarlyFilter());
2020
}
2121
i->StartMerging();
2222
}

ydb/core/tx/columnshard/engines/reader/read_filter_merger.cpp

Lines changed: 25 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,52 +5,37 @@ namespace NKikimr::NOlap::NIndexedReader {
55

66
void TMergePartialStream::PutControlPoint(std::shared_ptr<TSortableBatchPosition> point) {
77
Y_ABORT_UNLESS(point);
8-
Y_ABORT_UNLESS(point->IsSameSortingSchema(SortSchema));
8+
AFL_VERIFY(point->IsSameSortingSchema(SortSchema))("point", point->DebugJson())("schema", SortSchema->ToString());
99
Y_ABORT_UNLESS(point->IsReverseSort() == Reverse);
1010
Y_ABORT_UNLESS(++ControlPoints == 1);
1111

12-
SortHeap.emplace_back(TBatchIterator(*point));
13-
std::push_heap(SortHeap.begin(), SortHeap.end());
12+
SortHeap.Push(TBatchIterator(*point));
1413
}
1514

16-
void TMergePartialStream::AddPoolSource(const std::optional<ui32> poolId, std::shared_ptr<arrow::RecordBatch> batch, std::shared_ptr<NArrow::TColumnFilter> filter) {
15+
void TMergePartialStream::AddSource(std::shared_ptr<arrow::RecordBatch> batch, std::shared_ptr<NArrow::TColumnFilter> filter) {
1716
if (!batch || !batch->num_rows()) {
1817
return;
1918
}
2019
Y_VERIFY_DEBUG(NArrow::IsSorted(batch, SortSchema));
21-
if (!poolId) {
22-
AddNewToHeap(poolId, batch, filter, true);
23-
} else {
24-
auto it = BatchPools.find(*poolId);
25-
if (it == BatchPools.end()) {
26-
it = BatchPools.emplace(*poolId, std::deque<TIteratorData>()).first;
27-
}
28-
it->second.emplace_back(batch, filter);
29-
if (it->second.size() == 1) {
30-
AddNewToHeap(poolId, batch, filter, true);
31-
}
32-
}
20+
AddNewToHeap(batch, filter);
3321
}
3422

35-
void TMergePartialStream::AddNewToHeap(const std::optional<ui32> poolId, std::shared_ptr<arrow::RecordBatch> batch, std::shared_ptr<NArrow::TColumnFilter> filter, const bool restoreHeap) {
23+
void TMergePartialStream::AddNewToHeap(std::shared_ptr<arrow::RecordBatch> batch, std::shared_ptr<NArrow::TColumnFilter> filter) {
3624
if (!filter || filter->IsTotalAllowFilter()) {
37-
SortHeap.emplace_back(TBatchIterator(batch, nullptr, SortSchema->field_names(), DataSchema ? DataSchema->field_names() : std::vector<std::string>(), Reverse, poolId));
25+
SortHeap.Push(TBatchIterator(batch, nullptr, SortSchema->field_names(), DataSchema ? DataSchema->field_names() : std::vector<std::string>(), Reverse));
3826
} else if (filter->IsTotalDenyFilter()) {
3927
return;
4028
} else {
41-
SortHeap.emplace_back(TBatchIterator(batch, filter, SortSchema->field_names(), DataSchema ? DataSchema->field_names() : std::vector<std::string>(), Reverse, poolId));
42-
}
43-
if (restoreHeap) {
44-
std::push_heap(SortHeap.begin(), SortHeap.end());
29+
SortHeap.Push(TBatchIterator(batch, filter, SortSchema->field_names(), DataSchema ? DataSchema->field_names() : std::vector<std::string>(), Reverse));
4530
}
4631
}
4732

4833
void TMergePartialStream::RemoveControlPoint() {
4934
Y_ABORT_UNLESS(ControlPoints == 1);
5035
Y_ABORT_UNLESS(ControlPointEnriched());
5136
Y_ABORT_UNLESS(-- ControlPoints == 0);
52-
std::pop_heap(SortHeap.begin(), SortHeap.end());
53-
SortHeap.pop_back();
37+
Y_ABORT_UNLESS(SortHeap.Current().IsControlPoint());
38+
SortHeap.RemoveTop();
5439
}
5540

5641
void TMergePartialStream::CheckSequenceInDebug(const TSortableBatchPosition& nextKeyColumnsPosition) {
@@ -73,11 +58,11 @@ bool TMergePartialStream::DrainCurrentTo(TRecordBatchBuilder& builder, const TSo
7358
Y_ABORT_UNLESS((ui32)DataSchema->num_fields() == builder.GetBuildersCount());
7459
PutControlPoint(std::make_shared<TSortableBatchPosition>(readTo));
7560
bool cpReachedFlag = false;
76-
while (SortHeap.size() && !cpReachedFlag) {
77-
if (SortHeap.front().IsControlPoint()) {
61+
while (SortHeap.Size() && !cpReachedFlag) {
62+
if (SortHeap.Current().IsControlPoint()) {
7863
RemoveControlPoint();
7964
cpReachedFlag = true;
80-
if (SortHeap.empty() || !includeFinish || SortHeap.front().GetKeyColumns().Compare(readTo) == std::partial_ordering::greater) {
65+
if (SortHeap.Empty() || !includeFinish || SortHeap.Current().GetKeyColumns().Compare(readTo) == std::partial_ordering::greater) {
8166
return true;
8267
}
8368
}
@@ -92,7 +77,7 @@ bool TMergePartialStream::DrainCurrentTo(TRecordBatchBuilder& builder, const TSo
9277

9378
bool TMergePartialStream::DrainAll(TRecordBatchBuilder& builder) {
9479
Y_ABORT_UNLESS((ui32)DataSchema->num_fields() == builder.GetBuildersCount());
95-
while (SortHeap.size()) {
80+
while (SortHeap.Size()) {
9681
if (auto currentPosition = DrainCurrentPosition()) {
9782
CheckSequenceInDebug(*currentPosition);
9883
builder.AddRecord(*currentPosition);
@@ -102,19 +87,19 @@ bool TMergePartialStream::DrainAll(TRecordBatchBuilder& builder) {
10287
}
10388

10489
std::optional<TSortableBatchPosition> TMergePartialStream::DrainCurrentPosition() {
105-
Y_ABORT_UNLESS(SortHeap.size());
106-
Y_ABORT_UNLESS(!SortHeap.front().IsControlPoint());
107-
TSortableBatchPosition result = SortHeap.front().GetKeyColumns();
108-
TSortableBatchPosition resultVersion = SortHeap.front().GetVersionColumns();
90+
Y_ABORT_UNLESS(SortHeap.Size());
91+
Y_ABORT_UNLESS(!SortHeap.Current().IsControlPoint());
92+
TSortableBatchPosition result = SortHeap.Current().GetKeyColumns();
93+
TSortableBatchPosition resultVersion = SortHeap.Current().GetVersionColumns();
10994
bool isFirst = true;
110-
const bool deletedFlag = SortHeap.front().IsDeleted();
111-
while (SortHeap.size() && (isFirst || result.Compare(SortHeap.front().GetKeyColumns()) == std::partial_ordering::equivalent)) {
112-
auto& anotherIterator = SortHeap.front();
95+
const bool deletedFlag = SortHeap.Current().IsDeleted();
96+
while (SortHeap.Size() && (isFirst || result.Compare(SortHeap.Current().GetKeyColumns()) == std::partial_ordering::equivalent)) {
97+
auto& anotherIterator = SortHeap.Current();
11398
if (!isFirst) {
114-
AFL_VERIFY(resultVersion.Compare(anotherIterator.GetVersionColumns()) == std::partial_ordering::greater)("r", resultVersion.DebugJson())("a", anotherIterator.GetVersionColumns().DebugJson())
99+
AFL_VERIFY(resultVersion.Compare(anotherIterator.GetVersionColumns()) != std::partial_ordering::less)("r", resultVersion.DebugJson())("a", anotherIterator.GetVersionColumns().DebugJson())
115100
("key", result.DebugJson());
116101
}
117-
NextInHeap(true);
102+
SortHeap.Next();
118103
isFirst = false;
119104
}
120105
if (deletedFlag) {
@@ -123,13 +108,13 @@ std::optional<TSortableBatchPosition> TMergePartialStream::DrainCurrentPosition(
123108
return result;
124109
}
125110

126-
std::vector<std::shared_ptr<arrow::RecordBatch>> TMergePartialStream::DrainAllParts(const std::vector<TSortableBatchPosition>& positions,
127-
const std::vector<std::shared_ptr<arrow::Field>>& resultFields, const bool includePositions)
111+
std::vector<std::shared_ptr<arrow::RecordBatch>> TMergePartialStream::DrainAllParts(const std::map<TSortableBatchPosition, bool>& positions,
112+
const std::vector<std::shared_ptr<arrow::Field>>& resultFields)
128113
{
129114
std::vector<std::shared_ptr<arrow::RecordBatch>> result;
130115
for (auto&& i : positions) {
131116
NIndexedReader::TRecordBatchBuilder indexesBuilder(resultFields);
132-
DrainCurrentTo(indexesBuilder, i, includePositions);
117+
DrainCurrentTo(indexesBuilder, i.first, i.second);
133118
result.emplace_back(indexesBuilder.Finalize());
134119
if (result.back()->num_rows() == 0) {
135120
result.pop_back();
@@ -147,11 +132,6 @@ std::vector<std::shared_ptr<arrow::RecordBatch>> TMergePartialStream::DrainAllPa
147132
NJson::TJsonValue TMergePartialStream::TBatchIterator::DebugJson() const {
148133
NJson::TJsonValue result;
149134
result["is_cp"] = IsControlPoint();
150-
if (PoolId) {
151-
result["pool_id"] = *PoolId;
152-
} else {
153-
result["pool_id"] = "absent";
154-
}
155135
result["key"] = KeyColumns.DebugJson();
156136
return result;
157137
}

0 commit comments

Comments
 (0)