mirror of
https://github.com/nestriness/cdc-file-transfer.git
synced 2026-01-30 12:25:35 +02:00
This CL changes the chunking algorithm from "normalized chunking" to simple "regression chunking", and changes the has criteria from 'hash&mask' to 'hash<=threshold'. These are all ideas taken from testing and analysis done at https://github.com/dbaarda/rollsum-chunking/blob/master/RESULTS.rst Regression chunking was introduced in https://www.usenix.org/system/files/conference/atc12/atc12-final293.pdf The algorithm uses an arbitrary number of regressions using power-of-2 regression target lengths. This means we can use a simple bitmask for the regression hash criteria. Regression chunking yields high deduplication rates even for lower max chunk sizes, so that the cdc_stream max chunk can be reduced to 512K from 1024K. This fixes potential latency spikes from large chunks.
121 lines
3.8 KiB
C++
121 lines
3.8 KiB
C++
// Copyright 2022 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "fastcdc/fastcdc.h"
|
|
|
|
#include "gtest/gtest.h"
|
|
|
|
namespace cdc_ft {
|
|
namespace fastcdc {
|
|
|
|
// This gear function has the following properties:
|
|
// - data like {0, 0, 0, ...} results in a continuously zero rolling hash, thus
|
|
// is always identified as a chunk boundary.
|
|
// - data like {1, 1, 1, ...} results in a continuously all-ones rolling hash,
|
|
// thus is never identified as a chunk boundary.
|
|
static const uint64_t testgear64[256]{0, 1}; // 0, 1, 0, 0, 0, ...
|
|
|
|
template <const uint64_t gear[256] = testgear64>
|
|
using TestChunker = Chunker64<gear>;
|
|
|
|
class ChunkerTest : public ::testing::Test {
|
|
public:
|
|
ChunkerTest() {}
|
|
};
|
|
|
|
// Tests that the threshold for hash comparison is set correctly.
|
|
TEST_F(ChunkerTest, ValidateThreshold) {
|
|
// Sizes: 128/256/512 bytes
|
|
Config cfg(128, 256, 512);
|
|
TestChunker<> chunker(cfg, nullptr);
|
|
EXPECT_EQ(0x1fc07f01fc07f01, chunker.Threshold());
|
|
}
|
|
|
|
// Tests that the minimum chunk size is not undercut.
|
|
TEST_F(ChunkerTest, MinChunkSize) {
|
|
Config cfg(64, 96, 128);
|
|
std::vector<size_t> chunk_sizes;
|
|
TestChunker<> chunker(cfg, [&](const uint8_t* /* data */, size_t len) {
|
|
chunk_sizes.push_back(len);
|
|
});
|
|
// All-zero data matches a chunk boundary everywhere.
|
|
std::vector<uint8_t> data(cfg.max_size, 0);
|
|
chunker.Process(data.data(), data.size());
|
|
chunker.Finalize();
|
|
EXPECT_EQ(chunk_sizes.size(), 2);
|
|
for (size_t size : chunk_sizes) {
|
|
EXPECT_EQ(size, cfg.min_size);
|
|
}
|
|
}
|
|
|
|
// Tests that maximum chunk size is not exceeded.
|
|
TEST_F(ChunkerTest, MaxChunkSize) {
|
|
Config cfg(0, 64, 128);
|
|
std::vector<size_t> chunk_sizes;
|
|
TestChunker<> chunker(cfg, [&](const uint8_t* /* data */, size_t len) {
|
|
chunk_sizes.push_back(len);
|
|
});
|
|
// All-ones data never matches a chunk boundary.
|
|
std::vector<uint8_t> data(4 * cfg.max_size, 1);
|
|
chunker.Process(data.data(), data.size());
|
|
chunker.Finalize();
|
|
EXPECT_EQ(chunk_sizes.size(), 4);
|
|
for (size_t size : chunk_sizes) {
|
|
EXPECT_EQ(size, cfg.max_size);
|
|
}
|
|
}
|
|
|
|
// Tests that Finalize() returns the remaining data as a chunk.
|
|
TEST_F(ChunkerTest, FinalizeChunk) {
|
|
Config cfg(32, 64, 128);
|
|
std::vector<size_t> chunk_sizes;
|
|
TestChunker<> chunker(cfg, [&](const uint8_t* /* data */, size_t len) {
|
|
chunk_sizes.push_back(len);
|
|
});
|
|
std::vector<uint8_t> data(1, 0);
|
|
chunker.Process(data.data(), data.size());
|
|
EXPECT_EQ(chunk_sizes.size(), 0);
|
|
chunker.Finalize();
|
|
EXPECT_EQ(chunk_sizes.size(), 1);
|
|
EXPECT_EQ(chunk_sizes[0], 1);
|
|
}
|
|
|
|
// Tests that Finalize() works when no data is left.
|
|
TEST_F(ChunkerTest, FinalizeEmptyChunk) {
|
|
Config cfg(32, 64, 128);
|
|
std::vector<size_t> chunk_sizes;
|
|
TestChunker<> chunker(cfg, [&](const uint8_t* /* data */, size_t len) {
|
|
chunk_sizes.push_back(len);
|
|
});
|
|
std::vector<uint8_t> data(1, 0);
|
|
chunker.Process(data.data(), 0);
|
|
EXPECT_EQ(chunk_sizes.size(), 0);
|
|
chunker.Finalize();
|
|
EXPECT_EQ(chunk_sizes.size(), 0);
|
|
}
|
|
|
|
// Tests that Finalize() works when Process() was not called.
|
|
TEST_F(ChunkerTest, FinalizeWithoutProcess) {
|
|
Config cfg(32, 64, 128);
|
|
std::vector<size_t> chunk_sizes;
|
|
TestChunker<> chunker(cfg, [&](const uint8_t* /* data */, size_t len) {
|
|
chunk_sizes.push_back(len);
|
|
});
|
|
chunker.Finalize();
|
|
EXPECT_EQ(chunk_sizes.size(), 0);
|
|
}
|
|
|
|
} // namespace fastcdc
|
|
} // namespace cdc_ft
|