mirror of
https://github.com/nestriness/cdc-file-transfer.git
synced 2026-01-30 10:35:37 +02:00
Change fastcdc to a better and simpler algorithm. (#79)
This CL changes the chunking algorithm from "normalized chunking" to simple "regression chunking", and changes the has criteria from 'hash&mask' to 'hash<=threshold'. These are all ideas taken from testing and analysis done at https://github.com/dbaarda/rollsum-chunking/blob/master/RESULTS.rst Regression chunking was introduced in https://www.usenix.org/system/files/conference/atc12/atc12-final293.pdf The algorithm uses an arbitrary number of regressions using power-of-2 regression target lengths. This means we can use a simple bitmask for the regression hash criteria. Regression chunking yields high deduplication rates even for lower max chunk sizes, so that the cdc_stream max chunk can be reduced to 512K from 1024K. This fixes potential latency spikes from large chunks.
This commit is contained in:
@@ -201,8 +201,11 @@ bool ManifestTestBase::InProgress(const ContentIdProto& manifest_id,
|
||||
|
||||
void ManifestTestBase::ValidateChunkLookup(const std::string& rel_path,
|
||||
bool expect_contained) {
|
||||
Buffer file;
|
||||
EXPECT_OK(path::ReadFile(path::Join(cfg_.src_dir, rel_path), &file));
|
||||
|
||||
uint64_t offset = 0;
|
||||
auto handler = [&offset, &rel_path, file_chunks = &file_chunks_,
|
||||
auto handler = [&file, &offset, &rel_path, file_chunks = &file_chunks_,
|
||||
expect_contained](const void* data, size_t size) {
|
||||
ContentIdProto id = ContentId::FromArray(data, size);
|
||||
|
||||
@@ -214,8 +217,14 @@ void ManifestTestBase::ValidateChunkLookup(const std::string& rel_path,
|
||||
expect_contained);
|
||||
if (expect_contained) {
|
||||
EXPECT_EQ(lookup_path, rel_path);
|
||||
EXPECT_EQ(lookup_offset, offset);
|
||||
EXPECT_EQ(lookup_size, size);
|
||||
|
||||
// The offset can be ambiguous since the file might contain duplicate
|
||||
// data. Make sure that the actual data is the same.
|
||||
EXPECT_LE(offset + size, file.size());
|
||||
EXPECT_LE(lookup_offset + size, file.size());
|
||||
EXPECT_EQ(memcmp(file.data() + offset, file.data() + lookup_offset, size),
|
||||
0);
|
||||
}
|
||||
|
||||
offset += size;
|
||||
@@ -224,9 +233,7 @@ void ManifestTestBase::ValidateChunkLookup(const std::string& rel_path,
|
||||
cfg_.max_chunk_size);
|
||||
fastcdc::Chunker chunker(cdc_cfg, handler);
|
||||
|
||||
Buffer b;
|
||||
EXPECT_OK(path::ReadFile(path::Join(cfg_.src_dir, rel_path), &b));
|
||||
chunker.Process(reinterpret_cast<uint8_t*>(b.data()), b.size());
|
||||
chunker.Process(reinterpret_cast<uint8_t*>(file.data()), file.size());
|
||||
chunker.Finalize();
|
||||
}
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ struct UpdaterConfig {
|
||||
size_t avg_chunk_size = 256 << 10;
|
||||
|
||||
// Maximum allowed chunk size.
|
||||
size_t max_chunk_size = 1024 << 10;
|
||||
size_t max_chunk_size = 512 << 10;
|
||||
|
||||
// Size of the chunker thread pool. Defaults to the number of available CPUs.
|
||||
uint32_t num_threads = 0;
|
||||
|
||||
@@ -180,19 +180,19 @@ TEST_F(ManifestUpdaterTest, UpdateAll_PrunesUnreferencedChunks) {
|
||||
EXPECT_OK(updater.Update(
|
||||
MakeUpdateOps({"subdir/b.txt", "subdir/c.txt", "subdir/d.txt"}),
|
||||
&file_chunks_, nullptr));
|
||||
// 1 for manifest id, 1 for manifest, 5 indirect assets.
|
||||
// 1 for manifest id, 1 for manifest, 6 indirect assets.
|
||||
// 2 additional chunks from the first Update() that are now unreferenced.
|
||||
// -1, because the indirect asset for "a.txt" is deduplicated
|
||||
EXPECT_EQ(data_store_.Chunks().size(), 8)
|
||||
EXPECT_EQ(data_store_.Chunks().size(), 9)
|
||||
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
|
||||
<< std::endl
|
||||
<< DumpDataStoreProtos();
|
||||
|
||||
EXPECT_OK(updater.UpdateAll(&file_chunks_));
|
||||
EXPECT_OK(updater.UpdateAll(&file_chunks_));
|
||||
// 1 for manifest id, 1 for manifest, 5 indirect assets.
|
||||
// 1 for manifest id, 1 for manifest, 6 indirect assets.
|
||||
// Pruning has removed the 2 unreferenced ones.
|
||||
EXPECT_EQ(data_store_.Chunks().size(), 7)
|
||||
EXPECT_EQ(data_store_.Chunks().size(), 8)
|
||||
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
|
||||
<< std::endl
|
||||
<< DumpDataStoreProtos();
|
||||
@@ -224,9 +224,9 @@ TEST_F(ManifestUpdaterTest, UpdateAll_RecoversFromMissingChunks) {
|
||||
}
|
||||
|
||||
EXPECT_OK(updater.UpdateAll(&file_chunks_));
|
||||
// 1 for manifest id, 1 for manifest, 5 indirect assets.
|
||||
// There would be 8 chunks without the removal above, see UpdateAll_Prune.
|
||||
EXPECT_EQ(data_store_.Chunks().size(), 7)
|
||||
// 1 for manifest id, 1 for manifest, 6 indirect assets.
|
||||
// There would be 9 chunks without the removal above, see UpdateAll_Prune.
|
||||
EXPECT_EQ(data_store_.Chunks().size(), 8)
|
||||
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
|
||||
<< std::endl
|
||||
<< DumpDataStoreProtos();
|
||||
|
||||
Reference in New Issue
Block a user