Change fastcdc to a better and simpler algorithm. (#79)

This CL changes the chunking algorithm from "normalized chunking" to
simple "regression chunking", and changes the has criteria from
'hash&mask' to 'hash<=threshold'. These are all ideas taken from
testing and analysis done at
  https://github.com/dbaarda/rollsum-chunking/blob/master/RESULTS.rst
Regression chunking was introduced in
  https://www.usenix.org/system/files/conference/atc12/atc12-final293.pdf

The algorithm uses an arbitrary number of regressions using power-of-2
regression target lengths. This means we can use a simple bitmask for
the regression hash criteria.

Regression chunking yields high deduplication rates even for lower max
chunk sizes, so that the cdc_stream max chunk can be reduced to 512K
from 1024K. This fixes potential latency spikes from large chunks.
This commit is contained in:
Donovan Baarda
2023-02-09 01:06:41 +11:00
committed by GitHub
parent 24906eb36e
commit fcc4cbc3f3
10 changed files with 121 additions and 331 deletions

View File

@@ -180,19 +180,19 @@ TEST_F(ManifestUpdaterTest, UpdateAll_PrunesUnreferencedChunks) {
EXPECT_OK(updater.Update(
MakeUpdateOps({"subdir/b.txt", "subdir/c.txt", "subdir/d.txt"}),
&file_chunks_, nullptr));
// 1 for manifest id, 1 for manifest, 5 indirect assets.
// 1 for manifest id, 1 for manifest, 6 indirect assets.
// 2 additional chunks from the first Update() that are now unreferenced.
// -1, because the indirect asset for "a.txt" is deduplicated
EXPECT_EQ(data_store_.Chunks().size(), 8)
EXPECT_EQ(data_store_.Chunks().size(), 9)
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
<< std::endl
<< DumpDataStoreProtos();
EXPECT_OK(updater.UpdateAll(&file_chunks_));
EXPECT_OK(updater.UpdateAll(&file_chunks_));
// 1 for manifest id, 1 for manifest, 5 indirect assets.
// 1 for manifest id, 1 for manifest, 6 indirect assets.
// Pruning has removed the 2 unreferenced ones.
EXPECT_EQ(data_store_.Chunks().size(), 7)
EXPECT_EQ(data_store_.Chunks().size(), 8)
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
<< std::endl
<< DumpDataStoreProtos();
@@ -224,9 +224,9 @@ TEST_F(ManifestUpdaterTest, UpdateAll_RecoversFromMissingChunks) {
}
EXPECT_OK(updater.UpdateAll(&file_chunks_));
// 1 for manifest id, 1 for manifest, 5 indirect assets.
// There would be 8 chunks without the removal above, see UpdateAll_Prune.
EXPECT_EQ(data_store_.Chunks().size(), 7)
// 1 for manifest id, 1 for manifest, 6 indirect assets.
// There would be 9 chunks without the removal above, see UpdateAll_Prune.
EXPECT_EQ(data_store_.Chunks().size(), 8)
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
<< std::endl
<< DumpDataStoreProtos();