Fix #76 fastcdc chunk boundary off-by-one. (#78)

* Fix #76 fastcdc chunk boundary off-by-one.

This ensures that the last byte included in the gear-hash that identified the
chunk boundary is included in the chunk. This ensures chunks are still matched
when the byte immediately after them is changed.

* Init gear hash to all 1's to prevent zero-length chunks with min_size=0.

Also change the `MaxChunkSize` test to use min_size=0 to test this works.
This commit is contained in:
Donovan Baarda
2023-01-24 00:39:02 +11:00
committed by GitHub
parent efca9855e7
commit 9cf71cae65
2 changed files with 4 additions and 3 deletions

View File

@@ -239,7 +239,8 @@ class ChunkerTmpl {
len = cfg_.max_size;
}
uint64_t hash = 0;
// Init hash to all 1's to avoid zero-length chunks with min_size=0.
uint64_t hash = (uint64_t)-1;
// Skip the first min_size bytes, but "warm up" the rolling hash for 64
// rounds to make sure the 64-bit hash has gathered full "content history".
size_t i = cfg_.min_size > 64 ? cfg_.min_size - 64 : 0;
@@ -250,10 +251,10 @@ class ChunkerTmpl {
uint64_t mask = stages_[stg].mask;
size_t barrier = std::min(len, stages_[stg].barrier);
for (/*empty*/; i < barrier; ++i) {
hash = (hash << 1) + gear[data[i]];
if (!(hash & mask)) {
return i;
}
hash = (hash << 1) + gear[data[i]];
}
}
return i;

View File

@@ -195,7 +195,7 @@ TEST_F(ChunkerTest, MinChunkSize) {
// Tests that maximum chunk size is not exceeded.
TEST_F(ChunkerTest, MaxChunkSize) {
Config cfg(32, 64, 128);
Config cfg(0, 64, 128);
std::vector<size_t> chunk_sizes;
TestChunker<> chunker(cfg, [&](const uint8_t* /* data */, size_t len) {
chunk_sizes.push_back(len);