Fix #76 fastcdc chunk boundary off-by-one. (#78)

* Fix #76 fastcdc chunk boundary off-by-one.

This ensures that the last byte included in the gear-hash that identified the
chunk boundary is included in the chunk. This ensures chunks are still matched
when the byte immediately after them is changed.

* Init gear hash to all 1's to prevent zero-length chunks with min_size=0.

Also change the `MaxChunkSize` test to use min_size=0 to test this works.
This commit is contained in:
Donovan Baarda
2023-01-24 00:39:02 +11:00
committed by GitHub
parent efca9855e7
commit 9cf71cae65
2 changed files with 4 additions and 3 deletions

View File

@@ -239,7 +239,8 @@ class ChunkerTmpl {
len = cfg_.max_size;
}
uint64_t hash = 0;
// Init hash to all 1's to avoid zero-length chunks with min_size=0.
uint64_t hash = (uint64_t)-1;
// Skip the first min_size bytes, but "warm up" the rolling hash for 64
// rounds to make sure the 64-bit hash has gathered full "content history".
size_t i = cfg_.min_size > 64 ? cfg_.min_size - 64 : 0;
@@ -250,10 +251,10 @@ class ChunkerTmpl {
uint64_t mask = stages_[stg].mask;
size_t barrier = std::min(len, stages_[stg].barrier);
for (/*empty*/; i < barrier; ++i) {
hash = (hash << 1) + gear[data[i]];
if (!(hash & mask)) {
return i;
}
hash = (hash << 1) + gear[data[i]];
}
}
return i;