diff --git a/cdc_indexer/README.md b/cdc_indexer/README.md index 3a57b1c..2745fb9 100644 --- a/cdc_indexer/README.md +++ b/cdc_indexer/README.md @@ -14,7 +14,7 @@ experimentation. See the file `indexer.h` for preprocessor macros that can be enabled, for example: ``` -bazel build -c opt --copt=-DCDC_GEAR_TABLE=1 //cdc_indexer +bazel build -c opt --copt=-DCDC_GEAR_BITS=32 //cdc_indexer ``` At the end of the operation, the indexer outputs a summary of the results such @@ -25,7 +25,7 @@ as the following: Operation succeeded. Chunk size (min/avg/max): 128 KB / 256 KB / 1024 KB | Threads: 12 -gear_table: 64 bit | mask_s: 0x49249249249249 | mask_l: 0x1249249249 +gear_table: 64 bit | threshold: 0x7fffc0001fff Duration: 00:03 Total files: 2 Total chunks: 39203 diff --git a/cdc_indexer/indexer.cc b/cdc_indexer/indexer.cc index 8dbf38f..65e0254 100644 --- a/cdc_indexer/indexer.cc +++ b/cdc_indexer/indexer.cc @@ -140,8 +140,7 @@ Indexer::Impl::Impl(const IndexerConfig& cfg, fastcdc::Config ccfg(cfg_.min_chunk_size, cfg_.avg_chunk_size, cfg_.max_chunk_size); Indexer::Chunker chunker(ccfg, nullptr); - cfg_.mask_s = chunker.Stage(0).mask; - cfg_.mask_l = chunker.Stage(chunker.StagesCount() - 1).mask; + cfg_.threshold = chunker.Threshold(); // Collect inputs. for (auto it = inputs.begin(); it != inputs.end(); ++it) { inputs_.push(*it); @@ -368,8 +367,7 @@ IndexerConfig::IndexerConfig() max_chunk_size(0), max_chunk_size_step(0), num_threads(0), - mask_s(0), - mask_l(0) {} + threshold(0) {} Indexer::Indexer() : impl_(nullptr) {} diff --git a/cdc_indexer/indexer.h b/cdc_indexer/indexer.h index 2a10a86..3ae2a87 100644 --- a/cdc_indexer/indexer.h +++ b/cdc_indexer/indexer.h @@ -27,16 +27,10 @@ #include "fastcdc/fastcdc.h" // Compile-time parameters for the FastCDC algorithm. -#define CDC_GEAR_32BIT 1 -#define CDC_GEAR_64BIT 2 -#ifndef CDC_GEAR_TABLE -#define CDC_GEAR_TABLE CDC_GEAR_64BIT -#endif -#ifndef CDC_MASK_STAGES -#define CDC_MASK_STAGES 7 -#endif -#ifndef CDC_MASK_BIT_LSHIFT_AMOUNT -#define CDC_MASK_BIT_LSHIFT_AMOUNT 3 +#define CDC_GEAR_32BIT 32 +#define CDC_GEAR_64BIT 64 +#ifndef CDC_GEAR_BITS +#define CDC_GEAR_BITS CDC_GEAR_64BIT #endif namespace cdc_ft { @@ -66,23 +60,20 @@ struct IndexerConfig { uint32_t num_threads; // Which hash function to use. HashType hash_type; - // The masks will be populated by the indexer, setting them here has no - // effect. They are in this struct so that they can be conveniently accessed - // when printing the operation summary (and since they are derived from the - // configuration, they are technically part of it). - uint64_t mask_s; - uint64_t mask_l; + // The threshold will be populated by the indexer, setting it here has no + // effect. It is in this struct so that it can be conveniently accessed + // when printing the operation summary (and since it is derived from the + // configuration, it is technically part of it). + uint64_t threshold; }; class Indexer { public: using hash_t = std::string; -#if CDC_GEAR_TABLE == CDC_GEAR_32BIT - typedef fastcdc::Chunker32 - Chunker; -#elif CDC_GEAR_TABLE == CDC_GEAR_64BIT - typedef fastcdc::Chunker64 - Chunker; +#if CDC_GEAR_BITS == CDC_GEAR_32BIT + typedef fastcdc::Chunker32<> Chunker; +#elif CDC_GEAR_BITS == CDC_GEAR_64BIT + typedef fastcdc::Chunker64<> Chunker; #else #error "Unknown gear table" #endif diff --git a/cdc_indexer/main.cc b/cdc_indexer/main.cc index ebd4721..8fe1a34 100644 --- a/cdc_indexer/main.cc +++ b/cdc_indexer/main.cc @@ -64,9 +64,9 @@ namespace { const char* GearTable() { // The following macros are defined in indexer.h. -#if CDC_GEAR_TABLE == CDC_GEAR_32BIT +#if CDC_GEAR_BITS == CDC_GEAR_32BIT return "32 bit"; -#elif CDC_GEAR_TABLE == CDC_GEAR_64BIT +#elif CDC_GEAR_BITS == CDC_GEAR_64BIT return "64 bit"; #else #error "Unknown gear table" @@ -165,9 +165,8 @@ void ShowSummary(const IndexerConfig& cfg, const Indexer::OpStats& stats, << HumanBytes(cfg.max_chunk_size) << " | Hash: " << HashTypeToString(cfg.hash_type) << " | Threads: " << cfg.num_threads << std::endl; - std::cout << "gear_table: " << GearTable() << " | mask_s: 0x" << std::hex - << cfg.mask_s << " | mask_l: 0x" << cfg.mask_l << std::dec - << std::endl; + std::cout << "gear_table: " << GearTable() << " | threshold: 0x" << std::hex + << cfg.threshold << std::dec << std::endl; std::cout << std::setw(title_w) << "Duration:" << std::setw(num_w) << HumanDuration(elapsed) << std::endl; std::cout << std::setw(title_w) << "Total files:" << std::setw(num_w) @@ -279,11 +278,10 @@ absl::Status WriteResultsFile(const std::string& filepath, path::FileCloser closer(fout); - static constexpr int num_columns = 15; + static constexpr int num_columns = 14; static const char* columns[num_columns] = { "gear_table", - "mask_s", - "mask_l", + "threshold", "Min chunk size [KiB]", "Avg chunk size [KiB]", "Max chunk size [KiB]", @@ -332,7 +330,7 @@ absl::Status WriteResultsFile(const std::string& filepath, // Write user-supplied description if (!description.empty()) std::fprintf(fout, "%s,", description.c_str()); // Write chunking params. - std::fprintf(fout, "%s,0x%zx,0x%zx,", GearTable(), cfg.mask_s, cfg.mask_l); + std::fprintf(fout, "%s,0x%zx,", GearTable(), cfg.threshold); std::fprintf(fout, "%zu,%zu,%zu,", cfg.min_chunk_size >> 10, cfg.avg_chunk_size >> 10, cfg.max_chunk_size >> 10); // Write speed, files, chunks. diff --git a/cdc_stream/multi_session_test.cc b/cdc_stream/multi_session_test.cc index c8d6e33..b789df4 100644 --- a/cdc_stream/multi_session_test.cc +++ b/cdc_stream/multi_session_test.cc @@ -158,7 +158,7 @@ class MultiSessionTest : public ManifestTestBase { EXPECT_EQ(data->file_count, file_count); EXPECT_EQ(data->min_chunk_size, 128 << 10); EXPECT_EQ(data->avg_chunk_size, 256 << 10); - EXPECT_EQ(data->max_chunk_size, 1024 << 10); + EXPECT_EQ(data->max_chunk_size, 512 << 10); } metrics::ManifestUpdateData GetManifestUpdateData( diff --git a/fastcdc/fastcdc.h b/fastcdc/fastcdc.h index 5a34cba..16c6b7f 100644 --- a/fastcdc/fastcdc.h +++ b/fastcdc/fastcdc.h @@ -24,14 +24,12 @@ #include #include #include +#include #include namespace cdc_ft { namespace fastcdc { -static constexpr uint32_t default_mask_stages = 7; -static constexpr uint32_t default_mask_lshift = 3; - // Configures the chunk sizes that the ChunkerTmpl class produces. All sizes are // given in bytes. struct Config { @@ -41,9 +39,12 @@ struct Config { // that this size can still be undercut for the last chunk after processing // the input data. size_t min_size; - // The average chunk size is the target size for chunks. Sizes will show a - // normal distribution around the average size, depending on the template - // parameters of the ChunkerTmpl class. + // The average chunk size is the target size for chunks, not including the + // effects of max_size regression. Before regression, sizes will show an + // offset exponential distribution decaying after min_size with the desired + // average size. Regression will "reflect-back" the exponential + // distribution past max_size, which reduces the actual average size and + // gives a very flat distribution when max_size is small. size_t avg_size; // The maximum size is the upper bound for generating chunks. This limit is // never exceeded. If a chunk boundary was not detected based on the content @@ -57,53 +58,57 @@ using ChunkFoundHandler = std::function; // Implements a very fast content-defined chunking algorithm. // // FastCDC [1] identifies chunk boundaries based on a simple yet efficient -// rolling hash. This library implements a modified version of this algorithm to -// achieve better normalization of the chunk sizes around the target average -// size. This behavior can be tweaked with several parameters. +// "gear" rolling hash, a "normalized chunking" algorithm using a stepped +// chunk probability with a pair spread-out bitmasks for the '!(hash&mask)' +// "hash criteria". +// +// This library implements a modified version based on rollsum-chunking [2] +// tests and analysis that showed simple "exponential chunking" gives better +// deduplication, and a 'hash<=threshold' "hash criteria" works better for +// the gear rollsum and can support arbitrary non-power-of-two sizes. +// +// For limiting block sizes it uses a modified version of "Regression +// Chunking"[3] with an arbitrary number of regressions using power-of-2 +// target block lengths (not multiples of the target block length, which +// doesn't have to be a power-of-2). This means we can use a bitmask for the +// most significant bits for the regression hash criteria. // // The Config struct passed in during construction defines the minimum, average, // and maximum allowed chunk sizes. Those are runtime parameters. // // The template allows additional compile-time configuration: -// - T, gear: an array of random numbers that serves as a look-up table to +// +// - T : The type used for the hash. Should be an unsigned integer type, +// ideally uint32_t or uint64_t. The number of bits of this type determines +// the "sliding window" size of the gear hash. A smaller type is likely to be +// faster at the expense of reduced deduplication. +// +// - gear: an array of random numbers that serves as a look-up table to // modify be added to the rolling hash in each round based on the input data. -// This library comes with two different tables, one of type uint32_t and one of -// uint64_t. Both showed good results in our experiments, yet the 64-bit version -// provided slightly better deduplication. -// - mask_stages: the number of stages in which the requirements for identifying -// a chunk boundary is gradually losened as the amount of data processed is -// approaching the maximum chunk size. More stages result in a smoother normal -// distribution of chunk sizes around the configured average size. Our -// experiments showed good normalization with stages between 5 and 9. -// - mask_lshift: defines how much the bits set in the mask that identifies the -// chunk boundary are spread apart. Our experiments showed a better -// deduplication rate with a small amount of lshift (between 2 and 4). +// This library comes with two different tables, one of type uint32_t and one +// of uint64_t. Both showed good results in our experiments, yet the 64-bit +// version provided slightly better deduplication. // // [1] https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf. +// [2] https://github.com/dbaarda/rollsum-chunking/blob/master/RESULTS.rst +// [3] https://www.usenix.org/system/files/conference/atc12/atc12-final293.pdf // // TODO: Remove template parameters. -template +template class ChunkerTmpl { public: - struct MaskStage { - size_t barrier; - uint64_t mask; - }; - // Constructor. ChunkerTmpl(const Config& cfg, ChunkFoundHandler handler) : cfg_(cfg), handler_(handler) { - static_assert(mask_stages > 0 && mask_stages <= 64, - "mask_stages must be between 1 and 64"); - static_assert(mask_lshift > 0 && mask_lshift <= 31, - "mask_lshift must be between 1 and 31"); + assert(cfg_.avg_size >= 1); assert(cfg_.min_size <= cfg_.avg_size); assert(cfg_.avg_size <= cfg_.max_size); + // Calculate the threshold the hash must be <= to for a 1/(avg-min+1) + // chance of a chunk boundary. + threshold_ = + std::numeric_limits::max() / (cfg_.avg_size - cfg_.min_size + 1); data_.reserve(cfg_.max_size << 1); - InitStages(); } // Slices the given data block into chunks and calls the specified handler @@ -145,92 +150,10 @@ class ChunkerTmpl { // be smaller than the specified minimum chunk size. void Finalize() { Process(nullptr, 0); } - // Returns the number of mask stages used for determening chunk boundaries. - uint32_t StagesCount() { return mask_stages; } - - // Returns the mask stage with the given index. - const MaskStage& Stage(uint32_t i) { - assert(i < mask_stages); - return stages_[i]; - } + // Returns the threshold for the hash <= threshold chunk boundary. + T Threshold() { return threshold_; } private: - // Returns approximately log_2 of the given size, slightly adjusted to better - // achieve the average chunk size. - static uint32_t Bits(size_t size) { - uint32_t bits = 0; - for (; size > 0; size >>= 1) bits++; - // Adjust number of bits to better hit the target chunk size (evaluated via - // experiments). - return bits > 3 ? bits - 3 : 1; - } - - // Returns a bitmask with the given number of bits set to 1. - static uint64_t Mask(const uint32_t bits) { - assert(bits > 0 && bits < 64); - uint64_t mask = 0; - - // Check which bit pattern we need to make the 1s fit into 64 bit: - // 10..10..10... vs. 110..110..110... vs. 1110..1110..1110... - uint64_t pattern = 1ull; - uint32_t actual_lshift = mask_lshift; - for (uint32_t num_ones = 1; num_ones <= 32; num_ones++) { - // Round up integer division: (bits + num_ones - 1) / num_ones - if (((bits + num_ones - 1) / num_ones) * actual_lshift < 64) { - // The number of rounds needed depends on the number of 1s in "pattern". - uint32_t num_shifts = bits / num_ones; - for (uint32_t j = 0; j < num_shifts; j++) { - mask = (mask << actual_lshift) | pattern; - } - // Append any missing 1s to the end. - for (uint32_t j = num_shifts * num_ones; j < bits; j++) { - mask = (mask << 1) | 1ull; - } - return mask; - } - // Switch to the next denser pattern (e.g. 100100... => 11001100...). - pattern = (pattern << 1) | 1ull; - actual_lshift++; - } - // If we came here it's likely an error. - assert(bits == 0 || mask != 0); - return mask; - } - - void InitStages() { - constexpr uint32_t mask_stages_left = mask_stages / 2; - constexpr uint32_t mask_stages_right = mask_stages - mask_stages_left; - const uint32_t avg_bits = Bits(cfg_.avg_size); - - // Minimum distance from the average size to the extremes. - size_t dist = - std::min(cfg_.avg_size - cfg_.min_size, cfg_.max_size - cfg_.avg_size); - int stg = 0; - // Decrease mask bits by one in each stage from (bits + n) downto (bits + - // 1), barriers at 1/2, 1/3, ... 1/(n+1) of dist. - for (uint32_t i = 0; i < mask_stages_left; i++) { - // Bitmasks require at least one bit set. - uint32_t bits = - avg_bits + mask_stages_left > i ? avg_bits + mask_stages_left - i : 1; - stages_[stg].mask = Mask(bits); - stages_[stg].barrier = cfg_.avg_size - dist / (i + 2); - stg++; - } - // Decrease mask bits by one in each stage from (bits) downto (bits - n), - // barriers at 1/(n+1), 1/n, ..., 1/2 of dist. - for (int i = mask_stages_right; i > 0; i--) { - // Bitmasks require at least one bit set. - uint32_t bits = avg_bits + i > mask_stages_right - ? avg_bits + i - mask_stages_right - : 1; - stages_[stg].mask = Mask(bits); - stages_[stg].barrier = cfg_.avg_size + dist / i; - stg++; - } - // Adjust the final barrier to the max. chunk size. - stages_[mask_stages - 1].barrier = cfg_.max_size; - } - size_t FindChunkBoundary(const uint8_t* data, size_t len) { if (len <= cfg_.min_size) { return len; @@ -239,30 +162,41 @@ class ChunkerTmpl { len = cfg_.max_size; } + // Initialize the regression length to len (the end) and the regression + // mask to an empty bitmask (match any hash). + size_t rc_len = len; + T rc_mask = 0; + // Init hash to all 1's to avoid zero-length chunks with min_size=0. - uint64_t hash = UINT64_MAX; - // Skip the first min_size bytes, but "warm up" the rolling hash for 64 - // rounds to make sure the 64-bit hash has gathered full "content history". - size_t i = cfg_.min_size > 64 ? cfg_.min_size - 64 : 0; + T hash = std::numeric_limits::max(); + // Skip the first min_size bytes, but "warm up" the rolling hash for enough + // rounds to make sure the hash has gathered full "content history". + size_t i = cfg_.min_size > kHashBits ? cfg_.min_size - kHashBits : 0; for (/*empty*/; i < cfg_.min_size; ++i) { hash = (hash << 1) + gear[data[i]]; } - for (uint32_t stg = 0; stg < mask_stages && i < len; stg++) { - uint64_t mask = stages_[stg].mask; - size_t barrier = std::min(len, stages_[stg].barrier); - for (/*empty*/; i < barrier; ++i) { - if (!(hash & mask)) { + for (/*empty*/; i < len; ++i) { + if (!(hash & rc_mask)) { + if (hash <= threshold_) { + // This hash matches the target length hash criteria, return it. return i; } - hash = (hash << 1) + gear[data[i]]; + // This is a better regression point. Set it as the new rc_len and + // update rc_mask to check as many MSBits as this hash would pass. + rc_len = i; + rc_mask = std::numeric_limits::max(); + while (hash & rc_mask) rc_mask <<= 1; } + hash = (hash << 1) + gear[data[i]]; } - return i; + // Return best regression point we found or the end if it's better. + return (hash & rc_mask) ? rc_len : i; } + static constexpr size_t kHashBits = sizeof(T) * 8; const Config cfg_; const ChunkFoundHandler handler_; - MaskStage stages_[mask_stages]; + T threshold_; std::vector data_; }; @@ -404,16 +338,12 @@ static constexpr uint64_t gear64[256] = { }; // namespace internal // Chunker template with a 32-bit gear table. -template -using Chunker32 = - ChunkerTmpl; +template +using Chunker32 = ChunkerTmpl; // Chunker template with a 64-bit gear table. -template -using Chunker64 = - ChunkerTmpl; +template +using Chunker64 = ChunkerTmpl; // Default chunker class using params that are known to work well. using Chunker = Chunker64<>; diff --git a/fastcdc/fastcdc_test.cc b/fastcdc/fastcdc_test.cc index 1fc0b20..3d2aab3 100644 --- a/fastcdc/fastcdc_test.cc +++ b/fastcdc/fastcdc_test.cc @@ -25,160 +25,26 @@ namespace fastcdc { // - data like {1, 1, 1, ...} results in a continuously all-ones rolling hash, // thus is never identified as a chunk boundary. static const uint64_t testgear64[256]{0, 1}; // 0, 1, 0, 0, 0, ... -static constexpr uint32_t test_mask_stages = 5; -static constexpr uint32_t test_mask_lshift = 1; -template -using TestChunker = ChunkerTmpl; - -// Returns the number of bits set to 1 in the given mask. -uint32_t BitCount(uint64_t mask) { - uint32_t count = 0; - for (; mask; mask >>= 1) { - count += mask & 1u; - } - return count; -} +template +using TestChunker = Chunker64; class ChunkerTest : public ::testing::Test { public: ChunkerTest() {} - - protected: - template - static void ValidateStagesTmpl(const Config& cfg); - - template - static void ValidateLshiftTmpl(const Config& cfg); }; -template -void ChunkerTest::ValidateStagesTmpl(const Config& cfg) { - TestChunker chunker(cfg, nullptr); - EXPECT_EQ(chunker.StagesCount(), mask_stages); - - for (uint32_t i = 1; i < chunker.StagesCount(); i++) { - auto prev_stg = chunker.Stage(i - 1); - auto stg = chunker.Stage(i); - EXPECT_LT(prev_stg.barrier, stg.barrier) - << "Stage " << i + 1 << " of " << mask_stages - << ": barriers should be at increasing positions"; - if (prev_stg.mask > 1) { - EXPECT_EQ(BitCount(prev_stg.mask), BitCount(stg.mask) + 1) - << "Stage " << i + 1 << " of " << mask_stages - << ": number of bits in adjacent stages should differ by 1"; - } else { - EXPECT_EQ(1, BitCount(stg.mask)) - << "Stage " << i + 1 << " of " << mask_stages - << ": number of bits in last bitmasks should be 1"; - } - } - - EXPECT_EQ(chunker.Stage(mask_stages - 1).barrier, cfg.max_size) - << "final stage barrier must match the maximum chunk size"; -} - -// Tests that the stages to apply different bitmasks are initialized properly -TEST_F(ChunkerTest, ValidateStages) { +// Tests that the threshold for hash comparison is set correctly. +TEST_F(ChunkerTest, ValidateThreshold) { // Sizes: 128/256/512 bytes Config cfg(128, 256, 512); - ValidateStagesTmpl<1>(cfg); - ValidateStagesTmpl<2>(cfg); - ValidateStagesTmpl<3>(cfg); - ValidateStagesTmpl<4>(cfg); - ValidateStagesTmpl<5>(cfg); - ValidateStagesTmpl<6>(cfg); - ValidateStagesTmpl<7>(cfg); - ValidateStagesTmpl<8>(cfg); - - // Sizes: 128/256/512 KiB - cfg = Config(128 << 10, 256 << 10, 512 << 10); - ValidateStagesTmpl<1>(cfg); - ValidateStagesTmpl<2>(cfg); - ValidateStagesTmpl<3>(cfg); - ValidateStagesTmpl<4>(cfg); - ValidateStagesTmpl<5>(cfg); - ValidateStagesTmpl<6>(cfg); - ValidateStagesTmpl<7>(cfg); - ValidateStagesTmpl<8>(cfg); - ValidateStagesTmpl<16>(cfg); - ValidateStagesTmpl<32>(cfg); - ValidateStagesTmpl<64>(cfg); - - // Sizes: 128/256/512 MiB - cfg = Config(128 << 20, 256 << 20, 512 << 20); - ValidateStagesTmpl<1>(cfg); - ValidateStagesTmpl<2>(cfg); - ValidateStagesTmpl<3>(cfg); - ValidateStagesTmpl<4>(cfg); - ValidateStagesTmpl<5>(cfg); - ValidateStagesTmpl<6>(cfg); - ValidateStagesTmpl<7>(cfg); - ValidateStagesTmpl<8>(cfg); - ValidateStagesTmpl<16>(cfg); - ValidateStagesTmpl<32>(cfg); - ValidateStagesTmpl<64>(cfg); - - // Sizes: 0/512/1024 KiB - cfg = Config(0, 512 << 10, 1024 << 10); - ValidateStagesTmpl<1>(cfg); - ValidateStagesTmpl<2>(cfg); - ValidateStagesTmpl<3>(cfg); - ValidateStagesTmpl<4>(cfg); - ValidateStagesTmpl<5>(cfg); - ValidateStagesTmpl<6>(cfg); - ValidateStagesTmpl<7>(cfg); - ValidateStagesTmpl<8>(cfg); - ValidateStagesTmpl<16>(cfg); - ValidateStagesTmpl<32>(cfg); - ValidateStagesTmpl<64>(cfg); - - // Sizes: 0/512/1024 MiB - cfg = Config(0, 512 << 20, 1024 << 20); - ValidateStagesTmpl<1>(cfg); - ValidateStagesTmpl<2>(cfg); - ValidateStagesTmpl<3>(cfg); - ValidateStagesTmpl<4>(cfg); - ValidateStagesTmpl<5>(cfg); - ValidateStagesTmpl<6>(cfg); - ValidateStagesTmpl<7>(cfg); - ValidateStagesTmpl<8>(cfg); - ValidateStagesTmpl<16>(cfg); - ValidateStagesTmpl<32>(cfg); - ValidateStagesTmpl<64>(cfg); -} - -template -void ChunkerTest::ValidateLshiftTmpl(const Config& cfg) { - TestChunker<1, mask_lshift> chunker(cfg, nullptr); - uint64_t mask = chunker.Stage(0).mask; - uint64_t expected = BitCount(mask); - EXPECT_GE(expected, 1) << "no bits were set in the bit mask for lshift " - << mask_lshift; - // Compare no. of all 1-bits to no. of 1-bits with the given shift amount. - uint32_t actual = 0; - for (; mask; mask >>= mask_lshift) { - actual += mask & 1u; - } - EXPECT_EQ(expected, actual) - << "number of bits set is different with lshift " << mask_lshift; -} - -// Tests that the bitmasks for each stage honor the mask_lshift template -// parameter correctly. -TEST_F(ChunkerTest, ValidateLshift) { - Config cfg(32, 64, 128); - ValidateLshiftTmpl<1>(cfg); - ValidateLshiftTmpl<2>(cfg); - ValidateLshiftTmpl<3>(cfg); - ValidateLshiftTmpl<4>(cfg); - ValidateLshiftTmpl<5>(cfg); + TestChunker<> chunker(cfg, nullptr); + EXPECT_EQ(0x1fc07f01fc07f01, chunker.Threshold()); } // Tests that the minimum chunk size is not undercut. TEST_F(ChunkerTest, MinChunkSize) { - Config cfg(32, 64, 128); + Config cfg(64, 96, 128); std::vector chunk_sizes; TestChunker<> chunker(cfg, [&](const uint8_t* /* data */, size_t len) { chunk_sizes.push_back(len); @@ -187,7 +53,7 @@ TEST_F(ChunkerTest, MinChunkSize) { std::vector data(cfg.max_size, 0); chunker.Process(data.data(), data.size()); chunker.Finalize(); - EXPECT_EQ(chunk_sizes.size(), 4); + EXPECT_EQ(chunk_sizes.size(), 2); for (size_t size : chunk_sizes) { EXPECT_EQ(size, cfg.min_size); } diff --git a/manifest/manifest_test_base.cc b/manifest/manifest_test_base.cc index 16545b2..9543592 100644 --- a/manifest/manifest_test_base.cc +++ b/manifest/manifest_test_base.cc @@ -201,8 +201,11 @@ bool ManifestTestBase::InProgress(const ContentIdProto& manifest_id, void ManifestTestBase::ValidateChunkLookup(const std::string& rel_path, bool expect_contained) { + Buffer file; + EXPECT_OK(path::ReadFile(path::Join(cfg_.src_dir, rel_path), &file)); + uint64_t offset = 0; - auto handler = [&offset, &rel_path, file_chunks = &file_chunks_, + auto handler = [&file, &offset, &rel_path, file_chunks = &file_chunks_, expect_contained](const void* data, size_t size) { ContentIdProto id = ContentId::FromArray(data, size); @@ -214,8 +217,14 @@ void ManifestTestBase::ValidateChunkLookup(const std::string& rel_path, expect_contained); if (expect_contained) { EXPECT_EQ(lookup_path, rel_path); - EXPECT_EQ(lookup_offset, offset); EXPECT_EQ(lookup_size, size); + + // The offset can be ambiguous since the file might contain duplicate + // data. Make sure that the actual data is the same. + EXPECT_LE(offset + size, file.size()); + EXPECT_LE(lookup_offset + size, file.size()); + EXPECT_EQ(memcmp(file.data() + offset, file.data() + lookup_offset, size), + 0); } offset += size; @@ -224,9 +233,7 @@ void ManifestTestBase::ValidateChunkLookup(const std::string& rel_path, cfg_.max_chunk_size); fastcdc::Chunker chunker(cdc_cfg, handler); - Buffer b; - EXPECT_OK(path::ReadFile(path::Join(cfg_.src_dir, rel_path), &b)); - chunker.Process(reinterpret_cast(b.data()), b.size()); + chunker.Process(reinterpret_cast(file.data()), file.size()); chunker.Finalize(); } diff --git a/manifest/manifest_updater.h b/manifest/manifest_updater.h index 6a1ed48..cff0e07 100644 --- a/manifest/manifest_updater.h +++ b/manifest/manifest_updater.h @@ -51,7 +51,7 @@ struct UpdaterConfig { size_t avg_chunk_size = 256 << 10; // Maximum allowed chunk size. - size_t max_chunk_size = 1024 << 10; + size_t max_chunk_size = 512 << 10; // Size of the chunker thread pool. Defaults to the number of available CPUs. uint32_t num_threads = 0; diff --git a/manifest/manifest_updater_test.cc b/manifest/manifest_updater_test.cc index 3346acc..5c5cfb0 100644 --- a/manifest/manifest_updater_test.cc +++ b/manifest/manifest_updater_test.cc @@ -180,19 +180,19 @@ TEST_F(ManifestUpdaterTest, UpdateAll_PrunesUnreferencedChunks) { EXPECT_OK(updater.Update( MakeUpdateOps({"subdir/b.txt", "subdir/c.txt", "subdir/d.txt"}), &file_chunks_, nullptr)); - // 1 for manifest id, 1 for manifest, 5 indirect assets. + // 1 for manifest id, 1 for manifest, 6 indirect assets. // 2 additional chunks from the first Update() that are now unreferenced. // -1, because the indirect asset for "a.txt" is deduplicated - EXPECT_EQ(data_store_.Chunks().size(), 8) + EXPECT_EQ(data_store_.Chunks().size(), 9) << "Manifest: " << ContentId::ToHexString(updater.ManifestId()) << std::endl << DumpDataStoreProtos(); EXPECT_OK(updater.UpdateAll(&file_chunks_)); EXPECT_OK(updater.UpdateAll(&file_chunks_)); - // 1 for manifest id, 1 for manifest, 5 indirect assets. + // 1 for manifest id, 1 for manifest, 6 indirect assets. // Pruning has removed the 2 unreferenced ones. - EXPECT_EQ(data_store_.Chunks().size(), 7) + EXPECT_EQ(data_store_.Chunks().size(), 8) << "Manifest: " << ContentId::ToHexString(updater.ManifestId()) << std::endl << DumpDataStoreProtos(); @@ -224,9 +224,9 @@ TEST_F(ManifestUpdaterTest, UpdateAll_RecoversFromMissingChunks) { } EXPECT_OK(updater.UpdateAll(&file_chunks_)); - // 1 for manifest id, 1 for manifest, 5 indirect assets. - // There would be 8 chunks without the removal above, see UpdateAll_Prune. - EXPECT_EQ(data_store_.Chunks().size(), 7) + // 1 for manifest id, 1 for manifest, 6 indirect assets. + // There would be 9 chunks without the removal above, see UpdateAll_Prune. + EXPECT_EQ(data_store_.Chunks().size(), 8) << "Manifest: " << ContentId::ToHexString(updater.ManifestId()) << std::endl << DumpDataStoreProtos();