mirror of
https://github.com/nestriness/cdc-file-transfer.git
synced 2026-01-30 10:35:37 +02:00
Change fastcdc to a better and simpler algorithm. (#79)
This CL changes the chunking algorithm from "normalized chunking" to simple "regression chunking", and changes the has criteria from 'hash&mask' to 'hash<=threshold'. These are all ideas taken from testing and analysis done at https://github.com/dbaarda/rollsum-chunking/blob/master/RESULTS.rst Regression chunking was introduced in https://www.usenix.org/system/files/conference/atc12/atc12-final293.pdf The algorithm uses an arbitrary number of regressions using power-of-2 regression target lengths. This means we can use a simple bitmask for the regression hash criteria. Regression chunking yields high deduplication rates even for lower max chunk sizes, so that the cdc_stream max chunk can be reduced to 512K from 1024K. This fixes potential latency spikes from large chunks.
This commit is contained in:
@@ -14,7 +14,7 @@ experimentation. See the file `indexer.h` for preprocessor macros that can be
|
||||
enabled, for example:
|
||||
|
||||
```
|
||||
bazel build -c opt --copt=-DCDC_GEAR_TABLE=1 //cdc_indexer
|
||||
bazel build -c opt --copt=-DCDC_GEAR_BITS=32 //cdc_indexer
|
||||
```
|
||||
|
||||
At the end of the operation, the indexer outputs a summary of the results such
|
||||
@@ -25,7 +25,7 @@ as the following:
|
||||
Operation succeeded.
|
||||
|
||||
Chunk size (min/avg/max): 128 KB / 256 KB / 1024 KB | Threads: 12
|
||||
gear_table: 64 bit | mask_s: 0x49249249249249 | mask_l: 0x1249249249
|
||||
gear_table: 64 bit | threshold: 0x7fffc0001fff
|
||||
Duration: 00:03
|
||||
Total files: 2
|
||||
Total chunks: 39203
|
||||
|
||||
@@ -140,8 +140,7 @@ Indexer::Impl::Impl(const IndexerConfig& cfg,
|
||||
fastcdc::Config ccfg(cfg_.min_chunk_size, cfg_.avg_chunk_size,
|
||||
cfg_.max_chunk_size);
|
||||
Indexer::Chunker chunker(ccfg, nullptr);
|
||||
cfg_.mask_s = chunker.Stage(0).mask;
|
||||
cfg_.mask_l = chunker.Stage(chunker.StagesCount() - 1).mask;
|
||||
cfg_.threshold = chunker.Threshold();
|
||||
// Collect inputs.
|
||||
for (auto it = inputs.begin(); it != inputs.end(); ++it) {
|
||||
inputs_.push(*it);
|
||||
@@ -368,8 +367,7 @@ IndexerConfig::IndexerConfig()
|
||||
max_chunk_size(0),
|
||||
max_chunk_size_step(0),
|
||||
num_threads(0),
|
||||
mask_s(0),
|
||||
mask_l(0) {}
|
||||
threshold(0) {}
|
||||
|
||||
Indexer::Indexer() : impl_(nullptr) {}
|
||||
|
||||
|
||||
@@ -27,16 +27,10 @@
|
||||
#include "fastcdc/fastcdc.h"
|
||||
|
||||
// Compile-time parameters for the FastCDC algorithm.
|
||||
#define CDC_GEAR_32BIT 1
|
||||
#define CDC_GEAR_64BIT 2
|
||||
#ifndef CDC_GEAR_TABLE
|
||||
#define CDC_GEAR_TABLE CDC_GEAR_64BIT
|
||||
#endif
|
||||
#ifndef CDC_MASK_STAGES
|
||||
#define CDC_MASK_STAGES 7
|
||||
#endif
|
||||
#ifndef CDC_MASK_BIT_LSHIFT_AMOUNT
|
||||
#define CDC_MASK_BIT_LSHIFT_AMOUNT 3
|
||||
#define CDC_GEAR_32BIT 32
|
||||
#define CDC_GEAR_64BIT 64
|
||||
#ifndef CDC_GEAR_BITS
|
||||
#define CDC_GEAR_BITS CDC_GEAR_64BIT
|
||||
#endif
|
||||
|
||||
namespace cdc_ft {
|
||||
@@ -66,23 +60,20 @@ struct IndexerConfig {
|
||||
uint32_t num_threads;
|
||||
// Which hash function to use.
|
||||
HashType hash_type;
|
||||
// The masks will be populated by the indexer, setting them here has no
|
||||
// effect. They are in this struct so that they can be conveniently accessed
|
||||
// when printing the operation summary (and since they are derived from the
|
||||
// configuration, they are technically part of it).
|
||||
uint64_t mask_s;
|
||||
uint64_t mask_l;
|
||||
// The threshold will be populated by the indexer, setting it here has no
|
||||
// effect. It is in this struct so that it can be conveniently accessed
|
||||
// when printing the operation summary (and since it is derived from the
|
||||
// configuration, it is technically part of it).
|
||||
uint64_t threshold;
|
||||
};
|
||||
|
||||
class Indexer {
|
||||
public:
|
||||
using hash_t = std::string;
|
||||
#if CDC_GEAR_TABLE == CDC_GEAR_32BIT
|
||||
typedef fastcdc::Chunker32<CDC_MASK_STAGES, CDC_MASK_BIT_LSHIFT_AMOUNT>
|
||||
Chunker;
|
||||
#elif CDC_GEAR_TABLE == CDC_GEAR_64BIT
|
||||
typedef fastcdc::Chunker64<CDC_MASK_STAGES, CDC_MASK_BIT_LSHIFT_AMOUNT>
|
||||
Chunker;
|
||||
#if CDC_GEAR_BITS == CDC_GEAR_32BIT
|
||||
typedef fastcdc::Chunker32<> Chunker;
|
||||
#elif CDC_GEAR_BITS == CDC_GEAR_64BIT
|
||||
typedef fastcdc::Chunker64<> Chunker;
|
||||
#else
|
||||
#error "Unknown gear table"
|
||||
#endif
|
||||
|
||||
@@ -64,9 +64,9 @@ namespace {
|
||||
|
||||
const char* GearTable() {
|
||||
// The following macros are defined in indexer.h.
|
||||
#if CDC_GEAR_TABLE == CDC_GEAR_32BIT
|
||||
#if CDC_GEAR_BITS == CDC_GEAR_32BIT
|
||||
return "32 bit";
|
||||
#elif CDC_GEAR_TABLE == CDC_GEAR_64BIT
|
||||
#elif CDC_GEAR_BITS == CDC_GEAR_64BIT
|
||||
return "64 bit";
|
||||
#else
|
||||
#error "Unknown gear table"
|
||||
@@ -165,9 +165,8 @@ void ShowSummary(const IndexerConfig& cfg, const Indexer::OpStats& stats,
|
||||
<< HumanBytes(cfg.max_chunk_size)
|
||||
<< " | Hash: " << HashTypeToString(cfg.hash_type)
|
||||
<< " | Threads: " << cfg.num_threads << std::endl;
|
||||
std::cout << "gear_table: " << GearTable() << " | mask_s: 0x" << std::hex
|
||||
<< cfg.mask_s << " | mask_l: 0x" << cfg.mask_l << std::dec
|
||||
<< std::endl;
|
||||
std::cout << "gear_table: " << GearTable() << " | threshold: 0x" << std::hex
|
||||
<< cfg.threshold << std::dec << std::endl;
|
||||
std::cout << std::setw(title_w) << "Duration:" << std::setw(num_w)
|
||||
<< HumanDuration(elapsed) << std::endl;
|
||||
std::cout << std::setw(title_w) << "Total files:" << std::setw(num_w)
|
||||
@@ -279,11 +278,10 @@ absl::Status WriteResultsFile(const std::string& filepath,
|
||||
|
||||
path::FileCloser closer(fout);
|
||||
|
||||
static constexpr int num_columns = 15;
|
||||
static constexpr int num_columns = 14;
|
||||
static const char* columns[num_columns] = {
|
||||
"gear_table",
|
||||
"mask_s",
|
||||
"mask_l",
|
||||
"threshold",
|
||||
"Min chunk size [KiB]",
|
||||
"Avg chunk size [KiB]",
|
||||
"Max chunk size [KiB]",
|
||||
@@ -332,7 +330,7 @@ absl::Status WriteResultsFile(const std::string& filepath,
|
||||
// Write user-supplied description
|
||||
if (!description.empty()) std::fprintf(fout, "%s,", description.c_str());
|
||||
// Write chunking params.
|
||||
std::fprintf(fout, "%s,0x%zx,0x%zx,", GearTable(), cfg.mask_s, cfg.mask_l);
|
||||
std::fprintf(fout, "%s,0x%zx,", GearTable(), cfg.threshold);
|
||||
std::fprintf(fout, "%zu,%zu,%zu,", cfg.min_chunk_size >> 10,
|
||||
cfg.avg_chunk_size >> 10, cfg.max_chunk_size >> 10);
|
||||
// Write speed, files, chunks.
|
||||
|
||||
Reference in New Issue
Block a user