mirror of
https://github.com/nestriness/cdc-file-transfer.git
synced 2026-01-30 14:35:37 +02:00
This CL changes the chunking algorithm from "normalized chunking" to simple "regression chunking", and changes the has criteria from 'hash&mask' to 'hash<=threshold'. These are all ideas taken from testing and analysis done at https://github.com/dbaarda/rollsum-chunking/blob/master/RESULTS.rst Regression chunking was introduced in https://www.usenix.org/system/files/conference/atc12/atc12-final293.pdf The algorithm uses an arbitrary number of regressions using power-of-2 regression target lengths. This means we can use a simple bitmask for the regression hash criteria. Regression chunking yields high deduplication rates even for lower max chunk sizes, so that the cdc_stream max chunk can be reduced to 512K from 1024K. This fixes potential latency spikes from large chunks.
432 lines
12 KiB
C++
432 lines
12 KiB
C++
// Copyright 2022 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "cdc_indexer/indexer.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstdio>
|
|
#include <fstream>
|
|
#include <mutex>
|
|
#include <queue>
|
|
#include <thread>
|
|
|
|
#include "absl/functional/bind_front.h"
|
|
#include "absl/strings/str_format.h"
|
|
#include "absl/time/clock.h"
|
|
#include "blake3.h"
|
|
#include "common/dir_iter.h"
|
|
#include "common/errno_mapping.h"
|
|
#include "common/path.h"
|
|
#include "common/status_macros.h"
|
|
|
|
namespace cdc_ft {
|
|
|
|
struct IndexerJob {
|
|
std::string filepath;
|
|
};
|
|
|
|
class Indexer::Impl {
|
|
public:
|
|
Impl(const IndexerConfig& cfg, const std::vector<std::string>& inputs);
|
|
const IndexerConfig& Config() const;
|
|
|
|
// Calls the given `progress` function periodically until `SetDone(true)` is
|
|
// called.
|
|
void TriggerProgress(ProgressFn fn);
|
|
bool GetNextJob(IndexerJob* job);
|
|
|
|
bool HasError() const;
|
|
absl::Status Error() const;
|
|
void SetError(absl::Status err);
|
|
|
|
void SetDone(bool done);
|
|
|
|
inline const IndexerConfig& Cfg() const { return cfg_; }
|
|
inline Indexer::OpStats Stats() const;
|
|
inline Indexer::ChunkSizeMap ChunkSizes() const;
|
|
void AddChunk(const uint8_t* data, size_t len);
|
|
void AddFile();
|
|
|
|
private:
|
|
friend class Indexer;
|
|
// Calculates a hash value for the given data.
|
|
inline hash_t Hash(const uint8_t* data, size_t len);
|
|
inline hash_t HashBlake3(const uint8_t* data, size_t len);
|
|
inline hash_t HashXxhash(const uint8_t* data, size_t len);
|
|
// Finds the smallest power of 2 such that the result is <= size. If size is >
|
|
// 2^31, then UINT64_MAX is returned.
|
|
inline size_t SizeBucket(size_t size) const;
|
|
|
|
IndexerConfig cfg_;
|
|
bool done_;
|
|
// The following members are all guarded by jobs_mutex_.
|
|
std::queue<std::string> inputs_;
|
|
DirectoryIterator dir_iter_;
|
|
std::mutex jobs_mutex_;
|
|
// Guarded by chunks_mutex_
|
|
Indexer::ChunkMap chunks_;
|
|
std::mutex chunks_mutex_;
|
|
// Guarded by stats_mutex_.
|
|
Indexer::OpStats stats_;
|
|
mutable std::mutex stats_mutex_;
|
|
// Guarded by chunk_sizes_mutex_;
|
|
Indexer::ChunkSizeMap chunk_sizes_;
|
|
mutable std::mutex chunk_sizes_mutex_;
|
|
// Guarded by result_mutex_
|
|
absl::Status result_;
|
|
mutable std::mutex result_mutex_;
|
|
};
|
|
|
|
class Indexer::Worker {
|
|
public:
|
|
Worker(Impl* impl);
|
|
void Run();
|
|
|
|
private:
|
|
absl::Status IndexFile(const std::string& filepath);
|
|
|
|
Impl* impl_;
|
|
absl::Cord buf_;
|
|
const fastcdc::Config cdc_cfg_;
|
|
};
|
|
|
|
// This class holds a `Worker` object and the associated `std::thread` object
|
|
// that executes it.
|
|
class Indexer::WorkerThread {
|
|
public:
|
|
WorkerThread() : worker(nullptr), thrd(nullptr) {}
|
|
~WorkerThread() {
|
|
if (thrd) {
|
|
if (thrd->joinable()) thrd->join();
|
|
delete thrd;
|
|
}
|
|
if (worker) {
|
|
delete worker;
|
|
}
|
|
}
|
|
Worker* worker;
|
|
std::thread* thrd;
|
|
};
|
|
|
|
Indexer::Impl::Impl(const IndexerConfig& cfg,
|
|
const std::vector<std::string>& inputs)
|
|
: cfg_(cfg), done_(false) {
|
|
// Perform some sanity checks on the config.
|
|
if (cfg_.num_threads == 0)
|
|
cfg_.num_threads = std::thread::hardware_concurrency();
|
|
if (cfg_.read_block_size == 0) cfg_.read_block_size = 4 << 10;
|
|
if (cfg_.avg_chunk_size == 0) cfg_.avg_chunk_size = 256 << 10;
|
|
if (cfg_.min_chunk_size == 0 || cfg_.min_chunk_size > cfg_.avg_chunk_size)
|
|
cfg_.min_chunk_size = cfg_.avg_chunk_size >> 1;
|
|
if (cfg_.max_chunk_size == 0 || cfg_.max_chunk_size < cfg_.avg_chunk_size)
|
|
cfg_.max_chunk_size = cfg_.avg_chunk_size << 1;
|
|
if (cfg_.max_chunk_size_step == 0)
|
|
cfg_.max_chunk_size_step =
|
|
cfg_.min_chunk_size > 0 ? cfg_.min_chunk_size : 128u;
|
|
// Populate the CDC bitmasks which the Chunker creates. Only done here for
|
|
// being able to write it to the output, setting them in the IndexerConfig has
|
|
// no effect.
|
|
fastcdc::Config ccfg(cfg_.min_chunk_size, cfg_.avg_chunk_size,
|
|
cfg_.max_chunk_size);
|
|
Indexer::Chunker chunker(ccfg, nullptr);
|
|
cfg_.threshold = chunker.Threshold();
|
|
// Collect inputs.
|
|
for (auto it = inputs.begin(); it != inputs.end(); ++it) {
|
|
inputs_.push(*it);
|
|
}
|
|
}
|
|
|
|
const IndexerConfig& Indexer::Impl::Config() const { return cfg_; }
|
|
|
|
// Executes the `progress` function in a loop, approximately every 200ms. Call
|
|
// `SetDone(true)` to stop this function.
|
|
void Indexer::Impl::TriggerProgress(Indexer::ProgressFn fn) {
|
|
if (!fn) return;
|
|
const int64_t interval = 200;
|
|
absl::Time started = absl::Now();
|
|
// Keeping going until we're done or an error occured.
|
|
while (!done_ && !HasError()) {
|
|
absl::Time loop_started = absl::Now();
|
|
stats_mutex_.lock();
|
|
stats_.elapsed = loop_started - started;
|
|
stats_mutex_.unlock();
|
|
|
|
fn(Stats());
|
|
// Aim for one update every interval.
|
|
auto loop_elapsed = absl::ToInt64Milliseconds(loop_started - absl::Now());
|
|
if (loop_elapsed < interval)
|
|
std::this_thread::sleep_for(
|
|
std::chrono::milliseconds(interval - loop_elapsed));
|
|
}
|
|
}
|
|
|
|
bool Indexer::Impl::GetNextJob(IndexerJob* job) {
|
|
// Stop if an error occured.
|
|
if (HasError()) return false;
|
|
const std::lock_guard<std::mutex> lock(jobs_mutex_);
|
|
|
|
DirectoryEntry dent;
|
|
while (!dent.Valid()) {
|
|
// Open the next directory, if needed.
|
|
if (!dir_iter_.Valid()) {
|
|
if (inputs_.empty()) {
|
|
// We are done.
|
|
return false;
|
|
} else {
|
|
std::string input = inputs_.front();
|
|
std::string uinput = path::ToUnix(input);
|
|
inputs_.pop();
|
|
// Return files as jobs.
|
|
if (path::FileExists(uinput)) {
|
|
job->filepath = uinput;
|
|
return true;
|
|
}
|
|
// Otherwise read the directory.
|
|
if (!dir_iter_.Open(input, DirectorySearchFlags::kFiles)) {
|
|
// Ignore permission errors.
|
|
if (absl::IsPermissionDenied(dir_iter_.Status())) {
|
|
continue;
|
|
}
|
|
if (!dir_iter_.Status().ok()) {
|
|
SetError(dir_iter_.Status());
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
if (dir_iter_.NextEntry(&dent)) {
|
|
break;
|
|
} else if (!dir_iter_.Status().ok()) {
|
|
SetError(dir_iter_.Status());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
path::Join(&job->filepath, dir_iter_.Path(), dent.RelPathName());
|
|
return true;
|
|
}
|
|
|
|
void Indexer::Impl::SetDone(bool done) { done_ = done; }
|
|
|
|
inline size_t Indexer::Impl::SizeBucket(size_t size) const {
|
|
size_t bucket = 1024;
|
|
// Go in steps of powers of two until min. chunk size is reached.
|
|
while (bucket < size && bucket < cfg_.min_chunk_size && bucket < (1llu << 63))
|
|
bucket <<= 1;
|
|
// Go in steps of the configurable step size afterwards.
|
|
while (bucket < size && bucket < (1llu << 63))
|
|
bucket += cfg_.max_chunk_size_step;
|
|
return bucket >= size ? bucket : UINT64_MAX;
|
|
}
|
|
|
|
inline Indexer::OpStats Indexer::Impl::Stats() const {
|
|
const std::lock_guard<std::mutex> lock(stats_mutex_);
|
|
return stats_;
|
|
}
|
|
|
|
inline Indexer::ChunkSizeMap Indexer::Impl::ChunkSizes() const {
|
|
const std::lock_guard<std::mutex> lock(chunk_sizes_mutex_);
|
|
return chunk_sizes_;
|
|
}
|
|
|
|
Indexer::hash_t Indexer::Impl::HashBlake3(const uint8_t* data, size_t len) {
|
|
blake3_hasher state;
|
|
uint8_t out[BLAKE3_OUT_LEN];
|
|
blake3_hasher_init(&state);
|
|
blake3_hasher_update(&state, data, len);
|
|
blake3_hasher_finalize(&state, out, BLAKE3_OUT_LEN);
|
|
return Indexer::hash_t(reinterpret_cast<const char*>(out), BLAKE3_OUT_LEN);
|
|
}
|
|
|
|
Indexer::hash_t Indexer::Impl::Hash(const uint8_t* data, size_t len) {
|
|
switch (cfg_.hash_type) {
|
|
case IndexerConfig::HashType::kNull:
|
|
return hash_t();
|
|
case IndexerConfig::HashType::kBlake3:
|
|
return HashBlake3(data, len);
|
|
case IndexerConfig::HashType::kUndefined:
|
|
break;
|
|
}
|
|
std::cerr << "Unknown hash type" << std::endl;
|
|
return std::string();
|
|
}
|
|
|
|
void Indexer::Impl::AddChunk(const uint8_t* data, size_t len) {
|
|
std::string hash = Hash(data, len);
|
|
// See if the chunk already exists, insert it if not.
|
|
chunks_mutex_.lock();
|
|
bool new_chunk = chunks_.find(hash) == chunks_.end();
|
|
if (new_chunk) {
|
|
chunks_.emplace(hash, Chunk{hash, len});
|
|
}
|
|
chunks_mutex_.unlock();
|
|
|
|
// Update the stats.
|
|
stats_mutex_.lock();
|
|
stats_.total_bytes += len;
|
|
++stats_.total_chunks;
|
|
if (new_chunk) {
|
|
stats_.unique_bytes += len;
|
|
++stats_.unique_chunks;
|
|
}
|
|
stats_mutex_.unlock();
|
|
|
|
// Update chunk sizes distribution.
|
|
if (new_chunk) {
|
|
size_t bucket = SizeBucket(len);
|
|
chunk_sizes_mutex_.lock();
|
|
chunk_sizes_[bucket]++;
|
|
chunk_sizes_mutex_.unlock();
|
|
}
|
|
}
|
|
|
|
void Indexer::Impl::AddFile() {
|
|
const std::lock_guard<std::mutex> lock(stats_mutex_);
|
|
++stats_.total_files;
|
|
}
|
|
|
|
bool Indexer::Impl::HasError() const {
|
|
const std::lock_guard<std::mutex> lock(result_mutex_);
|
|
return !result_.ok();
|
|
}
|
|
|
|
absl::Status Indexer::Impl::Error() const {
|
|
const std::lock_guard<std::mutex> lock(result_mutex_);
|
|
return result_;
|
|
}
|
|
|
|
void Indexer::Impl::SetError(absl::Status err) {
|
|
// Ignore attempts to set a non-error.
|
|
if (err.ok()) return;
|
|
const std::lock_guard<std::mutex> lock(result_mutex_);
|
|
// Don't overwrite any previous error.
|
|
if (result_.ok()) result_ = err;
|
|
}
|
|
|
|
Indexer::Worker::Worker(Indexer::Impl* impl)
|
|
: impl_(impl),
|
|
cdc_cfg_(impl_->Cfg().min_chunk_size, impl_->Cfg().avg_chunk_size,
|
|
impl_->Cfg().max_chunk_size) {}
|
|
|
|
void Indexer::Worker::Run() {
|
|
IndexerJob job;
|
|
while (impl_->GetNextJob(&job)) {
|
|
absl::Status err = IndexFile(job.filepath);
|
|
if (!err.ok()) {
|
|
impl_->SetError(err);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
absl::Status Indexer::Worker::IndexFile(const std::string& filepath) {
|
|
std::FILE* fin = std::fopen(filepath.c_str(), "rb");
|
|
if (!fin) {
|
|
return ErrnoToCanonicalStatus(errno, "failed to open file '%s'", filepath);
|
|
}
|
|
path::FileCloser closer(fin);
|
|
std::fseek(fin, 0, SEEK_SET);
|
|
|
|
auto hdlr = absl::bind_front(&Indexer::Impl::AddChunk, impl_);
|
|
Indexer::Chunker chunker(cdc_cfg_, hdlr);
|
|
|
|
std::vector<uint8_t> buf(impl_->Cfg().read_block_size, 0);
|
|
int err = 0;
|
|
while (!std::feof(fin)) {
|
|
size_t cnt = std::fread(buf.data(), sizeof(uint8_t), buf.size(), fin);
|
|
err = std::ferror(fin);
|
|
if (err) {
|
|
return ErrnoToCanonicalStatus(err, "failed to read from file '%s'",
|
|
filepath);
|
|
}
|
|
if (cnt) {
|
|
chunker.Process(buf.data(), cnt);
|
|
}
|
|
}
|
|
chunker.Finalize();
|
|
impl_->AddFile();
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
IndexerConfig::IndexerConfig()
|
|
: read_block_size(32 << 10),
|
|
min_chunk_size(0),
|
|
avg_chunk_size(0),
|
|
max_chunk_size(0),
|
|
max_chunk_size_step(0),
|
|
num_threads(0),
|
|
threshold(0) {}
|
|
|
|
Indexer::Indexer() : impl_(nullptr) {}
|
|
|
|
Indexer::~Indexer() {
|
|
if (impl_) delete impl_;
|
|
}
|
|
|
|
absl::Status Indexer::Run(const IndexerConfig& cfg,
|
|
const std::vector<std::string>& inputs,
|
|
Indexer::ProgressFn fn) {
|
|
if (impl_) delete impl_;
|
|
impl_ = new Impl(cfg, inputs);
|
|
|
|
// Start the file creation workers.
|
|
std::vector<WorkerThread> workers(impl_->Config().num_threads);
|
|
for (auto it = workers.begin(); it != workers.end(); ++it) {
|
|
auto worker = new Worker(impl_);
|
|
it->worker = worker;
|
|
it->thrd = new std::thread(&Worker::Run, worker);
|
|
}
|
|
// Start the progress function worker.
|
|
std::thread prog(&Impl::TriggerProgress, impl_, fn);
|
|
|
|
// Wait for the workers to finish.
|
|
for (auto it = workers.begin(); it != workers.end(); ++it) {
|
|
it->thrd->join();
|
|
}
|
|
// Wait for the progress worker to finish.
|
|
impl_->SetDone(true);
|
|
prog.join();
|
|
|
|
return Error();
|
|
}
|
|
|
|
absl::Status Indexer::Error() const {
|
|
return impl_ ? impl_->Error() : absl::Status();
|
|
}
|
|
|
|
IndexerConfig Indexer::Config() const {
|
|
if (impl_) return impl_->Cfg();
|
|
return IndexerConfig();
|
|
}
|
|
|
|
Indexer::OpStats Indexer::Stats() const {
|
|
if (impl_) return impl_->Stats();
|
|
return Stats();
|
|
}
|
|
|
|
Indexer::ChunkSizeMap Indexer::ChunkSizes() const {
|
|
if (impl_) return impl_->ChunkSizes();
|
|
return Indexer::ChunkSizeMap();
|
|
}
|
|
|
|
inline Indexer::OpStats::OpStats()
|
|
: total_files(0),
|
|
total_chunks(0),
|
|
unique_chunks(0),
|
|
total_bytes(0),
|
|
unique_bytes(0) {}
|
|
|
|
}; // namespace cdc_ft
|