mirror of
https://github.com/nestriness/cdc-file-transfer.git
synced 2026-01-30 10:35:37 +02:00
Releasing the former Stadia file transfer tools
The tools allow efficient and fast synchronization of large directory trees from a Windows workstation to a Linux target machine. cdc_rsync* support efficient copy of files by using content-defined chunking (CDC) to identify chunks within files that can be reused. asset_stream_manager + cdc_fuse_fs support efficient streaming of a local directory to a remote virtual file system based on FUSE. It also employs CDC to identify and reuse unchanged data chunks.
This commit is contained in:
35
cdc_indexer/BUILD
Normal file
35
cdc_indexer/BUILD
Normal file
@@ -0,0 +1,35 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
cc_binary(
|
||||
name = "cdc_indexer",
|
||||
srcs = ["main.cc"],
|
||||
deps = [
|
||||
":indexer_lib",
|
||||
"//absl_helper:jedec_size_flag",
|
||||
"//common:path",
|
||||
"@com_google_absl//absl/flags:config",
|
||||
"@com_google_absl//absl/flags:flag",
|
||||
"@com_google_absl//absl/flags:parse",
|
||||
"@com_google_absl//absl/flags:usage",
|
||||
"@com_google_absl//absl/random",
|
||||
"@com_google_absl//absl/time",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "indexer_lib",
|
||||
srcs = ["indexer.cc"],
|
||||
hdrs = ["indexer.h"],
|
||||
deps = [
|
||||
"//common:dir_iter",
|
||||
"//common:path",
|
||||
"//common:status_macros",
|
||||
"//fastcdc",
|
||||
"@com_github_blake3//:blake3",
|
||||
"@com_google_absl//absl/functional:bind_front",
|
||||
"@com_google_absl//absl/random",
|
||||
"@com_google_absl//absl/status",
|
||||
"@com_google_absl//absl/strings:str_format",
|
||||
"@com_google_absl//absl/time",
|
||||
],
|
||||
)
|
||||
72
cdc_indexer/README.md
Normal file
72
cdc_indexer/README.md
Normal file
@@ -0,0 +1,72 @@
|
||||
# CDC Indexer
|
||||
|
||||
This directory contains a CDC indexer based on our implementation of
|
||||
[FastCDC](https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf).
|
||||
|
||||
Run the sample with Bazel:
|
||||
|
||||
```
|
||||
bazel run -c opt //cdc_indexer -- --inputs '/path/to/files'
|
||||
```
|
||||
|
||||
The CDC algorithm can be tweaked with a few compile-time constants for
|
||||
experimentation. See the file `indexer.h` for preprocessor macros that can be
|
||||
enabled, for example:
|
||||
|
||||
```
|
||||
bazel build -c opt --copt=-DCDC_GEAR_TABLE=1 //cdc_indexer
|
||||
```
|
||||
|
||||
At the end of the operation, the indexer outputs a summary of the results such
|
||||
as the following:
|
||||
|
||||
```
|
||||
00:02 7.44 GB in 2 files processed at 3.1 GB/s, 50% deduplication
|
||||
Operation succeeded.
|
||||
|
||||
Chunk size (min/avg/max): 128 KB / 256 KB / 1024 KB | Threads: 12
|
||||
gear_table: 64 bit | mask_s: 0x49249249249249 | mask_l: 0x1249249249
|
||||
Duration: 00:03
|
||||
Total files: 2
|
||||
Total chunks: 39203
|
||||
Unique chunks: 20692
|
||||
Total data: 9.25 GB
|
||||
Unique data: 4.88 GB
|
||||
Throughput: 3.07 GB/s
|
||||
Avg. chunk size: 247 KB
|
||||
Deduplication: 47.2%
|
||||
|
||||
160 KB ######### 1419 ( 7%)
|
||||
192 KB ######## 1268 ( 6%)
|
||||
224 KB ################### 2996 (14%)
|
||||
256 KB ######################################## 6353 (31%)
|
||||
288 KB ###################### 3466 (17%)
|
||||
320 KB ########################## 4102 (20%)
|
||||
352 KB ###### 946 ( 5%)
|
||||
384 KB 75 ( 0%)
|
||||
416 KB 27 ( 0%)
|
||||
448 KB 7 ( 0%)
|
||||
480 KB 5 ( 0%)
|
||||
512 KB 1 ( 0%)
|
||||
544 KB 4 ( 0%)
|
||||
576 KB 2 ( 0%)
|
||||
608 KB 3 ( 0%)
|
||||
640 KB 3 ( 0%)
|
||||
672 KB 3 ( 0%)
|
||||
704 KB 2 ( 0%)
|
||||
736 KB 0 ( 0%)
|
||||
768 KB 0 ( 0%)
|
||||
800 KB 1 ( 0%)
|
||||
832 KB 0 ( 0%)
|
||||
864 KB 0 ( 0%)
|
||||
896 KB 0 ( 0%)
|
||||
928 KB 0 ( 0%)
|
||||
960 KB 0 ( 0%)
|
||||
992 KB 0 ( 0%)
|
||||
1024 KB 9 ( 0%)
|
||||
```
|
||||
|
||||
For testing multiple combinations and comparing the results, the indexer also
|
||||
features a flag `--results_file="results.csv"` which appends the raw data to the
|
||||
given file in CSV format. Combine this flag with `--description` to label each
|
||||
experiment with additional columns.
|
||||
434
cdc_indexer/indexer.cc
Normal file
434
cdc_indexer/indexer.cc
Normal file
@@ -0,0 +1,434 @@
|
||||
// Copyright 2022 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "cdc_indexer/indexer.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
#include <thread>
|
||||
|
||||
#include "absl/functional/bind_front.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "blake3.h"
|
||||
#include "common/dir_iter.h"
|
||||
#include "common/errno_mapping.h"
|
||||
#include "common/path.h"
|
||||
#include "common/status_macros.h"
|
||||
|
||||
namespace cdc_ft {
|
||||
|
||||
struct IndexerJob {
|
||||
std::string filepath;
|
||||
};
|
||||
|
||||
class Indexer::Impl {
|
||||
public:
|
||||
Impl(const IndexerConfig& cfg, const std::vector<std::string>& inputs);
|
||||
const IndexerConfig& Config() const;
|
||||
|
||||
// Calls the given `progress` function periodically until `SetDone(true)` is
|
||||
// called.
|
||||
void TriggerProgress(ProgressFn fn);
|
||||
bool GetNextJob(IndexerJob* job);
|
||||
|
||||
bool HasError() const;
|
||||
absl::Status Error() const;
|
||||
void SetError(absl::Status err);
|
||||
|
||||
void SetDone(bool done);
|
||||
|
||||
inline const IndexerConfig& Cfg() const { return cfg_; }
|
||||
inline Indexer::OpStats Stats() const;
|
||||
inline Indexer::ChunkSizeMap ChunkSizes() const;
|
||||
void AddChunk(const uint8_t* data, size_t len);
|
||||
void AddFile();
|
||||
|
||||
private:
|
||||
friend class Indexer;
|
||||
// Calculates a hash value for the given data.
|
||||
inline hash_t Hash(const uint8_t* data, size_t len);
|
||||
inline hash_t HashBlake3(const uint8_t* data, size_t len);
|
||||
inline hash_t HashXxhash(const uint8_t* data, size_t len);
|
||||
// Finds the smallest power of 2 such that the result is <= size. If size is >
|
||||
// 2^31, then UINT64_MAX is returned.
|
||||
inline size_t SizeBucket(size_t size) const;
|
||||
|
||||
IndexerConfig cfg_;
|
||||
bool done_;
|
||||
// The following members are all guarded by jobs_mutex_.
|
||||
std::queue<std::string> inputs_;
|
||||
DirectoryIterator dir_iter_;
|
||||
std::mutex jobs_mutex_;
|
||||
// Guarded by chunks_mutex_
|
||||
Indexer::ChunkMap chunks_;
|
||||
std::mutex chunks_mutex_;
|
||||
// Guarded by stats_mutex_.
|
||||
Indexer::OpStats stats_;
|
||||
mutable std::mutex stats_mutex_;
|
||||
// Guarded by chunk_sizes_mutex_;
|
||||
Indexer::ChunkSizeMap chunk_sizes_;
|
||||
mutable std::mutex chunk_sizes_mutex_;
|
||||
// Guarded by result_mutex_
|
||||
absl::Status result_;
|
||||
mutable std::mutex result_mutex_;
|
||||
};
|
||||
|
||||
class Indexer::Worker {
|
||||
public:
|
||||
Worker(Impl* impl);
|
||||
void Run();
|
||||
|
||||
private:
|
||||
absl::Status IndexFile(const std::string& filepath);
|
||||
|
||||
Impl* impl_;
|
||||
absl::Cord buf_;
|
||||
const fastcdc::Config cdc_cfg_;
|
||||
};
|
||||
|
||||
// This class holds a `Worker` object and the associated `std::thread` object
|
||||
// that executes it.
|
||||
class Indexer::WorkerThread {
|
||||
public:
|
||||
WorkerThread() : worker(nullptr), thrd(nullptr) {}
|
||||
~WorkerThread() {
|
||||
if (thrd) {
|
||||
if (thrd->joinable()) thrd->join();
|
||||
delete thrd;
|
||||
}
|
||||
if (worker) {
|
||||
delete worker;
|
||||
}
|
||||
}
|
||||
Worker* worker;
|
||||
std::thread* thrd;
|
||||
};
|
||||
|
||||
Indexer::Impl::Impl(const IndexerConfig& cfg,
|
||||
const std::vector<std::string>& inputs)
|
||||
: cfg_(cfg), done_(false) {
|
||||
// Perform some sanity checks on the config.
|
||||
if (cfg_.num_threads == 0)
|
||||
cfg_.num_threads = std::thread::hardware_concurrency();
|
||||
if (cfg_.read_block_size == 0) cfg_.read_block_size = 4 << 10;
|
||||
if (cfg_.avg_chunk_size == 0) cfg_.avg_chunk_size = 256 << 10;
|
||||
if (cfg_.min_chunk_size == 0 || cfg_.min_chunk_size > cfg_.avg_chunk_size)
|
||||
cfg_.min_chunk_size = cfg_.avg_chunk_size >> 1;
|
||||
if (cfg_.max_chunk_size == 0 || cfg_.max_chunk_size < cfg_.avg_chunk_size)
|
||||
cfg_.max_chunk_size = cfg_.avg_chunk_size << 1;
|
||||
if (cfg_.max_chunk_size_step == 0)
|
||||
cfg_.max_chunk_size_step =
|
||||
cfg_.min_chunk_size > 0 ? cfg_.min_chunk_size : 128u;
|
||||
// Populate the CDC bitmasks which the Chunker creates. Only done here for
|
||||
// being able to write it to the output, setting them in the IndexerConfig has
|
||||
// no effect.
|
||||
fastcdc::Config ccfg(cfg_.min_chunk_size, cfg_.avg_chunk_size,
|
||||
cfg_.max_chunk_size);
|
||||
Indexer::Chunker chunker(ccfg, nullptr);
|
||||
cfg_.mask_s = chunker.Stage(0).mask;
|
||||
cfg_.mask_l = chunker.Stage(chunker.StagesCount() - 1).mask;
|
||||
// Collect inputs.
|
||||
for (auto it = inputs.begin(); it != inputs.end(); ++it) {
|
||||
inputs_.push(*it);
|
||||
}
|
||||
}
|
||||
|
||||
const IndexerConfig& Indexer::Impl::Config() const { return cfg_; }
|
||||
|
||||
// Executes the `progress` function in a loop, approximately every 200ms. Call
|
||||
// `SetDone(true)` to stop this function.
|
||||
void Indexer::Impl::TriggerProgress(Indexer::ProgressFn fn) {
|
||||
if (!fn) return;
|
||||
const int64_t interval = 200;
|
||||
absl::Time started = absl::Now();
|
||||
// Keeping going until we're done or an error occured.
|
||||
while (!done_ && !HasError()) {
|
||||
absl::Time loop_started = absl::Now();
|
||||
stats_mutex_.lock();
|
||||
stats_.elapsed = loop_started - started;
|
||||
stats_mutex_.unlock();
|
||||
|
||||
fn(Stats());
|
||||
// Aim for one update every interval.
|
||||
auto loop_elapsed = absl::ToInt64Milliseconds(loop_started - absl::Now());
|
||||
if (loop_elapsed < interval)
|
||||
std::this_thread::sleep_for(
|
||||
std::chrono::milliseconds(interval - loop_elapsed));
|
||||
}
|
||||
}
|
||||
|
||||
bool Indexer::Impl::GetNextJob(IndexerJob* job) {
|
||||
// Stop if an error occured.
|
||||
if (HasError()) return false;
|
||||
const std::lock_guard<std::mutex> lock(jobs_mutex_);
|
||||
|
||||
DirectoryEntry dent;
|
||||
while (!dent.Valid()) {
|
||||
// Open the next directory, if needed.
|
||||
if (!dir_iter_.Valid()) {
|
||||
if (inputs_.empty()) {
|
||||
// We are done.
|
||||
return false;
|
||||
} else {
|
||||
std::string input = inputs_.front();
|
||||
std::string uinput = path::ToUnix(input);
|
||||
inputs_.pop();
|
||||
// Return files as jobs.
|
||||
if (path::FileExists(uinput)) {
|
||||
job->filepath = uinput;
|
||||
return true;
|
||||
}
|
||||
// Otherwise read the directory.
|
||||
if (!dir_iter_.Open(input, DirectorySearchFlags::kFiles)) {
|
||||
// Ignore permission errors.
|
||||
if (absl::IsPermissionDenied(dir_iter_.Status())) {
|
||||
continue;
|
||||
}
|
||||
if (!dir_iter_.Status().ok()) {
|
||||
SetError(dir_iter_.Status());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (dir_iter_.NextEntry(&dent)) {
|
||||
break;
|
||||
} else if (!dir_iter_.Status().ok()) {
|
||||
SetError(dir_iter_.Status());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
path::Join(&job->filepath, dir_iter_.Path(), dent.RelPathName());
|
||||
return true;
|
||||
}
|
||||
|
||||
void Indexer::Impl::SetDone(bool done) { done_ = done; }
|
||||
|
||||
inline size_t Indexer::Impl::SizeBucket(size_t size) const {
|
||||
size_t bucket = 1024;
|
||||
// Go in steps of powers of two until min. chunk size is reached.
|
||||
while (bucket < size && bucket < cfg_.min_chunk_size && bucket < (1llu << 63))
|
||||
bucket <<= 1;
|
||||
// Go in steps of the configurable step size afterwards.
|
||||
while (bucket < size && bucket < (1llu << 63))
|
||||
bucket += cfg_.max_chunk_size_step;
|
||||
return bucket >= size ? bucket : UINT64_MAX;
|
||||
}
|
||||
|
||||
inline Indexer::OpStats Indexer::Impl::Stats() const {
|
||||
const std::lock_guard<std::mutex> lock(stats_mutex_);
|
||||
return stats_;
|
||||
}
|
||||
|
||||
inline Indexer::ChunkSizeMap Indexer::Impl::ChunkSizes() const {
|
||||
const std::lock_guard<std::mutex> lock(chunk_sizes_mutex_);
|
||||
return chunk_sizes_;
|
||||
}
|
||||
|
||||
Indexer::hash_t Indexer::Impl::HashBlake3(const uint8_t* data, size_t len) {
|
||||
blake3_hasher state;
|
||||
uint8_t out[BLAKE3_OUT_LEN];
|
||||
blake3_hasher_init(&state);
|
||||
blake3_hasher_update(&state, data, len);
|
||||
blake3_hasher_finalize(&state, out, BLAKE3_OUT_LEN);
|
||||
return Indexer::hash_t(reinterpret_cast<const char*>(out), BLAKE3_OUT_LEN);
|
||||
}
|
||||
|
||||
Indexer::hash_t Indexer::Impl::Hash(const uint8_t* data, size_t len) {
|
||||
switch (cfg_.hash_type) {
|
||||
case IndexerConfig::HashType::kNull:
|
||||
return hash_t();
|
||||
case IndexerConfig::HashType::kBlake3:
|
||||
return HashBlake3(data, len);
|
||||
case IndexerConfig::HashType::kUndefined:
|
||||
break;
|
||||
}
|
||||
std::cerr << "Unknown hash type" << std::endl;
|
||||
return std::string();
|
||||
}
|
||||
|
||||
void Indexer::Impl::AddChunk(const uint8_t* data, size_t len) {
|
||||
std::string hash = Hash(data, len);
|
||||
// See if the chunk already exists, insert it if not.
|
||||
chunks_mutex_.lock();
|
||||
bool new_chunk = chunks_.find(hash) == chunks_.end();
|
||||
if (new_chunk) {
|
||||
chunks_.emplace(hash, Chunk{hash, len});
|
||||
}
|
||||
chunks_mutex_.unlock();
|
||||
|
||||
// Update the stats.
|
||||
stats_mutex_.lock();
|
||||
stats_.total_bytes += len;
|
||||
++stats_.total_chunks;
|
||||
if (new_chunk) {
|
||||
stats_.unique_bytes += len;
|
||||
++stats_.unique_chunks;
|
||||
}
|
||||
stats_mutex_.unlock();
|
||||
|
||||
// Update chunk sizes distribution.
|
||||
if (new_chunk) {
|
||||
size_t bucket = SizeBucket(len);
|
||||
chunk_sizes_mutex_.lock();
|
||||
chunk_sizes_[bucket]++;
|
||||
chunk_sizes_mutex_.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
void Indexer::Impl::AddFile() {
|
||||
const std::lock_guard<std::mutex> lock(stats_mutex_);
|
||||
++stats_.total_files;
|
||||
}
|
||||
|
||||
bool Indexer::Impl::HasError() const {
|
||||
const std::lock_guard<std::mutex> lock(result_mutex_);
|
||||
return !result_.ok();
|
||||
}
|
||||
|
||||
absl::Status Indexer::Impl::Error() const {
|
||||
const std::lock_guard<std::mutex> lock(result_mutex_);
|
||||
return result_;
|
||||
}
|
||||
|
||||
void Indexer::Impl::SetError(absl::Status err) {
|
||||
// Ignore attempts to set a non-error.
|
||||
if (err.ok()) return;
|
||||
const std::lock_guard<std::mutex> lock(result_mutex_);
|
||||
// Don't overwrite any previous error.
|
||||
if (result_.ok()) result_ = err;
|
||||
}
|
||||
|
||||
Indexer::Worker::Worker(Indexer::Impl* impl)
|
||||
: impl_(impl),
|
||||
cdc_cfg_(impl_->Cfg().min_chunk_size, impl_->Cfg().avg_chunk_size,
|
||||
impl_->Cfg().max_chunk_size) {}
|
||||
|
||||
void Indexer::Worker::Run() {
|
||||
IndexerJob job;
|
||||
while (impl_->GetNextJob(&job)) {
|
||||
absl::Status err = IndexFile(job.filepath);
|
||||
if (!err.ok()) {
|
||||
impl_->SetError(err);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status Indexer::Worker::IndexFile(const std::string& filepath) {
|
||||
std::FILE* fin = std::fopen(filepath.c_str(), "rb");
|
||||
if (!fin) {
|
||||
return ErrnoToCanonicalStatus(
|
||||
errno, absl::StrFormat("failed to open file '%s'", filepath));
|
||||
}
|
||||
path::FileCloser closer(fin);
|
||||
std::fseek(fin, 0, SEEK_SET);
|
||||
|
||||
auto hdlr = absl::bind_front(&Indexer::Impl::AddChunk, impl_);
|
||||
Indexer::Chunker chunker(cdc_cfg_, hdlr);
|
||||
|
||||
std::vector<uint8_t> buf(impl_->Cfg().read_block_size, 0);
|
||||
int err = 0;
|
||||
while (!std::feof(fin)) {
|
||||
size_t cnt = std::fread(buf.data(), sizeof(uint8_t), buf.size(), fin);
|
||||
err = std::ferror(fin);
|
||||
if (err) {
|
||||
return ErrnoToCanonicalStatus(
|
||||
err, absl::StrFormat("failed to read from file '%s'", filepath));
|
||||
}
|
||||
if (cnt) {
|
||||
chunker.Process(buf.data(), cnt);
|
||||
}
|
||||
}
|
||||
chunker.Finalize();
|
||||
impl_->AddFile();
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
IndexerConfig::IndexerConfig()
|
||||
: read_block_size(32 << 10),
|
||||
min_chunk_size(0),
|
||||
avg_chunk_size(0),
|
||||
max_chunk_size(0),
|
||||
max_chunk_size_step(0),
|
||||
num_threads(0),
|
||||
mask_s(0),
|
||||
mask_l(0) {}
|
||||
|
||||
Indexer::Indexer() : impl_(nullptr) {}
|
||||
|
||||
Indexer::~Indexer() {
|
||||
if (impl_) delete impl_;
|
||||
}
|
||||
|
||||
absl::Status Indexer::Run(const IndexerConfig& cfg,
|
||||
const std::vector<std::string>& inputs,
|
||||
Indexer::ProgressFn fn) {
|
||||
if (impl_) delete impl_;
|
||||
impl_ = new Impl(cfg, inputs);
|
||||
|
||||
// Start the file creation workers.
|
||||
std::vector<WorkerThread> workers(impl_->Config().num_threads);
|
||||
for (auto it = workers.begin(); it != workers.end(); ++it) {
|
||||
auto worker = new Worker(impl_);
|
||||
it->worker = worker;
|
||||
it->thrd = new std::thread(&Worker::Run, worker);
|
||||
}
|
||||
// Start the progress function worker.
|
||||
std::thread prog(&Impl::TriggerProgress, impl_, fn);
|
||||
|
||||
// Wait for the workers to finish.
|
||||
for (auto it = workers.begin(); it != workers.end(); ++it) {
|
||||
it->thrd->join();
|
||||
}
|
||||
// Wait for the progress worker to finish.
|
||||
impl_->SetDone(true);
|
||||
prog.join();
|
||||
|
||||
return Error();
|
||||
}
|
||||
|
||||
absl::Status Indexer::Error() const {
|
||||
return impl_ ? impl_->Error() : absl::Status();
|
||||
}
|
||||
|
||||
IndexerConfig Indexer::Config() const {
|
||||
if (impl_) return impl_->Cfg();
|
||||
return IndexerConfig();
|
||||
}
|
||||
|
||||
Indexer::OpStats Indexer::Stats() const {
|
||||
if (impl_) return impl_->Stats();
|
||||
return Stats();
|
||||
}
|
||||
|
||||
Indexer::ChunkSizeMap Indexer::ChunkSizes() const {
|
||||
if (impl_) return impl_->ChunkSizes();
|
||||
return Indexer::ChunkSizeMap();
|
||||
}
|
||||
|
||||
inline Indexer::OpStats::OpStats()
|
||||
: total_files(0),
|
||||
total_chunks(0),
|
||||
unique_chunks(0),
|
||||
total_bytes(0),
|
||||
unique_bytes(0) {}
|
||||
|
||||
}; // namespace cdc_ft
|
||||
145
cdc_indexer/indexer.h
Normal file
145
cdc_indexer/indexer.h
Normal file
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright 2022 Google LLC
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef CDC_INDEXER_INDEXER_H_
|
||||
#define CDC_INDEXER_INDEXER_H_
|
||||
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/status/status.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "fastcdc/fastcdc.h"
|
||||
|
||||
// Compile-time parameters for the FastCDC algorithm.
|
||||
#define CDC_GEAR_32BIT 1
|
||||
#define CDC_GEAR_64BIT 2
|
||||
#ifndef CDC_GEAR_TABLE
|
||||
#define CDC_GEAR_TABLE CDC_GEAR_64BIT
|
||||
#endif
|
||||
#ifndef CDC_MASK_STAGES
|
||||
#define CDC_MASK_STAGES 7
|
||||
#endif
|
||||
#ifndef CDC_MASK_BIT_LSHIFT_AMOUNT
|
||||
#define CDC_MASK_BIT_LSHIFT_AMOUNT 3
|
||||
#endif
|
||||
|
||||
namespace cdc_ft {
|
||||
|
||||
struct IndexerConfig {
|
||||
// The hash function to use.
|
||||
enum class HashType {
|
||||
kUndefined = 0,
|
||||
// No hashing performed, always return an empty string.
|
||||
kNull,
|
||||
// Use BLAKE3 (cryptographic)
|
||||
kBlake3,
|
||||
};
|
||||
IndexerConfig();
|
||||
// Read file contents in the given block size from disk, defaults to 4K.
|
||||
size_t read_block_size;
|
||||
// The minimum allowed chunk size, defaults to avg_chunk_size/2.
|
||||
size_t min_chunk_size;
|
||||
// The target average chunk size.
|
||||
size_t avg_chunk_size;
|
||||
// The maximum allowed chunk size, defaults to 2*avg_chunk_size.
|
||||
size_t max_chunk_size;
|
||||
// Max. step size for bucketing the chunk size distribution.
|
||||
size_t max_chunk_size_step;
|
||||
// How many operations to run in parallel. If this value is zero, then
|
||||
// `std::thread::hardware_concurrency()` is used.
|
||||
uint32_t num_threads;
|
||||
// Which hash function to use.
|
||||
HashType hash_type;
|
||||
// The masks will be populated by the indexer, setting them here has no
|
||||
// effect. They are in this struct so that they can be conveniently accessed
|
||||
// when printing the operation summary (and since they are derived from the
|
||||
// configuration, they are technically part of it).
|
||||
uint64_t mask_s;
|
||||
uint64_t mask_l;
|
||||
};
|
||||
|
||||
class Indexer {
|
||||
public:
|
||||
using hash_t = std::string;
|
||||
#if CDC_GEAR_TABLE == CDC_GEAR_32BIT
|
||||
typedef fastcdc::Chunker32<CDC_MASK_STAGES, CDC_MASK_BIT_LSHIFT_AMOUNT>
|
||||
Chunker;
|
||||
#elif CDC_GEAR_TABLE == CDC_GEAR_64BIT
|
||||
typedef fastcdc::Chunker64<CDC_MASK_STAGES, CDC_MASK_BIT_LSHIFT_AMOUNT>
|
||||
Chunker;
|
||||
#else
|
||||
#error "Unknown gear table"
|
||||
#endif
|
||||
|
||||
// Represents a chunk.
|
||||
struct Chunk {
|
||||
hash_t hash;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// Chunk storage, keyed by hash. The hash value must be mapped to a uint64_t
|
||||
// value here, which is only acceptable for an experimental program like this.
|
||||
typedef std::unordered_map<hash_t, Chunk> ChunkMap;
|
||||
// Used for counting number of chunks in size buckets.
|
||||
typedef std::unordered_map<size_t, uint64_t> ChunkSizeMap;
|
||||
|
||||
// Statistics about the current operation.
|
||||
struct OpStats {
|
||||
OpStats();
|
||||
size_t total_files;
|
||||
size_t total_chunks;
|
||||
size_t unique_chunks;
|
||||
size_t total_bytes;
|
||||
size_t unique_bytes;
|
||||
absl::Duration elapsed;
|
||||
};
|
||||
|
||||
// Defines a callback function that can be used to display progress updates
|
||||
// while the Indexer is busy.
|
||||
typedef void(ProgressFn)(const OpStats& stats);
|
||||
|
||||
Indexer();
|
||||
~Indexer();
|
||||
|
||||
// Starts the indexing operation for the given configuration `cfg` and
|
||||
// `inputs`. The optional callback function `fn` is called periodically with
|
||||
// statistics about the ongoing operation.
|
||||
absl::Status Run(const IndexerConfig& cfg,
|
||||
const std::vector<std::string>& inputs, ProgressFn fn);
|
||||
// Returns the status of the ongoing or completed operation.
|
||||
absl::Status Error() const;
|
||||
// Returns the configuration that was passed to Run().
|
||||
IndexerConfig Config() const;
|
||||
// Returns the statistics about the ongoing or completed operation.
|
||||
OpStats Stats() const;
|
||||
// Returns a map of chunk sizes to the number of occurrences. The sizes are
|
||||
// combined to buckets according to the given `IndexerConfig` of the Run()
|
||||
// operation.
|
||||
ChunkSizeMap ChunkSizes() const;
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
class Worker;
|
||||
class WorkerThread;
|
||||
Impl* impl_;
|
||||
};
|
||||
|
||||
}; // namespace cdc_ft
|
||||
|
||||
#endif // CDC_INDEXER_INDEXER_H_
|
||||
435
cdc_indexer/main.cc
Normal file
435
cdc_indexer/main.cc
Normal file
@@ -0,0 +1,435 @@
|
||||
// Copyright 2022 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
#include "absl/flags/flag.h"
|
||||
#include "absl/flags/parse.h"
|
||||
#include "absl/flags/usage.h"
|
||||
#include "absl/flags/usage_config.h"
|
||||
#include "absl/random/random.h"
|
||||
#include "absl/status/status.h"
|
||||
#include "absl/strings/match.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl_helper/jedec_size_flag.h"
|
||||
#include "cdc_indexer/indexer.h"
|
||||
#include "common/errno_mapping.h"
|
||||
#include "common/path.h"
|
||||
|
||||
ABSL_FLAG(std::vector<std::string>, inputs, std::vector<std::string>(),
|
||||
"List of input files or directory to read from.");
|
||||
ABSL_FLAG(uint32_t, num_threads, 0,
|
||||
"How many threads should read files in parallel, use 0 to "
|
||||
"auto-dertermine the best concurrency for this machine.");
|
||||
ABSL_FLAG(cdc_ft::JedecSize, min_chunk_size, cdc_ft::JedecSize(0),
|
||||
"The minimum chunk size to size the files into. Defaults to half of "
|
||||
"the average chunk size. Supports common unit suffixes K, M, G.");
|
||||
ABSL_FLAG(cdc_ft::JedecSize, avg_chunk_size, cdc_ft::JedecSize(256 << 10),
|
||||
"The average chunk size to size the files into. Supports common "
|
||||
"unit suffixes K, M, G.");
|
||||
ABSL_FLAG(cdc_ft::JedecSize, max_chunk_size, cdc_ft::JedecSize(0),
|
||||
"The maximum chunk size to size the files into. Defaults to twice "
|
||||
"the average chunk size. Supports common unit suffixes K, M, G.");
|
||||
ABSL_FLAG(cdc_ft::JedecSize, read_block_size, cdc_ft::JedecSize(0),
|
||||
"The block size to read the input file(s) from disk. Defaults to the "
|
||||
"value of --max_chunk_size. Supports common unit suffixes K, M, G.");
|
||||
ABSL_FLAG(std::string, hash, "blake3",
|
||||
"Which hash function to use. Supported values are \"blake3\" and "
|
||||
"\"null\".");
|
||||
ABSL_FLAG(std::string, results_file, "",
|
||||
"File name to append results to in CVS format.");
|
||||
ABSL_FLAG(std::string, description, "",
|
||||
"A descriptive string of the experiment that was run. If given, this "
|
||||
"will be prepended literally to the results_file. Multiple columns "
|
||||
"can be separated with commas.");
|
||||
|
||||
namespace cdc_ft {
|
||||
namespace {
|
||||
|
||||
const char* GearTable() {
|
||||
// The following macros are defined in indexer.h.
|
||||
#if CDC_GEAR_TABLE == CDC_GEAR_32BIT
|
||||
return "32 bit";
|
||||
#elif CDC_GEAR_TABLE == CDC_GEAR_64BIT
|
||||
return "64 bit";
|
||||
#else
|
||||
#error "Unknown gear table"
|
||||
return "unknown";
|
||||
#endif
|
||||
}
|
||||
|
||||
void SetupFlagsHelp() {
|
||||
absl::SetProgramUsageMessage(
|
||||
"CDC indexer to measure and report data redundancy.");
|
||||
absl::FlagsUsageConfig fuc;
|
||||
// Filter flags to show when the --help flag is set.
|
||||
fuc.contains_help_flags = [](absl::string_view f) {
|
||||
return absl::EndsWith(f, "main.cc");
|
||||
};
|
||||
absl::SetFlagsUsageConfig(fuc);
|
||||
}
|
||||
|
||||
// Prints a human-readable representation of the given size, such as "4 KB".
|
||||
template <typename T>
|
||||
std::string HumanBytes(T size, int precision = 0) {
|
||||
const size_t threshold = 2048;
|
||||
if (size < 1024)
|
||||
return absl::StrFormat("%d bytes", static_cast<size_t>(size));
|
||||
double s = static_cast<double>(size) / 1024;
|
||||
std::string units = "KB";
|
||||
if (s > threshold) {
|
||||
s /= 1024;
|
||||
units = "MB";
|
||||
}
|
||||
if (s > threshold) {
|
||||
s /= 1024;
|
||||
units = "GB";
|
||||
}
|
||||
if (s > threshold) {
|
||||
s /= 1024;
|
||||
units = "TB";
|
||||
}
|
||||
if (s > threshold) {
|
||||
s /= 1024;
|
||||
units = "PB";
|
||||
}
|
||||
return absl::StrFormat("%.*f %s", precision, s, units);
|
||||
}
|
||||
|
||||
// Prints a human-readable representation of a duration as minutes and seconds
|
||||
// in the format "m:ss".
|
||||
std::string HumanDuration(const absl::Duration& d) {
|
||||
auto sec = absl::ToInt64Seconds(d);
|
||||
return absl::StrFormat("%02d:%02d", sec / 60, std::abs(sec) % 60);
|
||||
}
|
||||
|
||||
std::string HashTypeToString(IndexerConfig::HashType type) {
|
||||
switch (type) {
|
||||
case IndexerConfig::HashType::kNull:
|
||||
return "(no hashing)";
|
||||
case IndexerConfig::HashType::kBlake3:
|
||||
return "BLAKE3";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
// Prints progress information on stdout.
|
||||
void ShowProgress(const Indexer::OpStats& stats) {
|
||||
static absl::Time op_start = absl::Now();
|
||||
static absl::Time last_progress = op_start;
|
||||
static size_t last_total_bytes = 0;
|
||||
|
||||
auto now = absl::Now();
|
||||
auto elapsed = now - last_progress;
|
||||
if (elapsed < absl::Milliseconds(500)) return;
|
||||
|
||||
double bps =
|
||||
(stats.total_bytes - last_total_bytes) / absl::ToDoubleSeconds(elapsed);
|
||||
double dedup_pct = (stats.total_bytes - stats.unique_bytes) /
|
||||
static_cast<double>(stats.total_bytes) * 100.0;
|
||||
std::cout << '\r' << HumanDuration(now - op_start) << " " << std::setw(2)
|
||||
<< HumanBytes(stats.total_bytes, 2) << " in " << stats.total_files
|
||||
<< " files processed at " << HumanBytes(bps, 1) << "/s"
|
||||
<< ", " << static_cast<int>(dedup_pct) << "% deduplication"
|
||||
<< std::flush;
|
||||
last_progress = now;
|
||||
last_total_bytes = stats.total_bytes;
|
||||
}
|
||||
|
||||
void ShowSummary(const IndexerConfig& cfg, const Indexer::OpStats& stats,
|
||||
absl::Duration elapsed) {
|
||||
const int title_w = 20;
|
||||
const int num_w = 16;
|
||||
double dedup_pct = (stats.total_bytes - stats.unique_bytes) /
|
||||
static_cast<double>(stats.total_bytes) * 100.0;
|
||||
double bps = stats.total_bytes / absl::ToDoubleSeconds(elapsed);
|
||||
std::cout << "Chunk size (min/avg/max): " << HumanBytes(cfg.min_chunk_size)
|
||||
<< " / " << HumanBytes(cfg.avg_chunk_size) << " / "
|
||||
<< HumanBytes(cfg.max_chunk_size)
|
||||
<< " | Hash: " << HashTypeToString(cfg.hash_type)
|
||||
<< " | Threads: " << cfg.num_threads << std::endl;
|
||||
std::cout << "gear_table: " << GearTable() << " | mask_s: 0x" << std::hex
|
||||
<< cfg.mask_s << " | mask_l: 0x" << cfg.mask_l << std::dec
|
||||
<< std::endl;
|
||||
std::cout << std::setw(title_w) << "Duration:" << std::setw(num_w)
|
||||
<< HumanDuration(elapsed) << std::endl;
|
||||
std::cout << std::setw(title_w) << "Total files:" << std::setw(num_w)
|
||||
<< stats.total_files << std::endl;
|
||||
std::cout << std::setw(title_w) << "Total chunks:" << std::setw(num_w)
|
||||
<< stats.total_chunks << std::endl;
|
||||
std::cout << std::setw(title_w) << "Unique chunks:" << std::setw(num_w)
|
||||
<< stats.unique_chunks << std::endl;
|
||||
std::cout << std::setw(title_w) << "Total data:" << std::setw(num_w)
|
||||
<< HumanBytes(stats.total_bytes, 2) << std::endl;
|
||||
std::cout << std::setw(title_w) << "Unique data:" << std::setw(num_w)
|
||||
<< HumanBytes(stats.unique_bytes, 2) << std::endl;
|
||||
std::cout << std::setw(title_w) << "Throughput:" << std::setw(num_w - 2)
|
||||
<< HumanBytes(bps, 2) << "/s" << std::endl;
|
||||
std::cout << std::setw(title_w) << "Avg. chunk size:" << std::setw(num_w)
|
||||
<< HumanBytes(static_cast<double>(stats.unique_bytes) /
|
||||
stats.unique_chunks)
|
||||
<< std::endl;
|
||||
std::cout << std::setw(title_w) << "Deduplication:" << std::setw(num_w - 1)
|
||||
<< std::setprecision(4) << dedup_pct << "%" << std::endl;
|
||||
}
|
||||
|
||||
void ShowChunkSize(size_t size, uint64_t cnt, uint64_t max_count,
|
||||
uint64_t total_count) {
|
||||
const int key_w = 7;
|
||||
const int hbar_w = 40;
|
||||
const int num_w = 10;
|
||||
const int pct_w = 2;
|
||||
|
||||
double pct = 100.0 * static_cast<double>(cnt) / total_count;
|
||||
double hscale = static_cast<double>(cnt) / max_count;
|
||||
int blocks = round(hscale * hbar_w);
|
||||
|
||||
std::cout << std::setw(key_w) << HumanBytes(size) << " ";
|
||||
for (int i = 0; i < blocks; i++) std::cout << "#";
|
||||
for (int i = hbar_w - blocks; i > 0; i--) std::cout << " ";
|
||||
std::cout << " " << std::setw(num_w) << cnt << " (" << std::setw(pct_w)
|
||||
<< round(pct) << "%)" << std::endl;
|
||||
}
|
||||
|
||||
std::vector<size_t> ChunkSizeBuckets(const IndexerConfig& cfg,
|
||||
const Indexer::ChunkSizeMap& sizes,
|
||||
size_t fixed_min_size,
|
||||
size_t fixed_max_size,
|
||||
uint64_t* max_count_out,
|
||||
uint64_t* total_count_out) {
|
||||
size_t min_size = 1u << 31;
|
||||
size_t max_size = 0;
|
||||
uint64_t max_count = 0;
|
||||
uint64_t total_count = 0, found_count = 0;
|
||||
uint64_t outside_min_max_count = 0;
|
||||
std::vector<size_t> buckets;
|
||||
// Find out min/max chunk sizes
|
||||
for (auto [chunk_size, count] : sizes) {
|
||||
if (chunk_size < min_size) min_size = chunk_size;
|
||||
if (chunk_size > max_size) max_size = chunk_size;
|
||||
if (count > max_count) max_count = count;
|
||||
if (chunk_size < fixed_min_size) outside_min_max_count += count;
|
||||
if (fixed_max_size > 0 && chunk_size > fixed_max_size)
|
||||
outside_min_max_count += count;
|
||||
total_count += count;
|
||||
}
|
||||
if (fixed_min_size > 0) min_size = fixed_min_size;
|
||||
// Use steps of powers of two until min. chunk size is reached.
|
||||
uint64_t size;
|
||||
uint64_t pow_end_size = std::min(cfg.min_chunk_size, max_size);
|
||||
for (size = min_size; size < pow_end_size; size <<= 1) {
|
||||
buckets.push_back(size);
|
||||
auto it = sizes.find(size);
|
||||
if (it != sizes.end()) found_count += it->second;
|
||||
}
|
||||
if (fixed_max_size > max_size) max_size = fixed_max_size;
|
||||
// Use step increments of max_chunk_size_step afterwards.
|
||||
for (; size <= max_size; size += cfg.max_chunk_size_step) {
|
||||
buckets.push_back(size);
|
||||
auto it = sizes.find(size);
|
||||
if (it != sizes.end()) found_count += it->second;
|
||||
}
|
||||
// Make sure we found every bucket.
|
||||
assert(total_count == found_count + outside_min_max_count);
|
||||
if (max_count_out) *max_count_out = max_count;
|
||||
if (total_count_out) *total_count_out = total_count;
|
||||
return buckets;
|
||||
}
|
||||
|
||||
void ShowChunkSizes(const IndexerConfig& cfg,
|
||||
const Indexer::ChunkSizeMap& sizes) {
|
||||
uint64_t max_count = 0;
|
||||
uint64_t total_count = 0;
|
||||
auto buckets = ChunkSizeBuckets(cfg, sizes, 0, 0, &max_count, &total_count);
|
||||
for (auto size : buckets) {
|
||||
auto it = sizes.find(size);
|
||||
uint64_t cnt = it != sizes.end() ? it->second : 0;
|
||||
ShowChunkSize(size, cnt, max_count, total_count);
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status WriteResultsFile(const std::string& filepath,
|
||||
const std::string& description,
|
||||
const IndexerConfig& cfg,
|
||||
const Indexer::OpStats& stats,
|
||||
const Indexer::ChunkSizeMap& sizes) {
|
||||
bool exists = path::FileExists(filepath);
|
||||
std::FILE* fout = std::fopen(filepath.c_str(), "a");
|
||||
if (!fout) {
|
||||
return ErrnoToCanonicalStatus(
|
||||
errno, absl::StrFormat("Couldn't write to file '%s'", filepath));
|
||||
}
|
||||
|
||||
path::FileCloser closer(fout);
|
||||
|
||||
static constexpr int num_columns = 15;
|
||||
static const char* columns[num_columns] = {
|
||||
"gear_table",
|
||||
"mask_s",
|
||||
"mask_l",
|
||||
"Min chunk size [KiB]",
|
||||
"Avg chunk size [KiB]",
|
||||
"Max chunk size [KiB]",
|
||||
"Read speed [MiB/s]",
|
||||
"Files",
|
||||
"Total chunks",
|
||||
"Unique chunks",
|
||||
"Total size [MiB]",
|
||||
"Unique size [MiB]",
|
||||
"Dedup size [MiB]",
|
||||
"Dedup ratio",
|
||||
"Res avg chunk size [KiB]",
|
||||
};
|
||||
|
||||
auto buckets = ChunkSizeBuckets(cfg, sizes, cfg.min_chunk_size,
|
||||
cfg.max_chunk_size, nullptr, nullptr);
|
||||
// Write column headers this is a new file.
|
||||
if (!exists) {
|
||||
// Write empty columns corresponding to the no. of given columns.
|
||||
int desc_cols = description.empty() ? 0 : 1;
|
||||
desc_cols += std::count(description.begin(), description.end(), ',');
|
||||
for (int i = 0; i < desc_cols; i++) {
|
||||
std::fprintf(fout, i == 0 ? "Description," : ",");
|
||||
}
|
||||
// Write fixed column headers.
|
||||
for (int i = 0; i < num_columns; i++) {
|
||||
std::fprintf(fout, "%s,", columns[i]);
|
||||
}
|
||||
// Write chunk distribution column headers
|
||||
for (auto size : buckets) {
|
||||
std::fprintf(fout, "%s,", HumanBytes(size).c_str());
|
||||
}
|
||||
std::fprintf(fout, "\n");
|
||||
}
|
||||
|
||||
// Count allow chunks below min_chunk_size and above max_chunk_size as they
|
||||
// won't be included in the buckets list automatically.
|
||||
uint64_t below_min_cnt = 0, above_max_cnt = 0;
|
||||
for (auto [chunk_size, count] : sizes) {
|
||||
if (chunk_size < cfg.min_chunk_size) below_min_cnt += count;
|
||||
if (chunk_size > cfg.max_chunk_size) above_max_cnt += count;
|
||||
}
|
||||
|
||||
static constexpr double mib = static_cast<double>(1 << 20);
|
||||
|
||||
// Write user-supplied description
|
||||
if (!description.empty()) std::fprintf(fout, "%s,", description.c_str());
|
||||
// Write chunking params.
|
||||
std::fprintf(fout, "%s,0x%zx,0x%zx,", GearTable(), cfg.mask_s, cfg.mask_l);
|
||||
std::fprintf(fout, "%zu,%zu,%zu,", cfg.min_chunk_size >> 10,
|
||||
cfg.avg_chunk_size >> 10, cfg.max_chunk_size >> 10);
|
||||
// Write speed, files, chunks.
|
||||
double mibps =
|
||||
(stats.total_bytes / mib) / absl::ToDoubleSeconds(stats.elapsed);
|
||||
std::fprintf(fout, "%f,%zu,%zu,%zu,", mibps, stats.total_files,
|
||||
stats.total_chunks, stats.unique_chunks);
|
||||
// Write total and unique sizes.
|
||||
std::fprintf(fout, "%f,%f,%f,", stats.total_bytes / mib,
|
||||
stats.unique_bytes / mib,
|
||||
(stats.total_bytes - stats.unique_bytes) / mib);
|
||||
// Write dedup ratio and avg. chunk size.
|
||||
double dedup_ratio = (stats.total_bytes - stats.unique_bytes) /
|
||||
static_cast<double>(stats.total_bytes);
|
||||
size_t avg_size = stats.unique_bytes / stats.unique_chunks;
|
||||
std::fprintf(fout, "%f,%zu,", dedup_ratio, avg_size >> 10);
|
||||
// Write chunk distribution
|
||||
size_t index = 0;
|
||||
for (auto size : buckets) {
|
||||
auto it = sizes.find(size);
|
||||
uint64_t cnt = it != sizes.end() ? it->second : 0;
|
||||
if (index == 0) {
|
||||
cnt += below_min_cnt;
|
||||
} else if (index + 1 == buckets.size()) {
|
||||
cnt += above_max_cnt;
|
||||
}
|
||||
++index;
|
||||
std::fprintf(fout, "%f,", static_cast<double>(cnt) / stats.unique_chunks);
|
||||
}
|
||||
std::fprintf(fout, "\n");
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
IndexerConfig::HashType GetHashType(const std::string name) {
|
||||
if (name == "null") return IndexerConfig::HashType::kNull;
|
||||
if (name == "blake3") return IndexerConfig::HashType::kBlake3;
|
||||
std::cerr << "Unknown hash type: \"" << name << "\"" << std::endl;
|
||||
return IndexerConfig::HashType::kUndefined;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace cdc_ft
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
cdc_ft::SetupFlagsHelp();
|
||||
absl::ParseCommandLine(argc, argv);
|
||||
|
||||
std::vector<std::string> inputs = absl::GetFlag(FLAGS_inputs);
|
||||
|
||||
if (inputs.empty()) {
|
||||
std::cout << "Execute the following command to get help on the usage:"
|
||||
<< std::endl
|
||||
<< argv[0] << " --help" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
cdc_ft::IndexerConfig cfg;
|
||||
cfg.num_threads = absl::GetFlag(FLAGS_num_threads);
|
||||
cfg.min_chunk_size = absl::GetFlag(FLAGS_min_chunk_size).Size();
|
||||
cfg.avg_chunk_size = absl::GetFlag(FLAGS_avg_chunk_size).Size();
|
||||
cfg.max_chunk_size = absl::GetFlag(FLAGS_max_chunk_size).Size();
|
||||
cfg.read_block_size = absl::GetFlag(FLAGS_read_block_size).Size();
|
||||
cfg.hash_type = cdc_ft::GetHashType(absl::GetFlag(FLAGS_hash));
|
||||
|
||||
if (!cfg.min_chunk_size) cfg.min_chunk_size = cfg.avg_chunk_size >> 1;
|
||||
if (!cfg.max_chunk_size) cfg.max_chunk_size = cfg.avg_chunk_size << 1;
|
||||
if (!cfg.read_block_size) cfg.read_block_size = cfg.max_chunk_size;
|
||||
cfg.max_chunk_size_step = std::max<size_t>(cfg.min_chunk_size >> 2, 1024u);
|
||||
assert(cfg.avg_chunk_size > 0);
|
||||
assert(cfg.avg_chunk_size > cfg.min_chunk_size);
|
||||
assert(cfg.avg_chunk_size < cfg.max_chunk_size);
|
||||
assert(cfg.hash_type != cdc_ft::IndexerConfig::HashType::kUndefined);
|
||||
|
||||
cdc_ft::Indexer idx;
|
||||
std::cout << "Starting indexer on " << inputs.size() << " inputs."
|
||||
<< std::endl;
|
||||
static absl::Time start = absl::Now();
|
||||
absl::Status res = idx.Run(cfg, inputs, cdc_ft::ShowProgress);
|
||||
auto elapsed = absl::Now() - start;
|
||||
std::cout << std::endl;
|
||||
if (res.ok()) {
|
||||
std::cout << "Operation succeeded." << std::endl << std::endl;
|
||||
cdc_ft::ShowSummary(idx.Config(), idx.Stats(), elapsed);
|
||||
std::cout << std::endl;
|
||||
cdc_ft::ShowChunkSizes(idx.Config(), idx.ChunkSizes());
|
||||
std::string results_file = absl::GetFlag(FLAGS_results_file);
|
||||
if (!results_file.empty()) {
|
||||
res = cdc_ft::WriteResultsFile(
|
||||
results_file, absl::GetFlag(FLAGS_description), idx.Config(),
|
||||
idx.Stats(), idx.ChunkSizes());
|
||||
if (!res.ok())
|
||||
std::cerr << "Failed to write results to '" << results_file
|
||||
<< "': " << res.message() << std::endl;
|
||||
}
|
||||
} else {
|
||||
std::cerr << "Error: (" << res.code() << ") " << res.message() << std::endl;
|
||||
}
|
||||
|
||||
return static_cast<int>(res.code());
|
||||
}
|
||||
Reference in New Issue
Block a user