Files
netris-cdc-file-transfer/cdc_indexer/indexer.cc
ljusten 23fcd5ef1d Improve cdc_fuse_fs and path (#2)
Improve cdc_fuse_fs and path

Improves the error handling in path so that std:error_codes are not
assumed to be of system category, and also that their messages are
displayed. Also improves debug messages in GameletComponent.
2022-11-15 12:53:02 +01:00

434 lines
12 KiB
C++

// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cdc_indexer/indexer.h"
#include <algorithm>
#include <cstdio>
#include <fstream>
#include <mutex>
#include <queue>
#include <thread>
#include "absl/functional/bind_front.h"
#include "absl/strings/str_format.h"
#include "absl/time/clock.h"
#include "blake3.h"
#include "common/dir_iter.h"
#include "common/errno_mapping.h"
#include "common/path.h"
#include "common/status_macros.h"
namespace cdc_ft {
struct IndexerJob {
std::string filepath;
};
class Indexer::Impl {
public:
Impl(const IndexerConfig& cfg, const std::vector<std::string>& inputs);
const IndexerConfig& Config() const;
// Calls the given `progress` function periodically until `SetDone(true)` is
// called.
void TriggerProgress(ProgressFn fn);
bool GetNextJob(IndexerJob* job);
bool HasError() const;
absl::Status Error() const;
void SetError(absl::Status err);
void SetDone(bool done);
inline const IndexerConfig& Cfg() const { return cfg_; }
inline Indexer::OpStats Stats() const;
inline Indexer::ChunkSizeMap ChunkSizes() const;
void AddChunk(const uint8_t* data, size_t len);
void AddFile();
private:
friend class Indexer;
// Calculates a hash value for the given data.
inline hash_t Hash(const uint8_t* data, size_t len);
inline hash_t HashBlake3(const uint8_t* data, size_t len);
inline hash_t HashXxhash(const uint8_t* data, size_t len);
// Finds the smallest power of 2 such that the result is <= size. If size is >
// 2^31, then UINT64_MAX is returned.
inline size_t SizeBucket(size_t size) const;
IndexerConfig cfg_;
bool done_;
// The following members are all guarded by jobs_mutex_.
std::queue<std::string> inputs_;
DirectoryIterator dir_iter_;
std::mutex jobs_mutex_;
// Guarded by chunks_mutex_
Indexer::ChunkMap chunks_;
std::mutex chunks_mutex_;
// Guarded by stats_mutex_.
Indexer::OpStats stats_;
mutable std::mutex stats_mutex_;
// Guarded by chunk_sizes_mutex_;
Indexer::ChunkSizeMap chunk_sizes_;
mutable std::mutex chunk_sizes_mutex_;
// Guarded by result_mutex_
absl::Status result_;
mutable std::mutex result_mutex_;
};
class Indexer::Worker {
public:
Worker(Impl* impl);
void Run();
private:
absl::Status IndexFile(const std::string& filepath);
Impl* impl_;
absl::Cord buf_;
const fastcdc::Config cdc_cfg_;
};
// This class holds a `Worker` object and the associated `std::thread` object
// that executes it.
class Indexer::WorkerThread {
public:
WorkerThread() : worker(nullptr), thrd(nullptr) {}
~WorkerThread() {
if (thrd) {
if (thrd->joinable()) thrd->join();
delete thrd;
}
if (worker) {
delete worker;
}
}
Worker* worker;
std::thread* thrd;
};
Indexer::Impl::Impl(const IndexerConfig& cfg,
const std::vector<std::string>& inputs)
: cfg_(cfg), done_(false) {
// Perform some sanity checks on the config.
if (cfg_.num_threads == 0)
cfg_.num_threads = std::thread::hardware_concurrency();
if (cfg_.read_block_size == 0) cfg_.read_block_size = 4 << 10;
if (cfg_.avg_chunk_size == 0) cfg_.avg_chunk_size = 256 << 10;
if (cfg_.min_chunk_size == 0 || cfg_.min_chunk_size > cfg_.avg_chunk_size)
cfg_.min_chunk_size = cfg_.avg_chunk_size >> 1;
if (cfg_.max_chunk_size == 0 || cfg_.max_chunk_size < cfg_.avg_chunk_size)
cfg_.max_chunk_size = cfg_.avg_chunk_size << 1;
if (cfg_.max_chunk_size_step == 0)
cfg_.max_chunk_size_step =
cfg_.min_chunk_size > 0 ? cfg_.min_chunk_size : 128u;
// Populate the CDC bitmasks which the Chunker creates. Only done here for
// being able to write it to the output, setting them in the IndexerConfig has
// no effect.
fastcdc::Config ccfg(cfg_.min_chunk_size, cfg_.avg_chunk_size,
cfg_.max_chunk_size);
Indexer::Chunker chunker(ccfg, nullptr);
cfg_.mask_s = chunker.Stage(0).mask;
cfg_.mask_l = chunker.Stage(chunker.StagesCount() - 1).mask;
// Collect inputs.
for (auto it = inputs.begin(); it != inputs.end(); ++it) {
inputs_.push(*it);
}
}
const IndexerConfig& Indexer::Impl::Config() const { return cfg_; }
// Executes the `progress` function in a loop, approximately every 200ms. Call
// `SetDone(true)` to stop this function.
void Indexer::Impl::TriggerProgress(Indexer::ProgressFn fn) {
if (!fn) return;
const int64_t interval = 200;
absl::Time started = absl::Now();
// Keeping going until we're done or an error occured.
while (!done_ && !HasError()) {
absl::Time loop_started = absl::Now();
stats_mutex_.lock();
stats_.elapsed = loop_started - started;
stats_mutex_.unlock();
fn(Stats());
// Aim for one update every interval.
auto loop_elapsed = absl::ToInt64Milliseconds(loop_started - absl::Now());
if (loop_elapsed < interval)
std::this_thread::sleep_for(
std::chrono::milliseconds(interval - loop_elapsed));
}
}
bool Indexer::Impl::GetNextJob(IndexerJob* job) {
// Stop if an error occured.
if (HasError()) return false;
const std::lock_guard<std::mutex> lock(jobs_mutex_);
DirectoryEntry dent;
while (!dent.Valid()) {
// Open the next directory, if needed.
if (!dir_iter_.Valid()) {
if (inputs_.empty()) {
// We are done.
return false;
} else {
std::string input = inputs_.front();
std::string uinput = path::ToUnix(input);
inputs_.pop();
// Return files as jobs.
if (path::FileExists(uinput)) {
job->filepath = uinput;
return true;
}
// Otherwise read the directory.
if (!dir_iter_.Open(input, DirectorySearchFlags::kFiles)) {
// Ignore permission errors.
if (absl::IsPermissionDenied(dir_iter_.Status())) {
continue;
}
if (!dir_iter_.Status().ok()) {
SetError(dir_iter_.Status());
}
return false;
}
}
}
if (dir_iter_.NextEntry(&dent)) {
break;
} else if (!dir_iter_.Status().ok()) {
SetError(dir_iter_.Status());
return false;
}
}
path::Join(&job->filepath, dir_iter_.Path(), dent.RelPathName());
return true;
}
void Indexer::Impl::SetDone(bool done) { done_ = done; }
inline size_t Indexer::Impl::SizeBucket(size_t size) const {
size_t bucket = 1024;
// Go in steps of powers of two until min. chunk size is reached.
while (bucket < size && bucket < cfg_.min_chunk_size && bucket < (1llu << 63))
bucket <<= 1;
// Go in steps of the configurable step size afterwards.
while (bucket < size && bucket < (1llu << 63))
bucket += cfg_.max_chunk_size_step;
return bucket >= size ? bucket : UINT64_MAX;
}
inline Indexer::OpStats Indexer::Impl::Stats() const {
const std::lock_guard<std::mutex> lock(stats_mutex_);
return stats_;
}
inline Indexer::ChunkSizeMap Indexer::Impl::ChunkSizes() const {
const std::lock_guard<std::mutex> lock(chunk_sizes_mutex_);
return chunk_sizes_;
}
Indexer::hash_t Indexer::Impl::HashBlake3(const uint8_t* data, size_t len) {
blake3_hasher state;
uint8_t out[BLAKE3_OUT_LEN];
blake3_hasher_init(&state);
blake3_hasher_update(&state, data, len);
blake3_hasher_finalize(&state, out, BLAKE3_OUT_LEN);
return Indexer::hash_t(reinterpret_cast<const char*>(out), BLAKE3_OUT_LEN);
}
Indexer::hash_t Indexer::Impl::Hash(const uint8_t* data, size_t len) {
switch (cfg_.hash_type) {
case IndexerConfig::HashType::kNull:
return hash_t();
case IndexerConfig::HashType::kBlake3:
return HashBlake3(data, len);
case IndexerConfig::HashType::kUndefined:
break;
}
std::cerr << "Unknown hash type" << std::endl;
return std::string();
}
void Indexer::Impl::AddChunk(const uint8_t* data, size_t len) {
std::string hash = Hash(data, len);
// See if the chunk already exists, insert it if not.
chunks_mutex_.lock();
bool new_chunk = chunks_.find(hash) == chunks_.end();
if (new_chunk) {
chunks_.emplace(hash, Chunk{hash, len});
}
chunks_mutex_.unlock();
// Update the stats.
stats_mutex_.lock();
stats_.total_bytes += len;
++stats_.total_chunks;
if (new_chunk) {
stats_.unique_bytes += len;
++stats_.unique_chunks;
}
stats_mutex_.unlock();
// Update chunk sizes distribution.
if (new_chunk) {
size_t bucket = SizeBucket(len);
chunk_sizes_mutex_.lock();
chunk_sizes_[bucket]++;
chunk_sizes_mutex_.unlock();
}
}
void Indexer::Impl::AddFile() {
const std::lock_guard<std::mutex> lock(stats_mutex_);
++stats_.total_files;
}
bool Indexer::Impl::HasError() const {
const std::lock_guard<std::mutex> lock(result_mutex_);
return !result_.ok();
}
absl::Status Indexer::Impl::Error() const {
const std::lock_guard<std::mutex> lock(result_mutex_);
return result_;
}
void Indexer::Impl::SetError(absl::Status err) {
// Ignore attempts to set a non-error.
if (err.ok()) return;
const std::lock_guard<std::mutex> lock(result_mutex_);
// Don't overwrite any previous error.
if (result_.ok()) result_ = err;
}
Indexer::Worker::Worker(Indexer::Impl* impl)
: impl_(impl),
cdc_cfg_(impl_->Cfg().min_chunk_size, impl_->Cfg().avg_chunk_size,
impl_->Cfg().max_chunk_size) {}
void Indexer::Worker::Run() {
IndexerJob job;
while (impl_->GetNextJob(&job)) {
absl::Status err = IndexFile(job.filepath);
if (!err.ok()) {
impl_->SetError(err);
return;
}
}
}
absl::Status Indexer::Worker::IndexFile(const std::string& filepath) {
std::FILE* fin = std::fopen(filepath.c_str(), "rb");
if (!fin) {
return ErrnoToCanonicalStatus(errno, "failed to open file '%s'", filepath);
}
path::FileCloser closer(fin);
std::fseek(fin, 0, SEEK_SET);
auto hdlr = absl::bind_front(&Indexer::Impl::AddChunk, impl_);
Indexer::Chunker chunker(cdc_cfg_, hdlr);
std::vector<uint8_t> buf(impl_->Cfg().read_block_size, 0);
int err = 0;
while (!std::feof(fin)) {
size_t cnt = std::fread(buf.data(), sizeof(uint8_t), buf.size(), fin);
err = std::ferror(fin);
if (err) {
return ErrnoToCanonicalStatus(err, "failed to read from file '%s'",
filepath);
}
if (cnt) {
chunker.Process(buf.data(), cnt);
}
}
chunker.Finalize();
impl_->AddFile();
return absl::OkStatus();
}
IndexerConfig::IndexerConfig()
: read_block_size(32 << 10),
min_chunk_size(0),
avg_chunk_size(0),
max_chunk_size(0),
max_chunk_size_step(0),
num_threads(0),
mask_s(0),
mask_l(0) {}
Indexer::Indexer() : impl_(nullptr) {}
Indexer::~Indexer() {
if (impl_) delete impl_;
}
absl::Status Indexer::Run(const IndexerConfig& cfg,
const std::vector<std::string>& inputs,
Indexer::ProgressFn fn) {
if (impl_) delete impl_;
impl_ = new Impl(cfg, inputs);
// Start the file creation workers.
std::vector<WorkerThread> workers(impl_->Config().num_threads);
for (auto it = workers.begin(); it != workers.end(); ++it) {
auto worker = new Worker(impl_);
it->worker = worker;
it->thrd = new std::thread(&Worker::Run, worker);
}
// Start the progress function worker.
std::thread prog(&Impl::TriggerProgress, impl_, fn);
// Wait for the workers to finish.
for (auto it = workers.begin(); it != workers.end(); ++it) {
it->thrd->join();
}
// Wait for the progress worker to finish.
impl_->SetDone(true);
prog.join();
return Error();
}
absl::Status Indexer::Error() const {
return impl_ ? impl_->Error() : absl::Status();
}
IndexerConfig Indexer::Config() const {
if (impl_) return impl_->Cfg();
return IndexerConfig();
}
Indexer::OpStats Indexer::Stats() const {
if (impl_) return impl_->Stats();
return Stats();
}
Indexer::ChunkSizeMap Indexer::ChunkSizes() const {
if (impl_) return impl_->ChunkSizes();
return Indexer::ChunkSizeMap();
}
inline Indexer::OpStats::OpStats()
: total_files(0),
total_chunks(0),
unique_chunks(0),
total_bytes(0),
unique_bytes(0) {}
}; // namespace cdc_ft