mirror of
https://github.com/nestriness/cdc-file-transfer.git
synced 2026-01-30 12:25:35 +02:00
The tools allow efficient and fast synchronization of large directory trees from a Windows workstation to a Linux target machine. cdc_rsync* support efficient copy of files by using content-defined chunking (CDC) to identify chunks within files that can be reused. asset_stream_manager + cdc_fuse_fs support efficient streaming of a local directory to a remote virtual file system based on FUSE. It also employs CDC to identify and reuse unchanged data chunks.
817 lines
30 KiB
C++
817 lines
30 KiB
C++
// Copyright 2022 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "manifest/manifest_updater.h"
|
|
|
|
#include <future>
|
|
#include <thread>
|
|
|
|
#include "absl/strings/match.h"
|
|
#include "absl/strings/string_view.h"
|
|
#include "common/log.h"
|
|
#include "common/path.h"
|
|
#include "common/stopwatch.h"
|
|
#include "common/threadpool.h"
|
|
#include "common/util.h"
|
|
#include "data_store/data_store_writer.h"
|
|
#include "fastcdc/fastcdc.h"
|
|
#include "manifest/asset_builder.h"
|
|
#include "manifest/file_chunk_map.h"
|
|
#include "manifest/manifest_builder.h"
|
|
#include "manifest/manifest_iterator.h"
|
|
#include "manifest/manifest_proto_defs.h"
|
|
|
|
namespace cdc_ft {
|
|
namespace {
|
|
|
|
// Returns AssetInfos for all files and dirs in |src_dir| + |rel_path|. Does not
|
|
// recurse into sub-directories.
|
|
absl::Status GetAllSrcAssets(const std::string& src_dir,
|
|
const std::string& rel_path,
|
|
std::vector<AssetInfo>* src_assets) {
|
|
std::string full_src_dir = path::Join(src_dir, rel_path);
|
|
|
|
path::EnsureEndsWithPathSeparator(&full_src_dir);
|
|
auto handler = [src_assets, &src_dir = full_src_dir,
|
|
rel_path = path::ToUnix(rel_path)](
|
|
const std::string& dir, const std::string& filename,
|
|
int64_t mtime, uint64_t size, bool is_dir) {
|
|
AssetInfo ai;
|
|
ai.path = path::JoinUnix(rel_path, filename);
|
|
ai.type = is_dir ? AssetProto::DIRECTORY : AssetProto::FILE;
|
|
ai.mtime = mtime;
|
|
ai.size = is_dir ? 0 : size;
|
|
src_assets->push_back(std::move(ai));
|
|
return absl::OkStatus();
|
|
};
|
|
#if PLATFORM_WINDOWS
|
|
// Windows expects a globbing pattern to search a path.
|
|
std::string src_pattern = path::Join(full_src_dir, "*");
|
|
#else
|
|
std::string src_pattern = src_dir;
|
|
#endif
|
|
absl::Status status =
|
|
path::SearchFiles(src_pattern, /*recursive=*/false, handler);
|
|
std::sort(src_assets->begin(), src_assets->end());
|
|
return status;
|
|
}
|
|
|
|
// Creates a fastcdc::Config struct from a CdcParamsProto.
|
|
fastcdc::Config CdcConfigFromProto(const CdcParamsProto& cfg_pb) {
|
|
return fastcdc::Config(cfg_pb.min_chunk_size(), cfg_pb.avg_chunk_size(),
|
|
cfg_pb.max_chunk_size());
|
|
}
|
|
|
|
// Checks if a given CdcParamsProto is sane and can be used for FastCDC.
|
|
bool ValidateCdcParams(const CdcParamsProto& params) {
|
|
return params.min_chunk_size() <= params.avg_chunk_size() &&
|
|
params.avg_chunk_size() <= params.max_chunk_size() &&
|
|
params.max_chunk_size() > 0;
|
|
}
|
|
|
|
// Returns the max. number of tasks that should be enqueued in the given thread
|
|
// pool.
|
|
size_t MaxQueuedTasks(const Threadpool& pool) { return pool.NumThreads() << 1; }
|
|
|
|
} // namespace
|
|
|
|
void AssetInfo::AppendCopyChunks(const RepeatedChunkRefProto& list,
|
|
uint64_t list_offset) {
|
|
chunks.reserve(chunks.size() + list.size());
|
|
for (const ChunkRefProto& ch : list)
|
|
chunks.emplace_back(ch.chunk_id(), ch.offset() + list_offset);
|
|
}
|
|
|
|
void AssetInfo::AppendMoveChunks(RepeatedChunkRefProto* list,
|
|
uint64_t list_offset) {
|
|
chunks.reserve(chunks.size() + list->size());
|
|
for (ChunkRefProto& ch : *list)
|
|
chunks.emplace_back(std::move(*ch.mutable_chunk_id()),
|
|
ch.offset() + list_offset);
|
|
}
|
|
|
|
// Common fields for tasks that fill in manifest data.
|
|
class ManifestTask : public Task {
|
|
public:
|
|
ManifestTask(std::string src_dir, std::string relative_unix_path,
|
|
std::string filename)
|
|
: src_dir_(std::move(src_dir)),
|
|
rel_unix_path_(std::move(relative_unix_path)),
|
|
filename_(std::move(filename)) {}
|
|
|
|
// Relative unix path of the directory containing the file or directory for
|
|
// this task.
|
|
const std::string& RelativeUnixPath() const { return rel_unix_path_; }
|
|
|
|
// Relative unix path of the file or directory for this task.
|
|
std::string RelativeUnixFilePath() const {
|
|
return path::JoinUnix(rel_unix_path_, filename_);
|
|
}
|
|
|
|
// Name of the file or directory to process with this task.
|
|
const std::string& Filename() const { return filename_; }
|
|
|
|
// Full path of the file or directory to process with this task.
|
|
std::string FilePath() const {
|
|
return path::Join(src_dir_, path::ToNative(rel_unix_path_), filename_);
|
|
}
|
|
|
|
// Returns the final status of the task.
|
|
// Should not be accessed before the task is finished.
|
|
const absl::Status& Status() const { return status_; }
|
|
|
|
protected:
|
|
const std::string src_dir_;
|
|
const std::string rel_unix_path_;
|
|
const std::string filename_;
|
|
|
|
absl::Status status_;
|
|
};
|
|
|
|
// ThreadPool task that runs the CDC chunker on a given file.
|
|
class FileChunkerTask : public ManifestTask {
|
|
public:
|
|
FileChunkerTask(std::string src_dir, std::string relative_path,
|
|
std::string filename, const fastcdc::Config* cfg,
|
|
Buffer buffer)
|
|
: ManifestTask(std::move(src_dir), std::move(relative_path),
|
|
std::move(filename)),
|
|
cfg_(cfg),
|
|
buffer_(std::move(buffer)) {
|
|
assert(cfg_->max_size > 0);
|
|
}
|
|
|
|
// Returns the number of bytes processed. Should match file size unless some
|
|
// error occurred.
|
|
// Should not be accessed before the task is finished.
|
|
uint64_t ProcessedBytes() const { return processed_bytes_; }
|
|
|
|
// True if the file looks like a Linux executable based on elf/shebang magic
|
|
// headers.
|
|
// Should not be accessed before the task is finished.
|
|
bool IsExecutable() const { return is_executable_; }
|
|
|
|
// Returns the chunk hashes and offsets.
|
|
// Should not be accessed before the task is finished.
|
|
google::protobuf::RepeatedPtrField<ChunkRefProto>* Chunks() {
|
|
return &chunks_;
|
|
}
|
|
|
|
// Releases the allocated buffer and returns it to the caller.
|
|
Buffer&& ReleaseBuffer() { return std::move(buffer_); }
|
|
|
|
// Task:
|
|
void ThreadRun(IsCancelledPredicate is_cancelled) override {
|
|
// TODO: Retry with backoff if this fails in practice, e.g. if the file is
|
|
// changed repeatedly.
|
|
std::string file_path = FilePath();
|
|
absl::StatusOr<FILE*> file = path::OpenFile(file_path, "rb");
|
|
if (!file.ok()) {
|
|
status_ =
|
|
WrapStatus(file.status(), "Failed to open file '%s'", file_path);
|
|
return;
|
|
}
|
|
path::FileCloser closer(*file);
|
|
|
|
auto chunk_handler = [chunks = &chunks_, offset = &processed_bytes_](
|
|
const void* data, size_t size) {
|
|
ChunkRefProto* chunk = chunks->Add();
|
|
*chunk->mutable_chunk_id() = ContentId::FromArray(data, size);
|
|
chunk->set_offset(*offset);
|
|
*offset += size;
|
|
};
|
|
fastcdc::Chunker chunker(*cfg_, chunk_handler);
|
|
|
|
bool first_chunk = true;
|
|
auto stream_handler = [&chunker, &is_cancelled, &first_chunk,
|
|
is_executable = &is_executable_,
|
|
&file_path](const void* data, size_t size) {
|
|
chunker.Process(static_cast<const uint8_t*>(data), size);
|
|
if (first_chunk) {
|
|
first_chunk = false;
|
|
*is_executable = Util::IsExecutable(data, size);
|
|
}
|
|
return is_cancelled() ? absl::CancelledError(absl::StrFormat(
|
|
"chunking file '%s' cancelled", file_path))
|
|
: absl::OkStatus();
|
|
};
|
|
|
|
status_ = path::StreamReadFileContents(*file, &buffer_, stream_handler);
|
|
chunker.Finalize();
|
|
}
|
|
|
|
private:
|
|
const fastcdc::Config* const cfg_;
|
|
|
|
google::protobuf::RepeatedPtrField<ChunkRefProto> chunks_;
|
|
uint64_t processed_bytes_ = 0;
|
|
bool is_executable_ = false;
|
|
Buffer buffer_;
|
|
};
|
|
|
|
// ThreadPool task that creates assets for the contents of a directory.
|
|
class DirScannerTask : public ManifestTask {
|
|
public:
|
|
DirScannerTask(std::string src_dir, std::string relative_path,
|
|
std::string filename, AssetBuilder dir,
|
|
DataStoreReader* data_store)
|
|
: ManifestTask(std::move(src_dir), std::move(relative_path),
|
|
std::move(filename)),
|
|
dir_(dir),
|
|
data_store_(data_store) {}
|
|
|
|
// Task:
|
|
void ThreadRun(IsCancelledPredicate is_cancelled) override {
|
|
std::vector<AssetInfo> src_assets, manifest_assets;
|
|
// Collect all files from the given directory.
|
|
status_ = GetAllSrcAssets(src_dir_, path::ToNative(RelativeUnixFilePath()),
|
|
&src_assets);
|
|
if (!status_.ok()) return;
|
|
// Collect all assets from the manifest.
|
|
status_ = GetAllAssetsFromDirAsset(&manifest_assets, is_cancelled);
|
|
if (!status_.ok()) return;
|
|
CompareAssets(src_assets, manifest_assets);
|
|
if (is_cancelled()) status_ = absl::CancelledError();
|
|
}
|
|
|
|
// Returns the IDs of indirect lists that were fetched when executing this
|
|
// task.
|
|
std::vector<ContentIdProto>* ManifestContentIds() {
|
|
return &manifest_content_ids_;
|
|
}
|
|
|
|
// Returns the AssetBuilder representing the directory this task is scanning.
|
|
AssetBuilder* Dir() { return &dir_; }
|
|
|
|
// Returns the list of assets that need to be added or updated in the
|
|
// directory that this task was scanning.
|
|
ManifestUpdater::OperationList* Operations() { return &operations_; }
|
|
|
|
private:
|
|
using Operator = ManifestUpdater::Operator;
|
|
|
|
// Stores AssetInfo structs for all assets found in |assets| in the
|
|
// target param |asset_infos|.
|
|
void GetAssetInfosFromList(const std::string& rel_path,
|
|
const RepeatedAssetProto& assets,
|
|
std::vector<AssetInfo>* asset_infos) {
|
|
asset_infos->reserve(asset_infos->size() + assets.size());
|
|
|
|
for (const AssetProto& asset : assets) {
|
|
AssetInfo ai;
|
|
ai.path = path::JoinUnix(rel_path, asset.name());
|
|
ai.type = asset.type();
|
|
ai.mtime = asset.mtime_seconds();
|
|
ai.size = asset.type() == AssetProto::DIRECTORY ? 0 : asset.file_size();
|
|
|
|
if (asset.type() == AssetProto::FILE) {
|
|
// Copy chunks from the direct chunk list.
|
|
ai.AppendCopyChunks(asset.file_chunks(), 0);
|
|
|
|
// Append all chunk IDs from indirect chunk lists.
|
|
for (const IndirectChunkListProto& icl : asset.file_indirect_chunks()) {
|
|
ChunkListProto chunk_list;
|
|
absl::Status status =
|
|
data_store_->GetProto(icl.chunk_list_id(), &chunk_list);
|
|
if (!status.ok()) {
|
|
// Pretend the file is empty.
|
|
ai.chunks.clear();
|
|
// Log a warning and continue so that the file is re-added and
|
|
// corrected.
|
|
LOG_WARNING(
|
|
"Can't read indirect chunk list for file '%s': %s. The "
|
|
"affected asset will be updated from disk.",
|
|
ai.path, status.ToString());
|
|
break;
|
|
}
|
|
ai.AppendMoveChunks(chunk_list.mutable_chunks(), icl.offset());
|
|
// Collect the content IDs of all indirect chunk lists.
|
|
manifest_content_ids_.push_back(icl.chunk_list_id());
|
|
}
|
|
}
|
|
|
|
asset_infos->emplace_back(std::move(ai));
|
|
}
|
|
}
|
|
|
|
// Collects all assets from the manifest directory at RelativeUnixFilePath()
|
|
// and adds corresponding AssetInfo structs to |asset_infos|.
|
|
absl::Status GetAllAssetsFromDirAsset(std::vector<AssetInfo>* asset_infos,
|
|
IsCancelledPredicate is_cancelled) {
|
|
// Collect all direct assets from the manifest.
|
|
std::string rel_path = dir_.RelativeFilePath();
|
|
GetAssetInfosFromList(rel_path, dir_.Proto()->dir_assets(), asset_infos);
|
|
// Load all indirect asset lists, if there are any.
|
|
if (dir_.Proto()->dir_indirect_assets_size() > 0) {
|
|
auto it = dir_.Proto()->mutable_dir_indirect_assets()->begin();
|
|
while (it != dir_.Proto()->mutable_dir_indirect_assets()->end()) {
|
|
if (is_cancelled()) return absl::CancelledError();
|
|
|
|
AssetListProto list;
|
|
absl::Status status = data_store_->GetProto(*it, &list);
|
|
if (status.ok()) {
|
|
GetAssetInfosFromList(rel_path, list.assets(), asset_infos);
|
|
// Collect the content IDs of all indirect asset lists.
|
|
manifest_content_ids_.push_back(*it);
|
|
++it;
|
|
} else {
|
|
// In case of an error, log a warning and continue.
|
|
LOG_WARNING(
|
|
"Can't read indirect asset list for directory '%s': %s. The "
|
|
"affected assets will be updated from disk.",
|
|
rel_path, status.ToString());
|
|
it = dir_.Proto()->mutable_dir_indirect_assets()->erase(it);
|
|
}
|
|
}
|
|
}
|
|
std::sort(asset_infos->begin(), asset_infos->end());
|
|
return is_cancelled() ? absl::CancelledError() : absl::OkStatus();
|
|
}
|
|
|
|
// Both |srcs_assets| and |manifest_assets| must be sorted.
|
|
void CompareAssets(const std::vector<AssetInfo>& src_assets,
|
|
const std::vector<AssetInfo>& manifest_assets) {
|
|
// Compare the arrays, sorting the assets into the right buckets.
|
|
auto src_iter = src_assets.begin();
|
|
auto manifest_iter = manifest_assets.begin();
|
|
|
|
while (src_iter != src_assets.end() ||
|
|
manifest_iter != manifest_assets.end()) {
|
|
const int order = src_iter == src_assets.end()
|
|
? 1 // Extraneous manifest asset.
|
|
: manifest_iter == manifest_assets.end()
|
|
? -1 // Missing/outdated manifest asset.
|
|
: src_iter->path.compare(manifest_iter->path);
|
|
|
|
if (order < 0) {
|
|
// Missing manifest file -> add to manifest.
|
|
operations_.emplace_back(Operator::kAdd, std::move(*src_iter));
|
|
++src_iter;
|
|
} else if (order > 0) {
|
|
// Extraneous manifest asset -> delete.
|
|
operations_.emplace_back(Operator::kDelete, std::move(*manifest_iter));
|
|
++manifest_iter;
|
|
} else if (src_iter->mtime == manifest_iter->mtime &&
|
|
src_iter->type == manifest_iter->type &&
|
|
// For files, compare the size.
|
|
(src_iter->type != AssetProto::FILE ||
|
|
src_iter->size == manifest_iter->size) &&
|
|
// Directories always need to be updated recursively.
|
|
src_iter->type != AssetProto::DIRECTORY) {
|
|
// Assets match, keep content IDs from the manifest asset for populating
|
|
// the FileChunkMap.
|
|
operations_.emplace_back(Operator::kKeep, std::move(*manifest_iter));
|
|
++src_iter;
|
|
++manifest_iter;
|
|
} else {
|
|
// Source asset changed -> update manifest asset.
|
|
operations_.emplace_back(Operator::kUpdate, std::move(*src_iter));
|
|
++src_iter;
|
|
++manifest_iter;
|
|
}
|
|
}
|
|
}
|
|
|
|
DataStoreReader* data_store_;
|
|
AssetBuilder dir_;
|
|
std::vector<ContentIdProto> manifest_content_ids_;
|
|
ManifestUpdater::OperationList operations_;
|
|
};
|
|
|
|
// static
|
|
ContentIdProto ManifestUpdater::GetManifestStoreId() {
|
|
ContentIdProto manifest_store_id;
|
|
ContentId::FromHexString("0000000000000000000000000000000000000000",
|
|
&manifest_store_id);
|
|
return manifest_store_id;
|
|
}
|
|
|
|
// static
|
|
absl::Status ManifestUpdater::IsValidDir(std::string dir) {
|
|
path::EnsureDoesNotEndWithPathSeparator(&dir);
|
|
|
|
if (!path::IsAbsolute(dir)) {
|
|
return absl::FailedPreconditionError(
|
|
absl::StrFormat("Directory '%s' must be an absolute path.", dir));
|
|
}
|
|
|
|
if (!path::Exists(dir)) {
|
|
return absl::NotFoundError(
|
|
absl::StrFormat("Failed to find directory '%s'.", dir));
|
|
}
|
|
|
|
if (!path::DirExists(dir)) {
|
|
return absl::FailedPreconditionError(
|
|
absl::StrFormat("Path '%s' should be a directory.", dir));
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
ManifestUpdater::ManifestUpdater(DataStoreWriter* data_store, UpdaterConfig cfg)
|
|
: data_store_(data_store), cfg_(std::move(cfg)) {
|
|
path::EnsureEndsWithPathSeparator(&cfg_.src_dir);
|
|
}
|
|
|
|
ManifestUpdater::~ManifestUpdater() = default;
|
|
|
|
absl::Status ManifestUpdater::UpdateAll(
|
|
FileChunkMap* file_chunks,
|
|
PushIntermediateManifest push_intermediate_manifest) {
|
|
RETURN_IF_ERROR(ManifestUpdater::IsValidDir(cfg_.src_dir));
|
|
|
|
// Don't use the Windows localized time from path::GetStats.
|
|
time_t mtime;
|
|
RETURN_IF_ERROR(path::GetFileTime(cfg_.src_dir, &mtime));
|
|
|
|
// Create the info for the root directory to start the recursive search.
|
|
AssetInfo ri;
|
|
ri.type = AssetProto::DIRECTORY;
|
|
ri.mtime = mtime;
|
|
|
|
std::vector<Operation> operations{{Operator::kAdd, std::move(ri)}};
|
|
|
|
absl::Status status =
|
|
Update(&operations, file_chunks, push_intermediate_manifest,
|
|
/*recursive=*/true);
|
|
|
|
if (status.ok() || !absl::IsUnavailable(status)) return status;
|
|
|
|
// In case we receive an absl::UnavailableError, it means that not all
|
|
// manifest chunks could be located. In that case, we wipe all data and
|
|
// rebuild the manifest from scratch.
|
|
LOG_WARNING("Failed to load manifest, building from scratch: %s",
|
|
status.ToString());
|
|
|
|
RETURN_IF_ERROR(data_store_->Wipe());
|
|
file_chunks->Clear();
|
|
|
|
RETURN_IF_ERROR(Update(&operations, file_chunks, push_intermediate_manifest,
|
|
/*recursive=*/true),
|
|
"Failed to build manifest from scratch");
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
ContentIdProto ManifestUpdater::DefaultManifestId() {
|
|
CdcParamsProto params;
|
|
params.set_min_chunk_size(cfg_.min_chunk_size);
|
|
params.set_avg_chunk_size(cfg_.avg_chunk_size);
|
|
params.set_max_chunk_size(cfg_.max_chunk_size);
|
|
ManifestBuilder manifest_builder(params, data_store_);
|
|
|
|
// Load the manifest id from the store. It's necessary to extract the CDC
|
|
// parameters used last time.
|
|
ContentIdProto manifest_id;
|
|
if ((data_store_->GetProto(GetManifestStoreId(), &manifest_id).ok()) &&
|
|
manifest_builder.LoadManifest(manifest_id).ok() &&
|
|
ValidateCdcParams(manifest_builder.CdcParameters())) {
|
|
params = manifest_builder.CdcParameters();
|
|
}
|
|
|
|
// Create an empty manifest with correct CDC parameters.
|
|
ManifestBuilder new_manifest_builder(params, data_store_);
|
|
absl::StatusOr<ContentIdProto> result = new_manifest_builder.Flush();
|
|
assert(result.ok());
|
|
manifest_id_ = *result;
|
|
std::string id_str = manifest_id_.SerializeAsString();
|
|
|
|
absl::Status status =
|
|
data_store_->Put(GetManifestStoreId(), id_str.data(), id_str.size());
|
|
if (!status.ok()) {
|
|
LOG_ERROR("Failed to store default manifest ID in data store: %s",
|
|
status.ToString());
|
|
}
|
|
return manifest_id_;
|
|
}
|
|
|
|
size_t ManifestUpdater::QueueTasks(Threadpool* pool,
|
|
const fastcdc::Config* cdc_cfg,
|
|
ManifestBuilder* manifest_builder) {
|
|
const size_t max_tasks_queued = MaxQueuedTasks(*pool);
|
|
size_t num_tasks_queued = 0;
|
|
while (pool->NumQueuedTasks() < max_tasks_queued && !queue_.empty() &&
|
|
!buffers_.empty()) {
|
|
PendingAsset asset = std::move(queue_.front());
|
|
absl::StatusOr<AssetBuilder> dir;
|
|
queue_.pop_front();
|
|
|
|
switch (asset.type) {
|
|
case AssetProto::FILE:
|
|
pool->QueueTask(std::make_unique<FileChunkerTask>(
|
|
cfg_.src_dir, std::move(asset.relative_path),
|
|
std::move(asset.filename), cdc_cfg, std::move(buffers_.back())));
|
|
buffers_.pop_back();
|
|
break;
|
|
|
|
case AssetProto::DIRECTORY:
|
|
dir = manifest_builder->GetOrCreateAsset(
|
|
path::JoinUnix(asset.relative_path, asset.filename),
|
|
AssetProto::DIRECTORY, true);
|
|
if (!dir.ok()) {
|
|
LOG_ERROR(
|
|
"Failed to locate directory '%s' in the manifest, skipping it: "
|
|
"%s",
|
|
asset.relative_path, dir.status().ToString());
|
|
continue;
|
|
}
|
|
pool->QueueTask(std::make_unique<DirScannerTask>(
|
|
cfg_.src_dir, std::move(asset.relative_path),
|
|
std::move(asset.filename), std::move(dir.value()), data_store_));
|
|
break;
|
|
|
|
default:
|
|
LOG_ERROR("Unexpected type '%s' for asset '%s'",
|
|
AssetProto::Type_Name(asset.type), asset.relative_path);
|
|
continue;
|
|
}
|
|
++num_tasks_queued;
|
|
}
|
|
return num_tasks_queued;
|
|
}
|
|
|
|
absl::Status ManifestUpdater::ApplyOperations(
|
|
std::vector<Operation>* operations, FileChunkMap* file_chunks,
|
|
ManifestBuilder* manifest_builder, AssetBuilder* parent, bool recursive) {
|
|
assert(manifest_builder != nullptr);
|
|
if (operations->empty()) return absl::OkStatus();
|
|
|
|
// First, handle all deletions to make the outcome independent of the order of
|
|
// operations (e.g., when the same file is added and deleted again).
|
|
const std::string* last_deleted = nullptr;
|
|
for (const Operation& op : *operations) {
|
|
if (op.op != Operator::kDelete) continue;
|
|
const AssetInfo& ai = op.info;
|
|
|
|
++stats_.total_assets_deleted;
|
|
file_chunks->Remove(ai.path);
|
|
if (last_deleted && absl::StartsWith(ai.path, *last_deleted) &&
|
|
ai.path[last_deleted->size()] == '/') {
|
|
// Optimization: |path| is part of a deleted dir, so it can be
|
|
// skipped.
|
|
continue;
|
|
}
|
|
RETURN_IF_ERROR(manifest_builder->DeleteAsset(ai.path),
|
|
"Failed to delete asset '%s' from manifest", ai.path);
|
|
last_deleted = &ai.path;
|
|
}
|
|
|
|
// Second, handle additions and updates.
|
|
AssetBuilder asset_builder;
|
|
for (Operation& op : *operations) {
|
|
AssetInfo& ai = op.info;
|
|
bool created = true;
|
|
|
|
switch (op.op) {
|
|
case Operator::kDelete:
|
|
continue;
|
|
|
|
case Operator::kKeep:
|
|
file_chunks->Init(ai.path, ai.size, &ai.chunks);
|
|
continue;
|
|
|
|
case Operator::kAdd:
|
|
// If a parent was given, assets are added as direct children of that
|
|
// parent directory.
|
|
if (parent) {
|
|
asset_builder = parent->AppendAsset(path::BaseName(ai.path), ai.type);
|
|
break;
|
|
}
|
|
[[fallthrough]];
|
|
|
|
case Operator::kUpdate:
|
|
ASSIGN_OR_RETURN(asset_builder,
|
|
manifest_builder->GetOrCreateAsset(ai.path, ai.type,
|
|
true, &created),
|
|
"Failed to add '%s' to the manifest", ai.path);
|
|
break;
|
|
}
|
|
|
|
if (created) ++stats_.total_assets_added_or_updated;
|
|
asset_builder.SetMtimeSeconds(ai.mtime);
|
|
|
|
if (ai.type == AssetProto::FILE) {
|
|
// Assume everything is executable for the intermediate manifest.
|
|
// The executable bit is derived from the file data, which is not
|
|
// available at this point.
|
|
asset_builder.SetPermissions(kExecutablePerms);
|
|
asset_builder.TruncateChunks();
|
|
asset_builder.SetFileSize(ai.size);
|
|
// Queue chunker tasks for files.
|
|
asset_builder.SetInProgress(true);
|
|
} else if (recursive && ai.type == AssetProto::DIRECTORY) {
|
|
// We are recursing into all sub-directories, so we add queue up the
|
|
// child directory for scanning.
|
|
asset_builder.SetInProgress(true);
|
|
}
|
|
|
|
// If the asset is marked as in-progress, we need to queue it up.
|
|
if (asset_builder.InProgress()) {
|
|
queue_.emplace_back(ai.type, asset_builder.RelativePath(),
|
|
asset_builder.Name());
|
|
}
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status ManifestUpdater::HandleFileChunkerResult(
|
|
FileChunkerTask* task, FileChunkMap* file_chunks,
|
|
ManifestBuilder* manifest_builder) {
|
|
const std::string rel_file_path = task->RelativeUnixFilePath();
|
|
buffers_.emplace_back(task->ReleaseBuffer());
|
|
|
|
AssetBuilder asset_builder;
|
|
ASSIGN_OR_RETURN(asset_builder, manifest_builder->GetOrCreateAsset(
|
|
rel_file_path, AssetProto::FILE));
|
|
asset_builder.SetInProgress(false);
|
|
if (!task->Status().ok()) {
|
|
// In case of an error, pretend the file is empty.
|
|
asset_builder.SetFileSize(0);
|
|
file_chunks->Init(rel_file_path, 0);
|
|
|
|
++stats_.total_files_failed;
|
|
return task->Status();
|
|
}
|
|
|
|
// Update the asset and the stats.
|
|
uint64_t file_size = task->ProcessedBytes();
|
|
stats_.total_chunks += task->Chunks()->size();
|
|
stats_.total_processed_bytes += file_size;
|
|
++stats_.total_files_added_or_updated;
|
|
|
|
asset_builder.SwapChunks(task->Chunks(), file_size);
|
|
asset_builder.SetPermissions(task->IsExecutable()
|
|
? kExecutablePerms
|
|
: ManifestBuilder::kDefaultFilePerms);
|
|
|
|
file_chunks->Init(rel_file_path, file_size);
|
|
file_chunks->AppendCopy(rel_file_path, asset_builder.Proto()->file_chunks(),
|
|
0);
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status ManifestUpdater::HandleDirScannerResult(
|
|
DirScannerTask* task, FileChunkMap* file_chunks,
|
|
ManifestBuilder* manifest_builder,
|
|
std::unordered_set<ContentIdProto>* manifest_content_ids) {
|
|
// Include the error in the stats, but we can still try to process the
|
|
// (partial) results.
|
|
if (!task->Status().ok()) {
|
|
++stats_.total_dirs_failed;
|
|
}
|
|
|
|
// DirScannerTasks are inherently recursive.
|
|
RETURN_IF_ERROR(ApplyOperations(task->Operations(), file_chunks,
|
|
manifest_builder, task->Dir(),
|
|
/*recursive=*/true));
|
|
task->Dir()->SetInProgress(false);
|
|
// Union all manifest chunk content IDs.
|
|
assert(manifest_content_ids != nullptr);
|
|
manifest_content_ids->insert(task->ManifestContentIds()->begin(),
|
|
task->ManifestContentIds()->end());
|
|
return task->Status();
|
|
}
|
|
|
|
absl::Status ManifestUpdater::Update(
|
|
OperationList* operations, FileChunkMap* file_chunks,
|
|
PushIntermediateManifest push_intermediate_manifest, bool recursive) {
|
|
Stopwatch sw;
|
|
LOG_INFO(
|
|
"Updating manifest for '%s': applying %u changes, "
|
|
"%srecursive",
|
|
cfg_.src_dir, operations->size(), recursive ? "" : "non-");
|
|
|
|
stats_ = UpdaterStats();
|
|
|
|
CdcParamsProto cdc_params;
|
|
cdc_params.set_min_chunk_size(cfg_.min_chunk_size);
|
|
cdc_params.set_avg_chunk_size(cfg_.avg_chunk_size);
|
|
cdc_params.set_max_chunk_size(cfg_.max_chunk_size);
|
|
ManifestBuilder manifest_builder(cdc_params, data_store_);
|
|
|
|
// Load the manifest id from the store.
|
|
ContentIdProto manifest_id;
|
|
absl::Status status =
|
|
data_store_->GetProto(GetManifestStoreId(), &manifest_id);
|
|
if (!status.ok()) {
|
|
if (!absl::IsNotFound(status))
|
|
return WrapStatus(status, "Failed to load manifest id");
|
|
|
|
// A non-existing manifest is not an issue, just build it from scratch.
|
|
LOG_INFO("No cached manifest found. Building from scratch.");
|
|
} else {
|
|
RETURN_IF_ERROR(manifest_builder.LoadManifest(manifest_id),
|
|
"Failed to load manifest with id '%s'",
|
|
ContentId::ToHexString(manifest_id));
|
|
// The CDC params might have changed when loading the manifest.
|
|
if (ValidateCdcParams(manifest_builder.Manifest()->cdc_params())) {
|
|
cdc_params = manifest_builder.Manifest()->cdc_params();
|
|
}
|
|
}
|
|
|
|
RETURN_IF_ERROR(ApplyOperations(operations, file_chunks, &manifest_builder,
|
|
nullptr, recursive));
|
|
|
|
Threadpool pool(cfg_.num_threads > 0 ? cfg_.num_threads
|
|
: std::thread::hardware_concurrency());
|
|
// Pre-allocate one buffer per queueable task with 2 * max_chunk_size.
|
|
const size_t max_queued_tasks = MaxQueuedTasks(pool);
|
|
buffers_.reserve(max_queued_tasks);
|
|
while (buffers_.size() < max_queued_tasks)
|
|
buffers_.emplace_back(cfg_.max_chunk_size << 1);
|
|
size_t num_tasks_queued = 0;
|
|
|
|
// Collect the content IDs that make up the manifest when recursing. They are
|
|
// used to prune the manifest cache directory in the end.
|
|
std::unordered_set<ContentIdProto> manifest_content_ids;
|
|
|
|
// Push intermediate manifest if there are queued chunker tasks.
|
|
if (push_intermediate_manifest && !queue_.empty()) {
|
|
file_chunks->FlushUpdates();
|
|
ASSIGN_OR_RETURN(manifest_id_, manifest_builder.Flush(),
|
|
"Failed to flush intermediate manifest");
|
|
// Add all content IDs that were just written back.
|
|
manifest_content_ids.insert(manifest_builder.FlushedContentIds().begin(),
|
|
manifest_builder.FlushedContentIds().end());
|
|
push_intermediate_manifest(manifest_id_);
|
|
}
|
|
|
|
fastcdc::Config cdc_cfg = CdcConfigFromProto(cdc_params);
|
|
|
|
// Wait for the chunker tasks and update file assets.
|
|
while (!queue_.empty() || num_tasks_queued > 0) {
|
|
num_tasks_queued += QueueTasks(&pool, &cdc_cfg, &manifest_builder);
|
|
std::unique_ptr<Task> task = pool.GetCompletedTask();
|
|
assert(num_tasks_queued > 0);
|
|
--num_tasks_queued;
|
|
|
|
FileChunkerTask* chunker_task = dynamic_cast<FileChunkerTask*>(task.get());
|
|
if (chunker_task) {
|
|
status =
|
|
HandleFileChunkerResult(chunker_task, file_chunks, &manifest_builder);
|
|
|
|
if (!status.ok()) {
|
|
LOG_ERROR("Failed to process file '%s': %s", chunker_task->FilePath(),
|
|
status.ToString());
|
|
}
|
|
continue;
|
|
}
|
|
|
|
DirScannerTask* scanner_task = dynamic_cast<DirScannerTask*>(task.get());
|
|
if (scanner_task) {
|
|
status = HandleDirScannerResult(scanner_task, file_chunks,
|
|
&manifest_builder, &manifest_content_ids);
|
|
if (!status.ok()) {
|
|
LOG_ERROR("Failed to process directory '%s': %s",
|
|
scanner_task->FilePath(), status.ToString());
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
file_chunks->FlushUpdates();
|
|
ASSIGN_OR_RETURN(manifest_id_, manifest_builder.Flush(),
|
|
"Failed to flush manifest");
|
|
|
|
// Save the manifest id to the store.
|
|
std::string id_str = manifest_id_.SerializeAsString();
|
|
RETURN_IF_ERROR(
|
|
data_store_->Put(GetManifestStoreId(), id_str.data(), id_str.size()),
|
|
"Failed to store manifest id");
|
|
|
|
// Remove manifest chunks that are no longer referenced when recursing through
|
|
// all sub-directories. This also makes sure that all referenced manifest
|
|
// chunks are present.
|
|
if (status.ok() && recursive) {
|
|
// Retain the chunk that stores the manifest ID.
|
|
manifest_content_ids.insert(ManifestUpdater::GetManifestStoreId());
|
|
// Add all content IDs that were just written back.
|
|
manifest_content_ids.insert(manifest_builder.FlushedContentIds().begin(),
|
|
manifest_builder.FlushedContentIds().end());
|
|
status = data_store_->Prune(std::move(manifest_content_ids));
|
|
if (!status.ok()) {
|
|
// Signal to the caller that the manifest needs to be rebuilt from
|
|
// scratch.
|
|
return absl::UnavailableError(status.ToString());
|
|
}
|
|
}
|
|
|
|
LOG_INFO("Manifest for '%s' successfully updated in %0.3f seconds",
|
|
cfg_.src_dir, sw.ElapsedSeconds());
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
} // namespace cdc_ft
|