Files
netris-cdc-file-transfer/data_store/disk_data_store.cc
Christian Schneider 4326e972ac Releasing the former Stadia file transfer tools
The tools allow efficient and fast synchronization of large directory
trees from a Windows workstation to a Linux target machine.

cdc_rsync* support efficient copy of files by using content-defined
chunking (CDC) to identify chunks within files that can be reused.

asset_stream_manager + cdc_fuse_fs support efficient streaming of a
local directory to a remote virtual file system based on FUSE. It also
employs CDC to identify and reuse unchanged data chunks.
2022-11-03 10:39:10 +01:00

363 lines
12 KiB
C++

// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/disk_data_store.h"
#include <filesystem>
#include <memory>
#include "common/log.h"
#include "common/path.h"
#include "common/status.h"
#include "common/status_macros.h"
namespace cdc_ft {
namespace {
static constexpr char kDirNames[16] = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
// Generates directory names of |length| symbols from kDirNames.
// If length = 2, the names are 00, 01, 02, etc.
std::vector<std::string> GenerateDirNames(size_t length) {
size_t names_size = 1ull << (length * 4);
std::vector<std::string> names(names_size, std::string(length, '0'));
for (size_t idx = 0; idx < names_size; ++idx) {
size_t symbol = idx;
for (size_t jdx = 0; jdx < length; ++jdx) {
names[idx][jdx] = kDirNames[symbol & 0xfu];
symbol >>= 4;
}
}
return names;
}
// Adds |count| path separators to |input| after each |distance| symbols
// starting from the beginning. At least one symbol is left at the end
// for the file name.
// AddSeparators("abc", 1, 3) -> a\b\c
// AddSeparators("abc", 1, 0) -> abc
// AddSeparators("abc", 2, 100) -> ab\c
static std::string AddPathSeparators(const std::string& input, size_t distance,
size_t count) {
if (input.empty() || distance == 0 || count == 0) {
return input;
}
count = std::min((input.size() - 1) / distance, count);
std::string path;
path.reserve(input.size() + count);
std::string::const_iterator it_pos = input.begin();
while (count > 0 && it_pos < input.end()) {
path.append(it_pos, it_pos + distance);
path.push_back(path::PathSeparator());
it_pos += distance;
--count;
}
if (it_pos < input.end()) {
path.append(it_pos, input.end());
}
return path;
}
} // namespace
DiskDataStore::DiskDataStore(unsigned int depth, std::string cache_root_dir,
bool create_dirs, SystemClock* clock)
: depth_(depth),
root_dir_(std::move(cache_root_dir)),
create_dirs_(create_dirs),
clock_(clock) {
assert(!root_dir_.empty());
path::EnsureEndsWithPathSeparator(&root_dir_);
}
absl::StatusOr<std::unique_ptr<DiskDataStore>> DiskDataStore::Create(
unsigned int depth, std::string cache_root_dir, bool create_dirs,
SystemClock* clock) {
std::unique_ptr<DiskDataStore> store = absl::WrapUnique(
new DiskDataStore(depth, std::move(cache_root_dir), create_dirs, clock));
if (create_dirs) {
RETURN_IF_ERROR(store->CreateDirHierarchy());
}
return store;
}
DiskDataStore::~DiskDataStore() {}
absl::Status DiskDataStore::Put(const ContentIdProto& content_id,
const void* data, size_t size) {
std::string path = GetCacheFilePath(content_id);
if (!create_dirs_) {
RETURN_IF_ERROR(path::CreateDirRec(path::DirName(path)));
}
RETURN_IF_ERROR(path::WriteFile(path, data, size));
UpdateModificationTime(path);
size_.fetch_add(size, std::memory_order_relaxed);
return absl::OkStatus();
}
absl::StatusOr<size_t> DiskDataStore::Get(const ContentIdProto& content_id,
void* data, size_t offset,
size_t size) {
if (!size) return 0;
assert(data);
std::string path = GetCacheFilePath(content_id);
size_t read_size;
ASSIGN_OR_RETURN(read_size, path::ReadFile(path, data, offset, size),
"Failed to read chunk %s of size %d at offset %d",
ContentId::ToHexString(content_id), size, offset);
UpdateModificationTime(path);
return read_size;
}
absl::Status DiskDataStore::Get(const ContentIdProto& content_id,
Buffer* data) {
assert(data);
std::string path = GetCacheFilePath(content_id);
size_t read_size = 0;
size_t file_size = 0;
RETURN_IF_ERROR(path::FileSize(path, &file_size),
"Failed to stat file size for '%s'", path);
data->resize(file_size);
ASSIGN_OR_RETURN(read_size, path::ReadFile(path, data->data(), 0, file_size),
"Failed to read %s of size %d",
ContentId::ToHexString(content_id), file_size);
if (read_size != file_size) {
return absl::DataLossError(
absl::StrFormat("Only %u bytes out of %u are read for %s", read_size,
file_size, ContentId::ToHexString(content_id)));
}
UpdateModificationTime(path);
return absl::OkStatus();
}
int64_t DiskDataStore::Capacity() const { return capacity_; }
double DiskDataStore::FillFactor() const { return fill_factor_; }
unsigned int DiskDataStore::Depth() const { return depth_; }
size_t DiskDataStore::Size() const { return size_; }
const std::string& DiskDataStore::RootDir() const { return root_dir_; }
void DiskDataStore::SetCapacity(int64_t capacity) { capacity_ = capacity; }
absl::Status DiskDataStore::SetFillFactor(double fill_factor) {
if (fill_factor <= 0 || fill_factor > 1) {
return absl::FailedPreconditionError(
absl::StrFormat("Failed to set cache fill factor to %f.", fill_factor));
}
fill_factor_ = fill_factor;
return Cleanup();
}
absl::Status DiskDataStore::Wipe() {
RETURN_IF_ERROR(path::RemoveDirRec(root_dir_),
"RemoveDirRec() for '%s' failed", root_dir_);
size_ = 0;
if (create_dirs_) {
RETURN_IF_ERROR(CreateDirHierarchy());
}
return absl::OkStatus();
}
absl::Status DiskDataStore::Prune(
std::unordered_set<ContentIdProto> ids_to_keep) {
CacheFilesWithSize files_with_size;
ASSIGN_OR_RETURN(files_with_size, CollectCacheFiles(),
"Failed to collect cache files");
// Delete the set of chunks not in |ids_to_keep|.
std::vector<ContentIdProto> to_delete;
for (const CacheFile& file : files_with_size.files) {
// Don't touch files that don't match the chunk naming scheme
// (e.g. user-added files).
ContentIdProto id;
if (!ParseCacheFilePath(std::move(file.path), &id)) continue;
if (ids_to_keep.find(id) == ids_to_keep.end()) {
RETURN_IF_ERROR(Remove(id));
size_.fetch_sub(file.size, std::memory_order_relaxed);
} else {
ids_to_keep.erase(id);
}
}
// Verify that all chunks in |ids_to_keep| are present in the cache.
if (!ids_to_keep.empty()) {
return absl::NotFoundError(absl::StrFormat(
"%u chunks, e.g. '%s', not found in the store", ids_to_keep.size(),
ContentId::ToHexString(*ids_to_keep.begin())));
}
return absl::OkStatus();
}
absl::Status DiskDataStore::Remove(const ContentIdProto& content_id) {
std::string path = GetCacheFilePath(content_id);
return path::RemoveFile(path);
}
bool DiskDataStore::Contains(const ContentIdProto& content_id) {
return path::Exists(GetCacheFilePath(content_id));
}
absl::Status DiskDataStore::Cleanup() {
if (capacity_ < 0) {
return absl::OkStatus();
}
size_t size_threshold = static_cast<size_t>(capacity_) * fill_factor_;
if (size_initialized_.load() && size_ <= size_threshold) {
return absl::OkStatus();
}
CacheFilesWithSize files_with_size;
ASSIGN_OR_RETURN(files_with_size, CollectCacheFiles());
LOG_DEBUG("Cache size before the cleanup: %u bytes", size_.load());
std::vector<CacheFile>& files = files_with_size.files;
// Sort in the LRU order: the old files stored first.
std::sort(files.begin(), files.end(),
[](const CacheFile& file1, const CacheFile& file2) {
// Also sort by path for deterministic results in tests.
if (file1.mtime == file2.mtime) return file1.path < file2.path;
return file1.mtime < file2.mtime;
});
size_t file_index = 0;
const size_t num_of_files = files.size();
while (size_ > size_threshold && file_index < num_of_files) {
std::string path = path::Join(root_dir_, files[file_index].path);
RETURN_IF_ERROR(path::RemoveFile(path));
size_.fetch_sub(files[file_index].size, std::memory_order_relaxed);
++file_index;
if (interrupt_ && *interrupt_) {
return absl::CancelledError("Cache cleanup has been cancelled");
}
}
LOG_DEBUG("Cache size after the cleanup: %u bytes", size_.load());
return absl::OkStatus();
}
absl::StatusOr<std::vector<ContentIdProto>> DiskDataStore::List() {
CacheFilesWithSize files_with_size;
ASSIGN_OR_RETURN(files_with_size, CollectCacheFiles(true),
"Failed to collect cache files");
std::vector<ContentIdProto> ids;
ids.reserve(files_with_size.files.size());
for (const CacheFile& file : files_with_size.files) {
ContentIdProto id;
if (ParseCacheFilePath(std::move(file.path), &id))
ids.push_back(std::move(id));
}
return ids;
}
absl::StatusOr<DiskDataStore::Statistics> DiskDataStore::CalculateStatistics()
const {
Statistics statistics;
auto handler = [&](const std::string& dir, const std::string& filename,
int64_t /*modified_time*/, uint64_t size,
bool is_directory) -> absl::Status {
if (!is_directory) {
statistics.size += size;
++statistics.number_of_chunks;
}
return absl::OkStatus();
};
RETURN_IF_ERROR(path::SearchFiles(root_dir_, true, handler));
return statistics;
}
absl::StatusOr<DiskDataStore::CacheFilesWithSize>
DiskDataStore::CollectCacheFiles(bool continue_on_interrupt) {
CacheFilesWithSize cache_files;
if (!path::DirExists({root_dir_})) return cache_files;
auto handler = [&](const std::string& dir, const std::string& filename,
int64_t modified_time, uint64_t size,
bool is_directory) -> absl::Status {
if (!is_directory) {
cache_files.files.emplace_back();
cache_files.files.back().path =
path::Join(dir.substr(root_dir_.size()), filename);
cache_files.files.back().mtime = modified_time;
cache_files.files.back().size = size;
cache_files.size += size;
}
if (!continue_on_interrupt && interrupt_ && *interrupt_) {
return absl::CancelledError("Cache cleanup has been cancelled");
}
return absl::OkStatus();
};
RETURN_IF_ERROR(path::SearchFiles(root_dir_, true, handler));
size_ = cache_files.size;
size_initialized_ = true;
return cache_files;
}
std::string DiskDataStore::GetCacheFilePath(
const ContentIdProto& content_id) const {
std::string file_name = AddPathSeparators(ContentId::ToHexString(content_id),
kDirNameLength, depth_);
return path::Join(root_dir_, file_name);
}
bool DiskDataStore::ParseCacheFilePath(std::string path,
ContentIdProto* content_id) const {
// Remove path separators.
if (depth_ > 0) {
path.erase(std::remove_if(path.begin(), path.end(),
[](char c) {
return c == path::PathSeparator() ||
c == path::OtherPathSeparator();
}),
path.end());
}
return ContentId::FromHexString(path, content_id);
}
void DiskDataStore::UpdateModificationTime(const std::string& path) {
// Don't fail if the time cannot be modified.
// The time might be updated in parallel, so it is not critical.
path::SetFileTime(path, std::chrono::system_clock::to_time_t(clock_->Now()))
.IgnoreError();
}
absl::Status DiskDataStore::CreateDirHierarchy() {
if (dirs_.empty() && depth_ > 0) {
dirs_ = GenerateDirNames(kDirNameLength);
}
RETURN_IF_ERROR(path::CreateDirRec(root_dir_));
return CreateDirLevelRec(root_dir_, depth_);
}
absl::Status DiskDataStore::CreateDirLevelRec(const std::string& parent,
unsigned int depth) {
if (depth == 0) {
return absl::OkStatus();
}
for (const std::string& dir : dirs_) {
std::string name = path::Join(parent, dir);
RETURN_IF_ERROR(path::CreateDir(name));
RETURN_IF_ERROR(CreateDirLevelRec(name, depth - 1),
"CreateDirLevelRec() for %s failed at level %d:", name,
depth - 1);
}
return absl::OkStatus();
}
}; // namespace cdc_ft