Files
netris-cdc-file-transfer/manifest/file_chunk_map.cc
Christian Schneider 4326e972ac Releasing the former Stadia file transfer tools
The tools allow efficient and fast synchronization of large directory
trees from a Windows workstation to a Linux target machine.

cdc_rsync* support efficient copy of files by using content-defined
chunking (CDC) to identify chunks within files that can be reused.

asset_stream_manager + cdc_fuse_fs support efficient streaming of a
local directory to a remote virtual file system based on FUSE. It also
employs CDC to identify and reuse unchanged data chunks.
2022-11-03 10:39:10 +01:00

254 lines
8.0 KiB
C++

// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/file_chunk_map.h"
#include "absl/strings/str_format.h"
#include "manifest/stats_printer.h"
namespace cdc_ft {
FileChunkMap::FileChunkMap(bool enable_stats) {
if (enable_stats) stats_ = std::make_unique<StatsPrinter>();
}
FileChunkMap::~FileChunkMap() = default;
void FileChunkMap::Init(std::string path, uint64_t file_size,
std::vector<FileChunk>* chunks) {
FileUpdate update(FileUpdateType::kInit, std::move(path));
update.file_size = file_size;
if (chunks) update.chunks = std::move(*chunks);
file_updates_.push_back(std::move(update));
}
void FileChunkMap::AppendCopy(std::string path,
const RepeatedChunkRefProto& list,
uint64_t list_offset) {
FileUpdate update(FileUpdateType::kAppend, std::move(path));
update.chunks.reserve(list.size());
for (const ChunkRefProto& ch : list)
update.chunks.emplace_back(ch.chunk_id(), ch.offset() + list_offset);
file_updates_.push_back(std::move(update));
}
void FileChunkMap::AppendMove(std::string path, RepeatedChunkRefProto* list,
uint64_t list_offset) {
FileUpdate update(FileUpdateType::kAppend, std::move(path));
update.chunks.reserve(list->size());
for (ChunkRefProto& ch : *list) {
update.chunks.emplace_back(std::move(*ch.mutable_chunk_id()),
ch.offset() + list_offset);
}
file_updates_.push_back(std::move(update));
}
void FileChunkMap::Remove(std::string path) {
FileUpdate update(FileUpdateType::kRemove, std::move(path));
file_updates_.push_back(std::move(update));
}
void FileChunkMap::Clear() {
FileUpdate update(FileUpdateType::kClear, std::string());
file_updates_.push_back(std::move(update));
}
void FileChunkMap::FlushUpdates() {
if (file_updates_.empty()) return;
absl::MutexLock lock(&mutex_);
for (FileUpdate& update : file_updates_) {
switch (update.type) {
case FileUpdateType::kInit: {
File& file = path_to_file_[update.path];
file.size = update.file_size;
assert(total_chunks_ >= file.chunks.size());
total_chunks_ -= file.chunks.size();
total_chunks_ += update.chunks.size();
file.chunks = std::move(update.chunks);
break;
}
case FileUpdateType::kAppend: {
File& file = path_to_file_[update.path];
total_chunks_ += update.chunks.size();
if (file.chunks.empty()) {
file.chunks = std::move(update.chunks);
} else {
file.chunks.reserve(file.chunks.size() + update.chunks.size());
std::move(std::begin(update.chunks), std::end(update.chunks),
std::back_inserter(file.chunks));
}
break;
}
case FileUpdateType::kRemove: {
const auto iter = path_to_file_.find(update.path);
if (iter == path_to_file_.end()) break;
assert(total_chunks_ >= iter->second.chunks.size());
total_chunks_ -= iter->second.chunks.size();
path_to_file_.erase(iter);
break;
}
case FileUpdateType::kClear: {
path_to_file_.clear();
total_chunks_ = 0;
break;
}
}
}
file_updates_.clear();
UpdateIdToChunkMap();
}
bool FileChunkMap::Lookup(const ContentIdProto& content_id, std::string* path,
uint64_t* offset, uint32_t* size) {
assert(path && offset && size);
absl::MutexLock lock(&mutex_);
return FindChunk(content_id, path, offset, size, nullptr);
}
void FileChunkMap::RecordStreamedChunk(const ContentIdProto& content_id,
size_t thread_id) {
absl::MutexLock lock(&mutex_);
if (!stats_) return;
if (streamed_chunks_to_thread_.find(content_id) !=
streamed_chunks_to_thread_.end()) {
return;
}
std::string path;
uint32_t size;
size_t index;
if (FindChunk(content_id, &path, nullptr, &size, &index))
stats_->RecordStreamedChunk(path, index, size, thread_id);
streamed_chunks_to_thread_[content_id] = thread_id;
}
void FileChunkMap::RecordCachedChunk(const ContentIdProto& content_id) {
absl::MutexLock lock(&mutex_);
if (!stats_) return;
if (cached_chunks_.find(content_id) != cached_chunks_.end()) return;
// Restarting FUSE might report cached chunks that have been originally
// streamed. Ignore those.
if (streamed_chunks_to_thread_.find(content_id) !=
streamed_chunks_to_thread_.end()) {
return;
}
std::string path;
uint32_t size;
size_t index;
if (FindChunk(content_id, &path, nullptr, &size, &index))
stats_->RecordCachedChunk(path, index, size);
cached_chunks_.insert(content_id);
}
void FileChunkMap::PrintStats() {
absl::MutexLock lock(&mutex_);
if (!stats_) return;
stats_->Print();
}
bool FileChunkMap::HasStats() const {
absl::ReaderMutexLock lock(&mutex_);
return stats_ != nullptr;
}
void FileChunkMap::UpdateIdToChunkMap() {
assert((mutex_.AssertHeld(), true));
// Put all chunks into the map.
id_to_chunk_.clear();
id_to_chunk_.reserve(total_chunks_);
for (const auto& [path, file] : path_to_file_) {
for (uint32_t n = 0; n < static_cast<uint32_t>(file.chunks.size()); ++n)
id_to_chunk_[ContentIdRef(file.chunks[n].content_id)] = {&path, n};
}
// Might be "<" if multiple files contain the same chunk.
assert(id_to_chunk_.size() <= total_chunks_);
// Rebuild stats if present.
if (stats_) {
stats_->Clear();
for (const auto& [path, file] : path_to_file_)
stats_->InitFile(path, file.chunks.size());
// Fill in the streamed chunks.
std::string path;
uint32_t size;
size_t index;
for (const auto& [id, thread_id] : streamed_chunks_to_thread_) {
if (FindChunk(id, &path, nullptr, &size, &index))
stats_->RecordStreamedChunk(path, index, size, thread_id);
}
// Fill in the cached chunks.
for (const ContentIdProto& id : cached_chunks_) {
if (FindChunk(id, &path, nullptr, &size, &index))
stats_->RecordCachedChunk(path, index, size);
}
// Make sure the above RecordStreamedChunk() calls don't count towards
// bandwidth stats.
stats_->ResetBandwidthStats();
}
}
bool FileChunkMap::FindChunk(const ContentIdProto& content_id,
std::string* path, uint64_t* offset,
uint32_t* size, size_t* index) {
assert((mutex_.AssertHeld(), true));
// Find the |id_to_chunk_| entry by |content_id|. It might not exist if
// changes to the manifest have not propagated to gamelets yet.
IdToChunkMap::iterator i2c_iter = id_to_chunk_.find(ContentIdRef(content_id));
if (i2c_iter == id_to_chunk_.end()) return false;
// Find the chunk location by path. This lookup should not fail because
// |path_to_file_| and |id_to_chunk_| should always be in sync here.
const ChunkLocation& loc = i2c_iter->second;
PathToFileMap::iterator p2f_iter = path_to_file_.find(*loc.path);
assert(p2f_iter != path_to_file_.end());
// Compute path, chunk offset and chunk size.
const File& file = p2f_iter->second;
assert(loc.index < file.chunks.size());
uint64_t this_offset = file.chunks[loc.index].offset;
uint64_t next_offset = loc.index + 1 == file.chunks.size()
? file.size
: file.chunks[loc.index + 1].offset;
if (path) *path = *loc.path;
if (offset) *offset = this_offset;
if (size) *size = static_cast<uint32_t>(next_offset - this_offset);
if (index) *index = loc.index;
return true;
}
} // namespace cdc_ft