Releasing the former Stadia file transfer tools

The tools allow efficient and fast synchronization of large directory
trees from a Windows workstation to a Linux target machine.

cdc_rsync* support efficient copy of files by using content-defined
chunking (CDC) to identify chunks within files that can be reused.

asset_stream_manager + cdc_fuse_fs support efficient streaming of a
local directory to a remote virtual file system based on FUSE. It also
employs CDC to identify and reuse unchanged data chunks.
This commit is contained in:
Christian Schneider
2022-10-07 10:47:04 +02:00
commit 4326e972ac
364 changed files with 49410 additions and 0 deletions

220
manifest/BUILD Normal file
View File

@@ -0,0 +1,220 @@
package(default_visibility = ["//:__subpackages__"])
cc_library(
name = "content_id",
srcs = ["content_id.cc"],
hdrs = ["content_id.h"],
deps = [
":manifest_proto_defs",
"@com_github_blake3//:blake3",
"@com_google_absl//absl/strings",
],
)
cc_test(
name = "content_id_test",
srcs = ["content_id_test.cc"],
deps = [
":content_id",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
)
cc_library(
name = "manifest_proto_defs",
hdrs = ["manifest_proto_defs.h"],
deps = ["//proto:manifest_cc_proto"],
)
cc_library(
name = "fake_manifest_builder",
srcs = ["fake_manifest_builder.cc"],
hdrs = ["fake_manifest_builder.h"],
deps = [
":manifest_proto_defs",
"//common:path",
"//data_store:mem_data_store",
"//fastcdc",
],
)
cc_test(
name = "fake_manifest_builder_test",
srcs = ["fake_manifest_builder_test.cc"],
deps = [
":fake_manifest_builder",
"//common:status_test_macros",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
)
cc_library(
name = "manifest_builder",
srcs = [
"asset_builder.cc",
"manifest_builder.cc",
],
hdrs = [
"asset_builder.h",
"manifest_builder.h",
],
deps = [
":content_id",
":manifest_proto_defs",
"//common:log",
"//common:path",
"//common:status",
"//common:status_macros",
"//common:util",
"//data_store",
"@com_google_absl//absl/status:statusor",
],
)
cc_test(
name = "manifest_builder_test",
srcs = ["manifest_builder_test.cc"],
deps = [
":manifest_builder",
":manifest_iterator",
":manifest_printer",
"//common:status_test_macros",
"//data_store:mem_data_store",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
)
cc_library(
name = "manifest_iterator",
srcs = ["manifest_iterator.cc"],
hdrs = ["manifest_iterator.h"],
deps = [
"//common:log",
"//common:path",
"//data_store",
],
)
cc_library(
name = "manifest_printer",
srcs = ["manifest_printer.cc"],
hdrs = ["manifest_printer.h"],
deps = [
":content_id",
":manifest_proto_defs",
"@com_google_protobuf//:protobuf",
],
)
cc_library(
name = "manifest_updater",
srcs = ["manifest_updater.cc"],
hdrs = ["manifest_updater.h"],
deps = [
":file_chunk_map",
":manifest_builder",
":manifest_iterator",
":manifest_proto_defs",
":stats_printer",
"//common:log",
"//common:path",
"//common:stopwatch",
"//common:threadpool",
"//common:util",
"//data_store",
"//fastcdc",
"@com_google_absl//absl/status",
],
)
cc_library(
name = "stats_printer",
srcs = ["stats_printer.cc"],
hdrs = ["stats_printer.h"],
copts = select({
"//tools:windows": ["/wd4324"], # "structure was padded" from flat_hash_map
"//conditions:default": [],
}),
deps = [
"//common:path",
"//common:stopwatch",
"//fastcdc",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:statusor",
],
)
cc_library(
name = "file_chunk_map",
srcs = ["file_chunk_map.cc"],
hdrs = ["file_chunk_map.h"],
copts = select({
"//tools:windows": ["/wd4324"], # "structure was padded" from flat_hash_map
"//conditions:default": [],
}),
deps = [
":manifest_proto_defs",
":stats_printer",
"//manifest:content_id",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/status",
],
)
cc_test(
name = "file_chunk_map_test",
srcs = ["file_chunk_map_test.cc"],
deps = [
":file_chunk_map",
"//common:test_main",
"@com_google_googletest//:gtest",
],
)
cc_library(
name = "manifest_test_base",
srcs = ["manifest_test_base.cc"],
hdrs = ["manifest_test_base.h"],
deps = [
":manifest_iterator",
":manifest_printer",
":manifest_updater",
"//common:path",
"//common:status_test_macros",
"//data_store:mem_data_store",
"@com_google_googletest//:gtest",
],
)
# This test only succeeds on Windows if the timezone is set to the local host's
# timezone, but Bazel by default sets the test timezone to UTC.
#
# Run this test as follows to preserve the host's timezone:
# bazel test --action_env=TZ=Local
cc_test(
name = "manifest_updater_test",
srcs = ["manifest_updater_test.cc"],
data = [":all_test_data"],
deps = [
":manifest_test_base",
":manifest_updater",
"//common:test_main",
"//data_store:mem_data_store",
"@com_google_googletest//:gtest",
],
)
filegroup(
name = "all_test_sources",
srcs = glob(["*_test.cc"]),
)
filegroup(
name = "all_test_data",
srcs = glob(["testdata/**"]),
)

115
manifest/asset_builder.cc Normal file
View File

@@ -0,0 +1,115 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/asset_builder.h"
#include "absl/strings/str_cat.h"
#include "common/path.h"
namespace cdc_ft {
AssetBuilder::AssetBuilder() = default;
AssetBuilder::AssetBuilder(AssetProto* proto, const std::string& rel_path)
: proto_(proto), rel_path_(path::ToUnix(rel_path)) {}
AssetBuilder::~AssetBuilder() = default;
std::string AssetBuilder::RelativeFilePath() const {
if (!proto_) return std::string();
return path::JoinUnix(rel_path_, proto_->name());
}
void AssetBuilder::AppendChunk(const ContentIdProto& content_id, size_t len) {
assert(proto_ != nullptr);
assert(proto_->type() == AssetProto::FILE);
// TODO: Handle indirect chunks.
assert(proto_->file_indirect_chunks_size() == 0);
ChunkRefProto* chunk_ref = proto_->add_file_chunks();
chunk_ref->set_offset(proto_->file_size());
chunk_ref->mutable_chunk_id()->CopyFrom(content_id);
proto_->set_file_size(proto_->file_size() + len);
}
void AssetBuilder::TruncateChunks() {
assert(proto_ != nullptr);
assert(proto_->type() == AssetProto::FILE);
proto_->mutable_file_chunks()->Clear();
proto_->mutable_file_indirect_chunks()->Clear();
proto_->set_file_size(0);
}
void AssetBuilder::SetChunks(const RepeatedChunkRefProto& chunks,
uint64_t file_size) {
assert(proto_ != nullptr);
assert(proto_->type() == AssetProto::FILE);
proto_->mutable_file_chunks()->Clear();
proto_->mutable_file_chunks()->CopyFrom(chunks);
proto_->mutable_file_indirect_chunks()->Clear();
proto_->set_file_size(file_size);
}
void AssetBuilder::SwapChunks(RepeatedChunkRefProto* chunks,
uint64_t file_size) {
assert(proto_ != nullptr);
assert(proto_->type() == AssetProto::FILE);
proto_->mutable_file_chunks()->Swap(chunks);
proto_->mutable_file_indirect_chunks()->Clear();
proto_->set_file_size(file_size);
}
void AssetBuilder::SetFileSize(uint64_t file_size) {
assert(proto_ != nullptr);
assert(proto_->type() == AssetProto::FILE);
proto_->set_file_size(file_size);
}
AssetBuilder AssetBuilder::AppendAsset(const std::string& name,
AssetProto::Type type) {
assert(proto_ != nullptr);
assert(proto_->type() == AssetProto::DIRECTORY);
AssetProto* child = proto_->add_dir_assets();
child->set_type(type);
child->set_name(name);
return AssetBuilder(child, RelativeFilePath());
}
bool AssetBuilder::InProgress() const {
if (!proto_) return false;
return proto_->in_progress();
}
void AssetBuilder::SetInProgress(bool in_progress) {
assert(proto_ != nullptr);
proto_->set_in_progress(in_progress);
}
void AssetBuilder::SetProto(AssetProto* proto, const std::string& rel_path) {
Clear();
proto_ = proto;
absl::StrAppend(&rel_path_, path::ToUnix(rel_path));
}
void AssetBuilder::Clear() {
proto_ = nullptr;
rel_path_.resize(0);
}
AssetBuilder& AssetBuilder::operator=(const AssetBuilder& other) {
proto_ = other.proto_;
rel_path_ = other.rel_path_;
return *this;
}
} // namespace cdc_ft

151
manifest/asset_builder.h Normal file
View File

@@ -0,0 +1,151 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_ASSET_BUILDER_H_
#define MANIFEST_ASSET_BUILDER_H_
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
class AssetBuilder {
public:
AssetBuilder();
// Creates a new AssetBuilder referencing the given |proto| and relative path
// |rel_path|. Ownership of |proto| remains with the caller and must remain
// valid while the AssetBuilder is being used.
AssetBuilder(AssetProto* proto, const std::string& rel_path);
~AssetBuilder();
// The assignment operator ignores the constant member |empty_|.
AssetBuilder& operator=(const AssetBuilder& other);
// Returns the modification timestamp of this asset.
uint64_t MtimeSeconds() const { return proto_ ? proto_->mtime_seconds() : 0; }
// Sets the modification timestamp of this asset to |mtime|.
void SetMtimeSeconds(uint64_t mtime) {
if (proto_) proto_->set_mtime_seconds(mtime);
}
// Returns the permission bits of this asset (RWX for user, group, world, in
// that order).
uint32_t Permissions() const { return proto_ ? proto_->permissions() : 0; }
// Sets the permission bits of this asset to |perms|.
void SetPermissions(uint32_t perms) {
if (proto_) proto_->set_permissions(perms);
}
// Returns the file name of this asset.
const std::string& Name() const { return proto_ ? proto_->name() : empty_; }
// Returns the asset type.
AssetProto::Type Type() const {
return proto_ ? proto_->type() : AssetProto::UNKNOWN;
}
// Returns the Unix path of the directory containing this asset relative to
// the manifest root directory, as specified during construction or
// SetProto().
const std::string& RelativePath() const { return rel_path_; }
// Returns the path and file name of this asset relative to the manifest root
// directory.
std::string RelativeFilePath() const;
// Returns this asset's in_progress status.
bool InProgress() const;
// Sets the asset's in_progress status.
void SetInProgress(bool in_progress);
// For FILE assets, appends the chunk with the given |content_id| and |len| to
// the list of chunks. The chunk's offset will be auto-determined based on the
// current file size.
//
// Asserts that the asset is actually of type FILE and that the file does not
// have any associated indirect chunk lists.
void AppendChunk(const ContentIdProto& content_id, size_t len);
// For FILE assets, removes all chunks from this file and resets the file size
// to zero.
//
// Asserts that the asset is actually of type FILE.
void TruncateChunks();
// Sets this file's chunks from the ones given in the provided |chunks| list
// and the total size to |file_size|. Copies the proto contents, clears all
// indirect chunk lists.
//
// Asserts that the asset is actually of type FILE.
void SetChunks(const RepeatedChunkRefProto& chunks, uint64_t file_size);
// Swaps this file's chunks with the ones given in the provided |chunks| list
// and sets the total size to |file_size|. This avoids copying the data.
// Clears all indirect chunk lists.
//
// Asserts that the asset is actually of type FILE.
void SwapChunks(RepeatedChunkRefProto* chunks, uint64_t file_size);
// Sets this file's size.
//
// Asserts that the asset is actually of type FILE.
void SetFileSize(uint64_t file_size);
// For DIRECTORY assets, adds a new direct asset to the end of the list. Does
// *not* verify if an asset with that name already exists.
//
// Asserts that the asset is actually of type DIRECTORY.
AssetBuilder AppendAsset(const std::string& name, AssetProto::Type type);
// Returns the symlink target for symlinks.
const std::string& SymlinkTarget() const {
return proto_ ? proto_->symlink_target() : empty_;
}
// Sets the target for symlinks.
void SetSymlinkTarget(const std::string& target) {
if (proto_) proto_->set_symlink_target(target);
}
// Returns a pointer to the proto that this AssetBuilder references.
const AssetProto* Proto() const { return proto_; }
AssetProto* Proto() { return proto_; }
// Sets the |proto| and relative path |rel_path| this AssetBuilder is
// referring to. Ownership of |proto| remains with the caller and must remain
// valid while the AssetBuilder is being used.
void SetProto(AssetProto* proto, const std::string& rel_path);
private:
// Resets this AssetBuilder.
void Clear();
// Empty string to return as reference when no proto is set.
const std::string empty_;
// The proto this AssetBuilder refers to.
AssetProto* proto_ = nullptr;
// The path leading to this asset relative to the manfest root.
std::string rel_path_;
};
} // namespace cdc_ft
#endif // MANIFEST_ASSET_BUILDER_H_

99
manifest/content_id.cc Normal file
View File

@@ -0,0 +1,99 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/content_id.h"
#include "blake3.h"
namespace cdc_ft {
namespace {
// Converts |n| in the range 0..15 to its lower-case hex representation.
// Returns -1 if |n| is not in the range 0..15.
char IntToHex(uint8_t n) {
if (n <= 9) return '0' + n;
if (n <= 15) return 'a' + n - 10;
return -1;
}
// Converts the lower-case hex character |c| to its integer representation.
// Returns -1 if |c| is not a valid lower-case hex character.
int HexToInt(char c) {
if (c >= '0' && c <= '9') return c - '0';
if (c >= 'a' && c <= 'f') return c - 'a' + 10;
return -1;
}
} // namespace
// static
ContentIdProto ContentId::FromDataString(const std::string& data) {
return FromArray(data.c_str(), data.size());
}
// static
ContentIdProto ContentId::FromDataString(absl::string_view data) {
return FromArray(data.data(), data.size());
}
// static
ContentIdProto ContentId::FromArray(const void* data, size_t len) {
blake3_hasher state;
uint8_t out[kHashSize];
blake3_hasher_init(&state);
blake3_hasher_update(&state, data, len);
blake3_hasher_finalize(&state, out, kHashSize);
ContentIdProto content_id;
content_id.set_blake3_sum_160(out, kHashSize);
return content_id;
}
// static
std::string ContentId::ToHexString(const ContentIdProto& content_id) {
absl::string_view blake3_sum(content_id.blake3_sum_160());
std::string ret;
ret.reserve(blake3_sum.size() << 1);
for (size_t i = 0; i < blake3_sum.size(); ++i) {
ret.push_back(IntToHex(static_cast<uint8_t>(blake3_sum[i]) >> 4));
ret.push_back(IntToHex(static_cast<uint8_t>(blake3_sum[i]) & 0xf));
}
return ret;
}
// static
bool ContentId::FromHexString(const std::string& str,
ContentIdProto* content_id) {
if (str.size() != kHashSize * 2) return false;
std::string* hash = content_id->mutable_blake3_sum_160();
hash->clear();
hash->reserve(kHashSize);
for (int n = 0; n < str.size(); n += 2) {
int high = HexToInt(str[n]);
int low = HexToInt(str[n + 1]);
if (high == -1 || low == -1) {
hash->clear();
return false;
}
hash->push_back((high << 4) + low);
}
return true;
}
// static
uint8_t ContentId::GetByte(const ContentIdProto& content_id, size_t pos) {
if (pos >= content_id.blake3_sum_160().size()) return 0;
return content_id.blake3_sum_160()[pos];
}
} // namespace cdc_ft

90
manifest/content_id.h Normal file
View File

@@ -0,0 +1,90 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_CONTENT_ID_H_
#define MANIFEST_CONTENT_ID_H_
#include <string>
#include "absl/strings/string_view.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
// This helper class provides some utility functions to work with ContentIdProto
// messages.
class ContentId {
public:
// Hashes are 160 bit long.
static constexpr size_t kHashSize = 20;
// Returns content ID for the |data| passed in as a string.
static ContentIdProto FromDataString(const std::string& data);
// Returns the content ID for the |data| passed in as a string_view.
static ContentIdProto FromDataString(absl::string_view data);
// Returns the content ID for the |data| passed in as a pointer.
static ContentIdProto FromArray(const void* data, size_t len);
// Converts the given content ID into a hex string. The string will consist of
// the hex digits of the hash ('0'...'9', 'a'...'f'), so a 160 bit hash
// results in a string of length kHashSize * 2.
static std::string ToHexString(const ContentIdProto& content_id);
// Converts the given hex string into a content ID. The string is assumed to
// consist of the hex digits of the hash ('0'...'9', 'a'...'f'), so a 160 bit
// hash would have length kHashSize * 2. Returns false if |str| is malformed.
static bool FromHexString(const std::string& str, ContentIdProto* content_id);
// Returns the |pos| byte of |content_id|.
// Returns 0 if |content_id| is not set or |pos| is invalid.
static uint8_t GetByte(const ContentIdProto& content_id, size_t pos);
};
namespace proto {
inline bool operator==(const ContentId& a, const ContentId& b) {
return a.blake3_sum_160() == b.blake3_sum_160();
}
inline bool operator!=(const ContentId& a, const ContentId& b) {
return !(a == b);
}
inline bool operator<(const ContentId& a, const ContentId& b) {
return a.blake3_sum_160() < b.blake3_sum_160();
}
} // namespace proto
} // namespace cdc_ft
namespace std {
template <>
struct hash<cdc_ft::ContentIdProto> {
size_t operator()(const cdc_ft::ContentIdProto& id) const {
// Pick the first 8 bytes of the hash (assuming 64 bit binary).
if (id.blake3_sum_160().size() < sizeof(size_t)) {
return 0;
}
return *reinterpret_cast<const size_t*>(id.blake3_sum_160().data());
}
};
} // namespace std
#endif // MANIFEST_CONTENT_ID_H_

View File

@@ -0,0 +1,79 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/content_id.h"
#include "gtest/gtest.h"
namespace cdc_ft {
namespace {
using StringList = std::vector<absl::string_view>;
static constexpr char kData[] = "Hey Google, tell me a joke.";
static constexpr size_t kHashSize = 20;
static constexpr char kHash[kHashSize + 1] =
"\x12\xe8\x41\x41\x39\x93\x13\x82\x34\xd0\xfe\xcb\x4e\xcf\x6a\x4c\xfd\x74"
"\x55\x27";
static constexpr char kHashHex[] = "12e841413993138234d0fecb4ecf6a4cfd745527";
TEST(ContentIdTest, StringToContentId) {
ContentIdProto content_id = ContentId::FromDataString(std::string(kData));
EXPECT_EQ(content_id.blake3_sum_160().size(), kHashSize);
EXPECT_EQ(content_id.blake3_sum_160(), absl::string_view(kHash, kHashSize));
}
TEST(ContentIdTest, StringViewToContentId) {
ContentIdProto content_id =
ContentId::FromDataString(absl::string_view(kData));
EXPECT_EQ(content_id.blake3_sum_160().size(), kHashSize);
EXPECT_EQ(content_id.blake3_sum_160(), absl::string_view(kHash, kHashSize));
}
TEST(ContentIdTest, PtrToContentId) {
absl::string_view data(kData);
ContentIdProto content_id = ContentId::FromArray(data.data(), data.size());
EXPECT_EQ(content_id.blake3_sum_160().size(), kHashSize);
EXPECT_EQ(content_id.blake3_sum_160(), absl::string_view(kHash, kHashSize));
}
TEST(ContentIdTest, ToHexString) {
ContentIdProto content_id =
ContentId::FromDataString(absl::string_view(kData));
std::string hash_str = ContentId::ToHexString(content_id);
EXPECT_EQ(hash_str.size(), 2 * kHashSize);
EXPECT_EQ(hash_str, kHashHex);
}
TEST(ContentIdTest, FromHexString) {
ContentIdProto content_id;
EXPECT_TRUE(ContentId::FromHexString(kHashHex, &content_id));
EXPECT_EQ(content_id.blake3_sum_160(), kHash);
}
TEST(ContentIdTest, GetByte) {
ContentIdProto content_id;
EXPECT_EQ(ContentId::GetByte(content_id, 0), 0);
EXPECT_EQ(ContentId::GetByte(content_id, 1000), 0);
EXPECT_TRUE(ContentId::FromHexString(kHashHex, &content_id));
EXPECT_EQ(ContentId::GetByte(content_id, 0), static_cast<uint8_t>(kHash[0]));
EXPECT_EQ(ContentId::GetByte(content_id, 1), static_cast<uint8_t>(kHash[1]));
EXPECT_EQ(ContentId::GetByte(content_id, 20), 0);
}
} // namespace
} // namespace cdc_ft

View File

@@ -0,0 +1,182 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/fake_manifest_builder.h"
#include <vector>
#include "absl/strings/str_format.h"
#include "common/path.h"
#include "data_store/mem_data_store.h"
#include "fastcdc/fastcdc.h"
namespace cdc_ft {
namespace {
constexpr size_t kAvgChunkSize = 1024 * 256;
constexpr size_t kMinChunkSize = kAvgChunkSize / 2;
constexpr size_t kMaxChunkSize = kAvgChunkSize * 4;
// Builds a data blob for faking a large file that contains
// <line number> - <60 random letters>
std::vector<char> BuildLargeFileData(int num_lines) {
std::vector<char> data;
char filler[60] = {0};
for (int n = 0; n < num_lines; ++n) {
for (size_t k = 0; k < sizeof(filler); ++k) {
filler[k] = (rand() % 26) + 'a';
}
std::string n_str = std::to_string(n);
data.insert(data.end(), n_str.c_str(), n_str.c_str() + n_str.size());
data.push_back('-');
data.insert(data.end(), filler, filler + sizeof(filler));
data.push_back('\n');
}
return data;
}
void UpdateFileContent(AssetProto* asset, MemDataStore* const store,
const std::vector<char>& data) {
uint64_t offset = 0;
auto chunk_handler = [asset, store, &offset](const void* data, size_t size) {
const char* char_data = reinterpret_cast<const char*>(data);
std::vector<char> data_vec;
data_vec.insert(data_vec.end(), char_data, char_data + size);
ChunkRefProto* chunk_ref = asset->add_file_chunks();
*chunk_ref->mutable_chunk_id() = store->AddData(data_vec);
chunk_ref->set_offset(offset);
offset += size;
};
fastcdc::Config config(kMinChunkSize, kAvgChunkSize, kMaxChunkSize);
fastcdc::Chunker chunker(config, chunk_handler);
chunker.Process(reinterpret_cast<const uint8_t*>(data.data()), data.size());
chunker.Finalize();
}
AssetProto* FindAsset(AssetProto* dir_asset, const char* name) {
assert(dir_asset);
assert(dir_asset->type() == AssetProto::DIRECTORY);
for (AssetProto& asset : *dir_asset->mutable_dir_assets()) {
if (asset.name() == name) {
return &asset;
}
}
return nullptr;
}
} // namespace
FakeManifestBuilder::FakeManifestBuilder(MemDataStore* store) : store_(store) {
manifest_.mutable_root_dir()->set_type(AssetProto::DIRECTORY);
manifest_.mutable_root_dir()->set_permissions(kRootDirPerms);
}
FakeManifestBuilder::~FakeManifestBuilder() = default;
void FakeManifestBuilder::AddFile(AssetProto* dir_asset, const char* name,
int64_t mtime_sec, uint32_t permissions,
const std::vector<char>& data) {
assert(dir_asset);
AssetProto* asset = dir_asset->add_dir_assets();
asset->set_name(name);
asset->set_type(AssetProto::FILE);
asset->set_file_size(data.size());
asset->set_mtime_seconds(mtime_sec);
asset->set_permissions(permissions);
UpdateFileContent(asset, store_, data);
}
AssetProto* FakeManifestBuilder::AddDirectory(AssetProto* dir_asset,
const char* name,
int64_t mtime_sec,
uint32_t permissions) {
assert(dir_asset);
AssetProto* asset = dir_asset->add_dir_assets();
asset->set_name(name);
asset->set_type(AssetProto::DIRECTORY);
asset->set_mtime_seconds(mtime_sec);
asset->set_permissions(permissions);
return asset;
}
ContentIdProto FakeManifestBuilder::BuildTestData() {
const uint32_t kFileMode =
path::MODE_IRUSR | path::MODE_IWUSR | path::MODE_IRGRP | path::MODE_IROTH;
const uint32_t kDirMode = path::MODE_IRGRP | path::MODE_IXGRP |
path::MODE_IROTH | path::MODE_IXOTH |
path::MODE_IRWXU;
const int64_t kModTime = 1614843754;
// root
// |- file1.txt
// |- fio_test
// |- large_file1.txt
// |- ...
// |- large_file9.txt
// |- a
// |- file2.txt
// |- b
// |- file3.txt
AssetProto* fio_test_dir =
AddDirectory(Root(), "fio_test", kModTime, kDirMode);
// 500k lines generate a ~33 MB file.
std::vector<char> data = BuildLargeFileData(500000);
for (int n = 1; n < 9; ++n) {
std::string filename = absl::StrFormat("large_file%i.txt", n);
AddFile(fio_test_dir, filename.c_str(), kModTime, kFileMode, data);
}
AddFile(Root(), "file1.txt", kModTime, kFileMode, {'1', '3', '3', '7', '\n'});
AssetProto* a_dir = AddDirectory(Root(), "a", kModTime, kDirMode);
AddFile(a_dir, "file2.txt", kModTime, kFileMode,
{'H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd', '!', '\n'});
AssetProto* b_dir = AddDirectory(a_dir, "b", kModTime, kDirMode);
AddFile(
b_dir, "file3.txt", kModTime, kFileMode,
{0127, 0105, 0122, 0040, 0104, 0101, 0123, 0040, 0114, 0111, 0105, 0123,
0124, 0040, 0111, 0123, 0124, 0040, 0104, 0117, 0117, 0106, 0012});
return store_->AddProto(manifest_);
}
const ManifestProto* FakeManifestBuilder::Manifest() const {
return &manifest_;
}
AssetProto* FakeManifestBuilder::Root() { return manifest_.mutable_root_dir(); }
void FakeManifestBuilder::ModifyFile(AssetProto* dir_asset, const char* name,
int64_t mtime_sec, uint32_t permissions,
const std::vector<char>& data) {
assert(dir_asset);
AssetProto* asset = FindAsset(dir_asset, name);
assert(asset && asset->type() == AssetProto::FILE);
asset->set_file_size(data.size());
asset->set_mtime_seconds(mtime_sec);
asset->set_permissions(permissions);
asset->clear_file_chunks();
asset->clear_file_indirect_chunks();
UpdateFileContent(asset, store_, data);
}
} // namespace cdc_ft

View File

@@ -0,0 +1,73 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_FAKE_MANIFEST_BUILDER_H_
#define MANIFEST_FAKE_MANIFEST_BUILDER_H_
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
class MemDataStore;
// In-memory manifest builder. Useful for testing.
class FakeManifestBuilder {
public:
// Permissions assigned to the root directory.
static constexpr uint32_t kRootDirPerms = 0755u;
explicit FakeManifestBuilder(MemDataStore* store);
~FakeManifestBuilder();
// Adds a new file of given the |name| to the directory |dir_asset| and sets
// the modified time to |mtime_sec| and permissions to |permissions|. Also
// generates data chunks from |data| using fastcdc (hardcoded chunk sizes).
// Use builder.AddFile(builder.Root(), ...) to add a file to the root
// directory.
void AddFile(AssetProto* dir_asset, const char* name, int64_t mtime_sec,
uint32_t permissions, const std::vector<char>& data);
// Adds a new directory of the given |name| to the directory |dir_asset| and
// sets the modified time to |mtime_sec| and permissions to |permissions|.
// Returns a pointer to the new directory that can be used to further add
// files or subdirectories.
// Use builder.AddDirectory(builder.Root(), ...) to add a directory to the
// root directory.
AssetProto* AddDirectory(AssetProto* dir_asset, const char* name,
int64_t mtime_sec, uint32_t permissions);
// Builds a fake directory structure with files and subdirectories suitable
// for prototyping/testing.
ContentIdProto BuildTestData();
// Returns the built manifest.
const ManifestProto* Manifest() const;
// Shortcut to &Manifest()->root_dir().
AssetProto* Root();
// Updates the file |name| with new |permissions|, |mtime_sec|, and |data|.
void ModifyFile(AssetProto* dir_asset, const char* name, int64_t mtime_sec,
uint32_t permissions, const std::vector<char>& data);
private:
MemDataStore* const store_;
ManifestProto manifest_;
};
} // namespace cdc_ft
#endif // MANIFEST_FAKE_MANIFEST_BUILDER_H_

View File

@@ -0,0 +1,120 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/fake_manifest_builder.h"
#include "common/status_test_macros.h"
#include "data_store/mem_data_store.h"
#include "gtest/gtest.h"
namespace cdc_ft {
namespace {
TEST(FakeManifestBuilderTest, RootDir) {
MemDataStore store;
FakeManifestBuilder builder(&store);
const AssetProto& root = builder.Manifest()->root_dir();
EXPECT_EQ(&root, builder.Root());
EXPECT_EQ(root.type(), AssetProto::DIRECTORY);
EXPECT_TRUE(root.name().empty());
ASSERT_EQ(root.dir_assets().size(), 0);
}
TEST(FakeManifestBuilderTest, AddFile) {
MemDataStore store;
FakeManifestBuilder builder(&store);
std::vector<char> expected_data = {1, 3, 3, 7};
builder.AddFile(builder.Root(), "file", 12345, 0750, expected_data);
const AssetProto& root = builder.Manifest()->root_dir();
ASSERT_EQ(root.dir_assets().size(), 1);
const AssetProto& file = root.dir_assets(0);
EXPECT_EQ(file.name(), "file");
EXPECT_EQ(file.type(), AssetProto::FILE);
EXPECT_EQ(file.mtime_seconds(), 12345);
EXPECT_EQ(file.permissions(), 0750);
ASSERT_EQ(file.file_chunks_size(), 1);
const ChunkRefProto& chunk = file.file_chunks(0);
EXPECT_EQ(chunk.offset(), 0);
// Try to read a byte more to see if it's properly clamped.
std::vector<char> data;
data.resize(expected_data.size() + 1);
absl::StatusOr<uint64_t> bytes_read =
store.Get(chunk.chunk_id(), data.data(), 0, data.size());
ASSERT_OK(bytes_read);
EXPECT_EQ(*bytes_read, expected_data.size());
data.resize(expected_data.size());
}
TEST(FakeManifestBuilderTest, AddDirectory) {
MemDataStore store;
FakeManifestBuilder builder(&store);
AssetProto* dir = builder.AddDirectory(builder.Root(), "dir", 12345, 0750);
builder.AddFile(dir, "file", 23456, 0321, {});
const AssetProto& root = builder.Manifest()->root_dir();
ASSERT_EQ(root.dir_assets().size(), 1);
EXPECT_EQ(&root.dir_assets(0), dir);
EXPECT_EQ(dir->name(), "dir");
EXPECT_EQ(dir->type(), AssetProto::DIRECTORY);
EXPECT_EQ(dir->mtime_seconds(), 12345);
EXPECT_EQ(dir->permissions(), 0750);
ASSERT_EQ(dir->dir_assets_size(), 1);
const AssetProto& file = dir->dir_assets(0);
EXPECT_EQ(file.name(), "file");
}
TEST(FakeManifestBuilderTest, ModifyFile) {
MemDataStore store;
FakeManifestBuilder builder(&store);
std::vector<char> expected_data = {1, 3, 3, 7, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1};
builder.AddFile(builder.Root(), "file", 12345, 0750, expected_data);
expected_data = {2, 4, 4, 3};
builder.ModifyFile(builder.Root(), "file", 14843, 0666, expected_data);
const AssetProto& root = builder.Manifest()->root_dir();
ASSERT_EQ(root.dir_assets().size(), 1);
const AssetProto& file = root.dir_assets(0);
EXPECT_EQ(file.name(), "file");
EXPECT_EQ(file.type(), AssetProto::FILE);
EXPECT_EQ(file.mtime_seconds(), 14843);
EXPECT_EQ(file.permissions(), 0666);
ASSERT_EQ(file.file_chunks_size(), 1);
const ChunkRefProto& chunk = file.file_chunks(0);
EXPECT_EQ(chunk.offset(), 0);
// Try to read a byte more to see if it's properly clamped.
std::vector<char> data;
data.resize(expected_data.size() + 1);
absl::StatusOr<uint64_t> bytes_read =
store.Get(chunk.chunk_id(), data.data(), 0, data.size());
ASSERT_OK(bytes_read);
EXPECT_EQ(*bytes_read, expected_data.size());
data.resize(expected_data.size());
}
} // namespace
} // namespace cdc_ft

253
manifest/file_chunk_map.cc Normal file
View File

@@ -0,0 +1,253 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/file_chunk_map.h"
#include "absl/strings/str_format.h"
#include "manifest/stats_printer.h"
namespace cdc_ft {
FileChunkMap::FileChunkMap(bool enable_stats) {
if (enable_stats) stats_ = std::make_unique<StatsPrinter>();
}
FileChunkMap::~FileChunkMap() = default;
void FileChunkMap::Init(std::string path, uint64_t file_size,
std::vector<FileChunk>* chunks) {
FileUpdate update(FileUpdateType::kInit, std::move(path));
update.file_size = file_size;
if (chunks) update.chunks = std::move(*chunks);
file_updates_.push_back(std::move(update));
}
void FileChunkMap::AppendCopy(std::string path,
const RepeatedChunkRefProto& list,
uint64_t list_offset) {
FileUpdate update(FileUpdateType::kAppend, std::move(path));
update.chunks.reserve(list.size());
for (const ChunkRefProto& ch : list)
update.chunks.emplace_back(ch.chunk_id(), ch.offset() + list_offset);
file_updates_.push_back(std::move(update));
}
void FileChunkMap::AppendMove(std::string path, RepeatedChunkRefProto* list,
uint64_t list_offset) {
FileUpdate update(FileUpdateType::kAppend, std::move(path));
update.chunks.reserve(list->size());
for (ChunkRefProto& ch : *list) {
update.chunks.emplace_back(std::move(*ch.mutable_chunk_id()),
ch.offset() + list_offset);
}
file_updates_.push_back(std::move(update));
}
void FileChunkMap::Remove(std::string path) {
FileUpdate update(FileUpdateType::kRemove, std::move(path));
file_updates_.push_back(std::move(update));
}
void FileChunkMap::Clear() {
FileUpdate update(FileUpdateType::kClear, std::string());
file_updates_.push_back(std::move(update));
}
void FileChunkMap::FlushUpdates() {
if (file_updates_.empty()) return;
absl::MutexLock lock(&mutex_);
for (FileUpdate& update : file_updates_) {
switch (update.type) {
case FileUpdateType::kInit: {
File& file = path_to_file_[update.path];
file.size = update.file_size;
assert(total_chunks_ >= file.chunks.size());
total_chunks_ -= file.chunks.size();
total_chunks_ += update.chunks.size();
file.chunks = std::move(update.chunks);
break;
}
case FileUpdateType::kAppend: {
File& file = path_to_file_[update.path];
total_chunks_ += update.chunks.size();
if (file.chunks.empty()) {
file.chunks = std::move(update.chunks);
} else {
file.chunks.reserve(file.chunks.size() + update.chunks.size());
std::move(std::begin(update.chunks), std::end(update.chunks),
std::back_inserter(file.chunks));
}
break;
}
case FileUpdateType::kRemove: {
const auto iter = path_to_file_.find(update.path);
if (iter == path_to_file_.end()) break;
assert(total_chunks_ >= iter->second.chunks.size());
total_chunks_ -= iter->second.chunks.size();
path_to_file_.erase(iter);
break;
}
case FileUpdateType::kClear: {
path_to_file_.clear();
total_chunks_ = 0;
break;
}
}
}
file_updates_.clear();
UpdateIdToChunkMap();
}
bool FileChunkMap::Lookup(const ContentIdProto& content_id, std::string* path,
uint64_t* offset, uint32_t* size) {
assert(path && offset && size);
absl::MutexLock lock(&mutex_);
return FindChunk(content_id, path, offset, size, nullptr);
}
void FileChunkMap::RecordStreamedChunk(const ContentIdProto& content_id,
size_t thread_id) {
absl::MutexLock lock(&mutex_);
if (!stats_) return;
if (streamed_chunks_to_thread_.find(content_id) !=
streamed_chunks_to_thread_.end()) {
return;
}
std::string path;
uint32_t size;
size_t index;
if (FindChunk(content_id, &path, nullptr, &size, &index))
stats_->RecordStreamedChunk(path, index, size, thread_id);
streamed_chunks_to_thread_[content_id] = thread_id;
}
void FileChunkMap::RecordCachedChunk(const ContentIdProto& content_id) {
absl::MutexLock lock(&mutex_);
if (!stats_) return;
if (cached_chunks_.find(content_id) != cached_chunks_.end()) return;
// Restarting FUSE might report cached chunks that have been originally
// streamed. Ignore those.
if (streamed_chunks_to_thread_.find(content_id) !=
streamed_chunks_to_thread_.end()) {
return;
}
std::string path;
uint32_t size;
size_t index;
if (FindChunk(content_id, &path, nullptr, &size, &index))
stats_->RecordCachedChunk(path, index, size);
cached_chunks_.insert(content_id);
}
void FileChunkMap::PrintStats() {
absl::MutexLock lock(&mutex_);
if (!stats_) return;
stats_->Print();
}
bool FileChunkMap::HasStats() const {
absl::ReaderMutexLock lock(&mutex_);
return stats_ != nullptr;
}
void FileChunkMap::UpdateIdToChunkMap() {
assert((mutex_.AssertHeld(), true));
// Put all chunks into the map.
id_to_chunk_.clear();
id_to_chunk_.reserve(total_chunks_);
for (const auto& [path, file] : path_to_file_) {
for (uint32_t n = 0; n < static_cast<uint32_t>(file.chunks.size()); ++n)
id_to_chunk_[ContentIdRef(file.chunks[n].content_id)] = {&path, n};
}
// Might be "<" if multiple files contain the same chunk.
assert(id_to_chunk_.size() <= total_chunks_);
// Rebuild stats if present.
if (stats_) {
stats_->Clear();
for (const auto& [path, file] : path_to_file_)
stats_->InitFile(path, file.chunks.size());
// Fill in the streamed chunks.
std::string path;
uint32_t size;
size_t index;
for (const auto& [id, thread_id] : streamed_chunks_to_thread_) {
if (FindChunk(id, &path, nullptr, &size, &index))
stats_->RecordStreamedChunk(path, index, size, thread_id);
}
// Fill in the cached chunks.
for (const ContentIdProto& id : cached_chunks_) {
if (FindChunk(id, &path, nullptr, &size, &index))
stats_->RecordCachedChunk(path, index, size);
}
// Make sure the above RecordStreamedChunk() calls don't count towards
// bandwidth stats.
stats_->ResetBandwidthStats();
}
}
bool FileChunkMap::FindChunk(const ContentIdProto& content_id,
std::string* path, uint64_t* offset,
uint32_t* size, size_t* index) {
assert((mutex_.AssertHeld(), true));
// Find the |id_to_chunk_| entry by |content_id|. It might not exist if
// changes to the manifest have not propagated to gamelets yet.
IdToChunkMap::iterator i2c_iter = id_to_chunk_.find(ContentIdRef(content_id));
if (i2c_iter == id_to_chunk_.end()) return false;
// Find the chunk location by path. This lookup should not fail because
// |path_to_file_| and |id_to_chunk_| should always be in sync here.
const ChunkLocation& loc = i2c_iter->second;
PathToFileMap::iterator p2f_iter = path_to_file_.find(*loc.path);
assert(p2f_iter != path_to_file_.end());
// Compute path, chunk offset and chunk size.
const File& file = p2f_iter->second;
assert(loc.index < file.chunks.size());
uint64_t this_offset = file.chunks[loc.index].offset;
uint64_t next_offset = loc.index + 1 == file.chunks.size()
? file.size
: file.chunks[loc.index + 1].offset;
if (path) *path = *loc.path;
if (offset) *offset = this_offset;
if (size) *size = static_cast<uint32_t>(next_offset - this_offset);
if (index) *index = loc.index;
return true;
}
} // namespace cdc_ft

206
manifest/file_chunk_map.h Normal file
View File

@@ -0,0 +1,206 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_FILE_CHUNK_MAP_H_
#define MANIFEST_FILE_CHUNK_MAP_H_
#include <memory>
#include <string>
#include <vector>
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/status/status.h"
#include "manifest/content_id.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
class StatsPrinter;
// A file chunk, used by the FileChunkMap.
struct FileChunk {
// Id of the chunk.
ContentIdProto content_id;
// Absolute offset of the chunk in the file.
uint64_t offset = 0;
FileChunk(ContentIdProto content_id, uint64_t offset)
: content_id(std::move(content_id)), offset(offset) {}
};
// Manages chunk lookups by content id. The class can be populated by passing it
// to ManifestUpdater and then used to look up chunks by calling Lookup().
class FileChunkMap {
public:
// If |enable_stats| is true, keeps detailed statistics on chunk access
// patterns.
explicit FileChunkMap(bool enable_stats);
~FileChunkMap();
FileChunkMap(FileChunkMap&) = delete;
FileChunkMap& operator=(FileChunkMap&) = delete;
// Initializes a new entry for |path| or clears the existing one and sets the
// |file_size|. If |chunks| is not null, moves the contents of |chunks| to
// this file's chunk list.
void Init(std::string path, uint64_t file_size,
std::vector<FileChunk>* chunks = nullptr);
// Appends the chunks in |list| to the entry for |path|. |list_offset| is
// added to all chunk offsets in |list|. Copies ContentIdProtos from the list.
// The operation is queued and gets applied by calling FlushUpdates().
void AppendCopy(std::string path, const RepeatedChunkRefProto& list,
uint64_t list_offset);
// Same as above, but modifies |list| by moving ContentIdProtos off the list.
// The operation is queued and gets applied by calling FlushUpdates().
void AppendMove(std::string path, RepeatedChunkRefProto* list,
uint64_t list_offset);
// Removes the entry for |path|.
// The operation is queued and gets applied by calling FlushUpdates().
void Remove(std::string path);
// Clears all entries.
// The operation is queued and gets applied by calling FlushUpdates().
void Clear();
// Flushes all updates made by the above functions.
void FlushUpdates() ABSL_LOCKS_EXCLUDED(mutex_);
// Looks up the file |path|, the chunk |offset| and chunk |size| by the given
// |content_id|. Returns false if the entry does not exist.
bool Lookup(const ContentIdProto& content_id, std::string* path,
uint64_t* offset, uint32_t* size) ABSL_LOCKS_EXCLUDED(mutex_);
// Records that a chunk with the given |content_id| was streamed from the
// workstation.
// |thread_id| is the id of the thread that requested the chunk on the
// gamelet, usually the hash of the std::thread::id.
// No-op if |enable_stats| was false in the constructor.
void RecordStreamedChunk(const ContentIdProto& content_id, size_t thread_id)
ABSL_LOCKS_EXCLUDED(mutex_);
// Records that a chunk with the given |content_id| is cached on the gamelet.
// No-op if |enable_stats| was false in the constructor.
void RecordCachedChunk(const ContentIdProto& content_id)
ABSL_LOCKS_EXCLUDED(mutex_);
// Prints detailed chunk statistics.
// No-op if |enable_stats| was false in the constructor.
void PrintStats() ABSL_LOCKS_EXCLUDED(mutex_);
bool HasStats() const;
private:
struct File {
// All chunks in the file.
std::vector<FileChunk> chunks;
// Total file size.
uint64_t size = 0;
};
enum class FileUpdateType { kInit, kAppend, kRemove, kClear };
struct FileUpdate {
FileUpdateType type = FileUpdateType::kInit;
std::string path;
uint64_t file_size = 0;
std::vector<FileChunk> chunks;
FileUpdate(FileUpdateType type, std::string path)
: type(type), path(std::move(path)) {}
};
struct ChunkLocation {
// Asset path, also key into |path_to_file_| map.
const std::string* path = nullptr;
// Index into |path_to_file_[*path].chunks|.
uint32_t index = 0;
};
// Keeps a pointer to a content id proto and compares by value.
struct ContentIdRef {
const ContentIdProto* content_id;
explicit ContentIdRef(const ContentIdProto& content_id)
: content_id(&content_id) {}
bool operator==(const ContentIdRef& other) const {
return *content_id == *other.content_id;
}
bool operator!=(const ContentIdRef& other) const {
return !(*this == other);
}
};
struct ContentIdRefHash {
std::size_t operator()(const ContentIdRef& ref) const noexcept {
return hash(*ref.content_id);
}
std::hash<ContentIdProto> hash;
};
// Updates |id_to_chunk_|. Also rebuilds |stats_| if present.
void UpdateIdToChunkMap() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Finds a chunk its by |content_id|.
// |path| returns the relative Unix path of a file that contains the chunk.
// |offset| returns the offset of the chunk in the file.
// |size| returns the size of the chunk.
// |index| returns the index of the chunk in the File struct.
// All output variables are optional.
// Calls MaybeUpdateIdToChunkMap().
// Returns true if the chunk was found.
bool FindChunk(const ContentIdProto& content_id, std::string* path,
uint64_t* offset, uint32_t* size, size_t* index)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Queued updates.
std::vector<FileUpdate> file_updates_;
// Maps the relative Unix path of assets to its file size and chunks.
using PathToFileMap = absl::flat_hash_map<std::string, File>;
PathToFileMap path_to_file_ ABSL_GUARDED_BY(mutex_);
// Maps content id to path and chunk index.
using IdToChunkMap =
absl::flat_hash_map<ContentIdRef, ChunkLocation, ContentIdRefHash>;
IdToChunkMap id_to_chunk_ ABSL_GUARDED_BY(mutex_);
size_t total_chunks_ ABSL_GUARDED_BY(mutex_) = 0;
// Keeps detailed chunk access statistics.
// Only used if |enable_stats| was set to true in the constructor.
std::unique_ptr<StatsPrinter> stats_ ABSL_GUARDED_BY(mutex_);
// All chunks streamed from/cached on the gamelet.
// The data is used to rebuild stats in case of a the manifest update.
// Only used if |enable_stats| was set to true in the constructor.
absl::flat_hash_map<ContentIdProto, size_t> streamed_chunks_to_thread_
ABSL_GUARDED_BY(mutex_);
absl::flat_hash_set<ContentIdProto> cached_chunks_ ABSL_GUARDED_BY(mutex_);
mutable absl::Mutex mutex_;
};
}; // namespace cdc_ft
#endif // MANIFEST_FILE_CHUNK_MAP_H_

View File

@@ -0,0 +1,252 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/file_chunk_map.h"
#include "gtest/gtest.h"
namespace cdc_ft {
namespace {
constexpr char kFile1[] = "file1";
constexpr char kFile2[] = "file2";
class FileChunkMapTest : public ::testing::Test {
protected:
// Creates a ChunkRef proto list from chunk data.
RepeatedChunkRefProto MakeChunks(
std::initializer_list<std::string> chunk_data) {
uint64_t offset = 0;
RepeatedChunkRefProto chunks;
for (const std::string& data : chunk_data) {
ChunkRefProto* chunk = chunks.Add();
chunk->set_offset(offset);
*chunk->mutable_chunk_id() = ContentId::FromDataString(data);
offset += data.size();
}
return chunks;
}
// Creates a ContentId proto from string |data|.
ContentIdProto Id(const std::string& data) {
return ContentId::FromDataString(data);
}
FileChunkMap file_chunks_{/*enable_stats=*/false};
std::string path_;
uint64_t offset_ = 0;
uint32_t size_ = 0;
};
TEST_F(FileChunkMapTest, LookupOneChunk) {
file_chunks_.Init(kFile1, 10);
file_chunks_.AppendCopy(kFile1, MakeChunks({"0123456789"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0123456789"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
EXPECT_EQ(offset_, 0);
EXPECT_EQ(size_, 10);
}
TEST_F(FileChunkMapTest, LookupWithoutFlush) {
file_chunks_.Init(kFile1, 10);
file_chunks_.AppendCopy(kFile1, MakeChunks({"0123456789"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0123456789"), &path_, &offset_, &size_));
file_chunks_.Clear();
// No FlushUpdates() call.
EXPECT_TRUE(file_chunks_.Lookup(Id("0123456789"), &path_, &offset_, &size_));
}
TEST_F(FileChunkMapTest, LookupTwoChunks) {
file_chunks_.Init(kFile1, 10);
file_chunks_.AppendCopy(kFile1, MakeChunks({"0123", "456789"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0123"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
EXPECT_EQ(offset_, 0);
EXPECT_EQ(size_, 4);
EXPECT_TRUE(file_chunks_.Lookup(Id("456789"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
EXPECT_EQ(offset_, 4);
EXPECT_EQ(size_, 6);
}
TEST_F(FileChunkMapTest, LookupTwoFiles) {
file_chunks_.Init(kFile1, 4);
file_chunks_.AppendCopy(kFile1, MakeChunks({"0123"}), 0);
file_chunks_.Init(kFile2, 6);
file_chunks_.AppendCopy(kFile2, MakeChunks({"012345"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0123"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
EXPECT_EQ(offset_, 0);
EXPECT_EQ(size_, 4);
EXPECT_TRUE(file_chunks_.Lookup(Id("012345"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile2);
EXPECT_EQ(offset_, 0);
EXPECT_EQ(size_, 6);
}
TEST_F(FileChunkMapTest, InitWithChunks) {
std::vector<FileChunk> chunks;
chunks.emplace_back(Id("0123"), 0);
chunks.emplace_back(Id("456789"), 4);
file_chunks_.Init(kFile1, 10, &chunks);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0123"), &path_, &offset_, &size_));
EXPECT_TRUE(file_chunks_.Lookup(Id("456789"), &path_, &offset_, &size_));
EXPECT_TRUE(chunks.empty());
}
TEST_F(FileChunkMapTest, InitWithChunksAndAppend) {
std::vector<FileChunk> chunks;
chunks.emplace_back(Id("0123"), 0);
file_chunks_.Init(kFile1, 10, &chunks);
file_chunks_.AppendCopy(kFile1, MakeChunks({"456789"}), 4);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0123"), &path_, &offset_, &size_));
EXPECT_TRUE(file_chunks_.Lookup(Id("456789"), &path_, &offset_, &size_));
EXPECT_TRUE(chunks.empty());
}
TEST_F(FileChunkMapTest, InitClearsExistingEntry) {
file_chunks_.Init(kFile1, 6);
file_chunks_.AppendCopy(kFile1, MakeChunks({"012345"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("012345"), &path_, &offset_, &size_));
EXPECT_EQ(size_, 6);
file_chunks_.Init(kFile1, 4);
file_chunks_.AppendCopy(kFile1, MakeChunks({"0123"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0123"), &path_, &offset_, &size_));
EXPECT_EQ(size_, 4);
}
TEST_F(FileChunkMapTest, AppendAddsOffset) {
file_chunks_.Init(kFile1, 10);
file_chunks_.AppendCopy(kFile1, MakeChunks({"01", "23", "45"}), 0);
file_chunks_.AppendCopy(kFile1, MakeChunks({"67", "89"}), 6);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("45"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
EXPECT_EQ(offset_, 4);
EXPECT_EQ(size_, 2);
EXPECT_TRUE(file_chunks_.Lookup(Id("67"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
EXPECT_EQ(offset_, 6);
EXPECT_EQ(size_, 2);
EXPECT_TRUE(file_chunks_.Lookup(Id("89"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
EXPECT_EQ(offset_, 8);
EXPECT_EQ(size_, 2);
}
TEST_F(FileChunkMapTest, Remove_DifferentChunks) {
file_chunks_.Init(kFile1, 1);
file_chunks_.AppendCopy(kFile1, MakeChunks({"0"}), 0);
file_chunks_.Init(kFile2, 1);
file_chunks_.AppendCopy(kFile2, MakeChunks({"1"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0"), &path_, &offset_, &size_));
EXPECT_TRUE(file_chunks_.Lookup(Id("1"), &path_, &offset_, &size_));
file_chunks_.Remove(kFile2);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0"), &path_, &offset_, &size_));
EXPECT_FALSE(file_chunks_.Lookup(Id("1"), &path_, &offset_, &size_));
}
TEST_F(FileChunkMapTest, Remove_SameChunks) {
file_chunks_.Init(kFile1, 1);
file_chunks_.AppendCopy(kFile1, MakeChunks({"0"}), 0);
file_chunks_.Init(kFile2, 1);
file_chunks_.AppendCopy(kFile2, MakeChunks({"0"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0"), &path_, &offset_, &size_));
// |path_| is not deterministic as an absl::flat_hash_map is used internally.
EXPECT_TRUE(path_ == kFile1 || path_ == kFile2) << path_;
file_chunks_.Remove(kFile2);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
}
TEST_F(FileChunkMapTest, Clear) {
file_chunks_.Init(kFile1, 1);
file_chunks_.AppendCopy(kFile1, MakeChunks({"0"}), 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("0"), &path_, &offset_, &size_));
file_chunks_.Clear();
file_chunks_.FlushUpdates();
EXPECT_FALSE(file_chunks_.Lookup(Id("0"), &path_, &offset_, &size_));
}
TEST_F(FileChunkMapTest, AppendCopyMove) {
RepeatedChunkRefProto chunks1 = MakeChunks({"01"});
RepeatedChunkRefProto chunks2 = MakeChunks({"23"});
file_chunks_.Init(kFile1, 2);
file_chunks_.Init(kFile2, 2);
file_chunks_.AppendCopy(kFile1, chunks1, 0);
file_chunks_.AppendMove(kFile2, &chunks2, 0);
file_chunks_.FlushUpdates();
EXPECT_TRUE(file_chunks_.Lookup(Id("01"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile1);
EXPECT_TRUE(file_chunks_.Lookup(Id("23"), &path_, &offset_, &size_));
EXPECT_EQ(path_, kFile2);
// AppendMove() should have moved the second chunk off the list.
EXPECT_EQ(chunks1[0].chunk_id(), Id("01"));
EXPECT_EQ(chunks2[0].chunk_id(), ContentIdProto());
}
} // namespace
} // namespace cdc_ft

View File

@@ -0,0 +1,740 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/manifest_builder.h"
#include <cassert>
#include <deque>
#include "absl/strings/str_format.h"
#include "absl/time/time.h"
#include "common/log.h"
#include "common/path.h"
#include "common/status.h"
#include "common/status_macros.h"
#include "common/util.h"
#include "manifest/asset_builder.h"
#include "manifest/content_id.h"
namespace cdc_ft {
namespace {
// Splits the given Unix path into its components.
inline std::vector<absl::string_view> SplitUnixPath(const std::string& path) {
return SplitString(path, '/', false);
}
// Joins the given path components using the Unix path separator. This function
// assumes that none of the path components have trailing path separators.
inline std::string JoinUnixPath(const std::vector<absl::string_view>& path) {
return JoinStrings(path, 0, path.size(), '/');
}
} // namespace
ManifestBuilder::ManifestBuilder(CdcParamsProto cdc_params,
DataStoreWriter* chunk_store)
: data_store_(chunk_store), cdc_params_(std::move(cdc_params)) {
Reset();
}
ManifestBuilder::~ManifestBuilder() = default;
absl::Status ManifestBuilder::LoadManifest(const std::string& manifest_hex_id) {
ContentIdProto manifest_id;
if (!ContentId::FromHexString(manifest_hex_id, &manifest_id)) {
return absl::InvalidArgumentError(
absl::StrFormat("Invalid manifest ID: '%s'", manifest_hex_id));
}
return LoadManifest(manifest_id);
}
absl::Status ManifestBuilder::LoadManifest(const ContentIdProto& manifest_id) {
Reset();
RETURN_IF_ERROR(data_store_->GetProto(manifest_id, manifest_));
manifest_id_.CopyFrom(manifest_id);
return absl::OkStatus();
}
void ManifestBuilder::Reset() {
asset_lists_.clear();
manifest_id_.Clear();
manifest_bytes_written_ = 0;
manifest_chunks_written_ = 0;
arena_.Reset();
manifest_ = MakeProto<ManifestProto>();
*manifest_->mutable_cdc_params() = cdc_params_;
}
absl::StatusOr<AssetBuilder> ManifestBuilder::GetOrCreateAsset(
const std::string& path, AssetProto::Type type, bool force_create,
bool* created) {
// We must keep |unix_path| allocated while the string_views in |parts| are
// being used.
if (created) *created = false;
std::string unix_path = path::ToUnix(path);
std::vector<absl::string_view> parts = SplitUnixPath(unix_path);
absl::string_view name;
if (!parts.empty()) {
name = parts.back();
parts.pop_back();
}
DirCreateMode create_mode =
force_create ? DirCreateMode::kForceCreate : DirCreateMode::kCreate;
AssetProto* dir;
ASSIGN_OR_RETURN(dir, FindOrCreateDirPath(parts, create_mode),
"Failed to create directory '%s'", JoinUnixPath(parts));
if (name.empty()) {
// Special case: return the root directory for a DIRECTORY with empty name.
if (type == AssetProto::DIRECTORY) return AssetBuilder(dir, std::string());
return absl::InvalidArgumentError("Empty path given");
}
// Check if the asset already exists.
absl::StatusOr<AssetProto*> result = FindAssetInDir(name, dir);
AssetProto* asset = nullptr;
if (result.ok()) {
asset = result.value();
// Verify that both assets are of the same type.
if (asset->type() != type) {
if (force_create) {
RETURN_IF_ERROR(DeleteAsset(path));
asset = nullptr;
} else {
return absl::AlreadyExistsError(absl::StrFormat(
"Asset '%s' already exists in '%s' as %s.", path,
JoinUnixPath(parts), AssetProto::Type_Name(asset->type())));
}
}
} else if (!absl::IsNotFound(result.status())) {
// Return any unexpected error.
return result.status();
}
// Create the asset if it was not found or it was deleted.
if (!asset) {
asset = dir->add_dir_assets();
InitNewAsset(name, type, asset);
if (created) *created = true;
}
return AssetBuilder(asset, path::ToUnix(path::DirName(path)));
}
absl::Status ManifestBuilder::DeleteAsset(const std::string& path) {
// We must keep |unix_path| allocated while the string_views in |parts| are
// being used.
std::string unix_path = path::ToUnix(path);
std::vector<absl::string_view> parts = SplitUnixPath(unix_path);
if (parts.empty()) return absl::InvalidArgumentError("Empty path given");
absl::string_view name = parts.back();
parts.pop_back();
absl::StatusOr<AssetProto*> dir =
FindOrCreateDirPath(parts, DirCreateMode::kNoCreate);
if (!dir.ok()) {
// We can get an absl::InvalidArgumentError here if one of the path
// components is not a directory, which means the asset to be deleted does
// not exist.
if (absl::IsNotFound(dir.status()) ||
absl::IsInvalidArgument(dir.status())) {
return absl::OkStatus();
}
// Return any unexpected error.
return WrapStatus(dir.status(), "Failed to look up path '%s'",
JoinUnixPath(parts));
}
// Check if the asset exists.
return DeleteAssetFromDir(name, *dir);
}
absl::StatusOr<AssetProto*> ManifestBuilder::FindOrCreateDirPath(
const std::vector<absl::string_view>& path, DirCreateMode create_dirs) {
// Create the first manifest, if needed, independent of |create_dirs|.
if (!manifest_->has_root_dir()) {
InitNewAsset(absl::string_view(), AssetProto::DIRECTORY,
manifest_->mutable_root_dir());
}
return FindOrCreateDirPathRec(path, 0, manifest_->mutable_root_dir(),
create_dirs);
}
absl::StatusOr<AssetProto*> ManifestBuilder::FindOrCreateDirPathRec(
const std::vector<absl::string_view>& path, size_t path_idx,
AssetProto* dir, DirCreateMode create_dirs) {
if (path_idx >= path.size()) return dir;
absl::string_view name = path[path_idx];
// Try to find the name in the direct assets.
bool overwrite = create_dirs == DirCreateMode::kForceCreate;
absl::StatusOr<AssetProto*> result = FindMutableAssetInList(
name, AssetProto::DIRECTORY, overwrite, dir->mutable_dir_assets());
if (result.ok()) {
// Recurse into the sub-directory.
return FindOrCreateDirPathRec(path, path_idx + 1, result.value(),
create_dirs);
}
if (!absl::IsNotFound(result.status())) {
// Return any unexpected error.
return result;
}
// Try to find the name in the list of indirect assets.
for (const ContentIdProto& asset_list_id : dir->dir_indirect_assets()) {
AssetListProto* asset_list;
ASSIGN_OR_RETURN(asset_list, GetAssetList(asset_list_id));
// In theory it can happen that the loaded asset_list is empty, in which
// case it is null.
if (!asset_list) continue;
result = FindMutableAssetInList(name, AssetProto::DIRECTORY, overwrite,
asset_list->mutable_assets());
if (result.ok()) {
// Recurse into the sub-directory.
return FindOrCreateDirPathRec(path, path_idx + 1, result.value(),
create_dirs);
}
if (!absl::IsNotFound(result.status())) {
// Return any unexpected error.
return WrapStatus(result.status(),
"Failed to look up directory '%s' in AssetListProto %s",
name, ContentId::ToHexString(asset_list_id));
}
}
// If we're not supposed to create the directory, return an error.
if (create_dirs == DirCreateMode::kNoCreate) {
return absl::NotFoundError(absl::string_view());
}
// Create the missing directory.
AssetProto* child = dir->add_dir_assets();
InitNewAsset(name, AssetProto::DIRECTORY, child);
return FindOrCreateDirPathRec(path, path_idx + 1, child, create_dirs);
}
absl::StatusOr<AssetProto*> ManifestBuilder::FindAssetInDir(
absl::string_view name, AssetProto* dir) {
if (dir->type() != AssetProto::DIRECTORY) {
return WrongAssetTypeError(dir->name(), dir->type(), AssetProto::DIRECTORY);
}
// Try to find the name in the direct assets.
absl::StatusOr<AssetProto*> result =
FindMutableAssetInList(name, dir->mutable_dir_assets());
if (result.ok()) {
return result.value();
}
if (!absl::IsNotFound(result.status())) {
// Return any unexpected error.
return result;
}
// Try to find the name in the list of indirect assets.
for (const ContentIdProto& asset_list_id : dir->dir_indirect_assets()) {
AssetListProto* asset_list;
ASSIGN_OR_RETURN(asset_list, GetAssetList(asset_list_id),
"Failed to look up asset '%s' in directory '%s'", name,
dir->name());
result = FindMutableAssetInList(name, asset_list->mutable_assets());
if (result.ok()) {
return result.value();
}
if (!absl::IsNotFound(result.status())) {
// Return any unexpected error.
return result;
}
}
return absl::NotFoundError(absl::string_view());
}
absl::StatusOr<AssetProto*> ManifestBuilder::FindMutableAssetInList(
absl::string_view name, RepeatedAssetProto* assets) const {
for (AssetProto& asset : *assets) {
if (asset.name() == name) return &asset;
}
return absl::NotFoundError(absl::string_view());
}
absl::StatusOr<AssetProto*> ManifestBuilder::FindMutableAssetInList(
absl::string_view name, AssetProto::Type type, bool overwrite,
RepeatedAssetProto* assets) const {
AssetProto* asset;
ASSIGN_OR_RETURN(asset, FindMutableAssetInList(name, assets));
if (asset->type() != type) {
// Return an error if the asset is not of the desired type and we're not
// supposed to overwrite it.
if (!overwrite) {
return WrongAssetTypeError(asset->name(), asset->type(), type);
}
// Replace the asset with the new type.
InitNewAsset(std::string(asset->name()), type, asset);
}
return asset;
}
absl::Status ManifestBuilder::DeleteAssetFromDir(absl::string_view name,
AssetProto* dir) {
if (dir->type() != AssetProto::DIRECTORY) {
return WrongAssetTypeError(dir->name(), dir->type(), AssetProto::DIRECTORY);
}
// Try to find the name in the direct assets.
if (DeleteAssetFromList(name, dir->mutable_dir_assets())) {
return absl::OkStatus();
}
// Try to find the name in the list of indirect assets.
for (const ContentIdProto& asset_list_id : dir->dir_indirect_assets()) {
AssetListProto* asset_list;
ASSIGN_OR_RETURN(asset_list, GetAssetList(asset_list_id),
"Failed to look up asset '%s' in directory '%s'", name,
dir->name());
if (DeleteAssetFromList(name, asset_list->mutable_assets())) {
return absl::OkStatus();
}
}
return absl::OkStatus();
}
bool ManifestBuilder::DeleteAssetFromList(absl::string_view name,
RepeatedAssetProto* assets) const {
for (int i = 0; i < assets->size(); ++i) {
if (assets->at(i).name() == name) {
// Move the asset to the end of the list, then remove it, to avoid all
// other elements being moved.
if (i != assets->size() - 1) {
assets->SwapElements(i, assets->size() - 1);
}
assets->RemoveLast();
return true;
}
}
return false;
}
void ManifestBuilder::InitNewAsset(absl::string_view name,
AssetProto::Type type,
AssetProto* asset) const {
asset->Clear();
asset->set_name(name.data(), name.size());
asset->set_type(type);
asset->set_mtime_seconds(absl::ToUnixSeconds(absl::Now()));
asset->set_permissions(type == AssetProto::DIRECTORY ? kDefaultDirPerms
: kDefaultFilePerms);
}
absl::StatusOr<AssetListProto*> ManifestBuilder::GetAssetList(
const ContentIdProto& id) {
// See if we loaded this proto already.
AssetListMap::iterator it = asset_lists_.find(id);
if (it != asset_lists_.end()) return it->second;
// If not, we need to load it.
AssetListProto* asset_list = MakeProto<AssetListProto>();
RETURN_IF_ERROR(data_store_->GetProto(id, asset_list),
"Failed to read the AssetListProto with ID %s from storage",
ContentId::ToHexString(id));
asset_lists_[id] = asset_list;
return asset_list;
}
absl::StatusOr<AssetListProto*> ManifestBuilder::TakeOutAssetList(
const ContentIdProto& id) {
AssetListProto* list;
ASSIGN_OR_RETURN(list, GetAssetList(id));
asset_lists_.erase(id);
return list;
}
absl::Status ManifestBuilder::WrongAssetTypeError(
absl::string_view name, AssetProto::Type found,
AssetProto::Type expected) const {
return absl::InvalidArgumentError(absl::StrFormat(
"Asset '%s' is of type %s, expected %s.", name,
AssetProto::Type_Name(found), AssetProto::Type_Name(expected)));
}
size_t ManifestBuilder::ManifestBytesWritten() const {
return manifest_bytes_written_;
}
size_t ManifestBuilder::ManifestsChunksWritten() const {
return manifest_chunks_written_;
}
const ContentIdProto& ManifestBuilder::ManifestId() const {
return manifest_id_;
}
const ManifestProto* ManifestBuilder::Manifest() const { return manifest_; }
const std::vector<ContentIdProto>& ManifestBuilder::FlushedContentIds() const {
return flushed_content_ids_;
}
absl::Status ManifestBuilder::FlushDir(AssetProto* dir) {
// Flush all direct assets.
RETURN_IF_ERROR(FlushAssetList(dir->mutable_dir_assets()),
"Failed to flush directs assets of directory '%s'",
dir->name());
RepeatedAssetProto overflow;
RepeatedContentIdProto* indirect_assets = dir->mutable_dir_indirect_assets();
// Flush all indirect asset lists that were previously loaded.
RepeatedContentIdProto::iterator it = indirect_assets->begin();
while (it != indirect_assets->end()) {
ContentIdProto& asset_list_id = *it;
// Skip any list that was never loaded.
AssetListMap::iterator asset_list_it = asset_lists_.find(asset_list_id);
if (asset_list_it == asset_lists_.end()) {
++it;
continue;
}
AssetListProto* asset_list = asset_list_it->second;
// Flush the list and enforce the chunk size limit.
RETURN_IF_ERROR(FlushAssetList(asset_list->mutable_assets()),
"Failed to flush indirect asset list %s in directory '%s'",
ContentId::ToHexString(asset_list_id), dir->name());
EnforceAssetListProtoSize(asset_list, &overflow);
// If the asset list is empty, just delete it from the indirect asset list.
if (asset_list->assets_size() <= 0) {
it = indirect_assets->erase(it);
continue;
}
// Write the list to the chunk store and update the content ID.
RETURN_IF_ERROR(WriteProto(*asset_list, &asset_list_id),
"Failed to write indirect asset list proto for directory "
"'%s' to storage",
dir->name());
// If the content ID changed, we need to update the list's key in the map.
if (asset_list_it->first != asset_list_id) {
AssetListProto* list = asset_list_it->second;
asset_lists_.erase(asset_list_it);
asset_lists_[asset_list_id] = list;
}
++it;
}
// Enforce size limit for this DIRECTORY asset.
RETURN_IF_ERROR(EnforceDirProtoSize(dir, &overflow));
// Add the overflown assets to the indirect assets list.
return AppendAllocatedIndirectAssets(dir, &overflow);
}
absl::Status ManifestBuilder::FlushAssetList(RepeatedAssetProto* assets) {
// Flush all sub-directories.
for (AssetProto& asset : *assets) {
if (asset.type() == AssetProto::DIRECTORY)
RETURN_IF_ERROR(FlushDir(&asset));
}
return absl::OkStatus();
}
inline void SortByProtoSizeDesc(RepeatedAssetProto* assets) {
std::sort(assets->begin(), assets->end(),
[](const AssetProto& a, const AssetProto& b) -> bool {
// Compare greater than for descending order.
return a.ByteSizeLong() > b.ByteSizeLong();
});
}
absl::Status ManifestBuilder::EnforceDirProtoSize(
AssetProto* dir, RepeatedAssetProto* overflow) {
// A max. size of zero means no limit.
const size_t max_size = manifest_->cdc_params().avg_chunk_size();
if (!max_size) return absl::OkStatus();
// We cannot change the size of non-directory assets.
if (dir->type() != AssetProto::DIRECTORY) return absl::OkStatus();
// Calculate the full proto size only once.
size_t proto_size = dir->ByteSizeLong();
if (proto_size <= max_size) return absl::OkStatus();
// Sort asset list by size so that we start with the largest assets.
SortByProtoSizeDesc(dir->mutable_dir_assets());
// Enforce the size limit of large FILE assets, where "large" is defined as
// 1/16th of the target chunk size.
const size_t max_asset_proto_size = max_size >> 4;
if (max_asset_proto_size) {
for (AssetProto& asset : *dir->mutable_dir_assets()) {
size_t asset_proto_size = asset.ByteSizeLong();
// Stop if the remaining assets are no longer large.
if (proto_size <= max_size || asset_proto_size <= max_asset_proto_size) {
break;
}
if (asset.type() != AssetProto::FILE) continue;
RETURN_IF_ERROR(EnforceFileProtoSize(&asset, max_asset_proto_size));
// Adjust the directory proto size.
proto_size = proto_size + asset.ByteSizeLong() - asset_proto_size;
}
}
// Move assets to the overflow list until the limit is respected.
while (dir->dir_assets_size() && proto_size > max_size) {
// Use the UnsafeArena* function to avoid a heap copy of the message.
AssetProto* asset = dir->mutable_dir_assets()->UnsafeArenaReleaseLast();
proto_size -= asset->ByteSizeLong() + kRepeatedProtoFieldOverhead;
// When the estimates get us below the limit, calculate the accurate size.
if (proto_size <= max_size) proto_size = dir->ByteSizeLong();
overflow->UnsafeArenaAddAllocated(asset);
}
// At this point, we might still be over the size limit for a combination of
// a very small chunk size and a very large directories. There's nothing we
// can do about it with the current structure of the manifest proto.
if (proto_size > max_size) {
LOG_WARNING(
"Manifest for directory '%s' is over the configured chunk size limit "
"(%d > %d). Consider increasing the chunk size.",
dir->name(), proto_size, max_size);
}
return absl::OkStatus();
}
absl::Status ManifestBuilder::EnforceFileProtoSize(
AssetProto* file, size_t max_asset_proto_size) {
if (!max_asset_proto_size) return absl::OkStatus();
assert(file->type() == AssetProto::FILE);
// If there is only a single direct chunk, we cannot reduce the proto size.
if (file->file_chunks_size() <= 1) return absl::OkStatus();
// We expect no indirect chunk lists at this point. If we ever decide to
// "rebalance" existing manifests with a smaller chunk size, we need to push
// the indirect chunks before the existing ones.
if (file->file_indirect_chunks_size() > 0) {
return MakeStatus(
"Given asset '%s' already has %d indirect chunk lists which is not "
"supported",
file->name(), file->file_indirect_chunks_size());
}
std::deque<ChunkRefProto*> overflow;
size_t proto_size = file->ByteSizeLong();
// Remove chunks until the size limit is respected.
while (file->file_chunks_size() && proto_size > max_asset_proto_size) {
// Use the UnsafeArena* function to avoid a heap copy of the message.
ChunkRefProto* ref = file->mutable_file_chunks()->UnsafeArenaReleaseLast();
proto_size -= ref->ByteSizeLong() + kRepeatedProtoFieldOverhead;
// When the estimates get us below the limit, calculate the accurate size.
if (proto_size <= max_asset_proto_size) proto_size = file->ByteSizeLong();
overflow.push_back(ref);
}
if (overflow.empty()) return absl::OkStatus();
// Move chunks to indirect chunk lists. All proto memory is owned by the
// |arena_|, we don't need to worry about leaking memory here.
ChunkListProto* chunk_list = MakeProto<ChunkListProto>();
size_t chunk_list_size = 0;
uint64_t chunk_list_offset = overflow.back()->offset();
const size_t max_size = manifest_->cdc_params().avg_chunk_size();
while (!overflow.empty()) {
ChunkRefProto* chunk_ref = overflow.back();
overflow.pop_back();
// Convert the chunk's absolute offset to a relative one.
uint64_t chunk_absolute_offset = chunk_ref->offset();
chunk_ref->set_offset(chunk_absolute_offset - chunk_list_offset);
size_t chunkref_proto_size =
chunk_ref->ByteSizeLong() + kRepeatedProtoFieldOverhead;
// Write back a full chunk list and set offset and content ID accordingly.
if (chunk_list_size > 0 &&
chunk_list_size + chunkref_proto_size > max_size) {
RETURN_IF_ERROR(WriteBackChunkList(chunk_list_offset, *chunk_list,
file->add_file_indirect_chunks()));
chunk_list->Clear();
chunk_list_size = 0;
// The first chunk in the list defines the chunk list's offset.
chunk_list_offset = chunk_absolute_offset;
chunk_ref->set_offset(0);
chunkref_proto_size =
chunk_ref->ByteSizeLong() + kRepeatedProtoFieldOverhead;
}
// Move chunk reference to the indirect list. Use the UnsafeArena* function
// again to pass ownership without copying the data.
chunk_list->mutable_chunks()->UnsafeArenaAddAllocated(chunk_ref);
chunk_list_size += chunkref_proto_size;
// When the estimates get us above the limit, calculate the accurate size.
if (chunk_list_size > max_size)
chunk_list_size = chunk_list->ByteSizeLong();
}
// Write back final chunk list.
return WriteBackChunkList(chunk_list_offset, *chunk_list,
file->add_file_indirect_chunks());
}
bool ManifestBuilder::EnforceAssetListProtoSize(
AssetListProto* asset_list, RepeatedAssetProto* overflow) const {
// A max. size of zero means no limit.
const size_t max_size = manifest_->cdc_params().avg_chunk_size();
if (!max_size) return false;
size_t proto_size = asset_list->ByteSizeLong();
bool changed = false;
while (proto_size > max_size) {
// Use the UnsafeArena* function to avoid a heap copy of the message.
AssetProto* asset = asset_list->mutable_assets()->UnsafeArenaReleaseLast();
proto_size -= asset->ByteSizeLong() + kRepeatedProtoFieldOverhead;
// When the estimates get us below the limit, calculate the accurate size.
if (proto_size <= max_size) proto_size = asset_list->ByteSizeLong();
overflow->UnsafeArenaAddAllocated(asset);
changed = true;
}
return changed;
}
absl::Status ManifestBuilder::WriteBackAssetList(
AssetListProto* asset_list, ContentIdProto* asset_list_id) {
RETURN_IF_ERROR(WriteProto(*asset_list, asset_list_id),
"Failed to write back AssetListProto");
asset_lists_[*asset_list_id] = asset_list;
return absl::OkStatus();
}
absl::Status ManifestBuilder::WriteBackChunkList(
uint64_t chunk_list_offset, const ChunkListProto& chunk_list,
IndirectChunkListProto* indirect_chunk_list) {
assert(chunk_list.chunks_size() > 0);
RETURN_IF_ERROR(
WriteProto(chunk_list, indirect_chunk_list->mutable_chunk_list_id()));
indirect_chunk_list->set_offset(chunk_list_offset);
return absl::OkStatus();
}
absl::Status ManifestBuilder::WriteProto(
const google::protobuf::MessageLite& proto, ContentIdProto* content_id) {
size_t proto_size = 0;
RETURN_IF_ERROR(data_store_->PutProto(proto, content_id, &proto_size));
flushed_content_ids_.push_back(*content_id);
// Update stats.
manifest_bytes_written_ += proto_size;
++manifest_chunks_written_;
return absl::OkStatus();
}
absl::Status ManifestBuilder::AppendAllocatedIndirectAssets(
AssetProto* dir, RepeatedAssetProto* assets) {
if (assets->empty()) return absl::OkStatus();
// The max. manifest chunk size that we try to stay under.
const size_t max_size = manifest_->cdc_params().avg_chunk_size();
// Use asset_list to track the last allocated list, if any.
AssetListProto* asset_list = nullptr;
// Index to the indirect asset list within |dir| currently in use. Defaults to
// zero, which means that if |dir| does not have any indirect asset lists, the
// code below will create the first one and store it at index zero.
int asset_list_index = 0;
// Approximate byte size of the asset list proto currently in use. This size
// is updated with the byte size of any asset proto that is appended to the
// list, but ignores any overhead from the embedding proto format (which
// should be negliable).
size_t proto_size = 0;
// Find or create the AssetListProto where we can append the assets.
if (dir->dir_indirect_assets_size() > 0) {
// Load the last indirect asset list and see if we can append to it.
asset_list_index = dir->dir_indirect_assets_size() - 1;
const ContentIdProto& asset_list_id =
dir->dir_indirect_assets(asset_list_index);
// Take out the asset list from its original location since the content ID
// will be updated anyway once we append more assets to it.
ASSIGN_OR_RETURN(asset_list, TakeOutAssetList(asset_list_id));
proto_size = asset_list->ByteSizeLong();
} else {
// Add the first indirect asset to |dir|, asset_list_index is already
// initialized to zero.
dir->add_dir_indirect_assets();
asset_list = MakeProto<AssetListProto>();
}
while (!assets->empty()) {
// Use the UnsafeArena* function to avoid a heap copy of the message. Even
// though it is released from the proto, the memory is still owned by the
// |arena_| and shares its lifetime.
AssetProto* asset = assets->UnsafeArenaReleaseLast();
size_t asset_proto_size =
asset->ByteSizeLong() + kRepeatedProtoFieldOverhead;
// See if we need to create a new AssetListProto.
if (max_size > 0 && proto_size > 0 &&
proto_size + asset_proto_size > max_size) {
// Write back the full list to the data store.
RETURN_IF_ERROR(
WriteBackAssetList(
asset_list, dir->mutable_dir_indirect_assets(asset_list_index)),
"Failed to write back asset list for directory '%s'", dir->name());
// Create a new list.
asset_list = MakeProto<AssetListProto>();
proto_size = 0;
asset_list_index = dir->dir_indirect_assets_size();
dir->add_dir_indirect_assets();
}
// Append the allocated asset to the current list.
asset_list->mutable_assets()->UnsafeArenaAddAllocated(asset);
proto_size += asset_proto_size;
}
// Write back the final asset list.
RETURN_IF_ERROR(
WriteBackAssetList(asset_list,
dir->mutable_dir_indirect_assets(asset_list_index)),
"Failed to write back final asset list for directory '%s'", dir->name());
return absl::OkStatus();
}
absl::StatusOr<ContentIdProto> ManifestBuilder::Flush() {
manifest_bytes_written_ = 0;
manifest_chunks_written_ = 0;
flushed_content_ids_.clear();
if (!manifest_->has_root_dir()) {
InitNewAsset("", AssetProto::DIRECTORY, manifest_->mutable_root_dir());
}
RETURN_IF_ERROR(FlushDir(manifest_->mutable_root_dir()));
RETURN_IF_ERROR(WriteProto(*manifest_, &manifest_id_));
return manifest_id_;
}
ManifestBuilder::FileLookupMap ManifestBuilder::CreateFileLookup() {
std::unordered_map<std::string, AssetProto*> lookup;
CreateFileLookupRec(std::string(), manifest_->mutable_root_dir(), lookup);
return lookup;
}
void ManifestBuilder::CreateFileLookupRec(const std::string& rel_path,
AssetProto* asset,
FileLookupMap& lookup) {
std::string rel_file_path = path::JoinUnix(rel_path, asset->name());
if (asset->type() == AssetProto::FILE) {
lookup[rel_file_path] = asset;
return;
}
// Handle all direct assets.
for (AssetProto& child : *asset->mutable_dir_assets())
CreateFileLookupRec(rel_file_path, &child, lookup);
// Add all (loaded!) indirect assets as well.
for (const ContentIdProto& id : asset->dir_indirect_assets()) {
const auto iter = asset_lists_.find(id);
if (iter == asset_lists_.end()) continue;
AssetListProto* asset_list = iter->second;
assert(asset_list);
for (AssetProto& child : *asset_list->mutable_assets())
CreateFileLookupRec(rel_file_path, &child, lookup);
}
}
// Returns used CDC parameters
CdcParamsProto ManifestBuilder::CdcParameters() const {
return manifest_->cdc_params();
}
template <typename T>
T* ManifestBuilder::MakeProto() {
return google::protobuf::Arena::CreateMessage<T>(&arena_);
}
} // namespace cdc_ft

296
manifest/manifest_builder.h Normal file
View File

@@ -0,0 +1,296 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_MANIFEST_BUILDER_H_
#define MANIFEST_MANIFEST_BUILDER_H_
#include <cstddef>
#include <list>
#include "absl/status/statusor.h"
#include "data_store/data_store_writer.h"
#include "google/protobuf/arena.h"
#include "manifest/asset_builder.h"
#include "manifest/content_id.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
// The ManifestBuilder class is used to create a manifest proto for the assets
// (DIRECTORY, FILE, and SYMLINK) that are added incrementally. The proto is
// finalized with a call to Flush(). When the CdcParamsProto given during
// construction specifies an average chunk size, then the manifest will be split
// into balanced chunks of at most this size.
//
// See (internal).
class ManifestBuilder {
public:
// Default permission bits for new directories and files, respectively.
static constexpr uint32_t kDefaultDirPerms = 0755u;
static constexpr uint32_t kDefaultFilePerms = 0644u;
// Maps relative Unix file paths to the corresponding file asset proto.
using FileLookupMap = std::unordered_map<std::string, AssetProto*>;
// Creates a new builder which reads from/writes to the given |data_store|.
// The |cdc_params| are included in the resulting manifest proto and influence
// the size of the manifest chunks which are written back to the
// |chunk_store|.
ManifestBuilder(CdcParamsProto cdc_params, DataStoreWriter* data_store);
~ManifestBuilder();
// Loads the manifest identified by |manifest_id| from the data store. Returns
// an absl::NotFoundError if the manifest ID does not exist or other errors if
// is not a valid manifest proto.
absl::Status LoadManifest(const ContentIdProto& manifest_id);
// Loads the manifest identified by the hexadecimal representation
// |manifest_hex_id| from the data store. Returns an error if the string
// representation is invalid or if the manifest ID does not exist or is not a
// valid manifest proto.
absl::Status LoadManifest(const std::string& manifest_hex_id);
// Returns the asset identified by the given Windows or Unix |path| or creates
// a new one of type |type| if it does not exist yet. The |path| is relative
// to the manifest's root directory. If the asset is created, any missing
// directories in |path| that lead up to the asset are automatically
// created as DIRECTORY assets with default permissions. Use a DIRECTORY
// |type| with an empty |path| to retrieve the root directory asset.
//
// If an asset at |path| exists but is of different |type|, the outcome
// depends on |force_create|. If this is set to false (the default), an
// absl::AlreadyExistsError is returned. If it is set to true, the existing
// asset is removed (recursively for directories) and a new asset with the
// same name is created instead.
//
// When |created| is given, then it will be set to true if that asset was
// actually added, otherwise it will be set to false.
absl::StatusOr<AssetBuilder> GetOrCreateAsset(const std::string& path,
AssetProto::Type type,
bool force_create = false,
bool* created = nullptr);
// Deletes the asset with the given |path|. If the asset is of type DIRECTORY,
// the entire directory is deleted recursively. If no asset with this path
// exists, the function returns success.
absl::Status DeleteAsset(const std::string& path);
// Updates the manifest to reflect all changes that were done. Splits the
// manifest into chunks of sizes as specified by the CdcParamsProto given
// during construction.
//
// Calling this function might invalidate pointers to wrapped protos that were
// returned by GetOrCreateAsset() or AssetBuilder methods.
absl::StatusOr<ContentIdProto> Flush();
// Creates a lookup of relative Unix file paths to protos of all loaded
// protos. The lookup does not contain unloaded indirect dir assets.
FileLookupMap CreateFileLookup();
// Returns the content ID of the manifest which was valid after the last call
// to Flush().
const ContentIdProto& ManifestId() const;
// Gets the manifest proto which was valid after the last call to Flush().
const ManifestProto* Manifest() const;
// Returns a list of the content IDs of all manifest chunks that have been
// written back to the data store during the last call of Flush().
const std::vector<ContentIdProto>& FlushedContentIds() const;
// Access statistics after Flush() about the manifest that was built.
size_t ManifestBytesWritten() const;
size_t ManifestsChunksWritten() const;
// Returns used CDC parameters
CdcParamsProto CdcParameters() const;
private:
// Map for storing loaded AssetListProtos by content ID. The protos are
// allocated on the arena which owns the memory.
using AssetListMap = std::unordered_map<ContentIdProto, AssetListProto*>;
// Clears all loaded and/or changed data and resets the statictics.
void Reset();
// Decides if and how directories are created.
enum class DirCreateMode {
// No directories are created and absl::NotFoundError might be returned.
kNoCreate,
// Missing directories are created, but absl::InvalidArgumentError might be
// returned in case a non-directory asset with the same name exists.
kCreate,
// Missing directories are created, any asset of a different type will be
// replaced with a DIRECTORY asset.
kForceCreate
};
// Follows the given |path| components along DIRECTORY assets and returns the
// final DIRECTORY on success.
//
// |create_dirs| determines if and when any missing DIRECTORY asset along the
// way are created and what errors can be expected.
absl::StatusOr<AssetProto*> FindOrCreateDirPath(
const std::vector<absl::string_view>& path, DirCreateMode create_dirs);
absl::StatusOr<AssetProto*> FindOrCreateDirPathRec(
const std::vector<absl::string_view>& path, size_t path_idx,
AssetProto* dir, DirCreateMode create_dirs);
// Searches for an asset with the given |name| in the given DIRECTORY asset.
// Does not recurse into sub-directories. If no such asset is found, an
// absl::NotFoundError is returned.
absl::StatusOr<AssetProto*> FindAssetInDir(absl::string_view name,
AssetProto* dir);
// Searches for an asset by its |name| in the given list of |assets|. If no
// such asset is found, an absl::NotFoundError is returned.
absl::StatusOr<AssetProto*> FindMutableAssetInList(
absl::string_view name, RepeatedAssetProto* assets) const;
// Searches for an asset by its |name| and |type| in the given list of assets.
// If no such asset is found, an absl::NotFoundError is returned.
//
// If an asset with that name exists of a different type, the outcome is
// conditional on |overwrite|. If |overwrite| is true, then the existing
// asset's type will be replaced with the given type and the asset is
// returned. If |overwrite| is false, an absl::InvalidArgumentError is
// returned.
absl::StatusOr<AssetProto*> FindMutableAssetInList(
absl::string_view name, AssetProto::Type type, bool overwrite,
RepeatedAssetProto* assets) const;
// Deletes an asset with the given |name| in the given DIRECTORY asset. Does
// not recurse into sub-directories. If no such asset is found, success is
// returned.
absl::Status DeleteAssetFromDir(absl::string_view name, AssetProto* dir);
// Deletes an asset by its |name| in the given list of |assets|. Returns true
// if the asset was found and deleted, false otherwise.
bool DeleteAssetFromList(absl::string_view name,
RepeatedAssetProto* assets) const;
// Initializes the given empty asset as an asset of the given |type| with
// default values for permissions and timestamps. Does not clear the proto or
// reset any other fields.
void InitNewAsset(absl::string_view name, AssetProto::Type type,
AssetProto* asset) const;
// Retrieves the AssetListProto referenced by the given content |id|. If the
// proto has been previously loaded, the stored (and potentially modified)
// proto is returned. Otherwise, the proto is read from the chunk store.
absl::StatusOr<AssetListProto*> GetAssetList(const ContentIdProto& id);
// Like GetAssetList(), but removes the AssetListProto from the |asset_lists_|
// mapping.
absl::StatusOr<AssetListProto*> TakeOutAssetList(const ContentIdProto& id);
// Convenience wrapper function for returning an error that the asset with the
// given |name| did not match the |expected| asset type.
absl::Status WrongAssetTypeError(absl::string_view name,
AssetProto::Type found,
AssetProto::Type expected) const;
// Flushes all pending information for |dir| and all sub-directories, enforces
// the chunk size limit, updates the content IDs, and writes the chunks to the
// chunk store.
absl::Status FlushDir(AssetProto* dir);
// Flushes all DIRECTORY assets in the given list recursively.
absl::Status FlushAssetList(RepeatedAssetProto* assets);
// Enforces the chunk size limit for the given DIRECTORY asset |dir|. Any
// direct asset that does not fit is moved to the |overflow| list. Returns
// true if at least one asset was moved, otherwise returns false.
absl::Status EnforceDirProtoSize(AssetProto* dir,
RepeatedAssetProto* overflow);
// Enforces the chunk size limit for the given FILE asset |file| to be at most
// |max_size|. Any chunk that does not fit is moved to the file's indirect
// chunk list.
absl::Status EnforceFileProtoSize(AssetProto* file, size_t max_size);
// Enforces the chunk size limit for the given |asset_list|. Any asset that
// does no longer fit is moved to the |overflow| list. Returns true if at
// least one asset was moved, otherwise returns false.
bool EnforceAssetListProtoSize(AssetListProto* asset_list,
RepeatedAssetProto* overflow) const;
// Appends the given list of allocated |assets| to the DIRECTORY asset |dir|.
// Ownership of the items in |assets| is passed on to |dir|.
absl::Status AppendAllocatedIndirectAssets(AssetProto* dir,
RepeatedAssetProto* assets);
// Writes the given AssetListProto to storage and updates |asset_list_id| with
// the list's content ID. If the call succeeds, the |asset_lists_| map is
// updated such that the resulting |asset_list_id| is referencing the
// |asset_list|.
absl::Status WriteBackAssetList(AssetListProto* asset_list,
ContentIdProto* asset_list_id);
// Writes the given ChunkListProto |chunk_list| to storage and updates
// |indirect_chunk_list| with the given |chunk_list_offset| and the resulting
// content ID.
absl::Status WriteBackChunkList(uint64_t chunk_list_offset,
const ChunkListProto& chunk_list,
IndirectChunkListProto* indirect_chunk_list);
// Wrapper around ChunkStore::WriteProto() which keeps track of chunks and
// bytes written.
absl::Status WriteProto(const google::protobuf::MessageLite& proto,
ContentIdProto* content_id);
// Recursively iterates assets, adding all loaded file protos into |lookup|.
// |rel_path| is the relative Unix directory path containing the |asset|.
void CreateFileLookupRec(const std::string& rel_path, AssetProto* asset,
FileLookupMap& lookup);
// Convenient wrapper to allocate a proto message on the arena.
template <typename T>
T* MakeProto();
// Constant overhead in bytes per repeated proto field.
static constexpr size_t kRepeatedProtoFieldOverhead = 2;
// Data store to read and write manifest chunks.
DataStoreWriter* data_store_;
// Content ID of the resulting manifest, updated in Flush().
ContentIdProto manifest_id_;
// Content IDs of all manifest chunks that were written back to the data store
// during the last call of Flush().
std::vector<ContentIdProto> flushed_content_ids_;
// Holds the manifest proto under construction.
ManifestProto* manifest_ = nullptr;
// CDC params used for the manifest.
CdcParamsProto cdc_params_;
// List of AssetListProtos loaded from data_store_.
AssetListMap asset_lists_;
// Useful stats.
size_t manifest_bytes_written_ = 0;
size_t manifest_chunks_written_ = 0;
// Arena for protos allocated by this builder.
google::protobuf::Arena arena_;
};
} // namespace cdc_ft
#endif // MANIFEST_MANIFEST_BUILDER_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,163 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/manifest_iterator.h"
#include <google/protobuf/text_format.h>
#include <cassert>
#include <fstream>
#include "absl/strings/str_format.h"
#include "common/errno_mapping.h"
#include "common/log.h"
#include "common/path.h"
#include "common/status_macros.h"
#include "manifest/content_id.h"
namespace cdc_ft {
// Holds the iteration state for an opened DIRECTORY asset.
struct ManifestIterator::OpenedDirectory {
OpenedDirectory(AssetProto* dir) : dir(dir) {}
~OpenedDirectory() = default;
// The DIRECTORY proto that is being iterated over. The object is owned by the
// parent OpenedDirectory struct.
AssetProto* dir;
// Holds the currently loaded indirect asset list.
std::unique_ptr<AssetListProto> asset_list;
// Index of the next direct asset to be returned from this directory. If the
// index is equal to dir->dir_assets_size(), all direct assets have been
// exhausted.
int next_asset = 0;
// Index of the next indirect asset list to be read. If the index is equal to
// dir->dir_indirect_assets_size(), all indirect asset lists have been
// exhausted.
int next_asset_list = 0;
// Index of the next asset of the currently loaded indirect asset list. If the
// index is equal to asset_list->assets_size(), all assets in this list have
// been exhausted.
int next_asset_list_asset = 0;
};
ManifestIterator::ManifestIterator(DataStoreReader* data_store)
: last_opened_dir_(nullptr), data_store_(data_store) {
assert(data_store_ != nullptr);
}
ManifestIterator::~ManifestIterator() = default;
absl::Status ManifestIterator::Open(const ContentIdProto& manifest_id) {
Reset();
status_ = data_store_->GetProto(manifest_id, &manifest_);
if (status_.ok()) dirs_.emplace_back(manifest_.mutable_root_dir());
return status_;
}
absl::Status ManifestIterator::Open(const std::string& manifest_file) {
Reset();
errno = 0;
// Open input file.
std::ifstream fin(manifest_file, std::ios_base::in | std::ios_base::binary);
if (!fin) {
std::string msg =
absl::StrFormat("failed to open file '%s' for reading", manifest_file);
if (errno) {
status_ = ErrnoToCanonicalStatus(errno, msg);
} else {
status_ = absl::UnknownError(msg);
}
return status_;
}
// Parse proto.
if (!manifest_.ParseFromIstream(&fin)) {
status_ = absl::InternalError(absl::StrFormat(
"failed to parse Manifest proto from file '%s'", manifest_file));
return status_;
}
dirs_.emplace_back(manifest_.mutable_root_dir());
return absl::OkStatus();
}
bool ManifestIterator::Valid() const { return !dirs_.empty() && status_.ok(); }
AssetProto* ManifestIterator::MutableAsset(RepeatedAssetProto* assets,
int index) {
AssetProto* asset_pb = assets->Mutable(index);
// Recurse into sub-directories.
if (asset_pb->type() == AssetProto::DIRECTORY) dirs_.emplace_back(asset_pb);
return asset_pb;
}
void ManifestIterator::UpdateRelPath(const OpenedDirectory* od) {
if (last_opened_dir_ == od) return;
rel_path_.resize(0);
for (const auto& opened_dir : dirs_) {
path::AppendUnix(&rel_path_, opened_dir.dir->name());
}
last_opened_dir_ = od;
}
const AssetProto* ManifestIterator::NextEntry() {
while (!dirs_.empty() && status_.ok()) {
OpenedDirectory* od = &dirs_.back();
UpdateRelPath(od);
// First, iterate over the direct assets.
if (od->next_asset >= 0 && od->next_asset < od->dir->dir_assets_size()) {
return MutableAsset(od->dir->mutable_dir_assets(), od->next_asset++);
}
// Next, iterate over the currently loaded indirect asset list.
assert(od->next_asset_list_asset >= 0);
if (od->asset_list &&
od->next_asset_list_asset < od->asset_list->assets_size()) {
return MutableAsset(od->asset_list->mutable_assets(),
od->next_asset_list_asset++);
}
// Finally, load the next AssetListProto from the indirect assets.
assert(od->next_asset_list >= 0);
if (od->next_asset_list < od->dir->dir_indirect_assets_size()) {
// Create the proto, if needed.
if (!od->asset_list) od->asset_list = std::make_unique<AssetListProto>();
// Read the AssetListProto from the chunk store.
const ContentIdProto& asset_list_id =
od->dir->dir_indirect_assets(od->next_asset_list++);
od->next_asset_list_asset = 0;
status_ = data_store_->GetProto(asset_list_id, od->asset_list.get());
if (!status_.ok()) return nullptr;
// Restart the loop to read the first asset from the list.
continue;
}
// Nothing more to visit, we are done with this node.
dirs_.pop_back();
}
return nullptr;
}
void ManifestIterator::Reset() {
dirs_.clear();
last_opened_dir_ = nullptr;
status_ = absl::OkStatus();
rel_path_.resize(0);
}
} // namespace cdc_ft

View File

@@ -0,0 +1,94 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_MANIFEST_ITERATOR_H_
#define MANIFEST_MANIFEST_ITERATOR_H_
#include <list>
#include "data_store/data_store_reader.h"
namespace cdc_ft {
class ManifestIterator {
public:
// Constructs a new manifest iterator that can read a manifest proto from the
// given |chunk_store|.
explicit ManifestIterator(DataStoreReader* data_store);
~ManifestIterator();
// Opens the manifest identified by |manifest_id| from the chunk store. If
// this method returns an Ok() status, an AssetProto may be fetched by
// calling NextEntry(). In case of an error, the value of Status() is
// returned.
absl::Status Open(const ContentIdProto& manifest_id);
// Opens the manifest stored in the file path given as |manifest_file|.
// Further chunks will be read from the chunk store, if needed. If this method
// returns an Ok() status, an AssetProto may be fetched by calling
// NextEntry(). In case of an error, the value of Status() is returned.
absl::Status Open(const std::string& manifest_file);
// Returns any error that might have occured so far.
absl::Status Status() const { return status_; }
// Returns true as long as a manifest has been opened, no error has occured,
// and a call to NextEntry() has a chance to succeed.
bool Valid() const;
// Yields the next asset from the opened manifest. Returns nullptr in case of
// an error or if no more assets are available. Check Status() to distinguish
// between those two cases.
//
// Calling NextEntry() invalidates any references to objects returned by
// previous calls to this function.
const AssetProto* NextEntry();
// Returns the current relative path. This corresponds to the directory path
// in which the asset returned from the last call to NextEntry() is located,
// relative to the manifest root.
const std::string& RelativePath() const { return rel_path_; }
// Returns a reference to the loaded manifest proto. Only valid after a
// successful call to Open().
const ManifestProto& Manifest() const { return manifest_; }
private:
struct OpenedDirectory;
// Resets the iterator for a new Open() call.
void Reset();
// Returns the AssetProto at |index| from the given list |assets|. If the
// AssetProto is of type DIRECTORY, it is pushed on top of the stack of open
// directories. Does not check if |index| is out-of-bounds.
AssetProto* MutableAsset(RepeatedAssetProto* assets, int index);
// Updates the relative path according to the current stack of opened
// directories.
void UpdateRelPath(const OpenedDirectory* od);
ManifestProto manifest_;
std::list<OpenedDirectory> dirs_;
const OpenedDirectory* last_opened_dir_;
std::string rel_path_;
absl::Status status_;
DataStoreReader* data_store_;
};
} // namespace cdc_ft
#endif // MANIFEST_MANIFEST_ITERATOR_H_

View File

@@ -0,0 +1,59 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/manifest_printer.h"
#include "manifest/content_id.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
// A special text proto printer that prints all ContentId protos using a
// hexadecimal representation instead of octal-escaped string values.
class ContentIdPrinter : public google::protobuf::TextFormat::MessagePrinter {
public:
ContentIdPrinter() = default;
virtual ~ContentIdPrinter() = default;
void Print(const google::protobuf::Message& message, bool single_line_mode,
google::protobuf::TextFormat::BaseTextGenerator* generator)
const override {
const ContentIdProto* content_id =
dynamic_cast<const ContentIdProto*>(&message);
if (content_id) {
generator->PrintLiteral("blake3_sum_160: \"");
generator->PrintString(ContentId::ToHexString(*content_id));
generator->PrintLiteral("\"");
} else {
// Technically, we should just call the inherited Print() function, but
// this results in a linker error for unknown reasons. But since we are
// never supposed to be called for any other message type, let's not
// bother.
generator->PrintLiteral("(given message is no ContentId proto)");
}
if (!single_line_mode) generator->PrintLiteral("\n");
}
};
ManifestPrinter::ManifestPrinter() {
ContentIdPrinter* printer = new ContentIdPrinter;
// If registration of a printer is successful, the callee takes ownership of
// the object.
if (!RegisterMessagePrinter(ContentIdProto::default_instance().descriptor(),
printer)) {
// Registration unsuccessful, delete the object.
delete printer;
}
}
} // namespace cdc_ft

View File

@@ -0,0 +1,42 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_MANIFEST_PRINTER_H_
#define MANIFEST_MANIFEST_PRINTER_H_
#include <google/protobuf/text_format.h>
namespace cdc_ft {
// This class prints manifest protos as text, but uses a hexadecimal
// representation for all ContentId protos to make them human-readable.
//
// Usage:
// AssetListProto pb;
// // ...
// ManifestPrinter printer;
// std::string s;
// printer.PrintToString(pb, s);
// std::cout << s << std::endl;
class ManifestPrinter : public google::protobuf::TextFormat::Printer {
public:
ManifestPrinter();
virtual ~ManifestPrinter() = default;
};
} // namespace cdc_ft
#endif // MANIFEST_MANIFEST_PRINTER_H_

View File

@@ -0,0 +1,52 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_MANIFEST_PROTO_DEFS_H_
#define MANIFEST_MANIFEST_PROTO_DEFS_H_
#include "proto/manifest.pb.h"
namespace cdc_ft {
// Convenience typedefs to make the protos more easily accessible.
using AssetListProto = proto::AssetList;
using AssetProto = proto::Asset;
using CdcParamsProto = proto::CdcParameters;
using ChunkListProto = proto::ChunkList;
using ChunkRefProto = proto::ChunkRef;
using ContentIdProto = proto::ContentId;
using IndirectChunkListProto = proto::IndirectChunkList;
using ManifestProto = proto::Manifest;
using RepeatedAssetProto = google::protobuf::RepeatedPtrField<AssetProto>;
using RepeatedChunkRefProto = google::protobuf::RepeatedPtrField<ChunkRefProto>;
using RepeatedContentIdProto =
google::protobuf::RepeatedPtrField<ContentIdProto>;
using RepeatedIndirectChunkListProto =
google::protobuf::RepeatedPtrField<IndirectChunkListProto>;
using RepeatedStringProto = google::protobuf::RepeatedPtrField<std::string>;
namespace proto {
inline bool operator==(const Asset& a, const Asset& b) {
return a.SerializeAsString() == b.SerializeAsString();
}
inline bool operator!=(const Asset& a, const Asset& b) { return !(a == b); }
} // namespace proto
} // namespace cdc_ft
#endif // MANIFEST_MANIFEST_PROTO_DEFS_H_

View File

@@ -0,0 +1,239 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/manifest_test_base.h"
#include "common/path.h"
#include "common/status_test_macros.h"
#include "fastcdc/fastcdc.h"
#include "manifest/manifest_iterator.h"
#include "manifest/manifest_printer.h"
namespace cdc_ft {
namespace {
// Helper function that tries to parse data as any of the protos written to the
// store and returns its text proto representation.
//
// In order to disambiguate the proto auto-detection logic, you can temporarily
// assign globally unique field numbers to all fields in manifest.proto.
std::string ToTextProto(const ContentIdProto& content_id, const void* data,
size_t size) {
std::string text_proto;
std::string proto_name = "(unknown proto format)";
ManifestProto manifest_pb;
AssetListProto asset_list_pb;
ChunkListProto chunk_list_pb;
int isize = static_cast<int>(size);
ManifestPrinter printer;
if (size > 0) {
if (manifest_pb.ParseFromArray(data, isize) &&
!manifest_pb.GetReflection()
->GetUnknownFields(manifest_pb)
.field_count()) {
printer.PrintToString(manifest_pb, &text_proto);
proto_name = manifest_pb.GetTypeName();
} else if (asset_list_pb.ParseFromArray(data, isize) &&
!asset_list_pb.GetReflection()
->GetUnknownFields(asset_list_pb)
.field_count()) {
printer.PrintToString(asset_list_pb, &text_proto);
proto_name = asset_list_pb.GetTypeName();
} else if (chunk_list_pb.ParseFromArray(data, isize) &&
!chunk_list_pb.GetReflection()
->GetUnknownFields(chunk_list_pb)
.field_count()) {
printer.PrintToString(chunk_list_pb, &text_proto);
proto_name = chunk_list_pb.GetTypeName();
}
}
return absl::StrFormat("# %s => %s (size: %d)\n%s",
ContentId::ToHexString(content_id), proto_name, isize,
text_proto);
}
} // namespace
// Prints an AssetInfo object.
std::ostream& operator<<(std::ostream& os,
const ManifestTestBase::AssetInfoForTest& ai) {
os << "{.path = \"" << ai.info.path
<< "\", .type = " << AssetProto::Type_Name(ai.info.type)
<< ", .mtime = " << ai.info.mtime << ", .size = " << ai.info.size
<< ", .in_progress = " << (ai.in_progress ? "true" : "false") << "}";
return os;
}
ManifestTestBase::ManifestTestBase(std::string base_dir)
: ::testing::Test(), base_dir_(base_dir) {}
std::vector<ManifestTestBase::AssetInfoForTest>
ManifestTestBase::GetAllManifestAssets(ContentIdProto actual_manifest_id) {
ContentIdProto manifest_id;
EXPECT_OK(data_store_.GetProto(manifest_store_id_, &manifest_id));
EXPECT_EQ(manifest_id, actual_manifest_id);
ManifestIterator manifest_iter(&data_store_);
EXPECT_OK(manifest_iter.Open(manifest_id));
std::vector<AssetInfoForTest> assets;
const AssetProto* entry;
while ((entry = manifest_iter.NextEntry()) != nullptr) {
AssetInfoForTest ai;
ai.info.path = path::JoinUnix(manifest_iter.RelativePath(), entry->name());
ai.info.type = entry->type();
ai.info.mtime = entry->mtime_seconds();
ai.info.size = entry->file_size();
ai.in_progress = entry->in_progress();
assets.push_back(std::move(ai));
}
EXPECT_OK(manifest_iter.Status());
return assets;
}
ManifestTestBase::AssetInfoForTest ManifestTestBase::MakeAssetInfo(
const std::string& rel_path) {
std::string full_path = path::Join(cfg_.src_dir, rel_path);
path::Stats stats;
EXPECT_OK(path::GetStats(full_path, &stats));
// Don't use the stats.modified_time as this returns timestamps in the
// machine's local time, whereas GetFileTime() returns UTC time.
time_t mtime;
EXPECT_OK(path::GetFileTime(full_path, &mtime));
AssetInfoForTest ai;
ai.info.path = rel_path;
ai.info.type =
stats.mode & path::MODE_IFDIR ? AssetProto::DIRECTORY : AssetProto::FILE;
ai.info.mtime = static_cast<int64_t>(mtime);
ai.info.size = ai.info.type == AssetProto::DIRECTORY ? 0 : stats.size;
return ai;
}
std::vector<ManifestTestBase::AssetInfoForTest>
ManifestTestBase::MakeAssetInfos(std::initializer_list<std::string> rel_paths) {
std::vector<AssetInfoForTest> ais;
for (const std::string& rel_path : rel_paths) {
ais.push_back(MakeAssetInfo(rel_path));
}
return ais;
}
ManifestUpdater::OperationList* ManifestTestBase::MakeOps(
Operator op, std::initializer_list<std::string> rel_paths) {
ops_.clear();
ops_.reserve(rel_paths.size());
for (const auto& rel_path : rel_paths) {
ops_.emplace_back(op, MakeAssetInfo(rel_path).info);
}
return &ops_;
}
ManifestUpdater::OperationList* ManifestTestBase::MakeDeleteOps(
std::initializer_list<std::string> rel_paths) {
return MakeOps(Operator::kDelete, rel_paths);
}
ManifestUpdater::OperationList* ManifestTestBase::MakeUpdateOps(
std::initializer_list<std::string> rel_paths) {
return MakeOps(Operator::kUpdate, rel_paths);
}
void ManifestTestBase::ExpectAssetInfosEqual(std::vector<AssetInfoForTest> a,
std::vector<AssetInfoForTest> b,
bool equal) {
std::sort(a.begin(), a.end());
std::sort(b.begin(), b.end());
if (equal) {
EXPECT_EQ(a, b);
} else {
EXPECT_NE(a, b);
}
}
void ManifestTestBase::ExpectManifestEquals(
std::initializer_list<std::string> rel_paths,
const ContentIdProto& actual_manifest_id) {
std::vector<AssetInfoForTest> manifest_ais =
GetAllManifestAssets(actual_manifest_id);
std::vector<AssetInfoForTest> expected_ais = MakeAssetInfos(rel_paths);
ExpectAssetInfosEqual(manifest_ais, expected_ais);
}
bool ManifestTestBase::InProgress(const ContentIdProto& manifest_id,
const char* path) {
// Special case: the root directory is not returned by the manifest iterator.
if (absl::string_view(path) == "") {
ManifestProto manifest;
EXPECT_OK(data_store_.GetProto(manifest_id, &manifest));
return manifest.root_dir().in_progress();
}
ManifestIterator manifest_iter(&data_store_);
EXPECT_OK(manifest_iter.Open(manifest_id));
if (!manifest_iter.Status().ok()) return false;
const AssetProto* entry;
while ((entry = manifest_iter.NextEntry()) != nullptr) {
if (path == path::JoinUnix(manifest_iter.RelativePath(), entry->name()))
return entry->in_progress();
}
EXPECT_TRUE(false) << "'" << path << "' not found in manifest";
return false;
}
void ManifestTestBase::ValidateChunkLookup(const std::string& rel_path,
bool expect_contained) {
uint64_t offset = 0;
auto handler = [&offset, &rel_path, file_chunks = &file_chunks_,
expect_contained](const void* data, size_t size) {
ContentIdProto id = ContentId::FromArray(data, size);
std::string lookup_path;
uint64_t lookup_offset = 0;
uint32_t lookup_size = 0;
EXPECT_EQ(
file_chunks->Lookup(id, &lookup_path, &lookup_offset, &lookup_size),
expect_contained);
if (expect_contained) {
EXPECT_EQ(lookup_path, rel_path);
EXPECT_EQ(lookup_offset, offset);
EXPECT_EQ(lookup_size, size);
}
offset += size;
};
fastcdc::Config cdc_cfg(cfg_.min_chunk_size, cfg_.avg_chunk_size,
cfg_.max_chunk_size);
fastcdc::Chunker chunker(cdc_cfg, handler);
Buffer b;
EXPECT_OK(path::ReadFile(path::Join(cfg_.src_dir, rel_path), &b));
chunker.Process(reinterpret_cast<uint8_t*>(b.data()), b.size());
chunker.Finalize();
}
std::string ManifestTestBase::DumpDataStoreProtos() const {
std::string s;
for (const auto& [content_id, chunk] : data_store_.Chunks()) {
s += ToTextProto(content_id, chunk.data(), chunk.size());
}
return s;
}
} // namespace cdc_ft

View File

@@ -0,0 +1,155 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_MANIFEST_TEST_BASE_H_
#define MANIFEST_MANIFEST_TEST_BASE_H_
#include <initializer_list>
#include "data_store/mem_data_store.h"
#include "gtest/gtest.h"
#include "manifest/file_chunk_map.h"
#include "manifest/manifest_updater.h"
namespace cdc_ft {
// Test helper class to compare expected and actual manifests.
class ManifestTestBase : public ::testing::Test {
public:
struct AssetInfoForTest {
AssetInfo info;
bool in_progress = false;
bool operator==(const AssetInfoForTest& other) const {
return info == other.info && in_progress == other.in_progress;
}
bool operator!=(const AssetInfoForTest& other) const {
return !(*this == other);
}
// Compares by file path.
bool operator<(const AssetInfoForTest& other) const {
return info.path < other.info.path;
}
};
explicit ManifestTestBase(std::string base_dir);
~ManifestTestBase() = default;
protected:
using Operation = ManifestUpdater::Operation;
using Operator = ManifestUpdater::Operator;
// Returns the list of assets in the manifest stored in |data_store_|.
std::vector<AssetInfoForTest> GetAllManifestAssets(
ContentIdProto actual_manifest_id);
// Creates AssetInfo from the real files at |rel_path|.
// The path is relative to |cfg_.src_dir|.
AssetInfoForTest MakeAssetInfo(const std::string& rel_path);
// Creates AssetInfos from the real files at |rel_paths|.
// The paths are relative to |cfg_.src_dir|.
std::vector<AssetInfoForTest> MakeAssetInfos(
std::initializer_list<std::string> rel_paths);
// Creates |op| operations for the given list of file paths.
// The paths are relative to |cfg_.src_dir|.
ManifestUpdater::OperationList* MakeOps(
Operator op, std::initializer_list<std::string> rel_paths);
// Creates kDelete operations for the given list of file paths.
// The paths are relative to |cfg_.src_dir|.
ManifestUpdater::OperationList* MakeDeleteOps(
std::initializer_list<std::string> rel_paths);
// Creates kUpdate operations from the real files at |rel_paths|.
// The paths are relative to |cfg_.src_dir|.
ManifestUpdater::OperationList* MakeUpdateOps(
std::initializer_list<std::string> rel_paths);
// Expects that |a| and |b| are (not) equal, independently of order.
void ExpectAssetInfosEqual(std::vector<AssetInfoForTest> a,
std::vector<AssetInfoForTest> b,
bool equal = true);
// Compares the contents of the manifest to the real files at |rel_paths|.
// The paths are relative to |cfg_.src_dir|.
void ExpectManifestEquals(std::initializer_list<std::string> rel_paths,
const ContentIdProto& actual_manifest_id);
// Returns true if the file at Unix |path| contains file chunks in the
// manifest referenced by |manifest_id|.
// Expects the file to be present.
bool InProgress(const ContentIdProto& manifest_id, const char* path);
// Validates that all file chunks in the file at |rel_path| are present in
// |file_chunks_| if |expect_contained| is true. Otherwise, validates that
// none of the chunks are present.
void ValidateChunkLookup(const std::string& rel_path, bool expect_contained);
// Tries to parse all stored data chunks as manifest protos and formats them
// as text protos. In order to disambiguate the proto auto-detection logic,
// you can temporarily assign globally unique field numbers to all fields in
// manifest.proto.
//
// Sample output:
//
// # aa8bef577a9af66e9330140c394e5fce557bd677 =>
// cdc_ft.proto.Manifest (size: 48)
// root_dir {
// type: DIRECTORY
// mtime_seconds: 1663935163
// permissions: 493
// dir_indirect_assets {
// blake3_sum_160: "27b0cd2923714d143f32ec5394a02421fc89f5bc"
// }
// }
// cdc_params {
// min_chunk_size: 8
// avg_chunk_size: 16
// max_chunk_size: 32
// }
// # 27b0cd2923714d143f32ec5394a02421fc89f5bc =>
// cdc_ft.proto.AssetList (size: 52)
// assets {
// name: "a.txt"
// type: FILE
// mtime_seconds: 1653999616
// permissions: 420
// file_size: 8
// file_chunks {
// chunk_id {
// blake3_sum_160: "b1e57baceafdc3b03ab5189cb245757799874fbf"
// }
// }
// in_progress: true
// }
std::string DumpDataStoreProtos() const;
std::string base_dir_;
MemDataStore data_store_;
UpdaterConfig cfg_;
FileChunkMap file_chunks_{/*enable_stats=*/false};
ManifestUpdater::OperationList ops_;
ContentIdProto manifest_store_id_ = ManifestUpdater::GetManifestStoreId();
};
} // namespace cdc_ft
#endif // MANIFEST_MANIFEST_TEST_BASE_H_

View File

@@ -0,0 +1,816 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/manifest_updater.h"
#include <future>
#include <thread>
#include "absl/strings/match.h"
#include "absl/strings/string_view.h"
#include "common/log.h"
#include "common/path.h"
#include "common/stopwatch.h"
#include "common/threadpool.h"
#include "common/util.h"
#include "data_store/data_store_writer.h"
#include "fastcdc/fastcdc.h"
#include "manifest/asset_builder.h"
#include "manifest/file_chunk_map.h"
#include "manifest/manifest_builder.h"
#include "manifest/manifest_iterator.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
namespace {
// Returns AssetInfos for all files and dirs in |src_dir| + |rel_path|. Does not
// recurse into sub-directories.
absl::Status GetAllSrcAssets(const std::string& src_dir,
const std::string& rel_path,
std::vector<AssetInfo>* src_assets) {
std::string full_src_dir = path::Join(src_dir, rel_path);
path::EnsureEndsWithPathSeparator(&full_src_dir);
auto handler = [src_assets, &src_dir = full_src_dir,
rel_path = path::ToUnix(rel_path)](
const std::string& dir, const std::string& filename,
int64_t mtime, uint64_t size, bool is_dir) {
AssetInfo ai;
ai.path = path::JoinUnix(rel_path, filename);
ai.type = is_dir ? AssetProto::DIRECTORY : AssetProto::FILE;
ai.mtime = mtime;
ai.size = is_dir ? 0 : size;
src_assets->push_back(std::move(ai));
return absl::OkStatus();
};
#if PLATFORM_WINDOWS
// Windows expects a globbing pattern to search a path.
std::string src_pattern = path::Join(full_src_dir, "*");
#else
std::string src_pattern = src_dir;
#endif
absl::Status status =
path::SearchFiles(src_pattern, /*recursive=*/false, handler);
std::sort(src_assets->begin(), src_assets->end());
return status;
}
// Creates a fastcdc::Config struct from a CdcParamsProto.
fastcdc::Config CdcConfigFromProto(const CdcParamsProto& cfg_pb) {
return fastcdc::Config(cfg_pb.min_chunk_size(), cfg_pb.avg_chunk_size(),
cfg_pb.max_chunk_size());
}
// Checks if a given CdcParamsProto is sane and can be used for FastCDC.
bool ValidateCdcParams(const CdcParamsProto& params) {
return params.min_chunk_size() <= params.avg_chunk_size() &&
params.avg_chunk_size() <= params.max_chunk_size() &&
params.max_chunk_size() > 0;
}
// Returns the max. number of tasks that should be enqueued in the given thread
// pool.
size_t MaxQueuedTasks(const Threadpool& pool) { return pool.NumThreads() << 1; }
} // namespace
void AssetInfo::AppendCopyChunks(const RepeatedChunkRefProto& list,
uint64_t list_offset) {
chunks.reserve(chunks.size() + list.size());
for (const ChunkRefProto& ch : list)
chunks.emplace_back(ch.chunk_id(), ch.offset() + list_offset);
}
void AssetInfo::AppendMoveChunks(RepeatedChunkRefProto* list,
uint64_t list_offset) {
chunks.reserve(chunks.size() + list->size());
for (ChunkRefProto& ch : *list)
chunks.emplace_back(std::move(*ch.mutable_chunk_id()),
ch.offset() + list_offset);
}
// Common fields for tasks that fill in manifest data.
class ManifestTask : public Task {
public:
ManifestTask(std::string src_dir, std::string relative_unix_path,
std::string filename)
: src_dir_(std::move(src_dir)),
rel_unix_path_(std::move(relative_unix_path)),
filename_(std::move(filename)) {}
// Relative unix path of the directory containing the file or directory for
// this task.
const std::string& RelativeUnixPath() const { return rel_unix_path_; }
// Relative unix path of the file or directory for this task.
std::string RelativeUnixFilePath() const {
return path::JoinUnix(rel_unix_path_, filename_);
}
// Name of the file or directory to process with this task.
const std::string& Filename() const { return filename_; }
// Full path of the file or directory to process with this task.
std::string FilePath() const {
return path::Join(src_dir_, path::ToNative(rel_unix_path_), filename_);
}
// Returns the final status of the task.
// Should not be accessed before the task is finished.
const absl::Status& Status() const { return status_; }
protected:
const std::string src_dir_;
const std::string rel_unix_path_;
const std::string filename_;
absl::Status status_;
};
// ThreadPool task that runs the CDC chunker on a given file.
class FileChunkerTask : public ManifestTask {
public:
FileChunkerTask(std::string src_dir, std::string relative_path,
std::string filename, const fastcdc::Config* cfg,
Buffer buffer)
: ManifestTask(std::move(src_dir), std::move(relative_path),
std::move(filename)),
cfg_(cfg),
buffer_(std::move(buffer)) {
assert(cfg_->max_size > 0);
}
// Returns the number of bytes processed. Should match file size unless some
// error occurred.
// Should not be accessed before the task is finished.
uint64_t ProcessedBytes() const { return processed_bytes_; }
// True if the file looks like a Linux executable based on elf/shebang magic
// headers.
// Should not be accessed before the task is finished.
bool IsExecutable() const { return is_executable_; }
// Returns the chunk hashes and offsets.
// Should not be accessed before the task is finished.
google::protobuf::RepeatedPtrField<ChunkRefProto>* Chunks() {
return &chunks_;
}
// Releases the allocated buffer and returns it to the caller.
Buffer&& ReleaseBuffer() { return std::move(buffer_); }
// Task:
void ThreadRun(IsCancelledPredicate is_cancelled) override {
// TODO: Retry with backoff if this fails in practice, e.g. if the file is
// changed repeatedly.
std::string file_path = FilePath();
absl::StatusOr<FILE*> file = path::OpenFile(file_path, "rb");
if (!file.ok()) {
status_ =
WrapStatus(file.status(), "Failed to open file '%s'", file_path);
return;
}
path::FileCloser closer(*file);
auto chunk_handler = [chunks = &chunks_, offset = &processed_bytes_](
const void* data, size_t size) {
ChunkRefProto* chunk = chunks->Add();
*chunk->mutable_chunk_id() = ContentId::FromArray(data, size);
chunk->set_offset(*offset);
*offset += size;
};
fastcdc::Chunker chunker(*cfg_, chunk_handler);
bool first_chunk = true;
auto stream_handler = [&chunker, &is_cancelled, &first_chunk,
is_executable = &is_executable_,
&file_path](const void* data, size_t size) {
chunker.Process(static_cast<const uint8_t*>(data), size);
if (first_chunk) {
first_chunk = false;
*is_executable = Util::IsExecutable(data, size);
}
return is_cancelled() ? absl::CancelledError(absl::StrFormat(
"chunking file '%s' cancelled", file_path))
: absl::OkStatus();
};
status_ = path::StreamReadFileContents(*file, &buffer_, stream_handler);
chunker.Finalize();
}
private:
const fastcdc::Config* const cfg_;
google::protobuf::RepeatedPtrField<ChunkRefProto> chunks_;
uint64_t processed_bytes_ = 0;
bool is_executable_ = false;
Buffer buffer_;
};
// ThreadPool task that creates assets for the contents of a directory.
class DirScannerTask : public ManifestTask {
public:
DirScannerTask(std::string src_dir, std::string relative_path,
std::string filename, AssetBuilder dir,
DataStoreReader* data_store)
: ManifestTask(std::move(src_dir), std::move(relative_path),
std::move(filename)),
dir_(dir),
data_store_(data_store) {}
// Task:
void ThreadRun(IsCancelledPredicate is_cancelled) override {
std::vector<AssetInfo> src_assets, manifest_assets;
// Collect all files from the given directory.
status_ = GetAllSrcAssets(src_dir_, path::ToNative(RelativeUnixFilePath()),
&src_assets);
if (!status_.ok()) return;
// Collect all assets from the manifest.
status_ = GetAllAssetsFromDirAsset(&manifest_assets, is_cancelled);
if (!status_.ok()) return;
CompareAssets(src_assets, manifest_assets);
if (is_cancelled()) status_ = absl::CancelledError();
}
// Returns the IDs of indirect lists that were fetched when executing this
// task.
std::vector<ContentIdProto>* ManifestContentIds() {
return &manifest_content_ids_;
}
// Returns the AssetBuilder representing the directory this task is scanning.
AssetBuilder* Dir() { return &dir_; }
// Returns the list of assets that need to be added or updated in the
// directory that this task was scanning.
ManifestUpdater::OperationList* Operations() { return &operations_; }
private:
using Operator = ManifestUpdater::Operator;
// Stores AssetInfo structs for all assets found in |assets| in the
// target param |asset_infos|.
void GetAssetInfosFromList(const std::string& rel_path,
const RepeatedAssetProto& assets,
std::vector<AssetInfo>* asset_infos) {
asset_infos->reserve(asset_infos->size() + assets.size());
for (const AssetProto& asset : assets) {
AssetInfo ai;
ai.path = path::JoinUnix(rel_path, asset.name());
ai.type = asset.type();
ai.mtime = asset.mtime_seconds();
ai.size = asset.type() == AssetProto::DIRECTORY ? 0 : asset.file_size();
if (asset.type() == AssetProto::FILE) {
// Copy chunks from the direct chunk list.
ai.AppendCopyChunks(asset.file_chunks(), 0);
// Append all chunk IDs from indirect chunk lists.
for (const IndirectChunkListProto& icl : asset.file_indirect_chunks()) {
ChunkListProto chunk_list;
absl::Status status =
data_store_->GetProto(icl.chunk_list_id(), &chunk_list);
if (!status.ok()) {
// Pretend the file is empty.
ai.chunks.clear();
// Log a warning and continue so that the file is re-added and
// corrected.
LOG_WARNING(
"Can't read indirect chunk list for file '%s': %s. The "
"affected asset will be updated from disk.",
ai.path, status.ToString());
break;
}
ai.AppendMoveChunks(chunk_list.mutable_chunks(), icl.offset());
// Collect the content IDs of all indirect chunk lists.
manifest_content_ids_.push_back(icl.chunk_list_id());
}
}
asset_infos->emplace_back(std::move(ai));
}
}
// Collects all assets from the manifest directory at RelativeUnixFilePath()
// and adds corresponding AssetInfo structs to |asset_infos|.
absl::Status GetAllAssetsFromDirAsset(std::vector<AssetInfo>* asset_infos,
IsCancelledPredicate is_cancelled) {
// Collect all direct assets from the manifest.
std::string rel_path = dir_.RelativeFilePath();
GetAssetInfosFromList(rel_path, dir_.Proto()->dir_assets(), asset_infos);
// Load all indirect asset lists, if there are any.
if (dir_.Proto()->dir_indirect_assets_size() > 0) {
auto it = dir_.Proto()->mutable_dir_indirect_assets()->begin();
while (it != dir_.Proto()->mutable_dir_indirect_assets()->end()) {
if (is_cancelled()) return absl::CancelledError();
AssetListProto list;
absl::Status status = data_store_->GetProto(*it, &list);
if (status.ok()) {
GetAssetInfosFromList(rel_path, list.assets(), asset_infos);
// Collect the content IDs of all indirect asset lists.
manifest_content_ids_.push_back(*it);
++it;
} else {
// In case of an error, log a warning and continue.
LOG_WARNING(
"Can't read indirect asset list for directory '%s': %s. The "
"affected assets will be updated from disk.",
rel_path, status.ToString());
it = dir_.Proto()->mutable_dir_indirect_assets()->erase(it);
}
}
}
std::sort(asset_infos->begin(), asset_infos->end());
return is_cancelled() ? absl::CancelledError() : absl::OkStatus();
}
// Both |srcs_assets| and |manifest_assets| must be sorted.
void CompareAssets(const std::vector<AssetInfo>& src_assets,
const std::vector<AssetInfo>& manifest_assets) {
// Compare the arrays, sorting the assets into the right buckets.
auto src_iter = src_assets.begin();
auto manifest_iter = manifest_assets.begin();
while (src_iter != src_assets.end() ||
manifest_iter != manifest_assets.end()) {
const int order = src_iter == src_assets.end()
? 1 // Extraneous manifest asset.
: manifest_iter == manifest_assets.end()
? -1 // Missing/outdated manifest asset.
: src_iter->path.compare(manifest_iter->path);
if (order < 0) {
// Missing manifest file -> add to manifest.
operations_.emplace_back(Operator::kAdd, std::move(*src_iter));
++src_iter;
} else if (order > 0) {
// Extraneous manifest asset -> delete.
operations_.emplace_back(Operator::kDelete, std::move(*manifest_iter));
++manifest_iter;
} else if (src_iter->mtime == manifest_iter->mtime &&
src_iter->type == manifest_iter->type &&
// For files, compare the size.
(src_iter->type != AssetProto::FILE ||
src_iter->size == manifest_iter->size) &&
// Directories always need to be updated recursively.
src_iter->type != AssetProto::DIRECTORY) {
// Assets match, keep content IDs from the manifest asset for populating
// the FileChunkMap.
operations_.emplace_back(Operator::kKeep, std::move(*manifest_iter));
++src_iter;
++manifest_iter;
} else {
// Source asset changed -> update manifest asset.
operations_.emplace_back(Operator::kUpdate, std::move(*src_iter));
++src_iter;
++manifest_iter;
}
}
}
DataStoreReader* data_store_;
AssetBuilder dir_;
std::vector<ContentIdProto> manifest_content_ids_;
ManifestUpdater::OperationList operations_;
};
// static
ContentIdProto ManifestUpdater::GetManifestStoreId() {
ContentIdProto manifest_store_id;
ContentId::FromHexString("0000000000000000000000000000000000000000",
&manifest_store_id);
return manifest_store_id;
}
// static
absl::Status ManifestUpdater::IsValidDir(std::string dir) {
path::EnsureDoesNotEndWithPathSeparator(&dir);
if (!path::IsAbsolute(dir)) {
return absl::FailedPreconditionError(
absl::StrFormat("Directory '%s' must be an absolute path.", dir));
}
if (!path::Exists(dir)) {
return absl::NotFoundError(
absl::StrFormat("Failed to find directory '%s'.", dir));
}
if (!path::DirExists(dir)) {
return absl::FailedPreconditionError(
absl::StrFormat("Path '%s' should be a directory.", dir));
}
return absl::OkStatus();
}
ManifestUpdater::ManifestUpdater(DataStoreWriter* data_store, UpdaterConfig cfg)
: data_store_(data_store), cfg_(std::move(cfg)) {
path::EnsureEndsWithPathSeparator(&cfg_.src_dir);
}
ManifestUpdater::~ManifestUpdater() = default;
absl::Status ManifestUpdater::UpdateAll(
FileChunkMap* file_chunks,
PushIntermediateManifest push_intermediate_manifest) {
RETURN_IF_ERROR(ManifestUpdater::IsValidDir(cfg_.src_dir));
// Don't use the Windows localized time from path::GetStats.
time_t mtime;
RETURN_IF_ERROR(path::GetFileTime(cfg_.src_dir, &mtime));
// Create the info for the root directory to start the recursive search.
AssetInfo ri;
ri.type = AssetProto::DIRECTORY;
ri.mtime = mtime;
std::vector<Operation> operations{{Operator::kAdd, std::move(ri)}};
absl::Status status =
Update(&operations, file_chunks, push_intermediate_manifest,
/*recursive=*/true);
if (status.ok() || !absl::IsUnavailable(status)) return status;
// In case we receive an absl::UnavailableError, it means that not all
// manifest chunks could be located. In that case, we wipe all data and
// rebuild the manifest from scratch.
LOG_WARNING("Failed to load manifest, building from scratch: %s",
status.ToString());
RETURN_IF_ERROR(data_store_->Wipe());
file_chunks->Clear();
RETURN_IF_ERROR(Update(&operations, file_chunks, push_intermediate_manifest,
/*recursive=*/true),
"Failed to build manifest from scratch");
return absl::OkStatus();
}
ContentIdProto ManifestUpdater::DefaultManifestId() {
CdcParamsProto params;
params.set_min_chunk_size(cfg_.min_chunk_size);
params.set_avg_chunk_size(cfg_.avg_chunk_size);
params.set_max_chunk_size(cfg_.max_chunk_size);
ManifestBuilder manifest_builder(params, data_store_);
// Load the manifest id from the store. It's necessary to extract the CDC
// parameters used last time.
ContentIdProto manifest_id;
if ((data_store_->GetProto(GetManifestStoreId(), &manifest_id).ok()) &&
manifest_builder.LoadManifest(manifest_id).ok() &&
ValidateCdcParams(manifest_builder.CdcParameters())) {
params = manifest_builder.CdcParameters();
}
// Create an empty manifest with correct CDC parameters.
ManifestBuilder new_manifest_builder(params, data_store_);
absl::StatusOr<ContentIdProto> result = new_manifest_builder.Flush();
assert(result.ok());
manifest_id_ = *result;
std::string id_str = manifest_id_.SerializeAsString();
absl::Status status =
data_store_->Put(GetManifestStoreId(), id_str.data(), id_str.size());
if (!status.ok()) {
LOG_ERROR("Failed to store default manifest ID in data store: %s",
status.ToString());
}
return manifest_id_;
}
size_t ManifestUpdater::QueueTasks(Threadpool* pool,
const fastcdc::Config* cdc_cfg,
ManifestBuilder* manifest_builder) {
const size_t max_tasks_queued = MaxQueuedTasks(*pool);
size_t num_tasks_queued = 0;
while (pool->NumQueuedTasks() < max_tasks_queued && !queue_.empty() &&
!buffers_.empty()) {
PendingAsset asset = std::move(queue_.front());
absl::StatusOr<AssetBuilder> dir;
queue_.pop_front();
switch (asset.type) {
case AssetProto::FILE:
pool->QueueTask(std::make_unique<FileChunkerTask>(
cfg_.src_dir, std::move(asset.relative_path),
std::move(asset.filename), cdc_cfg, std::move(buffers_.back())));
buffers_.pop_back();
break;
case AssetProto::DIRECTORY:
dir = manifest_builder->GetOrCreateAsset(
path::JoinUnix(asset.relative_path, asset.filename),
AssetProto::DIRECTORY, true);
if (!dir.ok()) {
LOG_ERROR(
"Failed to locate directory '%s' in the manifest, skipping it: "
"%s",
asset.relative_path, dir.status().ToString());
continue;
}
pool->QueueTask(std::make_unique<DirScannerTask>(
cfg_.src_dir, std::move(asset.relative_path),
std::move(asset.filename), std::move(dir.value()), data_store_));
break;
default:
LOG_ERROR("Unexpected type '%s' for asset '%s'",
AssetProto::Type_Name(asset.type), asset.relative_path);
continue;
}
++num_tasks_queued;
}
return num_tasks_queued;
}
absl::Status ManifestUpdater::ApplyOperations(
std::vector<Operation>* operations, FileChunkMap* file_chunks,
ManifestBuilder* manifest_builder, AssetBuilder* parent, bool recursive) {
assert(manifest_builder != nullptr);
if (operations->empty()) return absl::OkStatus();
// First, handle all deletions to make the outcome independent of the order of
// operations (e.g., when the same file is added and deleted again).
const std::string* last_deleted = nullptr;
for (const Operation& op : *operations) {
if (op.op != Operator::kDelete) continue;
const AssetInfo& ai = op.info;
++stats_.total_assets_deleted;
file_chunks->Remove(ai.path);
if (last_deleted && absl::StartsWith(ai.path, *last_deleted) &&
ai.path[last_deleted->size()] == '/') {
// Optimization: |path| is part of a deleted dir, so it can be
// skipped.
continue;
}
RETURN_IF_ERROR(manifest_builder->DeleteAsset(ai.path),
"Failed to delete asset '%s' from manifest", ai.path);
last_deleted = &ai.path;
}
// Second, handle additions and updates.
AssetBuilder asset_builder;
for (Operation& op : *operations) {
AssetInfo& ai = op.info;
bool created = true;
switch (op.op) {
case Operator::kDelete:
continue;
case Operator::kKeep:
file_chunks->Init(ai.path, ai.size, &ai.chunks);
continue;
case Operator::kAdd:
// If a parent was given, assets are added as direct children of that
// parent directory.
if (parent) {
asset_builder = parent->AppendAsset(path::BaseName(ai.path), ai.type);
break;
}
[[fallthrough]];
case Operator::kUpdate:
ASSIGN_OR_RETURN(asset_builder,
manifest_builder->GetOrCreateAsset(ai.path, ai.type,
true, &created),
"Failed to add '%s' to the manifest", ai.path);
break;
}
if (created) ++stats_.total_assets_added_or_updated;
asset_builder.SetMtimeSeconds(ai.mtime);
if (ai.type == AssetProto::FILE) {
// Assume everything is executable for the intermediate manifest.
// The executable bit is derived from the file data, which is not
// available at this point.
asset_builder.SetPermissions(kExecutablePerms);
asset_builder.TruncateChunks();
asset_builder.SetFileSize(ai.size);
// Queue chunker tasks for files.
asset_builder.SetInProgress(true);
} else if (recursive && ai.type == AssetProto::DIRECTORY) {
// We are recursing into all sub-directories, so we add queue up the
// child directory for scanning.
asset_builder.SetInProgress(true);
}
// If the asset is marked as in-progress, we need to queue it up.
if (asset_builder.InProgress()) {
queue_.emplace_back(ai.type, asset_builder.RelativePath(),
asset_builder.Name());
}
}
return absl::OkStatus();
}
absl::Status ManifestUpdater::HandleFileChunkerResult(
FileChunkerTask* task, FileChunkMap* file_chunks,
ManifestBuilder* manifest_builder) {
const std::string rel_file_path = task->RelativeUnixFilePath();
buffers_.emplace_back(task->ReleaseBuffer());
AssetBuilder asset_builder;
ASSIGN_OR_RETURN(asset_builder, manifest_builder->GetOrCreateAsset(
rel_file_path, AssetProto::FILE));
asset_builder.SetInProgress(false);
if (!task->Status().ok()) {
// In case of an error, pretend the file is empty.
asset_builder.SetFileSize(0);
file_chunks->Init(rel_file_path, 0);
++stats_.total_files_failed;
return task->Status();
}
// Update the asset and the stats.
uint64_t file_size = task->ProcessedBytes();
stats_.total_chunks += task->Chunks()->size();
stats_.total_processed_bytes += file_size;
++stats_.total_files_added_or_updated;
asset_builder.SwapChunks(task->Chunks(), file_size);
asset_builder.SetPermissions(task->IsExecutable()
? kExecutablePerms
: ManifestBuilder::kDefaultFilePerms);
file_chunks->Init(rel_file_path, file_size);
file_chunks->AppendCopy(rel_file_path, asset_builder.Proto()->file_chunks(),
0);
return absl::OkStatus();
}
absl::Status ManifestUpdater::HandleDirScannerResult(
DirScannerTask* task, FileChunkMap* file_chunks,
ManifestBuilder* manifest_builder,
std::unordered_set<ContentIdProto>* manifest_content_ids) {
// Include the error in the stats, but we can still try to process the
// (partial) results.
if (!task->Status().ok()) {
++stats_.total_dirs_failed;
}
// DirScannerTasks are inherently recursive.
RETURN_IF_ERROR(ApplyOperations(task->Operations(), file_chunks,
manifest_builder, task->Dir(),
/*recursive=*/true));
task->Dir()->SetInProgress(false);
// Union all manifest chunk content IDs.
assert(manifest_content_ids != nullptr);
manifest_content_ids->insert(task->ManifestContentIds()->begin(),
task->ManifestContentIds()->end());
return task->Status();
}
absl::Status ManifestUpdater::Update(
OperationList* operations, FileChunkMap* file_chunks,
PushIntermediateManifest push_intermediate_manifest, bool recursive) {
Stopwatch sw;
LOG_INFO(
"Updating manifest for '%s': applying %u changes, "
"%srecursive",
cfg_.src_dir, operations->size(), recursive ? "" : "non-");
stats_ = UpdaterStats();
CdcParamsProto cdc_params;
cdc_params.set_min_chunk_size(cfg_.min_chunk_size);
cdc_params.set_avg_chunk_size(cfg_.avg_chunk_size);
cdc_params.set_max_chunk_size(cfg_.max_chunk_size);
ManifestBuilder manifest_builder(cdc_params, data_store_);
// Load the manifest id from the store.
ContentIdProto manifest_id;
absl::Status status =
data_store_->GetProto(GetManifestStoreId(), &manifest_id);
if (!status.ok()) {
if (!absl::IsNotFound(status))
return WrapStatus(status, "Failed to load manifest id");
// A non-existing manifest is not an issue, just build it from scratch.
LOG_INFO("No cached manifest found. Building from scratch.");
} else {
RETURN_IF_ERROR(manifest_builder.LoadManifest(manifest_id),
"Failed to load manifest with id '%s'",
ContentId::ToHexString(manifest_id));
// The CDC params might have changed when loading the manifest.
if (ValidateCdcParams(manifest_builder.Manifest()->cdc_params())) {
cdc_params = manifest_builder.Manifest()->cdc_params();
}
}
RETURN_IF_ERROR(ApplyOperations(operations, file_chunks, &manifest_builder,
nullptr, recursive));
Threadpool pool(cfg_.num_threads > 0 ? cfg_.num_threads
: std::thread::hardware_concurrency());
// Pre-allocate one buffer per queueable task with 2 * max_chunk_size.
const size_t max_queued_tasks = MaxQueuedTasks(pool);
buffers_.reserve(max_queued_tasks);
while (buffers_.size() < max_queued_tasks)
buffers_.emplace_back(cfg_.max_chunk_size << 1);
size_t num_tasks_queued = 0;
// Collect the content IDs that make up the manifest when recursing. They are
// used to prune the manifest cache directory in the end.
std::unordered_set<ContentIdProto> manifest_content_ids;
// Push intermediate manifest if there are queued chunker tasks.
if (push_intermediate_manifest && !queue_.empty()) {
file_chunks->FlushUpdates();
ASSIGN_OR_RETURN(manifest_id_, manifest_builder.Flush(),
"Failed to flush intermediate manifest");
// Add all content IDs that were just written back.
manifest_content_ids.insert(manifest_builder.FlushedContentIds().begin(),
manifest_builder.FlushedContentIds().end());
push_intermediate_manifest(manifest_id_);
}
fastcdc::Config cdc_cfg = CdcConfigFromProto(cdc_params);
// Wait for the chunker tasks and update file assets.
while (!queue_.empty() || num_tasks_queued > 0) {
num_tasks_queued += QueueTasks(&pool, &cdc_cfg, &manifest_builder);
std::unique_ptr<Task> task = pool.GetCompletedTask();
assert(num_tasks_queued > 0);
--num_tasks_queued;
FileChunkerTask* chunker_task = dynamic_cast<FileChunkerTask*>(task.get());
if (chunker_task) {
status =
HandleFileChunkerResult(chunker_task, file_chunks, &manifest_builder);
if (!status.ok()) {
LOG_ERROR("Failed to process file '%s': %s", chunker_task->FilePath(),
status.ToString());
}
continue;
}
DirScannerTask* scanner_task = dynamic_cast<DirScannerTask*>(task.get());
if (scanner_task) {
status = HandleDirScannerResult(scanner_task, file_chunks,
&manifest_builder, &manifest_content_ids);
if (!status.ok()) {
LOG_ERROR("Failed to process directory '%s': %s",
scanner_task->FilePath(), status.ToString());
}
continue;
}
}
file_chunks->FlushUpdates();
ASSIGN_OR_RETURN(manifest_id_, manifest_builder.Flush(),
"Failed to flush manifest");
// Save the manifest id to the store.
std::string id_str = manifest_id_.SerializeAsString();
RETURN_IF_ERROR(
data_store_->Put(GetManifestStoreId(), id_str.data(), id_str.size()),
"Failed to store manifest id");
// Remove manifest chunks that are no longer referenced when recursing through
// all sub-directories. This also makes sure that all referenced manifest
// chunks are present.
if (status.ok() && recursive) {
// Retain the chunk that stores the manifest ID.
manifest_content_ids.insert(ManifestUpdater::GetManifestStoreId());
// Add all content IDs that were just written back.
manifest_content_ids.insert(manifest_builder.FlushedContentIds().begin(),
manifest_builder.FlushedContentIds().end());
status = data_store_->Prune(std::move(manifest_content_ids));
if (!status.ok()) {
// Signal to the caller that the manifest needs to be rebuilt from
// scratch.
return absl::UnavailableError(status.ToString());
}
}
LOG_INFO("Manifest for '%s' successfully updated in %0.3f seconds",
cfg_.src_dir, sw.ElapsedSeconds());
return absl::OkStatus();
}
} // namespace cdc_ft

268
manifest/manifest_updater.h Normal file
View File

@@ -0,0 +1,268 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_MANIFEST_UPDATER_H_
#define MANIFEST_MANIFEST_UPDATER_H_
#include <list>
#include <string>
#include <vector>
#include "absl/status/statusor.h"
#include "common/buffer.h"
#include "manifest/asset_builder.h"
#include "manifest/file_chunk_map.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
namespace fastcdc {
struct Config;
}
class AssetBuilder;
class DataStoreWriter;
class DirScannerTask;
class FileChunkerTask;
class ManifestBuilder;
class Threadpool;
struct UpdaterConfig {
// Source directory from which to build the manifest from recursively.
std::string src_dir;
// Minimum allowed chunk size.
size_t min_chunk_size = 128 << 10;
// Target average chunk size.
size_t avg_chunk_size = 256 << 10;
// Maximum allowed chunk size.
size_t max_chunk_size = 1024 << 10;
// Size of the chunker thread pool. Defaults to the number of available CPUs.
uint32_t num_threads = 0;
};
struct UpdaterStats {
// Total no. of assets that were added or updated.
size_t total_assets_added_or_updated = 0;
// Total no. of assets of type FILE that were added or updated.
size_t total_files_added_or_updated = 0;
// Total no. of files where processing failed.
size_t total_files_failed = 0;
// Total no. of directories where processing failed.
size_t total_dirs_failed = 0;
// Total no. of assets that were deleted (not counting subdirectory files).
size_t total_assets_deleted = 0;
// Total no. of chunks created.
size_t total_chunks = 0;
// Total no. of bytes processed from the files added or updated.
size_t total_processed_bytes = 0;
};
struct AssetInfo {
// Unix path to the asset relative to the source directory.
std::string path;
// Type (file, dir, etc.)
AssetProto::Type type = AssetProto::FILE;
// Modification time in seconds since Epoch.
int64_t mtime = 0;
// File size (0 for directories).
uint64_t size = 0;
// File chunks (empty for directories). This list is ignored when comparing
// one AssetInfo to another.
std::vector<FileChunk> chunks;
// Appends the chunks from |list| to |chunks|.
void AppendCopyChunks(const RepeatedChunkRefProto& list,
uint64_t list_offset);
// Appends the chunks from |list| to |chunks|, but moves the data out of
// |list| instead of copying, wherever possible.
void AppendMoveChunks(RepeatedChunkRefProto* list, uint64_t list_offset);
bool operator==(const AssetInfo& other) const {
return path == other.path && type == other.type && mtime == other.mtime &&
size == other.size;
}
bool operator!=(const AssetInfo& other) const { return !(*this == other); }
// Compares by file path.
bool operator<(const AssetInfo& other) const { return path < other.path; }
};
// Incrementally updates a manifest
class ManifestUpdater {
public:
// Selects the update operation to be performed.
enum class Operator { kAdd, kUpdate, kDelete, kKeep };
// Represents an update operation that shall be applied to the owned manifest.
struct Operation {
Operation() {}
Operation(Operator op, AssetInfo info) : op(op), info(std::move(info)) {}
Operator op;
AssetInfo info;
};
using OperationList = std::vector<Operation>;
// Permissions for executable files.
static constexpr uint32_t kExecutablePerms = 0755u;
// Id of the chunk that stores the manifest id.
static ContentIdProto GetManifestStoreId();
// Returns an error if |dir| does not exist or it is not a directory.
static absl::Status IsValidDir(std::string dir);
using PushIntermediateManifest =
std::function<void(const ContentIdProto& manifest_id)>;
// |data_store| is used to store manifest chunks. File data chunks are not
// stored explicitly as they can be read from the original files.
// |cfg| determines the source directory to update the manifest from as well
// as configuration details about chunking.
ManifestUpdater(DataStoreWriter* data_store, UpdaterConfig cfg);
~ManifestUpdater();
ManifestUpdater(const ManifestUpdater&) = delete;
ManifestUpdater& operator=(const ManifestUpdater&) = delete;
// Reads the full source directory and syncs the manifest to it. Prunes old,
// unreferenced manifest chunks. Updates and flushes |file_chunks|.
//
// If a valid |push_intermediate_manifest| is passed, then a manifest is
// flushed after the root directory has been added, but before all files and
// directories have been processed. That means, the manifest does not yet
// contains all assets, all incomplete assets are set to in-progress.
absl::Status UpdateAll(FileChunkMap* file_chunks,
PushIntermediateManifest push_intermediate_manifest =
PushIntermediateManifest());
// Updates the manifest by applying the |operations| list. Deletions are
// handled first to make the outcome independent of the order in the list.
// Also updates and flushes |file_chunks| with the changes made. See
// UpdateAll() for a description of |push_intermediate_manifest|.
//
// All paths should be Unix paths. If |recursive| is true, then a directory
// scanner task is enqueued for each directory that is added to the manifest.
// This is only needed during UpdateAll(). When the manifest is updated in
// response to file watcher changes, then |recursive| should be set to false.
absl::Status Update(OperationList* operations, FileChunkMap* file_chunks,
PushIntermediateManifest push_intermediate_manifest =
PushIntermediateManifest(),
bool recursive = false);
// Content id of the current manifest.
const ContentIdProto& ManifestId() const { return manifest_id_; }
// Returns stats created from the last call to UpdateAll() or Update().
const UpdaterStats& Stats() const { return stats_; }
// Returns the manifest updater configuration.
const UpdaterConfig& Config() const { return cfg_; }
// Returns an empty manifest.
ContentIdProto DefaultManifestId();
private:
// Adds enough pending assets from |queue_| as tasks to the |pool| to keep all
// worker threads busy. Returns the number of tasks that were added.
size_t QueueTasks(Threadpool* pool, const fastcdc::Config* cdc_cfg,
ManifestBuilder* manifest_builder);
// Applies the |operatio ns| list to the manifest owned by the
// |manifest_builder|. First, all deletions are handled and the corresponding
// files are removed from the |file_chunks| map, then all added or updated
// assets are processed. This guarantees that the outcome is independent of
// the order in the list.
//
// If |parent| is non-null, then it must be of type DIRECTORY and all added
// assets are made direct children of |parent|. The function does *not* verify
// that all children have |parent| as directory path.
//
// Enqueues tasks to chunk the given files for files that were added or
// updated. If |recursive| is true, then it will also enqueue directory
// scanner tasks for all given directories.
absl::Status ApplyOperations(std::vector<Operation>* operations,
FileChunkMap* file_chunks,
ManifestBuilder* manifest_builder,
AssetBuilder* parent, bool recursive);
// Handles the results of a completed FileChunkerTask.
absl::Status HandleFileChunkerResult(FileChunkerTask* task,
FileChunkMap* file_chunks,
ManifestBuilder* manifest_builder);
// Handles the results of a completed DirScannerTask.
absl::Status HandleDirScannerResult(
DirScannerTask* task, FileChunkMap* file_chunks,
ManifestBuilder* manifest_builder,
std::unordered_set<ContentIdProto>* manifest_content_ids);
// Represents an asset that has not been fully processed yet.
struct PendingAsset {
PendingAsset() {}
PendingAsset(AssetProto::Type type, std::string relative_path,
std::string filename)
: type(type),
relative_path(std::move(relative_path)),
filename(std::move(filename)) {}
// The asset type (either FILE or DIRECTORY).
AssetProto::Type type = AssetProto::UNKNOWN;
// Relative unix path of the directory containing this asset.
std::string relative_path;
// File name of the asset that still needs processing.
std::string filename;
};
// Queue of pending assets waiting for completion.
std::list<PendingAsset> queue_;
// Pool of pre-allocated buffers
std::vector<Buffer> buffers_;
// Store for manifest chunks and the manifest id itself.
DataStoreWriter* const data_store_;
// Source directory to build the manifest from and configuration details.
UpdaterConfig cfg_;
// ID of the manifest chunk.
ContentIdProto manifest_id_;
// Stats for the last Update*() operation.
UpdaterStats stats_;
};
}; // namespace cdc_ft
#endif // MANIFEST_MANIFEST_UPDATER_H_

View File

@@ -0,0 +1,655 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/manifest_updater.h"
#include "absl/strings/match.h"
#include "common/path.h"
#include "common/status_test_macros.h"
#include "common/test_main.h"
#include "data_store/mem_data_store.h"
#include "fastcdc/fastcdc.h"
#include "gtest/gtest.h"
#include "manifest/file_chunk_map.h"
#include "manifest/manifest_builder.h"
#include "manifest/manifest_iterator.h"
#include "manifest/manifest_test_base.h"
namespace cdc_ft {
void PrintTo(const AssetInfo& ai, std::ostream* o) {
*o << "path=" << ai.path << ", type=" << ai.type << ", mtime=" << ai.mtime
<< ", size=" << ai.size;
}
namespace {
constexpr uint64_t kFileSizeA = 8; // a.txt
constexpr uint64_t kFileSizeB = 32; // subdir/b.txt
constexpr uint64_t kFileSizeC = 1; // subdir/c.txt
constexpr uint64_t kFileSizeD = 1; // subdir/d.txt
constexpr uint64_t kTotalFileSize =
kFileSizeA + kFileSizeB + kFileSizeC + kFileSizeD;
class ManifestUpdaterTest : public ManifestTestBase {
public:
ManifestUpdaterTest()
: ManifestTestBase(GetTestDataDir("manifest_updater")) {}
void SetUp() override {
path::CreateDirRec(empty_dir_).IgnoreError();
cfg_.num_threads = 1;
}
void TearDown() override { path::RemoveDirRec(empty_dir_).IgnoreError(); }
protected:
std::string empty_dir_ = path::Join(path::GetTempDir(), "empty");
};
// Runs UpdateAll() on an empty dir.
TEST_F(ManifestUpdaterTest, UpdateAll_EmptySrcDirectory) {
cfg_.src_dir = empty_dir_;
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
UpdaterStats stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 0);
EXPECT_EQ(stats.total_files_added_or_updated, 0);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 0);
EXPECT_EQ(stats.total_processed_bytes, 0);
// Store should contain a chunk for the manifest id and one for the manifest.
EXPECT_EQ(data_store_.Chunks().size(), 2);
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals({}, updater.ManifestId()));
}
// Runs UpdateAll() on a non-empty dir.
TEST_F(ManifestUpdaterTest, UpdateAll_NonEmptySrcDirectory) {
// Contains a.txt and subdir/b.txt.
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 5);
EXPECT_EQ(stats.total_files_added_or_updated, 4);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 4);
EXPECT_EQ(stats.total_processed_bytes, kTotalFileSize);
// Store should contain a chunk for the manifest id and one for the manifest.
EXPECT_EQ(data_store_.Chunks().size(), 2);
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals(
{"a.txt", "subdir", "subdir/b.txt", "subdir/c.txt", "subdir/d.txt"},
updater.ManifestId()));
}
// Runs UpdateAll() with existing manifest that misses a file.
TEST_F(ManifestUpdaterTest, UpdateAll_AddFileIncremental) {
// Create a manifest with "subdir/b.txt" missing.
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
EXPECT_OK(updater.Update(
MakeDeleteOps({"subdir/b.txt", "subdir/c.txt", "subdir/d.txt"}),
&file_chunks_));
ASSERT_NO_FATAL_FAILURE(
ExpectManifestEquals({"a.txt", "subdir"}, updater.ManifestId()));
// UpdateAll() should compute the proper diff from {"a.txt", "subdir"} to
// {"a.txt", "subdir", "subdir/b.txt", "subdir/c.txt", "subdir/d.txt"} and
// only add/update one file.
EXPECT_OK(updater.UpdateAll(&file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 3);
EXPECT_EQ(stats.total_files_added_or_updated, 3);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 3);
EXPECT_EQ(stats.total_processed_bytes, kFileSizeB + kFileSizeC + kFileSizeD);
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals(
{"a.txt", "subdir", "subdir/b.txt", "subdir/c.txt", "subdir/d.txt"},
updater.ManifestId()));
}
// Runs UpdateAll() with existing manifest that has an excessive file.
TEST_F(ManifestUpdaterTest, UpdateAll_DeleteFileIncremental) {
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
// Smuggle c.txt into the manifest.
CdcParamsProto params;
params.set_min_chunk_size(cfg_.min_chunk_size);
params.set_avg_chunk_size(cfg_.avg_chunk_size);
params.set_max_chunk_size(cfg_.max_chunk_size);
ManifestBuilder mb(params, &data_store_);
EXPECT_OK(mb.LoadManifest(updater.ManifestId()));
EXPECT_OK(mb.GetOrCreateAsset("c.txt", AssetProto::FILE));
EXPECT_OK(mb.Flush());
std::string id_str = mb.ManifestId().SerializeAsString();
EXPECT_OK(data_store_.Put(manifest_store_id_, id_str.data(), id_str.size()));
// UpdateAll() should compute the proper diff from
// {"a.txt", "c.txt", "subdir", "subdir/b.txt"} to
// {"a.txt", "subdir", "subdir/b.txt", "subdir/c.txt", "subdir/d.txt"} and
// only delete one file.
EXPECT_OK(updater.UpdateAll(&file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 0);
EXPECT_EQ(stats.total_files_added_or_updated, 0);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 1);
EXPECT_EQ(stats.total_chunks, 0);
EXPECT_EQ(stats.total_processed_bytes, 0);
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals(
{"a.txt", "subdir", "subdir/b.txt", "subdir/c.txt", "subdir/d.txt"},
updater.ManifestId()));
}
// UpdateAll() removes unreferenced manifest chunks.
TEST_F(ManifestUpdaterTest, UpdateAll_PrunesUnreferencedChunks) {
// Reduce chunk sizes to produce a bunch of indirect lists.
cfg_.min_chunk_size = 8;
cfg_.avg_chunk_size = 16;
cfg_.max_chunk_size = 32;
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.Update(MakeUpdateOps({"a.txt"}), &file_chunks_));
// 1 for manifest id, 1 for manifest, 1 indirect assets.
EXPECT_EQ(data_store_.Chunks().size(), 3);
EXPECT_OK(updater.Update(
MakeUpdateOps({"subdir/b.txt", "subdir/c.txt", "subdir/d.txt"}),
&file_chunks_));
// 1 for manifest id, 1 for manifest, 5 indirect assets.
// 2 additional chunks from the first Update() that are now unreferenced.
// -1, because the indirect asset for "a.txt" is deduplicated
EXPECT_EQ(data_store_.Chunks().size(), 8)
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
<< std::endl
<< DumpDataStoreProtos();
EXPECT_OK(updater.UpdateAll(&file_chunks_));
EXPECT_OK(updater.UpdateAll(&file_chunks_));
// 1 for manifest id, 1 for manifest, 5 indirect assets.
// Pruning has removed the 2 unreferenced ones.
EXPECT_EQ(data_store_.Chunks().size(), 7)
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
<< std::endl
<< DumpDataStoreProtos();
}
// UpdateAll() recovers if there are missing referenced manifest chunks.
TEST_F(ManifestUpdaterTest, UpdateAll_RecoversFromMissingChunks) {
// Reduce chunk sizes to produce a bunch of indirect lists.
cfg_.min_chunk_size = 8;
cfg_.avg_chunk_size = 16;
cfg_.max_chunk_size = 32;
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.Update(MakeUpdateOps({"a.txt"}), &file_chunks_));
// 1 for manifest id, 1 for manifest, 1 indirect assets.
EXPECT_EQ(data_store_.Chunks().size(), 3)
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
<< std::endl
<< DumpDataStoreProtos();
// Remove one of the indirect chunks list.
for (const auto& [id, _] : data_store_.Chunks()) {
if (id != ManifestUpdater::GetManifestStoreId() &&
id != updater.ManifestId()) {
data_store_.Chunks().erase(id);
break;
}
}
EXPECT_OK(updater.UpdateAll(&file_chunks_));
// 1 for manifest id, 1 for manifest, 5 indirect assets.
// There would be 7 chunks without the removal above, see UpdateAll_Prune.
EXPECT_EQ(data_store_.Chunks().size(), 7)
<< "Manifest: " << ContentId::ToHexString(updater.ManifestId())
<< std::endl
<< DumpDataStoreProtos();
}
// Verifies that |file_chunks_| contains the expected chunks after UpdateAll().
TEST_F(ManifestUpdaterTest, UpdateAll_FileChunkMapFromScratch) {
// Reduce chunk sizes to produce a bunch of indirect lists.
cfg_.min_chunk_size = 8;
cfg_.avg_chunk_size = 16;
cfg_.max_chunk_size = 32;
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
ValidateChunkLookup("a.txt", true);
ValidateChunkLookup("subdir/b.txt", true);
ValidateChunkLookup("subdir/c.txt", true);
ValidateChunkLookup("subdir/d.txt", true);
}
// Verifies that |file_chunks_| contains the expected chunks after UpdateAll().
TEST_F(ManifestUpdaterTest, UpdateAll_FileChunkMapAfterUpdate) {
// Reduce chunk sizes to produce a bunch of indirect lists.
cfg_.min_chunk_size = 8;
cfg_.avg_chunk_size = 16;
cfg_.max_chunk_size = 32;
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
ASSERT_OK(updater.UpdateAll(&file_chunks_));
// The file chunks will be populated again by UpdateAll().
file_chunks_.Clear();
EXPECT_OK(updater.UpdateAll(&file_chunks_));
ValidateChunkLookup("a.txt", true);
ValidateChunkLookup("subdir/b.txt", true);
ValidateChunkLookup("subdir/c.txt", true);
ValidateChunkLookup("subdir/d.txt", true);
}
// Verifies that the intermediate manifest contains the expected files.
TEST_F(ManifestUpdaterTest, UpdateAll_PushIntermediateManifest) {
ContentIdProto intermediate_id;
auto push_intermediate_manifest =
[&intermediate_id](const ContentIdProto& manifest_id) {
intermediate_id = manifest_id;
};
// Contains a.txt and subdir/b.txt.
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_, push_intermediate_manifest));
// Double check that the files in the final manifest are no longer in
// progress.
EXPECT_FALSE(InProgress(updater.ManifestId(), "a.txt"));
EXPECT_FALSE(InProgress(updater.ManifestId(), "subdir/b.txt"));
EXPECT_FALSE(InProgress(updater.ManifestId(), "subdir/c.txt"));
EXPECT_FALSE(InProgress(updater.ManifestId(), "subdir/d.txt"));
// Verify that the intermediate manifest is there, but it is empty.
std::string ser_id = intermediate_id.SerializeAsString();
EXPECT_OK(data_store_.Put(manifest_store_id_, ser_id.data(), ser_id.size()));
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals({}, intermediate_id));
// The root directory of the intermediate manifest is in progress.
EXPECT_TRUE(InProgress(intermediate_id, ""));
}
// Runs Update() with a single file to be added.
TEST_F(ManifestUpdaterTest, Update_AddFile) {
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.Update(MakeUpdateOps({"a.txt"}), &file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 1);
EXPECT_EQ(stats.total_files_added_or_updated, 1);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 1);
EXPECT_EQ(stats.total_processed_bytes, kFileSizeA);
ASSERT_NO_FATAL_FAILURE(
ExpectManifestEquals({"a.txt"}, updater.ManifestId()));
}
// Runs Update() with a single file to be added. The file is in a dir that is
// not contained in the manifest yet, so the dir will get auto-created.
TEST_F(ManifestUpdaterTest, Update_AddFileAutoCreateSubdir) {
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.Update(MakeUpdateOps({"subdir/b.txt"}), &file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 1);
EXPECT_EQ(stats.total_files_added_or_updated, 1);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 1);
EXPECT_EQ(stats.total_processed_bytes, kFileSizeB);
// Note: The manifest does NOT contain the proper "subdir" asset now. Since it
// was auto-created because of "subdir/b.txt", it does not have the
// proper file time.
std::vector<AssetInfoForTest> manifest_ais =
GetAllManifestAssets(updater.ManifestId());
std::vector<AssetInfoForTest> expected_ais =
MakeAssetInfos({"subdir", "subdir/b.txt"});
ExpectAssetInfosEqual(manifest_ais, expected_ais, false);
manifest_ais[0].info.mtime = expected_ais[0].info.mtime;
ExpectAssetInfosEqual(manifest_ais, expected_ais, true);
}
// Calls Update() with a single file to be deleted.
TEST_F(ManifestUpdaterTest, Update_DeleteFiles) {
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
EXPECT_OK(updater.Update(MakeDeleteOps({"a.txt"}), &file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 0);
EXPECT_EQ(stats.total_files_added_or_updated, 0);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 1);
EXPECT_EQ(stats.total_chunks, 0);
EXPECT_EQ(stats.total_processed_bytes, 0);
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals(
{"subdir", "subdir/b.txt", "subdir/c.txt", "subdir/d.txt"},
updater.ManifestId()));
// Delete another one in a subdirectory.
EXPECT_OK(updater.Update(MakeDeleteOps({"subdir/b.txt"}), &file_chunks_));
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals(
{"subdir", "subdir/c.txt", "subdir/d.txt"}, updater.ManifestId()));
}
// Calls Update() with a single dir to be deleted.
TEST_F(ManifestUpdaterTest, Update_DeleteDir) {
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
EXPECT_OK(updater.Update(MakeDeleteOps({"subdir"}), &file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 0);
EXPECT_EQ(stats.total_files_added_or_updated, 0);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 1);
EXPECT_EQ(stats.total_chunks, 0);
EXPECT_EQ(stats.total_processed_bytes, 0);
ASSERT_NO_FATAL_FAILURE(
ExpectManifestEquals({"a.txt"}, updater.ManifestId()));
}
// Calls Update() with a non-existing asset to be deleted.
TEST_F(ManifestUpdaterTest, Update_DeleteNonExistingAsset) {
cfg_.src_dir = empty_dir_;
ManifestUpdater updater(&data_store_, cfg_);
// We need to craft AssetInfos for non-existing assets manually.
AssetInfo ai{"non_existing", AssetProto::DIRECTORY};
ManifestUpdater::OperationList ops{{Operator::kDelete, ai}};
EXPECT_OK(updater.Update(&ops, &file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_deleted, 1);
}
// Calls Update() with a non-existing file to be added.
TEST_F(ManifestUpdaterTest, Update_AddNonExistingFile) {
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
// Note that Update() succeeds even through the "non_existing" file failed.
AssetInfo ai;
ai.path = "non_existing";
ManifestUpdater::OperationList ops{
{Operator::kAdd, ai}, {Operator::kAdd, MakeAssetInfo("a.txt").info}};
EXPECT_OK(updater.Update(&ops, &file_chunks_));
const UpdaterStats& stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 2);
EXPECT_EQ(stats.total_files_added_or_updated, 1);
EXPECT_EQ(stats.total_files_failed, 1);
// "non_existing" and "a.txt" were still added, but the former is empty.
std::vector<AssetInfoForTest> manifest_ais =
GetAllManifestAssets(updater.ManifestId());
std::vector<AssetInfoForTest> expected_ais = {AssetInfoForTest{ai},
MakeAssetInfo("a.txt")};
ExpectAssetInfosEqual(manifest_ais, expected_ais);
}
// Verifies that the intermediate manifest contains the expected files.
TEST_F(ManifestUpdaterTest, Update_PushIntermediateManifest) {
// Create a manifest without a.txt.
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
EXPECT_OK(updater.Update(
MakeDeleteOps({"subdir/b.txt", "subdir/c.txt", "subdir/d.txt"}),
&file_chunks_));
// Add a.txt back and check intermediate manifest.
ContentIdProto intermediate_id;
auto push_intermediate_manifest =
[&intermediate_id](const ContentIdProto& manifest_id) {
intermediate_id = manifest_id;
};
EXPECT_OK(updater.Update(
MakeUpdateOps({"subdir/b.txt", "subdir/c.txt", "subdir/d.txt"}),
&file_chunks_, push_intermediate_manifest));
EXPECT_GT(intermediate_id.blake3_sum_160().size(), 0);
// Only file a.txt is done in the intermediate manifest, all others are in
// progress.
EXPECT_FALSE(InProgress(intermediate_id, "a.txt"));
EXPECT_TRUE(InProgress(intermediate_id, "subdir/b.txt"));
EXPECT_TRUE(InProgress(intermediate_id, "subdir/c.txt"));
EXPECT_TRUE(InProgress(intermediate_id, "subdir/d.txt"));
}
// Verifies that |file_chunks_| contains the expected chunks after Update().
TEST_F(ManifestUpdaterTest, Update_FileChunkMap) {
// Reduce chunk sizes to produce a bunch of indirect lists.
cfg_.min_chunk_size = 8;
cfg_.avg_chunk_size = 16;
cfg_.max_chunk_size = 32;
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
// Add a.txt.
EXPECT_OK(updater.Update(MakeUpdateOps({"a.txt"}), &file_chunks_));
ValidateChunkLookup("a.txt", true);
ValidateChunkLookup("subdir/b.txt", false);
// Add subdir/b.txt.
EXPECT_OK(updater.Update(MakeUpdateOps({"subdir/b.txt"}), &file_chunks_));
ValidateChunkLookup("a.txt", true);
ValidateChunkLookup("subdir/b.txt", true);
// Remove a.txt.
EXPECT_OK(updater.Update(MakeDeleteOps({"a.txt"}), &file_chunks_));
ValidateChunkLookup("a.txt", false);
ValidateChunkLookup("subdir/b.txt", true);
}
// Verifies that |file_chunks_| contains the expected chunks an intermediate
// update (and does not deadlock!).
TEST_F(ManifestUpdaterTest, Update_IntermediateFileChunkMap) {
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
// Add a.txt.
EXPECT_OK(updater.Update(MakeUpdateOps({"a.txt"}), &file_chunks_));
// Add subdir/b.txt and check intermediate lookups.
auto push_intermediate_manifest = [this](const ContentIdProto&) {
ValidateChunkLookup("a.txt", true);
ValidateChunkLookup("subdir/b.txt", false); // Not in yet.
};
EXPECT_OK(updater.Update(MakeUpdateOps({"subdir/b.txt"}), &file_chunks_,
push_intermediate_manifest));
ValidateChunkLookup("a.txt", true);
ValidateChunkLookup("subdir/b.txt", true); // Now it's in!
}
// A call to ManifestId() returns the manifest id!!!
TEST_F(ManifestUpdaterTest, ManifestId) {
cfg_.src_dir = empty_dir_;
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
ContentIdProto manifest_id;
EXPECT_OK(data_store_.GetProto(manifest_store_id_, &manifest_id));
EXPECT_EQ(updater.ManifestId(), manifest_id);
}
// Makes sure that executables are properly detected.
TEST_F(ManifestUpdaterTest, DetectExecutables) {
cfg_.src_dir = path::Join(base_dir_, "executables");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
ContentIdProto manifest_id;
EXPECT_OK(data_store_.GetProto(manifest_store_id_, &manifest_id));
ManifestIterator manifest_iter(&data_store_);
EXPECT_OK(manifest_iter.Open(manifest_id));
std::unordered_map<std::string, uint32_t> path_to_perms;
const AssetProto* entry;
while ((entry = manifest_iter.NextEntry()) != nullptr)
path_to_perms[entry->name()] = entry->permissions();
EXPECT_OK(manifest_iter.Status());
EXPECT_EQ(path_to_perms["game.elf"], ManifestUpdater::kExecutablePerms);
EXPECT_EQ(path_to_perms["win.exe"], ManifestUpdater::kExecutablePerms);
EXPECT_EQ(path_to_perms["script.sh"], ManifestUpdater::kExecutablePerms);
EXPECT_EQ(path_to_perms["normal.txt"], ManifestBuilder::kDefaultFilePerms);
}
TEST_F(ManifestUpdaterTest, UpdateAll_LargeIntermediateIndirectDirAssets) {
// Reduce chunk sizes to produce a bunch of indirect lists.
cfg_.min_chunk_size = 8;
cfg_.avg_chunk_size = 16;
cfg_.max_chunk_size = 32;
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
// (internal): Run UpdateAll() with intermediate manifest push. The push
// causes a Flush() call to the manifest builder, which pushes some assets to
// indirect lists. This used to invalidate pointers and cause asserts to
// trigger.
EXPECT_OK(updater.UpdateAll(&file_chunks_, [](const ContentIdProto&) {}));
}
// Runs increamental UpdateAll() on an empty dir.
TEST_F(ManifestUpdaterTest, UpdateAll_EmptySrcDirectory_Incremental) {
cfg_.src_dir = empty_dir_;
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
CdcParamsProto params;
params.set_min_chunk_size(cfg_.min_chunk_size);
params.set_avg_chunk_size(cfg_.avg_chunk_size);
params.set_max_chunk_size(cfg_.max_chunk_size);
ManifestBuilder mb(params, &data_store_);
EXPECT_OK(mb.LoadManifest(updater.ManifestId()));
EXPECT_OK(mb.GetOrCreateAsset("folder1", AssetProto::DIRECTORY));
EXPECT_OK(mb.DeleteAsset("folder1"));
}
TEST_F(ManifestUpdaterTest, UpdateAll_FileAsRootFails) {
cfg_.src_dir = path::Join(base_dir_, "non_empty", "a.txt");
ManifestUpdater updater(&data_store_, cfg_);
auto status = updater.UpdateAll(&file_chunks_);
EXPECT_NOT_OK(status);
EXPECT_TRUE(absl::IsFailedPrecondition(status)) << status.ToString();
}
TEST_F(ManifestUpdaterTest, UpdateAll_RootNotExistFails) {
cfg_.src_dir = path::Join(base_dir_, "non-existing");
ManifestUpdater updater(&data_store_, cfg_);
auto status = updater.UpdateAll(&file_chunks_);
EXPECT_NOT_OK(status);
EXPECT_TRUE(absl::IsNotFound(status)) << status.ToString();
}
// Runs UpdateAll() multiple times on an empty dir with no changes.
TEST_F(ManifestUpdaterTest, UpdateAll_EmptySrcDirectoryMultiTimesNoChange) {
cfg_.src_dir = empty_dir_;
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
UpdaterStats stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 0);
EXPECT_EQ(stats.total_files_added_or_updated, 0);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 0);
EXPECT_EQ(stats.total_processed_bytes, 0);
// Store should contain a chunk for the manifest id and one for the manifest.
EXPECT_EQ(data_store_.Chunks().size(), 2);
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals({}, updater.ManifestId()));
// No new changes should be done.
EXPECT_OK(updater.UpdateAll(&file_chunks_));
stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 0);
EXPECT_EQ(stats.total_files_added_or_updated, 0);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 0);
EXPECT_EQ(stats.total_processed_bytes, 0);
}
// Runs UpdateAll() multiple times on a non-empty dir with no changes.
TEST_F(ManifestUpdaterTest, UpdateAll_NonEmptySrcDirectoryMultiTimesNoChange) {
// Contains a.txt and subdir/b.txt.
cfg_.src_dir = path::Join(base_dir_, "non_empty");
ManifestUpdater updater(&data_store_, cfg_);
EXPECT_OK(updater.UpdateAll(&file_chunks_));
UpdaterStats stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 5);
EXPECT_EQ(stats.total_files_added_or_updated, 4);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 4);
EXPECT_EQ(stats.total_processed_bytes, kTotalFileSize);
// Store should contain a chunk for the manifest id and one for the manifest.
EXPECT_EQ(data_store_.Chunks().size(), 2);
ASSERT_NO_FATAL_FAILURE(ExpectManifestEquals(
{"a.txt", "subdir", "subdir/b.txt", "subdir/c.txt", "subdir/d.txt"},
updater.ManifestId()));
EXPECT_OK(updater.UpdateAll(&file_chunks_));
// No new changes should be done.
stats = updater.Stats();
EXPECT_EQ(stats.total_assets_added_or_updated, 0);
EXPECT_EQ(stats.total_files_added_or_updated, 0);
EXPECT_EQ(stats.total_files_failed, 0);
EXPECT_EQ(stats.total_assets_deleted, 0);
EXPECT_EQ(stats.total_chunks, 0);
EXPECT_EQ(stats.total_processed_bytes, 0);
}
TEST_F(ManifestUpdaterTest, IsValidDir) {
EXPECT_OK(ManifestUpdater::IsValidDir(path::Join(base_dir_, "non_empty")));
EXPECT_TRUE(absl::IsNotFound(
ManifestUpdater::IsValidDir(path::Join(base_dir_, "non-existing"))));
EXPECT_TRUE(absl::IsFailedPrecondition(ManifestUpdater::IsValidDir(
path::Join(base_dir_, "non_empty", "a.txt"))));
EXPECT_TRUE(
absl::IsFailedPrecondition(ManifestUpdater::IsValidDir("relative_dir")));
}
} // namespace
} // namespace cdc_ft

278
manifest/stats_printer.cc Normal file
View File

@@ -0,0 +1,278 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "manifest/stats_printer.h"
#include "absl/strings/str_format.h"
#include "common/path.h"
#include "common/util.h"
namespace cdc_ft {
namespace {
// See https://ss64.com/nt/syntax-ansi.html.
enum class AnsiCode {
// Foreground colors
kBlackFg = 0,
kDarkRedFg = 1,
kDarkGreenFg = 2,
kDarkYellowFg = 3,
kDarkBlueFg = 4,
kDarkMagentaFg = 5,
kDarkCyanFg = 6,
kLightGrayFg = 7,
kDarkGrayFg = 8,
kLightRedFg = 9,
kLightGreenFg = 10,
kLightYellowFg = 11,
kLightBlueFg = 12,
kLightMagentaFg = 13,
kLightCyanFg = 14,
kWhiteFg = 15,
// Background colors
kBlackBg = 16,
kDarkRedBg = 17,
kDarkGreenBg = 18,
kDarkYellowBg = 19,
kDarkBlueBg = 20,
kDarkMagentaBg = 21,
kDarkCyanBg = 22,
kLightGrayBg = 23,
kDarkGrayBg = 24,
kLightRedBg = 25,
kLightGreenBg = 26,
kLightYellowBg = 27,
kLightBlueBg = 28,
kLightMagentaBg = 29,
kLightCyanBg = 30,
kWhiteBg = 31,
// Misc
kBold = 32,
kUnderline = 33,
kNoUnderline = 34,
kReverseText = 35,
kNoReverseText = 36,
kDefault = 37
};
constexpr char kAnsiCodeStr[][7]{
"\033[30m", "\033[31m", "\033[32m", "\033[33m", "\033[34m",
"\033[35m", "\033[36m", "\033[37m", "\033[90m", "\033[91m",
"\033[92m", "\033[93m", "\033[94m", "\033[95m", "\033[96m",
"\033[97m", "\033[40m", "\033[41m", "\033[42m", "\033[43m",
"\033[44m", "\033[45m", "\033[46m", "\033[47m", "\033[100m",
"\033[101m", "\033[102m", "\033[103m", "\033[104m", "\033[105m",
"\033[106m", "\033[107m", "\033[1m", "\033[4m", "\033[24m",
"\033[7m", "\033[27m", "\033[0m"};
constexpr int kBgColors[] = {
static_cast<int>(AnsiCode::kLightRedBg),
static_cast<int>(AnsiCode::kLightGreenBg),
static_cast<int>(AnsiCode::kLightBlueBg),
static_cast<int>(AnsiCode::kLightYellowBg),
static_cast<int>(AnsiCode::kLightMagentaBg),
static_cast<int>(AnsiCode::kLightCyanBg),
static_cast<int>(AnsiCode::kDarkRedBg),
static_cast<int>(AnsiCode::kDarkGreenBg),
static_cast<int>(AnsiCode::kDarkBlueBg),
static_cast<int>(AnsiCode::kDarkYellowBg),
static_cast<int>(AnsiCode::kDarkMagentaBg),
static_cast<int>(AnsiCode::kDarkCyanBg),
};
constexpr int kNumBgColors = static_cast<int>(std::size(kBgColors));
constexpr int kFgColors[] = {
static_cast<int>(AnsiCode::kBlackFg),
static_cast<int>(AnsiCode::kDarkGrayFg),
static_cast<int>(AnsiCode::kLightGrayFg),
};
constexpr int kNumFgColors = static_cast<int>(std::size(kFgColors));
// Max length of filenames to print.
constexpr size_t kMaxFilenameSize = 32;
// Number of most recent files to print.
constexpr size_t kMaxNumRecentFiles = 32;
void PrintPadded(std::string line, size_t padded_size) {
line.resize(padded_size, ' ');
printf("%s\n", line.c_str());
}
// Returns teh base name of |path|, shortened to |kMaxFilenameSize| characters.
std::string GetShortFilename(const std::string path) {
std::string filename = path::BaseName(path);
if (filename.size() > kMaxFilenameSize)
filename = filename.substr(0, kMaxFilenameSize - 2) + "..";
return filename;
}
} // namespace
StatsPrinter::StatsPrinter() = default;
StatsPrinter::~StatsPrinter() = default;
void StatsPrinter::InitFile(const std::string& path, size_t num_chunks) {
path_to_file_[path].chunks.resize(num_chunks);
}
void StatsPrinter::Clear() {
recent_files_.clear();
path_to_file_.clear();
thread_id_to_color_.clear();
num_threads_ = 0;
// Don't clear max_bandwidth_, it can't be recalculated, the others can.
total_streamed_bytes_ = 0;
total_cached_bytes_ = 0;
}
void StatsPrinter::ResetBandwidthStats() {
bandwidth_timer_.Reset();
curr_bandwidth_ = 0;
curr_streamed_bytes_ = 0;
}
void StatsPrinter::RecordStreamedChunk(const std::string& path, size_t index,
uint32_t size, size_t thread_id) {
AddToRecentFiles(path);
assert(path_to_file_.find(path) != path_to_file_.end());
assert(index < path_to_file_[path].chunks.size());
path_to_file_[path].chunks[index] =
FileChunk(ChunkState::kStreamed, thread_id);
curr_streamed_bytes_ += size;
total_streamed_bytes_ += size;
// Update thread-to-color map.
if (thread_id_to_color_.find(thread_id) == thread_id_to_color_.end())
thread_id_to_color_[thread_id] = num_threads_++;
}
void StatsPrinter::RecordCachedChunk(const std::string& path, size_t index,
uint32_t size) {
AddToRecentFiles(path);
path_to_file_[path].chunks[index] = FileChunk(ChunkState::kCached, 0);
total_cached_bytes_ += size;
}
void StatsPrinter::Print() {
int console_width = Util::GetConsoleWidth();
if (console_width < static_cast<int>(kMaxFilenameSize) + 4) return;
printf("\r");
size_t max_filename_size = 0;
for (const std::string& path : recent_files_) {
max_filename_size =
std::max(max_filename_size, GetShortFilename(path).size());
}
std::string line;
for (const std::string& path : recent_files_) {
const File& file = path_to_file_[path];
line = GetShortFilename(path);
line.resize(max_filename_size + 1, ' ');
// Fill the rest of the line with a visualization of the chunk states.
size_t num_chunks = file.chunks.size();
size_t print_width =
std::min(num_chunks, static_cast<size_t>(console_width) - line.size());
size_t num_chars = line.size() + print_width;
for (int n = 0; n < print_width; ++n) {
// There can be multiple chunks per output char. Pick the most recent one.
size_t begin_idx = n * num_chunks / print_width;
size_t end_idx = (n + 1) * num_chunks / print_width;
absl::Time last_modified_time = file.chunks[begin_idx].modified_time;
size_t last_modified_idx = begin_idx;
for (size_t k = begin_idx + 1; k < end_idx; ++k) {
if (last_modified_time < file.chunks[k].modified_time) {
last_modified_time = file.chunks[k].modified_time;
last_modified_idx = k;
}
}
// Print character depending on the chunk type:
// - for chunks that have not been loaded.
// X for chunks that have been streamed.
// C for chunks that were cached.
const FileChunk& chunk = file.chunks[last_modified_idx];
if (chunk.state == ChunkState::kNotLoaded) {
line += kAnsiCodeStr[static_cast<int>(AnsiCode::kDefault)];
line.push_back('-');
} else if (chunk.state == ChunkState::kCached) {
line += kAnsiCodeStr[static_cast<int>(AnsiCode::kBlackFg)];
line += kAnsiCodeStr[static_cast<int>(AnsiCode::kLightGrayBg)];
line.push_back('C');
} else {
int col = thread_id_to_color_[chunk.thread_id];
line += kAnsiCodeStr[kBgColors[col % kNumBgColors]];
line += kAnsiCodeStr[kFgColors[(col / kNumBgColors) % kNumFgColors]];
line.push_back('X');
}
// Return to default coloring.
line += kAnsiCodeStr[static_cast<int>(AnsiCode::kDefault)];
}
// Fill with spaces and print.
PrintPadded(std::move(line), line.size() + console_width - num_chars + 1);
}
// Print bandwidth and other stats.
UpdateBandwidthStats();
line = "Legend: (-) not loaded, (C) cached, (X) streamed (color=FUSE thread)";
PrintPadded(std::move(line), console_width);
constexpr double MBd = 1024.0 * 1024.0;
line = absl::StrFormat("Bandwidth %7.2f MB/sec (curr) %7.2f MB/sec (max)",
curr_bandwidth_ / MBd, max_bandwidth_ / MBd);
PrintPadded(std::move(line), console_width);
constexpr int MBi = 1024 * 1024;
line =
absl::StrFormat("Total data %6i MB (streamed) %7i MB (cached)",
total_streamed_bytes_ / MBi, total_cached_bytes_ / MBi);
PrintPadded(std::move(line), console_width);
// Move cursor up, so that printing again overwrites the old content.
for (size_t n = 0; n < recent_files_.size() + 3; ++n) printf("\033[F");
}
void StatsPrinter::AddToRecentFiles(const std::string& path) {
if (std::find(recent_files_.begin(), recent_files_.end(), path) !=
recent_files_.end()) {
return;
}
recent_files_.push_back(path);
if (recent_files_.size() > kMaxNumRecentFiles) recent_files_.pop_front();
}
void StatsPrinter::UpdateBandwidthStats() {
double deltaSec = bandwidth_timer_.ElapsedSeconds();
if (deltaSec < 1.0f) return;
curr_bandwidth_ = curr_streamed_bytes_ / deltaSec;
if (max_bandwidth_ < curr_bandwidth_) max_bandwidth_ = curr_bandwidth_;
curr_streamed_bytes_ = 0;
bandwidth_timer_.Reset();
}
} // namespace cdc_ft

135
manifest/stats_printer.h Normal file
View File

@@ -0,0 +1,135 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANIFEST_STATS_PRINTER_H_
#define MANIFEST_STATS_PRINTER_H_
#include <deque>
#include <string>
#include <vector>
#include "absl/container/flat_hash_map.h"
#include "absl/time/time.h"
#include "common/stopwatch.h"
namespace cdc_ft {
// Collects and prints statistics about chunks streamed and cached per file.
// Also prints general bandwidth and total bytes statistics.
// Sample output (X's are colored by FUSE thread id, not shown):
// gamedata.pak CCCXXXXXXXXXX------
// lib.so XXX------
// Legend: (-) not loaded, (C) cached, (X) streamed (color=FUSE thread)
// Bandwidth 0.00 MB/sec (curr) 2.39 MB/sec (max)
// Total data 3 MB (streamed) 1 MB (cached)
// Each X/C/- represents a chunk unless the file is large and the chunks don't
// fit into a single line. In that case, the X/C/- represents the most recently
// accessed chunk in a group of several chunks.
class StatsPrinter {
public:
StatsPrinter();
~StatsPrinter();
// Registers a file for the given relative Unix |path| that has |num_chunks|
// chunks.
void InitFile(const std::string& path, size_t num_chunks);
// Clears all data expect max bandwidth.
void Clear();
// Resets measurement of current bandwidth.
void ResetBandwidthStats();
// Records a chunk that was streamed from the workstation.
// |path| is the relative Unix path of a file that contains the chunk.
// |index| is the index of the chunk.
// |size| is the size of the chunk in bytes.
// |thread_id| is the id of the thread that requested the chunk on the
// gamelet, usually the hash of the std::thread::id.
// Asserts that the file was registered with InitFile() and that |index| is
// smaller than |num_chunks| passed to InitFile().
void RecordStreamedChunk(const std::string& path, size_t index, uint32_t size,
size_t thread_id);
// Records a chunk that is cached on the gamelet.
// |path| is the relative Unix path of a file that contains the chunk.
// |index| is the index of the chunk.
// |size| is the size of the chunk in bytes.
// Asserts that the file was registered with InitFile() and that |index| is
// smaller than |num_chunks| passed to InitFile().
void RecordCachedChunk(const std::string& path, size_t index, uint32_t size);
// Prints all statistics.
void Print();
private:
// Adds |path| to |recent_files_| if it's not already there and removes the
// first entry if the list gets too large.
void AddToRecentFiles(const std::string& path);
// Updates the current and total bandwidth stats.
void UpdateBandwidthStats();
enum class ChunkState : uint8_t {
kNotLoaded = 0, // Chunk is neither cached nor streamed.
kStreamed = 1, // Chunk was streamed from the workstation.
kCached = 2, // Chunk was cached on the gamelet.
};
struct FileChunk {
// Thread on gamelet that requested a streamed chunk.
// Unused for cached chunks and chunks that are not loaded.
size_t thread_id = 0;
// Time when this data was modified.
absl::Time modified_time;
// Whether the chunk is cached, was streamed or is not loaded.
ChunkState state = ChunkState::kNotLoaded;
FileChunk() {}
explicit FileChunk(ChunkState state, size_t thread_id)
: thread_id(thread_id), modified_time(absl::Now()), state(state) {}
};
struct File {
// All chunks in the file.
std::vector<FileChunk> chunks;
};
// LRU access list.
std::deque<std::string> recent_files_;
// Map from relative Unix file path to all chunks in that file.
using PathToFileMap = absl::flat_hash_map<std::string, File>;
PathToFileMap path_to_file_;
// Assigns each thread a fixed color.
std::unordered_map<size_t, int> thread_id_to_color_;
int num_threads_ = 0;
Stopwatch bandwidth_timer_;
double curr_bandwidth_ = 0;
uint64_t curr_streamed_bytes_ = 0;
double max_bandwidth_ = 0;
uint64_t total_streamed_bytes_ = 0;
uint64_t total_cached_bytes_ = 0;
};
} // namespace cdc_ft
#endif // MANIFEST_STATS_PRINTER_H_

Binary file not shown.

View File

@@ -0,0 +1 @@
*waves hand* This is not an executable!

View File

@@ -0,0 +1,17 @@
#!/bin/sh
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo Hi!

Binary file not shown.

View File

@@ -0,0 +1 @@
aaaaaaaa

View File

@@ -0,0 +1 @@
bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb

View File

@@ -0,0 +1 @@
c

View File

@@ -0,0 +1 @@
d

0
manifest/testdata/root.txt vendored Normal file
View File