Releasing the former Stadia file transfer tools

The tools allow efficient and fast synchronization of large directory
trees from a Windows workstation to a Linux target machine.

cdc_rsync* support efficient copy of files by using content-defined
chunking (CDC) to identify chunks within files that can be reused.

asset_stream_manager + cdc_fuse_fs support efficient streaming of a
local directory to a remote virtual file system based on FUSE. It also
employs CDC to identify and reuse unchanged data chunks.
This commit is contained in:
Christian Schneider
2022-10-07 10:47:04 +02:00
commit 4326e972ac
364 changed files with 49410 additions and 0 deletions

128
data_store/BUILD Normal file
View File

@@ -0,0 +1,128 @@
package(default_visibility = ["//:__subpackages__"])
cc_library(
name = "data_store",
srcs = [
"data_store_reader.cc",
"data_store_writer.cc",
],
hdrs = [
"data_store_reader.h",
"data_store_writer.h",
],
deps = [
"//common:buffer",
"//common:status",
"//common:status_macros",
"//manifest:content_id",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings:str_format",
],
)
cc_library(
name = "data_provider",
srcs = ["data_provider.cc"],
hdrs = ["data_provider.h"],
deps = [
":data_store",
"//common:clock",
"//common:log",
"//common:status",
"//common:stopwatch",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/synchronization",
],
)
cc_library(
name = "disk_data_store",
srcs = ["disk_data_store.cc"],
hdrs = ["disk_data_store.h"],
deps = [
":data_store",
"//common:clock",
"//common:log",
"//common:path",
"//common:platform",
"//common:status_macros",
"//manifest:content_id",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:statusor",
],
)
cc_test(
name = "disk_data_store_test",
srcs = ["disk_data_store_test.cc"],
deps = [
":disk_data_store",
"//common:status_test_macros",
"//common:testing_clock",
"//manifest:content_id",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "data_provider_test",
srcs = ["data_provider_test.cc"],
deps = [
":data_provider",
":disk_data_store",
":mem_data_store",
"//common:status_test_macros",
"//common:testing_clock",
"//common:util",
"//manifest:content_id",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
)
cc_library(
name = "grpc_reader",
srcs = ["grpc_reader.cc"],
hdrs = ["grpc_reader.h"],
deps = [
":data_store",
"//cdc_fuse_fs:asset_stream_client",
"//common:buffer",
"//common:status",
"//common:status_macros",
"//manifest:content_id",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:statusor",
],
)
cc_library(
name = "mem_data_store",
srcs = ["mem_data_store.cc"],
hdrs = ["mem_data_store.h"],
deps = [
":data_store",
"//common:status",
],
)
cc_test(
name = "mem_data_store_test",
srcs = ["mem_data_store_test.cc"],
deps = [
":mem_data_store",
"//common:status_test_macros",
"//manifest:content_id",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
)
filegroup(
name = "all_test_sources",
srcs = glob(["*_test.cc"]),
)

364
data_store/data_provider.cc Normal file
View File

@@ -0,0 +1,364 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/data_provider.h"
#include <algorithm>
#include <thread>
#include "absl/strings/str_format.h"
#include "common/log.h"
#include "common/status.h"
#include "common/stopwatch.h"
#include "manifest/content_id.h"
namespace cdc_ft {
namespace {
// FUSE limits the maximum read request size to 128k. Larger requests will be
// split up into smaller requests up to at most this size. This constant can
// be used to identify max. size requests.
constexpr uint64_t kMaxFuseRequestSize = 1 << 17;
} // namespace
DataProvider::DataProvider(
std::unique_ptr<DataStoreWriter> writer,
std::vector<std::unique_ptr<DataStoreReader>> readers, size_t prefetch_size,
uint32_t cleanup_timeout_sec, uint32_t access_idle_timeout_sec)
: prefetch_size_(prefetch_size),
writer_(std::move(writer)),
readers_(std::move(readers)),
chunks_updated_(true),
cleanup_timeout_sec_(cleanup_timeout_sec),
access_idle_timeout_sec_(access_idle_timeout_sec) {
if (writer_) {
assert(!async_cleaner_);
async_cleaner_ =
std::make_unique<std::thread>([this]() { CleanupThreadMain(); });
}
}
DataProvider::~DataProvider() { Shutdown(); }
void DataProvider::Shutdown() {
{
absl::MutexLock lock(&shutdown_mutex_);
shutdown_ = true;
}
if (async_cleaner_) {
if (async_cleaner_->joinable()) async_cleaner_->join();
async_cleaner_.reset();
}
}
size_t DataProvider::PrefetchSize(size_t read_size) const {
// If the read size matches the maximum FUSE request size, it is very likely
// that the next chunk is needed as well, so we enlarge the read size by the
// prefetch size.
if (read_size == kMaxFuseRequestSize) read_size += prefetch_size_;
return read_size;
}
absl::StatusOr<size_t> DataProvider::Get(const ContentIdProto& content_id,
void* data, size_t offset,
size_t size) {
last_access_ts_ = steady_clock_->Now();
absl::Mutex* content_mutex = GetContentMutex(content_id);
absl::StatusOr<size_t> read_bytes;
if (writer_) {
{
absl::ReaderMutexLock read_lock(content_mutex);
read_bytes = writer_->Get(content_id, data, offset, size);
}
if (read_bytes.ok()) {
return read_bytes;
}
LogWriterWarning(read_bytes.status(), content_id);
}
// To prevent reading the same chunk from multiple threads, make read/write
// atomic.
absl::WriterMutexLock write_lock(content_mutex);
// Read from the writer_ again, in case the cache has been populated by
// another thread.
if (writer_ && absl::IsNotFound(read_bytes.status())) {
read_bytes = writer_->Get(content_id, data, offset, size);
if (read_bytes.ok()) {
return read_bytes;
}
LogWriterWarning(read_bytes.status(), content_id);
}
for (auto& reader : readers_) {
Buffer buffer;
absl::Status status = reader->Get(content_id, &buffer);
if (!status.ok()) {
// Try next reader if this one doesn't contain the chunk.
if (absl::IsNotFound(status)) continue;
// TODO: Add reader identification for debugging.
return WrapStatus(status, "Failed to get '%s'.",
ContentId::ToHexString(content_id));
}
if (writer_) {
status = writer_->Put(content_id, buffer.data(), buffer.size());
chunks_updated_ = true;
if (!status.ok()) {
LOG_ERROR("Failed to write chunk '%s': %s.",
ContentId::ToHexString(content_id), status.ToString());
}
}
if (buffer.size() <= offset) return 0;
size_t return_bytes = std::min(buffer.size() - offset, size);
memcpy(data, buffer.data() + offset, return_bytes);
return return_bytes;
}
return absl::NotFoundError(absl::StrFormat(
"Failed to find %s.", ContentId::ToHexString(content_id)));
}
absl::Status DataProvider::Get(ChunkTransferList* chunks) {
last_access_ts_ = steady_clock_->Now();
// Try to fetch chunks from the cache first.
RETURN_IF_ERROR(GetFromWriter(chunks, /*lock_required=*/true));
if (chunks->ReadDone()) return absl::OkStatus();
// Get list of all missing chunk IDs.
std::vector<const ContentIdProto*> chunk_ids;
for (const ChunkTransferTask& chunk : *chunks) {
if (!chunk.done) chunk_ids.push_back(&chunk.id);
}
// Acquire writer locks for all missing chunks.
WriterMutexLockList locks;
WriteLockAll(std::move(chunk_ids), &locks);
// Read from the |writer_| again, in case the cache has been populated by
// another thread. We hold all chunk locks already.
RETURN_IF_ERROR(GetFromWriter(chunks, /*lock_required=*/false));
if (chunks->ReadDone()) return absl::OkStatus();
// Try to read from all readers.
for (auto& reader : readers_) {
absl::Status status = reader->Get(chunks);
if (!status.ok()) {
// TODO: Add reader identification for debugging.
return WrapStatus(status, "Failed to get chunks [%s] from list [%s]",
chunks->UndoneToHexString(), chunks->ToHexString());
}
if (chunks->PrefetchDone()) break;
}
// Cache complete chunks in the writer.
if (writer_) {
for (ChunkTransferTask& chunk : *chunks) {
if (!chunk.done || chunk.chunk_data.empty()) continue;
absl::Status status = writer_->Put(chunk.id, chunk.chunk_data.data(),
chunk.chunk_data.size());
chunks_updated_ = true;
if (!status.ok()) {
LOG_WARNING("Failed to put '%s' to writer: %s.",
ContentId::ToHexString(chunk.id), status.message());
}
}
}
return absl::OkStatus();
}
absl::Status DataProvider::Get(const ContentIdProto& content_id, Buffer* data) {
last_access_ts_ = steady_clock_->Now();
absl::Mutex* content_mutex = GetContentMutex(content_id);
absl::Status status = absl::OkStatus();
if (writer_) {
{
absl::ReaderMutexLock read_lock(content_mutex);
status = writer_->Get(content_id, data);
}
if (status.ok()) {
return absl::OkStatus();
}
LogWriterWarning(status, content_id);
}
// To prevent reading the same chunk from multiple threads, make read/write
// atomic.
absl::WriterMutexLock write_lock(content_mutex);
// Read from the writer_ again, in case the cache has been populated by
// another thread.
if (writer_ && absl::IsNotFound(status)) {
status = writer_->Get(content_id, data);
if (status.ok()) {
return absl::OkStatus();
}
LogWriterWarning(status, content_id);
}
for (auto& reader : readers_) {
status = reader->Get(content_id, data);
if (!status.ok()) {
// Try next reader if this one doesn't contain the chunk.
if (absl::IsNotFound(status)) continue;
// TODO: Add reader identification for debugging.
return WrapStatus(status, "Failed to get '%s'.",
ContentId::ToHexString(content_id));
}
if (writer_) {
writer_->Put(content_id, data->data(), data->size()).IgnoreError();
chunks_updated_ = true;
}
return absl::OkStatus();
}
return absl::NotFoundError(absl::StrFormat(
"Failed to find '%s'.", ContentId::ToHexString(content_id)));
}
void DataProvider::LogWriterWarning(const absl::Status& status,
const ContentIdProto& content_id) {
if (!absl::IsNotFound(status)) {
LOG_WARNING("Failed to get '%s' from writer: %s.",
ContentId::ToHexString(content_id), status.message());
}
}
absl::Mutex* DataProvider::GetContentMutex(const ContentIdProto& content_id) {
interrupt_ = true;
uint8_t id = ContentId::GetByte(content_id, 0);
return &content_mutexes_[id];
}
void DataProvider::WriteLockAll(std::vector<const ContentIdProto*> chunk_ids,
WriterMutexLockList* locks) {
// Sorting the list avoids cycles when locking from multiple threads
// concurrently, thus avoiding deadlocks when holding some mutexes while
// trying to lock others.
std::sort(
chunk_ids.begin(), chunk_ids.end(),
[](const ContentIdProto* a, const ContentIdProto* b) { return *a < *b; });
std::unordered_set<absl::Mutex*> locked;
for (const ContentIdProto* id : chunk_ids) {
absl::Mutex* mu = GetContentMutex(*id);
auto [_, inserted] = locked.insert(mu);
if (!inserted) continue;
locks->push_back(std::make_unique<absl::WriterMutexLock>(mu));
}
}
absl::Status DataProvider::GetFromWriter(ChunkTransferList* chunks,
bool lock_required) {
if (!writer_ || chunks->ReadDone()) return absl::OkStatus();
// Try to read all remaining chunks from the cache.
absl::StatusOr<size_t> read_bytes;
for (ChunkTransferTask& chunk : *chunks) {
if (chunk.done) continue;
{
std::unique_ptr<absl::ReaderMutexLock> lock;
if (lock_required) {
lock =
std::make_unique<absl::ReaderMutexLock>(GetContentMutex(chunk.id));
}
if (!chunk.size) {
// Check if the prefetch chunk is already present, no further processing
// needed.
chunk.done = writer_->Contains(chunk.id);
continue;
}
// Read the requested data.
read_bytes = writer_->Get(chunk.id, chunk.data, chunk.offset, chunk.size);
}
if (!read_bytes.ok()) {
LogWriterWarning(read_bytes.status(), chunk.id);
} else if (*read_bytes == chunk.size) {
chunk.done = true;
if (chunks->ReadDone()) return absl::OkStatus();
} else {
LogWriterWarning(
MakeStatus("Expected %u bytes, got %u", chunk.size, *read_bytes),
chunk.id);
// Remove the corrupted chunk from the cache, but only if the chunk was
// write-locked by the caller.
if (!lock_required) {
absl::Status status = writer_->Remove(chunk.id);
if (!status.ok()) {
LOG_WARNING("Failed to remove chunk '%s' from the cache: %s",
ContentId::ToHexString(chunk.id), status.ToString());
}
}
}
}
return absl::OkStatus();
}
void DataProvider::LockAllMutexes(WriterMutexLockList* locks) {
for (absl::Mutex& mu : content_mutexes_) {
locks->push_back(std::make_unique<absl::WriterMutexLock>(&mu));
}
}
void DataProvider::CleanupThreadMain() {
assert(writer_);
writer_->RegisterInterrupt(&interrupt_);
absl::MutexLock lock(&shutdown_mutex_);
SteadyClock::Timestamp next_cleanup_time =
steady_clock_->Now() + std::chrono::seconds(cleanup_timeout_sec_);
while (!shutdown_) {
auto cond = [this]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(shutdown_mutex_) {
return shutdown_;
};
shutdown_mutex_.AwaitWithTimeout(
absl::Condition(&cond),
std::max(absl::Seconds(access_idle_timeout_sec_),
absl::Seconds(std::chrono::duration_cast<std::chrono::seconds>(
next_cleanup_time - steady_clock_->Now())
.count())));
int64_t time_sec_since_last_access =
std::chrono::duration_cast<std::chrono::seconds>(steady_clock_->Now() -
last_access_ts_.load())
.count();
if (chunks_updated_ &&
time_sec_since_last_access > access_idle_timeout_sec_) {
WriterMutexLockList locks;
LockAllMutexes(&locks);
chunks_updated_ = false;
LOG_DEBUG("Starting cache cleanup");
Stopwatch sw;
absl::Status status = writer_->Cleanup();
LOG_DEBUG("Finished cache cleanup in %0.3f seconds", sw.ElapsedSeconds());
next_cleanup_time =
steady_clock_->Now() + std::chrono::seconds(cleanup_timeout_sec_);
absl::MutexLock cleaned_lock(&cleaned_mutex_);
if (!status.ok()) {
LOG_WARNING("Failed to cleanup the cache: %s", status.message());
chunks_updated_ = true;
is_cleaned_ = false;
} else {
is_cleaned_ = true;
}
}
interrupt_ = false;
}
}
bool DataProvider::WaitForCleanupAndResetForTesting(absl::Duration timeout) {
absl::MutexLock lock(&cleaned_mutex_);
auto cond = [this]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(cleaned_mutex_) {
return is_cleaned_;
};
cleaned_mutex_.AwaitWithTimeout(absl::Condition(&cond), timeout);
bool is_cleaned = is_cleaned_;
is_cleaned_ = false;
return is_cleaned;
}
} // namespace cdc_ft

157
data_store/data_provider.h Normal file
View File

@@ -0,0 +1,157 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATA_STORE_DATA_PROVIDER_H_
#define DATA_STORE_DATA_PROVIDER_H_
#include <atomic>
#include <thread>
#include <vector>
#include "absl/base/thread_annotations.h"
#include "absl/status/statusor.h"
#include "absl/synchronization/mutex.h"
#include "absl/time/time.h"
#include "common/clock.h"
#include "data_store/data_store_reader.h"
#include "data_store/data_store_writer.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
// DataProvider is a composite of several data-store readers used for the file
// transfer. Thread-safe.
class DataProvider : public DataStoreReader {
public:
// Default cleanup interval in seconds.
static constexpr unsigned int kCleanupTimeoutSec = 300;
// Default access-idling time in seconds.
static constexpr int64_t kAccessIdleSec = 5;
DataProvider(std::unique_ptr<DataStoreWriter> writer,
std::vector<std::unique_ptr<DataStoreReader>> readers,
size_t prefetch_size,
uint32_t cleanup_timeout_sec = kCleanupTimeoutSec,
uint32_t access_idle_timeout_sec = kAccessIdleSec);
DataProvider() = delete;
DataProvider(const DataProvider&) = delete;
DataProvider& operator=(const DataProvider&) = delete;
virtual ~DataProvider() ABSL_LOCKS_EXCLUDED(shutdown_mutex_);
// Shuts down the background cleanup thread.
void Shutdown();
// DataStoreReader:
size_t PrefetchSize(size_t read_size) const override;
absl::StatusOr<size_t> Get(const ContentIdProto& content_id, void* data,
size_t offset, size_t size)
ABSL_LOCKS_EXCLUDED(*content_mutexes_) override;
absl::Status Get(ChunkTransferList* chunks)
ABSL_LOCKS_EXCLUDED(*content_mutexes_) override;
absl::Status Get(const ContentIdProto& content_id, Buffer* data)
ABSL_LOCKS_EXCLUDED(*content_mutexes_) override;
private:
friend class DataProviderTest;
// Returns whether the writer was cleaned up and resets |is_cleaned_|.
bool WaitForCleanupAndResetForTesting(absl::Duration timeout)
ABSL_LOCKS_EXCLUDED(cleaned_mutex_);
// Vector of WriterMutexLock pointers to lock multiple mutexes together.
using WriterMutexLockList =
std::vector<std::unique_ptr<absl::WriterMutexLock>>;
// Logs a warning if unexpectedly could not get data from the writer.
void LogWriterWarning(const absl::Status& status,
const ContentIdProto& content_id);
// Returns the mutex for |content_id| from |content_mutexes_|.
absl::Mutex* GetContentMutex(const ContentIdProto& content_id);
// Acquires write locks on the corresponding mutexes for all content IDs in
// |chunk_ids|. The locks are placed in the |locks| list. Detects if two chunk
// IDs are guarded by the same mutex and locks it only once.
//
// The list of mutexes is sorted in a deterministic way before they are
// locked. This prevents cycles when calling this function from multiple
// threads and thus avoids deadlocks.
void WriteLockAll(std::vector<const ContentIdProto*> chunk_ids,
WriterMutexLockList* locks);
// Tries to fulfill as many of the chunk transfer tasks in |chunks| as
// possible. Tasks that are completed are marked as `done`. If |lock_required|
// is true, a read lock is acquired for each chunk as its read. Otherwise the
// caller is responsible for acquiring all required locks beforehand.
absl::Status GetFromWriter(ChunkTransferList* chunks, bool lock_required);
// Collects locks for all mutexes.
void LockAllMutexes(WriterMutexLockList* locks)
ABSL_LOCKS_EXCLUDED(*content_mutexes_);
// Periodically cleans up data in |writer_|.
void CleanupThreadMain() ABSL_LOCKS_EXCLUDED(shutdown_mutex_, cleaned_mutex_);
static constexpr unsigned int kNumberOfMutexes = 256;
// How much additional data to prefetch when a max. FUSE read is encountered.
size_t prefetch_size_;
std::unique_ptr<DataStoreWriter> writer_;
std::vector<std::unique_ptr<DataStoreReader>> readers_;
// Array of mutexes to protect read/write operations.
absl::Mutex content_mutexes_[kNumberOfMutexes];
// Runs periodical cleanup of the data writer.
std::unique_ptr<std::thread> async_cleaner_;
absl::Mutex shutdown_mutex_;
// Indicates whether the shutdown was triggered.
bool shutdown_ ABSL_GUARDED_BY(shutdown_mutex_) = false;
// The last access time.
std::atomic<std::chrono::time_point<std::chrono::steady_clock>>
last_access_ts_;
// Identifies if new data was added to the cache since the last cleanup.
std::atomic<bool> chunks_updated_;
// Clock to track the last access time.
SteadyClock* steady_clock_ = DefaultSteadyClock::GetInstance();
// Cleanup interval.
uint32_t cleanup_timeout_sec_ = kCleanupTimeoutSec;
// The number of seconds needs to pass since the last write or read operation
// to mark the data provider as access-idling.
uint32_t access_idle_timeout_sec_ = kAccessIdleSec;
absl::Mutex cleaned_mutex_;
// Whether the writer was cleaned up since the last time
// WaitForCleanupAndResetForTesting() was executed or since beginning.
bool is_cleaned_ ABSL_GUARDED_BY(cleaned_mutex_) = false;
// Shows whether any read/write request arrived during Cleanup().
// data_writer_ only reads it and cancels Cleanup() if it is true.
// It is set in GetContentMutex() and reset at the end of Get().
std::atomic<bool> interrupt_;
}; // class DataProvider
}; // namespace cdc_ft
#endif // DATA_STORE_DATA_PROVIDER_H_

View File

@@ -0,0 +1,370 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/data_provider.h"
#include <chrono>
#include <numeric>
#include <thread>
#include "common/path.h"
#include "common/status_test_macros.h"
#include "common/testing_clock.h"
#include "common/util.h"
#include "data_store/disk_data_store.h"
#include "data_store/mem_data_store.h"
#include "gtest/gtest.h"
#include "manifest/content_id.h"
namespace cdc_ft {
namespace {
constexpr uint8_t kFirstData[] = {10, 20, 30, 40, 50, 60, 70, 80, 90};
constexpr size_t kFirstDataSize = sizeof(kFirstData);
constexpr char kTestCacheDirName[] = ".cdc_ft_cache";
} // namespace
class DataProviderTest : public ::testing::Test {
public:
void SetUp() override {
cache_dir_path_ = path::Join(path::GetTempDir(), kTestCacheDirName);
EXPECT_OK(path::RemoveDirRec(cache_dir_path_));
}
void TearDown() override { EXPECT_OK(path::RemoveDirRec(cache_dir_path_)); }
DataProviderTest() {
first_content_id_ = ContentId::FromArray(kFirstData, kFirstDataSize);
}
ContentIdProto Id(const std::string& data) {
return ContentId::FromDataString(data);
}
std::unique_ptr<DiskDataStore> CreateDiskCache(
const std::vector<std::string>& chunks) {
absl::StatusOr<std::unique_ptr<DiskDataStore>> cache =
DiskDataStore::Create(0, cache_dir_path_, false, &clock_);
EXPECT_OK(cache);
for (const std::string& s : chunks) {
EXPECT_OK((*cache)->Put(Id(s), s.data(), s.size()));
}
return std::move(*cache);
}
std::vector<std::unique_ptr<DataStoreReader>> CreateMemCache(
const std::vector<std::string>& chunks) {
auto cache = std::make_unique<MemDataStore>();
for (const std::string& chunk : chunks) {
cache->AddData({chunk.begin(), chunk.end()});
}
std::vector<std::unique_ptr<DataStoreReader>> readers;
readers.emplace_back(std::move(cache));
return readers;
}
std::unique_ptr<DiskDataStore> CreateCacheWithFirstData() {
absl::StatusOr<std::unique_ptr<DiskDataStore>> cache =
DiskDataStore::Create(0, cache_dir_path_, false, &clock_);
EXPECT_OK(cache);
EXPECT_OK((*cache)->Put(first_content_id_, &kFirstData[0], kFirstDataSize));
return std::move(*cache);
}
std::unique_ptr<MemDataStore> CreateMemCacheWithFirstData() {
auto cache = std::make_unique<MemDataStore>();
cache->AddData({kFirstData, kFirstData + kFirstDataSize});
return cache;
}
std::string GetDiskCacheFilePath(DiskDataStore* dds,
const ContentIdProto& content_id) const {
return dds->GetCacheFilePath(content_id);
}
bool WaitForProviderCleanupAndResetForTesting(DataProvider* dp,
absl::Duration timeout) {
return dp->WaitForCleanupAndResetForTesting(timeout);
}
void TestGetExistingChunkInBounds(DataProvider& data_provider) {
uint8_t ret_data[kFirstDataSize];
size_t offset = 3;
absl::StatusOr<uint64_t> bytes_read =
data_provider.Get(first_content_id_, &ret_data, offset, kFirstDataSize);
ASSERT_OK(bytes_read);
ASSERT_EQ(kFirstDataSize - offset, *bytes_read);
EXPECT_TRUE(std::equal(std::begin(kFirstData) + offset,
std::end(kFirstData), std::begin(ret_data)));
}
void TestGetExistingChunkOutOfBounds(DataProvider& data_provider) {
uint8_t ret_data[kFirstDataSize];
size_t offset = 15;
absl::StatusOr<uint64_t> bytes_read =
data_provider.Get(first_content_id_, &ret_data, offset, kFirstDataSize);
ASSERT_OK(bytes_read);
EXPECT_EQ(0u, *bytes_read);
}
void TestGetExistingChunkComplete(DataProvider& data_provider) {
Buffer buffer;
EXPECT_OK(data_provider.Get(first_content_id_, &buffer));
Buffer exp_buffer({10, 20, 30, 40, 50, 60, 70, 80, 90});
EXPECT_EQ(exp_buffer, buffer);
}
void TestGetExistingChunk(DataProvider& data_provider) {
TestGetExistingChunkInBounds(data_provider);
TestGetExistingChunkOutOfBounds(data_provider);
TestGetExistingChunkComplete(data_provider);
}
protected:
ContentIdProto first_content_id_;
TestingSystemClock clock_;
std::string cache_dir_path_;
};
namespace {
// TODO: Add test with several readers and a writer, which has no data at the
// beginning. Request the chunk several times (the first time it should be
// received from reader, the second time - from the writer).
TEST_F(DataProviderTest, DataProvider) {
DataProvider data_provider(nullptr, {}, 0);
uint8_t ret_data[kFirstDataSize];
absl::StatusOr<uint64_t> bytes =
data_provider.Get(first_content_id_, &ret_data, 0, kFirstDataSize);
EXPECT_TRUE(absl::IsNotFound(bytes.status()));
Buffer buffer;
EXPECT_TRUE(absl::IsNotFound(data_provider.Get(first_content_id_, &buffer)));
}
TEST_F(DataProviderTest, CacheAsReader) {
std::vector<std::unique_ptr<DataStoreReader>> readers;
readers.emplace_back(CreateCacheWithFirstData());
DataProvider data_provider(nullptr, std::move(readers), 0);
TestGetExistingChunk(data_provider);
}
TEST_F(DataProviderTest, CacheAsWriter) {
DataProvider data_provider(CreateCacheWithFirstData(), {}, 0);
TestGetExistingChunk(data_provider);
}
TEST_F(DataProviderTest, MemCacheAsReader) {
std::vector<std::unique_ptr<DataStoreReader>> readers;
readers.emplace_back(CreateMemCacheWithFirstData());
DataProvider data_provider(nullptr, std::move(readers), 0);
TestGetExistingChunk(data_provider);
}
TEST_F(DataProviderTest, MemCacheAsWriter) {
DataProvider data_provider(CreateMemCacheWithFirstData(), {}, 0);
TestGetExistingChunk(data_provider);
}
TEST_F(DataProviderTest, CacheAsWriterMemCacheAsReader) {
absl::StatusOr<std::unique_ptr<DiskDataStore>> cache =
DiskDataStore::Create(0, cache_dir_path_, false, &clock_);
ASSERT_OK(cache);
std::vector<std::unique_ptr<DataStoreReader>> readers;
readers.emplace_back(CreateMemCacheWithFirstData());
DataProvider data_provider(std::move(*cache), std::move(readers), 0);
TestGetExistingChunk(data_provider);
}
TEST_F(DataProviderTest, GetMultiChunksFromWriterSuccess) {
DataProvider data_provider(CreateDiskCache({"aaa", "bbb", "ccc"}), {}, 0);
char buf[10];
ChunkTransferList chunks;
chunks.emplace_back(Id("aaa"), 0, buf, 3);
chunks.emplace_back(Id("bbb"), 0, buf + 3, 3);
chunks.emplace_back(Id("ccc"), 0, buf + 6, 3);
EXPECT_OK(data_provider.Get(&chunks));
EXPECT_TRUE(chunks.ReadDone());
EXPECT_TRUE(chunks.PrefetchDone());
EXPECT_TRUE(chunks[0].done);
EXPECT_TRUE(chunks[1].done);
EXPECT_TRUE(chunks[2].done);
EXPECT_EQ(absl::string_view(buf, 9), "aaabbbccc");
}
TEST_F(DataProviderTest, GetMultiChunksFromWriterPartialFail) {
DataProvider data_provider(CreateDiskCache({"aaa", "bbb", "ccc"}), {}, 0);
char buf[10];
ChunkTransferList chunks;
chunks.emplace_back(Id("aaa"), 0, buf, 3);
chunks.emplace_back(Id("does not exist"), 0, buf + 3, sizeof(buf));
chunks.emplace_back(Id("ccc"), 0, buf + 6, 3);
EXPECT_OK(data_provider.Get(&chunks));
EXPECT_FALSE(chunks.ReadDone());
EXPECT_FALSE(chunks.PrefetchDone());
EXPECT_TRUE(chunks[0].done);
EXPECT_FALSE(chunks[1].done);
EXPECT_TRUE(chunks[2].done);
EXPECT_EQ(absl::string_view(buf, 3), "aaa");
EXPECT_EQ(absl::string_view(buf + 6, 3), "ccc");
}
TEST_F(DataProviderTest, GetMultiChunksFromWriterAllFail) {
DataProvider data_provider(CreateDiskCache({"aaa", "bbb", "ccc"}), {}, 0);
char buf[10];
ChunkTransferList chunks;
chunks.emplace_back(Id("does not exist"), 0, buf, sizeof(buf));
EXPECT_OK(data_provider.Get(&chunks));
EXPECT_FALSE(chunks.ReadDone());
EXPECT_FALSE(chunks.PrefetchDone());
EXPECT_FALSE(chunks[0].done);
}
TEST_F(DataProviderTest, GetMultiChunksFromReaderCachedInWriter) {
auto readers = CreateMemCache({"aaa", "bbb", "ccc"});
auto disk_cache = CreateDiskCache({});
DiskDataStore* disk_cache_ptr = disk_cache.get();
DataProvider data_provider(std::move(disk_cache), std::move(readers), 0);
char buf[10];
ChunkTransferList chunks;
chunks.emplace_back(Id("aaa"), 0, buf, 3);
EXPECT_OK(data_provider.Get(&chunks));
EXPECT_TRUE(chunks.ReadDone());
EXPECT_TRUE(chunks.PrefetchDone());
EXPECT_TRUE(chunks[0].done);
EXPECT_EQ(absl::string_view(buf, 3), "aaa");
// Verify data has been cached in the writer.
EXPECT_TRUE(disk_cache_ptr->Contains(Id("aaa")));
EXPECT_EQ(disk_cache_ptr->List()->size(), 1);
chunks.clear();
chunks.emplace_back(Id("bbb"), 0, buf + 3, 3);
chunks.emplace_back(Id("ccc"), 0, buf + 6, 3);
EXPECT_OK(data_provider.Get(&chunks));
EXPECT_TRUE(chunks.ReadDone());
EXPECT_TRUE(chunks.PrefetchDone());
EXPECT_TRUE(chunks[0].done);
EXPECT_TRUE(chunks[1].done);
EXPECT_EQ(absl::string_view(buf, 9), "aaabbbccc");
// Verify data has been cached in the writer.
EXPECT_TRUE(disk_cache_ptr->Contains(Id("aaa")));
EXPECT_TRUE(disk_cache_ptr->Contains(Id("bbb")));
EXPECT_TRUE(disk_cache_ptr->Contains(Id("ccc")));
EXPECT_EQ(disk_cache_ptr->List()->size(), 3);
}
TEST_F(DataProviderTest, GetMultiChunksFromReaderAndWriterSkipPrefetch) {
auto readers = CreateMemCache({"bbb", "ccc"});
auto disk_cache = CreateDiskCache({"aaa"});
DiskDataStore* disk_cache_ptr = disk_cache.get();
DataProvider data_provider(std::move(disk_cache), std::move(readers), 0);
char buf[10];
// This request can be fulfilled with cached data, so "bbb" and "ccc" are not
// fetched from the reader.
ChunkTransferList chunks;
chunks.emplace_back(Id("aaa"), 0, buf, 3);
chunks.emplace_back(Id("bbb"), 0, nullptr, 0); // prefetch
chunks.emplace_back(Id("ccc"), 0, nullptr, 0); // prefetch
EXPECT_OK(data_provider.Get(&chunks));
EXPECT_TRUE(chunks.ReadDone());
EXPECT_FALSE(chunks.PrefetchDone());
EXPECT_TRUE(chunks[0].done);
EXPECT_FALSE(chunks[1].done);
EXPECT_FALSE(chunks[2].done);
EXPECT_EQ(absl::string_view(buf, 3), "aaa");
// No additional chunks should have been cached in the writer.
EXPECT_EQ(disk_cache_ptr->List()->size(), 1);
}
TEST_F(DataProviderTest, GetMultiChunksFromReaderAndWriterWithPrefetch) {
auto readers = CreateMemCache({"bbb", "ccc"});
auto disk_cache = CreateDiskCache({"aaa"});
DiskDataStore* disk_cache_ptr = disk_cache.get();
DataProvider data_provider(std::move(disk_cache), std::move(readers), 0);
char buf[10];
// This request includes one chunk that has to be fetched, so the third
// chunk should be prefetched as well.
ChunkTransferList chunks;
chunks.emplace_back(Id("aaa"), 0, buf, 3);
chunks.emplace_back(Id("bbb"), 0, buf + 3, 3);
chunks.emplace_back(Id("ccc"), 0, nullptr, 0); // prefetch
EXPECT_OK(data_provider.Get(&chunks));
EXPECT_TRUE(chunks.ReadDone());
EXPECT_TRUE(chunks.PrefetchDone());
EXPECT_TRUE(chunks[0].done);
EXPECT_TRUE(chunks[1].done);
EXPECT_TRUE(chunks[2].done);
EXPECT_EQ(absl::string_view(buf, 6), "aaabbb");
// Verify data has been cached in the writer
EXPECT_TRUE(disk_cache_ptr->Contains(Id("aaa")));
EXPECT_TRUE(disk_cache_ptr->Contains(Id("bbb")));
EXPECT_TRUE(disk_cache_ptr->Contains(Id("ccc")));
EXPECT_EQ(disk_cache_ptr->List()->size(), 3);
}
TEST_F(DataProviderTest, RecoverFromTruncatedChunkInCache) {
auto readers = CreateMemCache({"aaa"});
auto disk_cache = CreateDiskCache({"aaa"});
DiskDataStore* disk_cache_ptr = disk_cache.get();
DataProvider data_provider(std::move(disk_cache), std::move(readers), 0);
char buf[3];
// Truncate the chunk stored in the disk cache.
std::string path = GetDiskCacheFilePath(disk_cache_ptr, Id("aaa"));
size_t size;
EXPECT_OK(path::WriteFile(path, "a", 1));
EXPECT_OK(path::FileSize(path, &size));
EXPECT_EQ(size, 1);
ChunkTransferList chunks;
chunks.emplace_back(Id("aaa"), 0, buf, 3);
EXPECT_OK(data_provider.Get(&chunks));
EXPECT_TRUE(chunks.ReadDone());
EXPECT_TRUE(chunks[0].done);
EXPECT_EQ(absl::string_view(buf, 3), "aaa");
// Verify that the chunk has been recovered.
EXPECT_OK(path::FileSize(path, &size));
EXPECT_EQ(size, 3);
}
TEST_F(DataProviderTest, CleanupNotAllChunksRead) {
auto cache = CreateDiskCache({"aaa", "bbb", "ccc"});
cache->SetCapacity(5);
// Check that chunks are available in the cache first.
char buf[10];
EXPECT_EQ(cache->Get(Id("aaa"), buf, 0, 3).value(), 3u);
EXPECT_EQ(cache->Get(Id("bbb"), buf + 3, 0, 3).value(), 3u);
EXPECT_EQ(cache->Get(Id("ccc"), buf + 6, 0, 3).value(), 3u);
EXPECT_EQ(absl::string_view(buf, 9), "aaabbbccc");
DataProvider data_provider(std::move(cache), {}, 0, 0 /*cleanup timeout*/,
0 /*idling timeout*/);
memset(buf, 0, 10);
EXPECT_EQ(data_provider.Get(Id("ccc"), buf, 0, 3).value(), 3u);
// The data provider should contain only 1 chunk as the cleanup was already
// executed.
EXPECT_TRUE(WaitForProviderCleanupAndResetForTesting(
&data_provider, absl::Seconds(5) /*timeout*/));
memset(buf, 0, 10);
EXPECT_NOT_OK(data_provider.Get(Id("aaa"), buf, 0, 3));
EXPECT_NOT_OK(data_provider.Get(Id("bbb"), buf, 0, 3));
EXPECT_EQ(data_provider.Get(Id("ccc"), buf, 0, 3).value(), 3u);
}
} // namespace
} // namespace cdc_ft

View File

@@ -0,0 +1,98 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/data_store_reader.h"
#include "absl/strings/str_format.h"
#include "common/status_macros.h"
#include "manifest/content_id.h"
namespace cdc_ft {
bool ChunkTransferList::ReadDone() const {
for (auto it = begin(); it != end(); ++it) {
if (it->size && !it->done) return false;
}
return true;
}
bool ChunkTransferList::PrefetchDone() const {
for (auto it = begin(); it != end(); ++it) {
if (!it->done) return false;
}
return true;
}
std::string ChunkTransferList::ToHexString(
std::function<bool(const ChunkTransferTask&)> filter) const {
std::string ids;
for (auto it = begin(); it != end(); ++it) {
if (filter && !filter(*it)) continue;
if (!ids.empty()) ids += ", ";
ids += ContentId::ToHexString(it->id);
}
return ids;
}
std::string ChunkTransferList::UndoneToHexString() const {
return ToHexString(
[](const ChunkTransferTask& chunk) { return !chunk.done; });
}
size_t DataStoreReader::PrefetchSize(size_t read_size) const {
return read_size;
}
absl::Status DataStoreReader::Get(ChunkTransferList* chunks) {
absl::StatusOr<uint64_t> bytes_read;
for (ChunkTransferTask& chunk : *chunks) {
// This default implementation skips prefetching tasks (chunk.size == 0).
if (chunk.done || !chunk.size) continue;
bytes_read = Get(chunk.id, chunk.data, chunk.offset, chunk.size);
if (bytes_read.ok()) {
if (*bytes_read != chunk.size) {
return MakeStatus(
"Corrupted chunk %s detected, expected to read %u bytes, got %u",
ContentId::ToHexString(chunk.id), chunk.size, *bytes_read);
}
chunk.done = true;
} else {
// Return any unexpected error.
if (!absl::IsNotFound(bytes_read.status())) return bytes_read.status();
}
}
return absl::OkStatus();
}
absl::Status DataStoreReader::GetProto(const ContentIdProto& content_id,
google::protobuf::Message* proto) {
Buffer chunk;
return GetProto(content_id, &chunk, proto);
}
absl::Status DataStoreReader::GetProto(const ContentIdProto& content_id,
Buffer* buf,
google::protobuf::Message* proto) {
// Fetch the referenced chunk.
RETURN_IF_ERROR(Get(content_id, buf));
// Parse the manifest proto from the chunk.
if (!proto->ParseFromArray(buf->data(), static_cast<int>(buf->size()))) {
return absl::InternalError(absl::StrFormat(
"Failed to parse %s from chunk '%s'", proto->GetTypeName(),
ContentId::ToHexString(content_id)));
}
return absl::OkStatus();
}
} // namespace cdc_ft

View File

@@ -0,0 +1,129 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATA_STORE_DATA_STORE_READER_H_
#define DATA_STORE_DATA_STORE_READER_H_
#include <vector>
#include "absl/status/statusor.h"
#include "common/buffer.h"
#include "common/status.h"
#include "common/status_macros.h"
#include "manifest/content_id.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
// Describes which part of a chunk needs to copied into a given buffer.
struct ChunkTransferTask {
ChunkTransferTask() {}
ChunkTransferTask(ContentIdProto id, uint64_t offset, void* data,
uint64_t size)
: id(std::move(id)), offset(offset), data(data), size(size) {}
// Identifies the chunk.
ContentIdProto id;
// Relative offset into the chunk from where data should be copied.
uint64_t offset = 0;
// Data buffer into which the chunk is written. May be null for prefetching.
void* data = nullptr;
// Size of the |data| buffer. May be zero for prefetching.
uint64_t size = 0;
// If the storage layer fetches the complete chunk data, it can be moved into
// this string so that the data provider layer can cache the chunk.
std::string chunk_data;
// Indicates if the chunk was successfully copied into |data| or prefetched.
bool done = false;
};
// A std::vector of ChunkTransferTask elements.
class ChunkTransferList : public std::vector<ChunkTransferTask> {
public:
// Returns true if all tasks with a non-zero size have |done| set to true.
bool ReadDone() const;
// Returns true if all tasks have |done| set to true, including those only
// meant for prefetching (|size| == 0).
bool PrefetchDone() const;
// Returns a comma separated string of hex IDs of all chunks in this list. If
// the optional function |filter| is given, only those chunks are included for
// which |filter| returns true.
std::string ToHexString(
std::function<bool(const ChunkTransferTask&)> filter = nullptr) const;
// Same as ToHexString, but only includes tasks having |done| set to false.
std::string UndoneToHexString() const;
};
// DataStoreReader is an abstract interface to read from all data stores used
// for the file transfer, for example: a local cache, a data store, which
// receives data via a gRPC channel, etc.
class DataStoreReader {
public:
DataStoreReader() = default;
virtual ~DataStoreReader() = default;
DataStoreReader(const DataStoreReader&) = delete;
DataStoreReader& operator=(const DataStoreReader&) = delete;
// Suggests a data prefetch size based on the given |read_size|. The default
// implementation just returns |read_size|. Override this function to
// implement a prefetching strategy.
virtual size_t PrefetchSize(size_t read_size) const;
// Reads |size| bytes from the chunk specified by |content_id|, starting
// at the given |offset|, and writes the result into |data|.
// The return value is the number of read bytes.
// If the chunk is not found in the data store, returns NotFoundError.
virtual absl::StatusOr<size_t> Get(const ContentIdProto& content_id,
void* data, size_t offset,
size_t size) = 0;
// Reads all chunks from the given task list |chunks| that are not done yet,
// copies the data into the associated buffer, and marks the chunk as done. If
// the reader fetches the full chunk, the raw data may be moved to the task as
// well for caching.
//
// Returns success even if no chunk was found. Check |chunks->ReadDone()| or
// |chunks->PrefetchDone()| to verify all chunks were fetched. Returns any
// error other than absl::NotFoundError from the underlying implementation.
//
// The default implementation calls the single item `Get()` method for each
// task in |chunks|. Override this method in a sub-class for optimized batch
// processing.
virtual absl::Status Get(ChunkTransferList* chunks);
// Reads the complete data chunk specified by |content_id| and writes the
// result into |data|.
// If the chunk is not found in the data store, returns NotFoundError.
virtual absl::Status Get(const ContentIdProto& content_id, Buffer* data) = 0;
// Reads the complete chunk identified by |content_id| and parses it as the
// given protocol buffer.
absl::Status GetProto(const ContentIdProto& content_id,
google::protobuf::Message* proto);
// Reads the complete chunk identified by |content_id| and parses it as the
// given protocol buffer. Uses the given Buffer |buf| as intermediate
// storage.
absl::Status GetProto(const ContentIdProto& content_id, Buffer* buf,
google::protobuf::Message* proto);
}; // class DataStoreReader
} // namespace cdc_ft
#endif // DATA_STORE_DATA_STORE_READER_H_

View File

@@ -0,0 +1,44 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/data_store_writer.h"
#include "absl/strings/str_format.h"
#include "common/status.h"
#include "manifest/content_id.h"
namespace cdc_ft {
bool DataStoreWriter::Contains(const ContentIdProto& content_id) {
Buffer buffer;
return Get(content_id, &buffer).ok();
}
absl::Status DataStoreWriter::PutProto(
const google::protobuf::MessageLite& proto, ContentIdProto* content_id,
size_t* proto_size) {
// Serialize the proto.
std::string out;
if (!proto.SerializeToString(&out)) {
return absl::InternalError(
absl::StrFormat("Failed to serialize %s.", proto.GetTypeName()));
}
// Calculate the proto's content ID.
*content_id = ContentId::FromDataString(out);
if (proto_size) *proto_size = out.size();
// Write manifest chunk to storage.
return Put(*content_id, out.c_str(), out.size());
}
} // namespace cdc_ft

View File

@@ -0,0 +1,83 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATA_STORE_DATA_STORE_WRITER_H_
#define DATA_STORE_DATA_STORE_WRITER_H_
#include <unordered_set>
#include "absl/status/statusor.h"
#include "common/buffer.h"
#include "data_store/data_store_reader.h"
#include "manifest/manifest_proto_defs.h"
namespace cdc_ft {
// DataStoreWriter is an abstract interface for read/write operations for a data
// store, for example: a disk-based or in-memory cache.
class DataStoreWriter : public DataStoreReader {
public:
DataStoreWriter() = default;
DataStoreWriter(const DataStoreWriter&) = delete;
DataStoreWriter& operator=(const DataStoreWriter&) = delete;
virtual ~DataStoreWriter() = default;
// Returns true if the chunk with the given |content_id| is available
// in the data store. Otherwise, returns false. The default implementation
// uses Get() to retrieve the chunk and should be overridden.
virtual bool Contains(const ContentIdProto& content_id);
// Stores a data chunk |data| of |size| and |content_id| into the data store.
virtual absl::Status Put(const ContentIdProto& content_id, const void* data,
size_t size) = 0;
// Stores the given protocol buffer |proto| as a unique chunk and updates
// |content_id| with the corresponding digest. If the optional parameter
// |proto_size| is given, it will be set to the byte size of the serialized
// proto.
absl::Status PutProto(const google::protobuf::MessageLite& proto,
ContentIdProto* content_id,
size_t* proto_size = nullptr);
// Removes the data chunk with |content_id| from the writer. Returns success
// if the chunk does not exist or was removed.
virtual absl::Status Remove(const ContentIdProto& content_id) = 0;
// Wipes the data. All statistics and data chunks are removed from the data
// store.
virtual absl::Status Wipe() = 0;
// Removes all chunks except for |ids_to_keep|. Also checks whether all chunks
// in |ids_to_keep| are present. If not, returns a NotFound error.
virtual absl::Status Prune(
std::unordered_set<ContentIdProto> ids_to_keep) = 0;
// Removes the data if the data store size exceeds its capacity.
virtual absl::Status Cleanup() { return absl::OkStatus(); }
// Allows to interrupt methods by setting |interrupt_|.
void RegisterInterrupt(std::atomic<bool>* interrupt) {
interrupt_ = interrupt;
}
protected:
// Shows whether a function can be cancelled. Used in Cleanup().
std::atomic<bool>* interrupt_ = nullptr;
}; // class DataStoreWriter
} // namespace cdc_ft
#endif // DATA_STORE_DATA_STORE_WRITER_H_

View File

@@ -0,0 +1,362 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/disk_data_store.h"
#include <filesystem>
#include <memory>
#include "common/log.h"
#include "common/path.h"
#include "common/status.h"
#include "common/status_macros.h"
namespace cdc_ft {
namespace {
static constexpr char kDirNames[16] = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
// Generates directory names of |length| symbols from kDirNames.
// If length = 2, the names are 00, 01, 02, etc.
std::vector<std::string> GenerateDirNames(size_t length) {
size_t names_size = 1ull << (length * 4);
std::vector<std::string> names(names_size, std::string(length, '0'));
for (size_t idx = 0; idx < names_size; ++idx) {
size_t symbol = idx;
for (size_t jdx = 0; jdx < length; ++jdx) {
names[idx][jdx] = kDirNames[symbol & 0xfu];
symbol >>= 4;
}
}
return names;
}
// Adds |count| path separators to |input| after each |distance| symbols
// starting from the beginning. At least one symbol is left at the end
// for the file name.
// AddSeparators("abc", 1, 3) -> a\b\c
// AddSeparators("abc", 1, 0) -> abc
// AddSeparators("abc", 2, 100) -> ab\c
static std::string AddPathSeparators(const std::string& input, size_t distance,
size_t count) {
if (input.empty() || distance == 0 || count == 0) {
return input;
}
count = std::min((input.size() - 1) / distance, count);
std::string path;
path.reserve(input.size() + count);
std::string::const_iterator it_pos = input.begin();
while (count > 0 && it_pos < input.end()) {
path.append(it_pos, it_pos + distance);
path.push_back(path::PathSeparator());
it_pos += distance;
--count;
}
if (it_pos < input.end()) {
path.append(it_pos, input.end());
}
return path;
}
} // namespace
DiskDataStore::DiskDataStore(unsigned int depth, std::string cache_root_dir,
bool create_dirs, SystemClock* clock)
: depth_(depth),
root_dir_(std::move(cache_root_dir)),
create_dirs_(create_dirs),
clock_(clock) {
assert(!root_dir_.empty());
path::EnsureEndsWithPathSeparator(&root_dir_);
}
absl::StatusOr<std::unique_ptr<DiskDataStore>> DiskDataStore::Create(
unsigned int depth, std::string cache_root_dir, bool create_dirs,
SystemClock* clock) {
std::unique_ptr<DiskDataStore> store = absl::WrapUnique(
new DiskDataStore(depth, std::move(cache_root_dir), create_dirs, clock));
if (create_dirs) {
RETURN_IF_ERROR(store->CreateDirHierarchy());
}
return store;
}
DiskDataStore::~DiskDataStore() {}
absl::Status DiskDataStore::Put(const ContentIdProto& content_id,
const void* data, size_t size) {
std::string path = GetCacheFilePath(content_id);
if (!create_dirs_) {
RETURN_IF_ERROR(path::CreateDirRec(path::DirName(path)));
}
RETURN_IF_ERROR(path::WriteFile(path, data, size));
UpdateModificationTime(path);
size_.fetch_add(size, std::memory_order_relaxed);
return absl::OkStatus();
}
absl::StatusOr<size_t> DiskDataStore::Get(const ContentIdProto& content_id,
void* data, size_t offset,
size_t size) {
if (!size) return 0;
assert(data);
std::string path = GetCacheFilePath(content_id);
size_t read_size;
ASSIGN_OR_RETURN(read_size, path::ReadFile(path, data, offset, size),
"Failed to read chunk %s of size %d at offset %d",
ContentId::ToHexString(content_id), size, offset);
UpdateModificationTime(path);
return read_size;
}
absl::Status DiskDataStore::Get(const ContentIdProto& content_id,
Buffer* data) {
assert(data);
std::string path = GetCacheFilePath(content_id);
size_t read_size = 0;
size_t file_size = 0;
RETURN_IF_ERROR(path::FileSize(path, &file_size),
"Failed to stat file size for '%s'", path);
data->resize(file_size);
ASSIGN_OR_RETURN(read_size, path::ReadFile(path, data->data(), 0, file_size),
"Failed to read %s of size %d",
ContentId::ToHexString(content_id), file_size);
if (read_size != file_size) {
return absl::DataLossError(
absl::StrFormat("Only %u bytes out of %u are read for %s", read_size,
file_size, ContentId::ToHexString(content_id)));
}
UpdateModificationTime(path);
return absl::OkStatus();
}
int64_t DiskDataStore::Capacity() const { return capacity_; }
double DiskDataStore::FillFactor() const { return fill_factor_; }
unsigned int DiskDataStore::Depth() const { return depth_; }
size_t DiskDataStore::Size() const { return size_; }
const std::string& DiskDataStore::RootDir() const { return root_dir_; }
void DiskDataStore::SetCapacity(int64_t capacity) { capacity_ = capacity; }
absl::Status DiskDataStore::SetFillFactor(double fill_factor) {
if (fill_factor <= 0 || fill_factor > 1) {
return absl::FailedPreconditionError(
absl::StrFormat("Failed to set cache fill factor to %f.", fill_factor));
}
fill_factor_ = fill_factor;
return Cleanup();
}
absl::Status DiskDataStore::Wipe() {
RETURN_IF_ERROR(path::RemoveDirRec(root_dir_),
"RemoveDirRec() for '%s' failed", root_dir_);
size_ = 0;
if (create_dirs_) {
RETURN_IF_ERROR(CreateDirHierarchy());
}
return absl::OkStatus();
}
absl::Status DiskDataStore::Prune(
std::unordered_set<ContentIdProto> ids_to_keep) {
CacheFilesWithSize files_with_size;
ASSIGN_OR_RETURN(files_with_size, CollectCacheFiles(),
"Failed to collect cache files");
// Delete the set of chunks not in |ids_to_keep|.
std::vector<ContentIdProto> to_delete;
for (const CacheFile& file : files_with_size.files) {
// Don't touch files that don't match the chunk naming scheme
// (e.g. user-added files).
ContentIdProto id;
if (!ParseCacheFilePath(std::move(file.path), &id)) continue;
if (ids_to_keep.find(id) == ids_to_keep.end()) {
RETURN_IF_ERROR(Remove(id));
size_.fetch_sub(file.size, std::memory_order_relaxed);
} else {
ids_to_keep.erase(id);
}
}
// Verify that all chunks in |ids_to_keep| are present in the cache.
if (!ids_to_keep.empty()) {
return absl::NotFoundError(absl::StrFormat(
"%u chunks, e.g. '%s', not found in the store", ids_to_keep.size(),
ContentId::ToHexString(*ids_to_keep.begin())));
}
return absl::OkStatus();
}
absl::Status DiskDataStore::Remove(const ContentIdProto& content_id) {
std::string path = GetCacheFilePath(content_id);
return path::RemoveFile(path);
}
bool DiskDataStore::Contains(const ContentIdProto& content_id) {
return path::Exists(GetCacheFilePath(content_id));
}
absl::Status DiskDataStore::Cleanup() {
if (capacity_ < 0) {
return absl::OkStatus();
}
size_t size_threshold = static_cast<size_t>(capacity_) * fill_factor_;
if (size_initialized_.load() && size_ <= size_threshold) {
return absl::OkStatus();
}
CacheFilesWithSize files_with_size;
ASSIGN_OR_RETURN(files_with_size, CollectCacheFiles());
LOG_DEBUG("Cache size before the cleanup: %u bytes", size_.load());
std::vector<CacheFile>& files = files_with_size.files;
// Sort in the LRU order: the old files stored first.
std::sort(files.begin(), files.end(),
[](const CacheFile& file1, const CacheFile& file2) {
// Also sort by path for deterministic results in tests.
if (file1.mtime == file2.mtime) return file1.path < file2.path;
return file1.mtime < file2.mtime;
});
size_t file_index = 0;
const size_t num_of_files = files.size();
while (size_ > size_threshold && file_index < num_of_files) {
std::string path = path::Join(root_dir_, files[file_index].path);
RETURN_IF_ERROR(path::RemoveFile(path));
size_.fetch_sub(files[file_index].size, std::memory_order_relaxed);
++file_index;
if (interrupt_ && *interrupt_) {
return absl::CancelledError("Cache cleanup has been cancelled");
}
}
LOG_DEBUG("Cache size after the cleanup: %u bytes", size_.load());
return absl::OkStatus();
}
absl::StatusOr<std::vector<ContentIdProto>> DiskDataStore::List() {
CacheFilesWithSize files_with_size;
ASSIGN_OR_RETURN(files_with_size, CollectCacheFiles(true),
"Failed to collect cache files");
std::vector<ContentIdProto> ids;
ids.reserve(files_with_size.files.size());
for (const CacheFile& file : files_with_size.files) {
ContentIdProto id;
if (ParseCacheFilePath(std::move(file.path), &id))
ids.push_back(std::move(id));
}
return ids;
}
absl::StatusOr<DiskDataStore::Statistics> DiskDataStore::CalculateStatistics()
const {
Statistics statistics;
auto handler = [&](const std::string& dir, const std::string& filename,
int64_t /*modified_time*/, uint64_t size,
bool is_directory) -> absl::Status {
if (!is_directory) {
statistics.size += size;
++statistics.number_of_chunks;
}
return absl::OkStatus();
};
RETURN_IF_ERROR(path::SearchFiles(root_dir_, true, handler));
return statistics;
}
absl::StatusOr<DiskDataStore::CacheFilesWithSize>
DiskDataStore::CollectCacheFiles(bool continue_on_interrupt) {
CacheFilesWithSize cache_files;
if (!path::DirExists({root_dir_})) return cache_files;
auto handler = [&](const std::string& dir, const std::string& filename,
int64_t modified_time, uint64_t size,
bool is_directory) -> absl::Status {
if (!is_directory) {
cache_files.files.emplace_back();
cache_files.files.back().path =
path::Join(dir.substr(root_dir_.size()), filename);
cache_files.files.back().mtime = modified_time;
cache_files.files.back().size = size;
cache_files.size += size;
}
if (!continue_on_interrupt && interrupt_ && *interrupt_) {
return absl::CancelledError("Cache cleanup has been cancelled");
}
return absl::OkStatus();
};
RETURN_IF_ERROR(path::SearchFiles(root_dir_, true, handler));
size_ = cache_files.size;
size_initialized_ = true;
return cache_files;
}
std::string DiskDataStore::GetCacheFilePath(
const ContentIdProto& content_id) const {
std::string file_name = AddPathSeparators(ContentId::ToHexString(content_id),
kDirNameLength, depth_);
return path::Join(root_dir_, file_name);
}
bool DiskDataStore::ParseCacheFilePath(std::string path,
ContentIdProto* content_id) const {
// Remove path separators.
if (depth_ > 0) {
path.erase(std::remove_if(path.begin(), path.end(),
[](char c) {
return c == path::PathSeparator() ||
c == path::OtherPathSeparator();
}),
path.end());
}
return ContentId::FromHexString(path, content_id);
}
void DiskDataStore::UpdateModificationTime(const std::string& path) {
// Don't fail if the time cannot be modified.
// The time might be updated in parallel, so it is not critical.
path::SetFileTime(path, std::chrono::system_clock::to_time_t(clock_->Now()))
.IgnoreError();
}
absl::Status DiskDataStore::CreateDirHierarchy() {
if (dirs_.empty() && depth_ > 0) {
dirs_ = GenerateDirNames(kDirNameLength);
}
RETURN_IF_ERROR(path::CreateDirRec(root_dir_));
return CreateDirLevelRec(root_dir_, depth_);
}
absl::Status DiskDataStore::CreateDirLevelRec(const std::string& parent,
unsigned int depth) {
if (depth == 0) {
return absl::OkStatus();
}
for (const std::string& dir : dirs_) {
std::string name = path::Join(parent, dir);
RETURN_IF_ERROR(path::CreateDir(name));
RETURN_IF_ERROR(CreateDirLevelRec(name, depth - 1),
"CreateDirLevelRec() for %s failed at level %d:", name,
depth - 1);
}
return absl::OkStatus();
}
}; // namespace cdc_ft

View File

@@ -0,0 +1,191 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATA_STORE_DISK_DATA_STORE_H_
#define DATA_STORE_DISK_DATA_STORE_H_
#include <atomic>
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "common/buffer.h"
#include "common/clock.h"
#include "common/platform.h"
#include "data_store/data_store_writer.h"
#include "manifest/content_id.h"
namespace cdc_ft {
// File-based LRU cache to store data chunks on disk. The LRU strategy is based
// on each file's mtime, which gets updated on each access.
// Not thread-safe.
class DiskDataStore : public DataStoreWriter {
public:
struct Statistics {
size_t size = 0;
size_t number_of_chunks = 0;
};
static constexpr uint64_t kDefaultCapacity{150ull << 30}; // 150 GiB
// Creates and returns a DiskDataStore that generates the cache directory
// hierarchy in |cache_root_dir| of |depth| at startup if |create_dirs| is
// set.
// Returns an error status if the cache directories cannot be created.
// Uses |clock| as an internal clock for the file modification times.
static absl::StatusOr<std::unique_ptr<DiskDataStore>> Create(
unsigned int depth, std::string cache_root_dir, bool create_dirs,
SystemClock* clock = DefaultSystemClock::GetInstance());
DiskDataStore(const DiskDataStore& other) = delete;
DiskDataStore& operator=(const DiskDataStore& other) = delete;
~DiskDataStore();
// DataStoreReader:
absl::StatusOr<size_t> Get(const ContentIdProto& content_id, void* data,
size_t offset, size_t size) override;
absl::Status Get(const ContentIdProto& content_id, Buffer* data) override;
// DataStoreWriter:
absl::Status Put(const ContentIdProto& content_id, const void* data,
size_t size) override;
absl::Status Remove(const ContentIdProto& content_id) override;
absl::Status Wipe() override;
absl::Status Prune(std::unordered_set<ContentIdProto> ids_to_keep) override;
bool Contains(const ContentIdProto& content_id) override;
// Removes chunks in the LRU order if the cache size exceeds its capacity.
// Cleans the cache up until its size drops below the cache capacity
// limited by the fill factor (capacity * fill factor).
absl::Status Cleanup() override;
// Returns a list of all contained content ids independent of |interrupt_|.
absl::StatusOr<std::vector<ContentIdProto>> List();
// Returns the defined cache capacity in bytes.
// If 0, the cache is disabled.
// If < 0, the disk space is not limited and the whole disk can be used for
// storing data in the cache.
int64_t Capacity() const;
// Returns the fill factor that defines the maximum portion of the capacity,
// which can be occupied by the cache after cleanup.
double FillFactor() const;
// Returns the depth of the hierarchy of the cache directories.
unsigned int Depth() const;
// Returns the current total |size_| of the stored data.
size_t Size() const;
// Returns the path to the root cache directory.
const std::string& RootDir() const;
// Sets the cache capacity in bytes.
// No cleanup is performed.
void SetCapacity(int64_t capacity);
// Sets the cache fill factor.
// |factor| should be a positive number (0,1].
absl::Status SetFillFactor(double factor);
// Calculates cache statistics including the total amount of disk space used
// for storing chunks measured in bytes and the number of chunks.
// Returns an error, if the size could not be calculated.
// This is an expensive operation.
absl::StatusOr<Statistics> CalculateStatistics() const;
// The number of symbols in the cache's directory names.
static constexpr int kDirNameLength = 2;
private:
friend class DataProviderTest;
struct CacheFile {
std::string path;
int64_t mtime = 0;
size_t size = 0;
void swap(CacheFile& other) {
std::swap(path, other.path);
std::swap(mtime, other.mtime);
std::swap(size, other.size);
}
};
struct CacheFilesWithSize {
size_t size = 0;
std::vector<CacheFile> files;
};
DiskDataStore(unsigned int depth, std::string cache_root_dir,
bool create_dirs, SystemClock* clock);
// Returns a vector of CacheFile with their total size in bytes.
// In addition, initializes the size if the method succeeds.
// Returns an error status, if an error occured.
// |continue_on_interrupt| shows whether the method should be cancelled on new
// read/write requests.
absl::StatusOr<CacheFilesWithSize> CollectCacheFiles(
bool continue_on_interrupt = false);
// Returns the path to the file, which stores the data chunk for |content_id|.
std::string GetCacheFilePath(const ContentIdProto& content_id) const;
// Parses the chunk file |path| into its content id if possible.
// |path| is expected to look similar to "aa/bb/ccddeeff...".
// Returns false if parsing fails.
bool ParseCacheFilePath(std::string path, ContentIdProto* content_id) const;
// Updates modification time of |path|.
void UpdateModificationTime(const std::string& path);
// Creates the cache directory hierarchy.
absl::Status CreateDirHierarchy();
// Creates cache directories in the |parent| on the level |depth| recursively.
absl::Status CreateDirLevelRec(const std::string& parent, unsigned int depth);
// When the cache is cleaned up, it is advantageous to make some more space
// available for new chunks and not only to clean the redundant chunks up,
// which make the cache exceed its capacity.
static constexpr double kDefaultFillFactor = 0.8;
const unsigned int depth_;
std::string root_dir_;
const bool create_dirs_;
const SystemClock* clock_;
std::atomic<int64_t> capacity_{kDefaultCapacity};
std::atomic<double> fill_factor_{kDefaultFillFactor};
// The total data size is updated at Put(), Prune(), Wipe(), and Cleanup().
// It is not guaranteed to be correct between cleanups:
// - Put() does not consider the size of the file metadata.
// - before the first Cleanup if the cache had already some data stored on the
// disk from previous AS runs.
std::atomic<size_t> size_{0};
// Shows if the |size_| was already initialized correctly in the Cleanup().
// Without it, Cleanup() can be skipped if some new data has been written
// before the first Cleanup() took place.
std::atomic<bool> size_initialized_{false};
std::vector<std::string> dirs_;
}; // class DiskDataStore
}; // namespace cdc_ft
#endif // DATA_STORE_DISK_DATA_STORE_H_

View File

@@ -0,0 +1,453 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/disk_data_store.h"
#include "common/path.h"
#include "common/status_test_macros.h"
#include "common/testing_clock.h"
#include "gtest/gtest.h"
#include "manifest/content_id.h"
namespace cdc_ft {
namespace {
constexpr uint8_t kFirstData[] = {10, 20, 30, 40, 50, 60, 70, 80, 90};
constexpr uint8_t kSecondData[] = {100, 101, 102, 103, 104, 105, 106};
constexpr size_t kFirstDataSize = sizeof(kFirstData);
constexpr size_t kSecondDataSize = sizeof(kSecondData);
constexpr char kTestCacheDirName[] = ".cdc_ft_cache";
class DiskDataStoreTest : public ::testing::Test {
public:
DiskDataStoreTest() {
first_content_id_ = ContentId::FromArray(kFirstData, kFirstDataSize);
second_content_id_ = ContentId::FromArray(kSecondData, kSecondDataSize);
}
void SetUp() override {
cache_dir_path_ = path::Join(path::GetTempDir(), kTestCacheDirName);
EXPECT_OK(path::RemoveDirRec(cache_dir_path_));
}
void TearDown() override { EXPECT_OK(path::RemoveDirRec(cache_dir_path_)); }
std::unique_ptr<DiskDataStore> CreateCache(unsigned int depth,
bool create_dirs = false) {
absl::StatusOr<std::unique_ptr<DiskDataStore>> cache =
DiskDataStore::Create(depth, cache_dir_path_, create_dirs, &clock_);
EXPECT_OK(cache);
return std::move(*cache);
}
protected:
ContentIdProto first_content_id_;
ContentIdProto second_content_id_;
TestingSystemClock clock_;
std::string cache_dir_path_;
};
TEST_F(DiskDataStoreTest, DiskDataStore) {
auto cache = CreateCache(0);
EXPECT_EQ(0u, cache->Size());
absl::StatusOr<DiskDataStore::Statistics> statistics =
cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(0u, statistics->size);
EXPECT_EQ(0u, statistics->number_of_chunks);
EXPECT_GT(cache->Capacity(), 0);
EXPECT_GT(cache->FillFactor(), 0);
EXPECT_LT(cache->FillFactor(), 1);
}
TEST_F(DiskDataStoreTest, PutGet) {
auto cache = CreateCache(2);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
EXPECT_EQ(kFirstDataSize, cache->Size());
absl::StatusOr<DiskDataStore::Statistics> statistics =
cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(kFirstDataSize, statistics->size);
EXPECT_EQ(1u, statistics->number_of_chunks);
EXPECT_TRUE(cache->Contains(first_content_id_));
uint8_t ret_data[kFirstDataSize];
absl::StatusOr<uint64_t> bytes_read =
cache->Get(first_content_id_, &ret_data, 0, kFirstDataSize);
EXPECT_EQ(kFirstDataSize, cache->Size());
ASSERT_OK(bytes_read);
ASSERT_EQ(kFirstDataSize, *bytes_read);
EXPECT_TRUE(std::equal(std::begin(kFirstData), std::end(kFirstData),
std::begin(ret_data)));
statistics = cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(kFirstDataSize, statistics->size);
EXPECT_EQ(1u, statistics->number_of_chunks);
}
TEST_F(DiskDataStoreTest, GetBuffer) {
auto cache = CreateCache(1);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
Buffer buffer;
EXPECT_OK(cache->Get(first_content_id_, &buffer));
Buffer exp_buffer({10, 20, 30, 40, 50, 60, 70, 80, 90});
EXPECT_EQ(exp_buffer, buffer);
}
TEST_F(DiskDataStoreTest, Wipe) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
EXPECT_EQ(kFirstDataSize, cache->Size());
EXPECT_OK(cache->Wipe());
EXPECT_EQ(0u, cache->Size());
absl::StatusOr<DiskDataStore::Statistics> statistics =
cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(0u, statistics->size);
EXPECT_EQ(0u, statistics->number_of_chunks);
EXPECT_FALSE(cache->Contains(first_content_id_));
}
TEST_F(DiskDataStoreTest, PruneSucceeds) {
auto cache = CreateCache(2);
ContentIdProto content_ids[4];
for (size_t n = 0; n < std::size(content_ids); ++n) {
content_ids[n] = ContentId::FromArray(&n, sizeof(n));
EXPECT_OK(cache->Put(content_ids[n], &n, sizeof(n)));
}
std::unordered_set<ContentIdProto> ids_to_keep = {content_ids[0],
content_ids[2]};
EXPECT_OK(cache->Prune(std::move(ids_to_keep)));
EXPECT_TRUE(cache->Contains(content_ids[0]));
EXPECT_TRUE(cache->Contains(content_ids[2]));
EXPECT_EQ(2 * sizeof(size_t), cache->Size());
EXPECT_FALSE(cache->Contains(content_ids[1]));
EXPECT_FALSE(cache->Contains(content_ids[3]));
}
TEST_F(DiskDataStoreTest, PruneFailsNotFound) {
auto cache = CreateCache(2);
ContentIdProto content_ids[2];
for (size_t n = 0; n < std::size(content_ids); ++n)
content_ids[n] = ContentId::FromArray(&n, sizeof(n));
EXPECT_OK(cache->Put(content_ids[0], nullptr, 0));
std::unordered_set<ContentIdProto> ids_to_keep = {content_ids[1]};
EXPECT_TRUE(absl::IsNotFound(cache->Prune(std::move(ids_to_keep))));
EXPECT_FALSE(cache->Contains(content_ids[0]));
}
TEST_F(DiskDataStoreTest, SetCapacity) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
cache->SetCapacity(0);
EXPECT_OK(cache->Cleanup());
absl::StatusOr<DiskDataStore::Statistics> statistics =
cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(0u, statistics->size);
EXPECT_EQ(0u, statistics->number_of_chunks);
EXPECT_FALSE(cache->Contains(first_content_id_));
}
TEST_F(DiskDataStoreTest, SetFillFactor) {
auto cache = CreateCache(2);
EXPECT_OK(cache->SetFillFactor(0.1));
EXPECT_NOT_OK(cache->SetFillFactor(0));
EXPECT_NOT_OK(cache->SetFillFactor(100));
EXPECT_OK(cache->SetFillFactor(1));
EXPECT_EQ(1, cache->FillFactor());
}
TEST_F(DiskDataStoreTest, GetNonExisting) {
auto cache = CreateCache(0);
EXPECT_FALSE(cache->Contains(first_content_id_));
uint8_t ret_data[kFirstDataSize];
absl::StatusOr<size_t> read_bytes =
cache->Get(first_content_id_, &ret_data, 0, kFirstDataSize);
EXPECT_TRUE(absl::IsNotFound(read_bytes.status()));
Buffer buffer;
EXPECT_TRUE(absl::IsNotFound(cache->Get(first_content_id_, &buffer)));
}
TEST_F(DiskDataStoreTest, PutTwoRemoveOne) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
EXPECT_EQ(kFirstDataSize, cache->Size());
clock_.Advance(1000);
EXPECT_OK(cache->Put(second_content_id_, kSecondData, kSecondDataSize));
absl::StatusOr<DiskDataStore::Statistics> statistics =
cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(kFirstDataSize + kSecondDataSize, statistics->size);
EXPECT_EQ(statistics->size, cache->Size());
EXPECT_EQ(2u, statistics->number_of_chunks);
cache->SetCapacity(kFirstDataSize + 4);
EXPECT_OK(cache->Cleanup());
statistics = cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(kSecondDataSize, statistics->size);
EXPECT_EQ(statistics->size, cache->Size());
EXPECT_EQ(1u, statistics->number_of_chunks);
EXPECT_FALSE(cache->Contains(first_content_id_));
EXPECT_TRUE(cache->Contains(second_content_id_));
}
TEST_F(DiskDataStoreTest, PutTwoReadOldRemoveOne) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
clock_.Advance(1000);
EXPECT_OK(cache->Put(second_content_id_, kSecondData, kSecondDataSize));
clock_.Advance(1000);
uint8_t ret_data[kFirstDataSize];
EXPECT_OK(
cache->Get(first_content_id_, &ret_data, 0, kFirstDataSize).status());
// second_key should be removed after the cleanup.
cache->SetCapacity(kFirstDataSize + 4);
EXPECT_OK(cache->Cleanup());
EXPECT_TRUE(cache->Contains(first_content_id_));
EXPECT_FALSE(cache->Contains(second_content_id_));
uint8_t ret_data2[kFirstDataSize];
absl::StatusOr<uint64_t> bytes_read =
cache->Get(first_content_id_, &ret_data2, 0, kFirstDataSize);
ASSERT_OK(bytes_read);
ASSERT_EQ(kFirstDataSize, *bytes_read);
EXPECT_TRUE(std::equal(std::begin(kFirstData), std::end(kFirstData),
std::begin(ret_data)));
}
TEST_F(DiskDataStoreTest, GetWithZeroLength) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
uint8_t ret_data[1];
absl::StatusOr<size_t> read_bytes =
cache->Get(first_content_id_, &ret_data, 0, 0);
ASSERT_OK(read_bytes);
ASSERT_EQ(0u, *read_bytes);
}
TEST_F(DiskDataStoreTest, GetWithOffset) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
size_t const offset = 3;
size_t const len = kFirstDataSize - offset;
uint8_t ret_data[len];
absl::StatusOr<size_t> read_bytes =
cache->Get(first_content_id_, &ret_data, offset, len);
ASSERT_OK(read_bytes);
ASSERT_EQ(len, *read_bytes);
EXPECT_TRUE(std::equal(std::begin(kFirstData) + offset, std::end(kFirstData),
std::begin(ret_data)));
}
TEST_F(DiskDataStoreTest, GetWithWrongOffset) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
uint8_t ret_data[kFirstDataSize];
absl::StatusOr<size_t> read_bytes =
cache->Get(first_content_id_, &ret_data, 1000, kFirstDataSize);
ASSERT_OK(read_bytes);
ASSERT_EQ(0, *read_bytes);
}
TEST_F(DiskDataStoreTest, GetWithTooBigLength) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
uint8_t ret_data[kFirstDataSize + 10];
absl::StatusOr<size_t> read_bytes =
cache->Get(first_content_id_, &ret_data, 0, kFirstDataSize + 10);
ASSERT_OK(read_bytes);
ASSERT_EQ(kFirstDataSize, *read_bytes);
}
TEST_F(DiskDataStoreTest, Remove) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
EXPECT_OK(cache->Remove(first_content_id_));
EXPECT_FALSE(cache->Contains(first_content_id_));
EXPECT_OK(cache->Remove(first_content_id_));
}
TEST_F(DiskDataStoreTest, CreateCacheIfRootDirExists) {
auto cache1 = CreateCache(1);
EXPECT_OK(cache1->Put(first_content_id_, kFirstData, kFirstDataSize));
EXPECT_OK(cache1->Put(second_content_id_, kSecondData, kSecondDataSize));
auto cache2 = CreateCache(1);
absl::StatusOr<DiskDataStore::Statistics> statistics1 =
cache1->CalculateStatistics();
ASSERT_OK(statistics1);
absl::StatusOr<DiskDataStore::Statistics> statistics2 =
cache2->CalculateStatistics();
ASSERT_OK(statistics2);
EXPECT_EQ(statistics1->size, statistics2->size);
EXPECT_EQ(statistics1->number_of_chunks, statistics2->number_of_chunks);
EXPECT_EQ(cache2->Capacity(), cache1->Capacity());
EXPECT_EQ(cache2->FillFactor(), cache1->FillFactor());
EXPECT_EQ(cache2->Depth(), cache1->Depth());
EXPECT_TRUE(cache2->Contains(first_content_id_));
EXPECT_TRUE(cache2->Contains(second_content_id_));
}
TEST_F(DiskDataStoreTest, CacheWithDirectories) {
unsigned int depth = 1;
unsigned int dir_count = 0;
auto cache = CreateCache(depth, true);
EXPECT_EQ(depth, cache->Depth());
auto handler = [&dir_count](const std::string& /*dir*/,
const std::string& /*filename*/,
int64_t /*modified_time*/, uint64_t /*size*/,
bool is_directory) -> absl::Status {
if (is_directory) {
++dir_count;
}
return absl::OkStatus();
};
EXPECT_OK(path::SearchFiles(cache->RootDir(), true, handler));
EXPECT_EQ(dir_count,
std::pow(std::pow(16, DiskDataStore::kDirNameLength), depth));
}
TEST_F(DiskDataStoreTest, CacheWithDirectoriesOnDemand) {
unsigned int depth = 4;
unsigned int dir_count = 0;
auto cache = CreateCache(depth, false);
EXPECT_EQ(depth, cache->Depth());
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
auto handler = [&dir_count](const std::string& /*dir*/,
const std::string& /*filename*/,
int64_t /*modified_time*/, uint64_t /*size*/,
bool is_directory) -> absl::Status {
if (is_directory) {
++dir_count;
}
return absl::OkStatus();
};
EXPECT_OK(path::SearchFiles(cache->RootDir(), true, handler));
EXPECT_EQ(dir_count, 4u);
}
TEST_F(DiskDataStoreTest, OverwriteExistingEntry) {
auto cache = CreateCache(0, true);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
EXPECT_OK(cache->Put(first_content_id_, kSecondData, kSecondDataSize));
uint8_t ret_data[kSecondDataSize];
absl::StatusOr<uint64_t> bytes_read =
cache->Get(first_content_id_, &ret_data, 0, kSecondDataSize);
ASSERT_OK(bytes_read);
ASSERT_EQ(kSecondDataSize, *bytes_read);
EXPECT_TRUE(std::equal(std::begin(kSecondData), std::end(kSecondData),
std::begin(ret_data)));
}
TEST_F(DiskDataStoreTest, List) {
auto cache = CreateCache(0, true);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, 1));
EXPECT_OK(cache->Put(second_content_id_, kSecondData, 1));
absl::StatusOr<std::vector<ContentIdProto>> ids = cache->List();
ASSERT_OK(ids);
ASSERT_EQ(ids->size(), 2);
if (ids->at(0) == second_content_id_) std::swap(ids->at(0), ids->at(1));
EXPECT_TRUE(ids->at(0) == first_content_id_);
EXPECT_TRUE(ids->at(1) == second_content_id_);
}
TEST_F(DiskDataStoreTest, InterruptCleanup) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
cache->SetCapacity(0);
std::atomic<bool> interrupt{true};
cache->RegisterInterrupt(&interrupt);
EXPECT_TRUE(absl::IsCancelled(cache->Cleanup()));
absl::StatusOr<DiskDataStore::Statistics> statistics =
cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(kFirstDataSize, statistics->size);
EXPECT_EQ(1u, statistics->number_of_chunks);
EXPECT_TRUE(cache->Contains(first_content_id_));
// Resetting interrupt should enable Cleanup().
interrupt = false;
EXPECT_OK(cache->Cleanup());
statistics = cache->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(0u, statistics->size);
EXPECT_EQ(0u, statistics->number_of_chunks);
EXPECT_FALSE(cache->Contains(first_content_id_));
}
TEST_F(DiskDataStoreTest, CleanupForPrefilledCacheSuccess) {
auto cache = CreateCache(0);
EXPECT_OK(cache->Put(first_content_id_, kFirstData, kFirstDataSize));
clock_.Advance(1000);
absl::StatusOr<std::unique_ptr<DiskDataStore>> filled_cache =
DiskDataStore::Create(0, cache_dir_path_, false, &clock_);
EXPECT_OK(filled_cache);
EXPECT_OK(
(*filled_cache)->Put(second_content_id_, kSecondData, kSecondDataSize));
(*filled_cache)->SetCapacity(kFirstDataSize + 4);
EXPECT_OK((*filled_cache)->Cleanup());
absl::StatusOr<DiskDataStore::Statistics> statistics =
(*filled_cache)->CalculateStatistics();
ASSERT_OK(statistics);
EXPECT_EQ(kSecondDataSize, statistics->size);
EXPECT_EQ(1u, statistics->number_of_chunks);
}
} // namespace
} // namespace cdc_ft

103
data_store/grpc_reader.cc Normal file
View File

@@ -0,0 +1,103 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/grpc_reader.h"
#include <algorithm>
#include "cdc_fuse_fs/asset_stream_client.h"
#include "common/status.h"
#include "common/status_macros.h"
namespace cdc_ft {
GrpcReader::GrpcReader(std::shared_ptr<grpc::Channel> channel,
bool enable_stats)
: client_(std::make_unique<AssetStreamClient>(std::move(channel),
enable_stats)) {}
GrpcReader::~GrpcReader() = default;
absl::Status GrpcReader::SendCachedContentIds(
std::vector<ContentIdProto> content_ids) {
return client_->SendCachedContentIds(std::move(content_ids));
}
absl::StatusOr<size_t> GrpcReader::Get(const ContentIdProto& id, void* data,
uint64_t size, uint64_t offset) {
absl::StatusOr<std::string> result = client_->GetContent(id);
if (!result.ok()) {
return WrapStatus(result.status(), "Failed to stream data for id %s",
ContentId::ToHexString(id));
}
if (offset >= result->size()) {
return 0;
}
uint64_t bytes_to_copy = std::min<uint64_t>(result->size() - offset, size);
memcpy(data, result->data() + offset, bytes_to_copy);
return bytes_to_copy;
}
absl::Status GrpcReader::Get(ChunkTransferList* chunks) {
RepeatedContentIdProto chunk_ids;
for (const ChunkTransferTask& chunk : *chunks) {
if (!chunk.done) *chunk_ids.Add() = chunk.id;
};
const int chunk_id_count = chunk_ids.size();
RepeatedStringProto chunk_data;
ASSIGN_OR_RETURN(chunk_data, client_->GetContent(std::move(chunk_ids)),
"Failed to stream data chunks [%s]",
chunks->UndoneToHexString());
if (chunk_data.size() != chunk_id_count) {
return MakeStatus(
"Incomplete response received for chunks [%s], expected %u, got %u",
chunks->UndoneToHexString(), chunk_id_count, chunk_data.size());
}
int i = 0;
for (ChunkTransferTask& chunk : *chunks) {
if (chunk.done) continue;
// Move the complete chunk data over to the chunks list.
chunk.chunk_data = std::move(chunk_data[i++]);
// Verify the chunk size.
if (chunk.chunk_data.size() < chunk.offset + chunk.size) {
return MakeStatus(
"Truncated chunk '%s' received, expected %u + %u = %u bytes, got %u",
ContentId::ToHexString(chunk.id), chunk.offset, chunk.size,
chunk.offset + chunk.size, chunk.chunk_data.size());
}
// Copy the part of the chunk data to the target buffer.
if (chunk.data) {
memcpy(chunk.data, chunk.chunk_data.data() + chunk.offset, chunk.size);
}
chunk.done = true;
}
return absl::OkStatus();
}
absl::Status GrpcReader::Get(const ContentIdProto& id, Buffer* data) {
absl::StatusOr<std::string> result = client_->GetContent(id);
if (!result.ok()) {
return WrapStatus(result.status(), "Failed to stream data for id %s",
ContentId::ToHexString(id));
}
data->clear();
data->append((*result).data(), (*result).size());
return absl::OkStatus();
}
} // namespace cdc_ft

57
data_store/grpc_reader.h Normal file
View File

@@ -0,0 +1,57 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATA_STORE_GRPC_READER_H_
#define DATA_STORE_GRPC_READER_H_
#include "absl/status/statusor.h"
#include "data_store/data_store_reader.h"
#include "grpcpp/channel.h"
#include "manifest/content_id.h"
namespace cdc_ft {
class AssetStreamClient;
// Implementation of a DataStoreReader that loads chunks through gRpc
// exclusively. Does not have any local caching.
class GrpcReader : public DataStoreReader {
public:
// |channel| is a grpc channel to connect to.
// |enable_stats| determines whether additional statistics are sent.
GrpcReader(std::shared_ptr<grpc::Channel> channel, bool enable_stats);
virtual ~GrpcReader();
GrpcReader(const GrpcReader&) = delete;
GrpcReader& operator=(const GrpcReader&) = delete;
// Sends the IDs of all cached chunks to the workstation for statistical
// purposes.
absl::Status SendCachedContentIds(std::vector<ContentIdProto> content_ids);
// DataStoreReader:
absl::StatusOr<size_t> Get(const ContentIdProto& key, void* data,
uint64_t size, uint64_t offset) override;
absl::Status Get(ChunkTransferList* chunks) override;
absl::Status Get(const ContentIdProto& key, Buffer* data) override;
private:
std::unique_ptr<AssetStreamClient> client_;
};
} // namespace cdc_ft
#endif // DATA_STORE_GRPC_READER_H_

View File

@@ -0,0 +1,151 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/mem_data_store.h"
#include "common/status.h"
namespace cdc_ft {
MemDataStore::MemDataStore() = default;
MemDataStore::~MemDataStore() = default;
ContentIdProto MemDataStore::AddData(std::vector<char> data) {
ContentIdProto id = ContentId::FromArray(data.data(), data.size());
data_lookup_[id] = std::move(data);
return id;
}
ContentIdProto MemDataStore::AddProto(
const google::protobuf::MessageLite& message) {
std::vector<char> data;
data.resize(message.ByteSizeLong());
message.SerializeToArray(data.data(), static_cast<int>(data.size()));
return AddData(std::move(data));
}
absl::StatusOr<size_t> MemDataStore::Get(const ContentIdProto& id, void* data,
size_t offset, size_t size) {
auto it = data_lookup_.find(id);
if (it == data_lookup_.end()) {
return absl::NotFoundError(absl::StrFormat("Failed to find data id '%s'",
ContentId::ToHexString(id)));
}
const std::vector<char>& data_vec = it->second;
if (offset >= data_vec.size()) {
return 0;
}
uint64_t bytes_to_copy = std::min<uint64_t>(data_vec.size() - offset, size);
memcpy(data, data_vec.data() + offset, bytes_to_copy);
return bytes_to_copy;
}
absl::Status MemDataStore::Get(const ContentIdProto& id, Buffer* data) {
auto it = data_lookup_.find(id);
if (it == data_lookup_.end()) {
return absl::NotFoundError(absl::StrFormat("Failed to find data id '%s'",
ContentId::ToHexString(id)));
}
const std::vector<char>& data_vec = it->second;
data->clear();
data->append(data_vec.data(), data_vec.size());
return absl::OkStatus();
}
absl::Status MemDataStore::Get(ChunkTransferList* chunks) {
for (ChunkTransferTask& chunk : *chunks) {
if (chunk.done) continue;
auto it = data_lookup_.find(chunk.id);
if (it == data_lookup_.end()) continue;
// Copy the potentially prefetched string for caching.
chunk.chunk_data = std::string(it->second.data(), it->second.size());
if (!chunk.size) {
chunk.done = true;
continue;
}
if (chunk.offset >= chunk.chunk_data.size()) {
return absl::OutOfRangeError(absl::StrFormat(
"Chunk '%s': requested offset %u is larger or equal than size %u",
ContentId::ToHexString(chunk.id), chunk.offset,
chunk.chunk_data.size()));
}
uint64_t bytes_to_copy =
std::min<uint64_t>(chunk.chunk_data.size() - chunk.offset, chunk.size);
if (bytes_to_copy < chunk.size) {
return absl::DataLossError(
absl::StrFormat("Chunk '%s': requested size %u at offset %u is "
"larger than chunk size %u",
ContentId::ToHexString(chunk.id), chunk.size,
chunk.offset, chunk.chunk_data.size()));
}
memcpy(chunk.data, chunk.chunk_data.data() + chunk.offset, bytes_to_copy);
chunk.done = true;
}
return absl::OkStatus();
}
bool MemDataStore::Contains(const ContentIdProto& content_id) {
return data_lookup_.find(content_id) != data_lookup_.end();
}
absl::Status MemDataStore::Put(const ContentIdProto& content_id,
const void* data, size_t size) {
data_lookup_[content_id] =
std::vector<char>(reinterpret_cast<const char*>(data),
reinterpret_cast<const char*>(data) + size);
return absl::OkStatus();
}
absl::Status MemDataStore::Remove(const ContentIdProto& content_id) {
data_lookup_.erase(content_id);
return absl::OkStatus();
}
absl::Status MemDataStore::Wipe() {
data_lookup_.clear();
return absl::OkStatus();
}
absl::Status MemDataStore::Prune(
std::unordered_set<ContentIdProto> ids_to_keep) {
// Find the set of chunks not in |ids_to_keep|.
std::vector<ContentIdProto> to_delete;
for (const auto& [id, _] : data_lookup_) {
if (ids_to_keep.find(id) == ids_to_keep.end())
to_delete.push_back(id);
else
ids_to_keep.erase(id);
}
// Delete chunks not in |ids_to_keep|.
for (const ContentIdProto& id : to_delete) {
data_lookup_.erase(id);
}
// Verify that all chunks in |ids_to_keep| are present in the store.
if (!ids_to_keep.empty()) {
return absl::NotFoundError(absl::StrFormat(
"%u chunks, e.g. '%s', not found in the store", ids_to_keep.size(),
ContentId::ToHexString(*ids_to_keep.begin())));
}
return absl::OkStatus();
}
} // namespace cdc_ft

View File

@@ -0,0 +1,82 @@
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATA_STORE_MEM_DATA_STORE_H_
#define DATA_STORE_MEM_DATA_STORE_H_
#include <string>
#include <vector>
#include "data_store/data_store_writer.h"
#include "manifest/content_id.h"
namespace cdc_ft {
// In-memory implementation of a DataStoreWriter. Data needs to be pre-
// populated manually using AddData() and AddProto(). Useful for testing.
class MemDataStore : public DataStoreWriter {
public:
using ChunkMap = std::unordered_map<ContentIdProto, std::vector<char>>;
MemDataStore();
MemDataStore(const MemDataStore&) = delete;
MemDataStore& operator=(const MemDataStore&) = delete;
virtual ~MemDataStore();
// TODO: Extract AddData in a helper function.
// Adds |data| to the memory-backed storage and returns the id to it.
ContentIdProto AddData(std::vector<char> data);
// TODO: Extract AddProto in a helper function.
// Serializes |message|, adds it to the memory-backed storage, and returns the
// id to it.
ContentIdProto AddProto(const google::protobuf::MessageLite& message);
// Note: DO NOT MIX Add* get Get* methods in a multi-threaded environment!
// Get* methods are thread-safe as they are read-only, but Add* methods write
// to the data. They are not thread-safe.
// DataStoreReader:
absl::StatusOr<size_t> Get(const ContentIdProto& id, void* data,
size_t offset, size_t size) override;
absl::Status Get(const ContentIdProto& content_id, Buffer* data) override;
absl::Status Get(ChunkTransferList* chunks) override;
// DataStoreWriter:
bool Contains(const ContentIdProto& content_id) override;
absl::Status Put(const ContentIdProto& content_id, const void* data,
size_t size) override;
absl::Status Remove(const ContentIdProto& content_id) override;
absl::Status Wipe() override;
absl::Status Prune(std::unordered_set<ContentIdProto> ids_to_keep) override;
// Direct access to the chunks for testing.
const ChunkMap& Chunks() const { return data_lookup_; }
ChunkMap& Chunks() { return data_lookup_; }
private:
// Maps content IDs to chunks.
ChunkMap data_lookup_;
};
} // namespace cdc_ft
#endif // DATA_STORE_MEM_DATA_STORE_H_

View File

@@ -0,0 +1,188 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "data_store/mem_data_store.h"
#include "common/status_test_macros.h"
#include "gtest/gtest.h"
#include "manifest/content_id.h"
namespace cdc_ft {
namespace {
TEST(MemDataStoreTest, GetWithMultipleIds) {
std::vector<char> expected_data1 = {1, 3, 3, 7};
std::vector<char> expected_data2 = {15, 0, 0, 13, 0};
MemDataStore p;
ContentIdProto id1 = p.AddData(expected_data1);
ContentIdProto id2 = p.AddData(expected_data2);
std::vector<char> data1;
std::vector<char> data2;
data1.resize(expected_data1.size());
data2.resize(expected_data2.size());
absl::StatusOr<uint64_t> bytes_read1 =
p.Get(id1, data1.data(), 0, data1.size());
absl::StatusOr<uint64_t> bytes_read2 =
p.Get(id2, data2.data(), 0, data2.size());
ASSERT_OK(bytes_read1);
ASSERT_OK(bytes_read2);
EXPECT_EQ(*bytes_read1, data1.size());
EXPECT_EQ(*bytes_read2, data2.size());
EXPECT_EQ(expected_data1, data1);
EXPECT_EQ(expected_data2, data2);
}
TEST(MemDataStoreTest, GetWithRangeInsideOfData) {
MemDataStore p;
ContentIdProto id = p.AddData({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
std::vector<char> data;
data.resize(5);
absl::StatusOr<uint64_t> bytes_read =
p.Get(id, data.data(), /*offset=*/2, data.size());
ASSERT_OK(bytes_read);
EXPECT_EQ(*bytes_read, data.size());
EXPECT_EQ(data, std::vector<char>({2, 3, 4, 5, 6}));
}
TEST(MemDataStoreTest, GetWithRangePartlyOutsideOfData) {
MemDataStore p;
ContentIdProto id = p.AddData({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
std::vector<char> data;
data.resize(5);
absl::StatusOr<uint64_t> bytes_read =
p.Get(id, data.data(), /*offset=*/7, data.size());
ASSERT_OK(bytes_read);
ASSERT_EQ(*bytes_read, 3);
data.resize(3);
EXPECT_EQ(data, std::vector<char>({7, 8, 9}));
}
TEST(MemDataStoreTest, GetWithRangeOutsideOfData) {
MemDataStore p;
ContentIdProto id = p.AddData({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
std::vector<char> data;
data.resize(5);
absl::StatusOr<uint64_t> bytes_read =
p.Get(id, data.data(), /*offset=*/12, data.size());
ASSERT_OK(bytes_read);
EXPECT_EQ(*bytes_read, 0);
}
TEST(MemDataStoreTest, GetWholeChunk) {
std::vector<char> expected_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
Buffer expected_buffer;
expected_buffer.append(expected_data.data(), expected_data.size());
MemDataStore p;
ContentIdProto id = p.AddData(std::move(expected_data));
Buffer data;
EXPECT_OK(p.Get(id, &data));
EXPECT_EQ(data, expected_buffer);
}
TEST(MemDataStoreTest, GetProtoWithMultipleKeys) {
AssetProto expected_proto1;
AssetProto expected_proto2;
expected_proto1.set_type(AssetProto::DIRECTORY);
expected_proto2.set_type(AssetProto::FILE);
expected_proto1.set_name("dir");
expected_proto2.set_name("file");
// Use a MemDataStore to get test data in.
// Note that GetProto is implemented by DataStoreReader.
MemDataStore p;
ContentIdProto key1 = p.AddProto(expected_proto1);
ContentIdProto key2 = p.AddProto(expected_proto2);
AssetProto proto1;
AssetProto proto2;
EXPECT_OK(p.GetProto(key1, &proto1));
EXPECT_OK(p.GetProto(key2, &proto2));
EXPECT_EQ(expected_proto1.type(), proto1.type());
EXPECT_EQ(expected_proto1.type(), proto1.type());
EXPECT_EQ(expected_proto1.name(), proto1.name());
EXPECT_EQ(expected_proto2.name(), proto2.name());
}
TEST(MemDataStoreTest, PutGet) {
std::vector<char> expected_data = {1, 3, 3, 7};
ContentIdProto content_id =
ContentId::FromArray(expected_data.data(), expected_data.size());
MemDataStore p;
ASSERT_OK(p.Put(content_id, expected_data.data(), expected_data.size()));
ASSERT_TRUE(p.Contains(content_id));
std::vector<char> data;
data.resize(expected_data.size());
absl::StatusOr<uint64_t> bytes_read =
p.Get(content_id, data.data(), 0, data.size());
ASSERT_OK(bytes_read);
EXPECT_EQ(*bytes_read, data.size());
EXPECT_EQ(expected_data, data);
}
TEST(MemDataStoreTest, PruneSucceeds) {
MemDataStore p;
ContentIdProto content_ids[4];
for (size_t n = 0; n < std::size(content_ids); ++n) {
content_ids[n] = ContentId::FromArray(&n, sizeof(n));
EXPECT_OK(p.Put(content_ids[n], &n, sizeof(n)));
}
std::unordered_set<ContentIdProto> ids_to_keep = {content_ids[0],
content_ids[2]};
EXPECT_OK(p.Prune(std::move(ids_to_keep)));
EXPECT_TRUE(p.Contains(content_ids[0]));
EXPECT_TRUE(p.Contains(content_ids[2]));
EXPECT_FALSE(p.Contains(content_ids[1]));
EXPECT_FALSE(p.Contains(content_ids[3]));
}
TEST(MemDataStoreTest, PruneFailsNotFound) {
MemDataStore p;
ContentIdProto content_ids[2];
for (size_t n = 0; n < std::size(content_ids); ++n)
content_ids[n] = ContentId::FromArray(&n, sizeof(n));
EXPECT_OK(p.Put(content_ids[0], nullptr, 0));
std::unordered_set<ContentIdProto> ids_to_keep = {content_ids[1]};
EXPECT_TRUE(absl::IsNotFound(p.Prune(std::move(ids_to_keep))));
EXPECT_FALSE(p.Contains(content_ids[0]));
}
} // namespace
} // namespace cdc_ft