mirror of
https://github.com/nestriness/cdc-file-transfer.git
synced 2026-01-30 12:35:35 +02:00
This change introduces dynamic manifest updates to asset streaming. Asset streaming describes the directory to be streamed in a manifest, which is a proto definition of all content metadata. This information is sufficient to answer `stat` and `readdir` calls in the FUSE layer without additional round-trips to the workstation. When a directory is streamed for the first time, the corresponding manifest is created in two steps: 1. The directory is traversed recursively and the inode information of all contained files and directories is written to the manifest. 2. The content of all identified files is processed to generate each file's chunk list. This list is part of the definition of a file in the manifest. * The chunk boundaries are identified using our implementation of the FastCDC algorithm. * The hash of each chunk is calculated using the BLAKE3 hash function. * The length and hash of each chunk is appended to the file's chunk list. Prior to this change, when the user mounted a workstation directory on a client, the asset streaming server pushed an intermediate manifest to the gamelet as soon as step 1 was completed. At this point, the FUSE client started serving the virtual file system and was ready to answer `stat` and `readdir` calls. In case the FUSE client received any call that required file contents, such as `read`, it would block the caller until the server completed step 2 above and pushed the final manifest to the client. This works well for large directories (> 100GB) with a reasonable number of files (< 100k). But when dealing with millions of tiny files, creating the full manifest can take several minutes. With this change, we introduce dynamic manifest updates. When the FUSE layer receives an `open` or `readdir` request for a file or directory that is incomplete, it sends an RPC to the workstation about what information is missing from the manifest. The workstation identifies the corresponding file chunker or directory scanner tasks and moves them to the front of the queue. As soon as the task is completed, the workstation pushes an updated intermediate manifest to the client which now includes the information to serve the FUSE request. The queued FUSE request is resumed and returns the result to the caller. While this does not reduce the required time to build the final manifest, it splits up the work into smaller tasks. This allows us to interrupt the current work and prioritize those tasks which are required to handle an incoming request from the client. While this still takes a round-trip to the workstation plus the processing time for the task, an updated manifest is received within a few seconds, which is much better than blocking for several minutes. This latency is only visible when serving data while the manifest is still being created. The situation improves as the manifest creation on the workstation progresses. As soon as the final manifest is pushed, all metadata can be served directly without having to wait for pending tasks.
317 lines
12 KiB
C++
317 lines
12 KiB
C++
/*
|
|
* Copyright 2022 Google LLC
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef MANIFEST_MANIFEST_UPDATER_H_
|
|
#define MANIFEST_MANIFEST_UPDATER_H_
|
|
|
|
#include <list>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "absl/status/statusor.h"
|
|
#include "common/buffer.h"
|
|
#include "manifest/asset_builder.h"
|
|
#include "manifest/file_chunk_map.h"
|
|
#include "manifest/manifest_proto_defs.h"
|
|
#include "manifest/pending_assets_queue.h"
|
|
|
|
namespace cdc_ft {
|
|
namespace fastcdc {
|
|
struct Config;
|
|
}
|
|
|
|
class AssetBuilder;
|
|
class DataStoreWriter;
|
|
class DirScannerTask;
|
|
class FileChunkerTask;
|
|
class ManifestBuilder;
|
|
class Threadpool;
|
|
|
|
struct UpdaterConfig {
|
|
// Source directory from which to build the manifest from recursively.
|
|
std::string src_dir;
|
|
|
|
// Minimum allowed chunk size.
|
|
size_t min_chunk_size = 128 << 10;
|
|
|
|
// Target average chunk size.
|
|
size_t avg_chunk_size = 256 << 10;
|
|
|
|
// Maximum allowed chunk size.
|
|
size_t max_chunk_size = 1024 << 10;
|
|
|
|
// Size of the chunker thread pool. Defaults to the number of available CPUs.
|
|
uint32_t num_threads = 0;
|
|
};
|
|
|
|
struct UpdaterStats {
|
|
// Total no. of assets that were added or updated.
|
|
size_t total_assets_added_or_updated = 0;
|
|
|
|
// Total no. of assets of type FILE that were added or updated.
|
|
size_t total_files_added_or_updated = 0;
|
|
|
|
// Total no. of files where processing failed.
|
|
size_t total_files_failed = 0;
|
|
|
|
// Total no. of directories where processing failed.
|
|
size_t total_dirs_failed = 0;
|
|
|
|
// Total no. of assets that were deleted (not counting subdirectory files).
|
|
size_t total_assets_deleted = 0;
|
|
|
|
// Total no. of chunks created.
|
|
size_t total_chunks = 0;
|
|
|
|
// Total no. of bytes processed from the files added or updated.
|
|
size_t total_processed_bytes = 0;
|
|
};
|
|
|
|
struct AssetInfo {
|
|
// Unix path to the asset relative to the source directory.
|
|
std::string path;
|
|
|
|
// Type (file, dir, etc.)
|
|
AssetProto::Type type = AssetProto::FILE;
|
|
|
|
// Modification time in seconds since Epoch.
|
|
int64_t mtime = 0;
|
|
|
|
// File size (0 for directories).
|
|
uint64_t size = 0;
|
|
|
|
// File chunks (empty for directories). This list is ignored when comparing
|
|
// one AssetInfo to another.
|
|
std::vector<FileChunk> chunks;
|
|
|
|
// Appends the chunks from |list| to |chunks|.
|
|
void AppendCopyChunks(const RepeatedChunkRefProto& list,
|
|
uint64_t list_offset);
|
|
|
|
// Appends the chunks from |list| to |chunks|, but moves the data out of
|
|
// |list| instead of copying, wherever possible.
|
|
void AppendMoveChunks(RepeatedChunkRefProto* list, uint64_t list_offset);
|
|
|
|
bool operator==(const AssetInfo& other) const {
|
|
return path == other.path && type == other.type && mtime == other.mtime &&
|
|
size == other.size;
|
|
}
|
|
bool operator!=(const AssetInfo& other) const { return !(*this == other); }
|
|
|
|
// Compares by file path.
|
|
bool operator<(const AssetInfo& other) const { return path < other.path; }
|
|
};
|
|
|
|
// Incrementally updates a manifest
|
|
class ManifestUpdater {
|
|
public:
|
|
// Selects the update operation to be performed.
|
|
enum class Operator { kAdd, kUpdate, kDelete, kKeep };
|
|
|
|
// Represents an update operation that shall be applied to the owned manifest.
|
|
struct Operation {
|
|
Operation() {}
|
|
Operation(Operator op, AssetInfo info) : op(op), info(std::move(info)) {}
|
|
|
|
Operator op;
|
|
AssetInfo info;
|
|
};
|
|
|
|
using OperationList = std::vector<Operation>;
|
|
|
|
// Permissions for executable files.
|
|
static constexpr uint32_t kExecutablePerms = 0755u;
|
|
|
|
// Id of the chunk that stores the manifest id.
|
|
static ContentIdProto GetManifestStoreId();
|
|
|
|
// Returns an error if |dir| does not exist or it is not a directory.
|
|
static absl::Status IsValidDir(std::string dir);
|
|
|
|
using PushManifestHandler =
|
|
std::function<void(const ContentIdProto& manifest_id)>;
|
|
|
|
// |data_store| is used to store manifest chunks. File data chunks are not
|
|
// stored explicitly as they can be read from the original files.
|
|
// |cfg| determines the source directory to update the manifest from as well
|
|
// as configuration details about chunking.
|
|
ManifestUpdater(DataStoreWriter* data_store, UpdaterConfig cfg);
|
|
~ManifestUpdater();
|
|
|
|
ManifestUpdater(const ManifestUpdater&) = delete;
|
|
ManifestUpdater& operator=(const ManifestUpdater&) = delete;
|
|
|
|
// Reads the full source directory and syncs the manifest to it. Prunes old,
|
|
// unreferenced manifest chunks. Updates and flushes |file_chunks|.
|
|
//
|
|
// If a valid |push_handler| is passed, then a manifest is flushed at least
|
|
// twice and the handler is called:
|
|
// - after the root directory has been added, but before all files and
|
|
// directories have been processed. That means, the manifest does not yet
|
|
// contains all assets, all incomplete assets are set to in-progress.
|
|
// - after an asset that was prioritized with AddPriorityAssets() has been
|
|
// completed.
|
|
// - at the end of the update process in case of success
|
|
absl::Status UpdateAll(FileChunkMap* file_chunks,
|
|
PushManifestHandler push_handler = nullptr);
|
|
|
|
// Updates the manifest by applying the |operations| list. Deletions are
|
|
// handled first to make the outcome independent of the order in the list.
|
|
// Also updates and flushes |file_chunks| with the changes made. The
|
|
// |push_handler| is called at least twice during the operation and at the
|
|
// end, see UpdateAll() for more details.
|
|
//
|
|
// All paths should be Unix paths. If |recursive| is true, then a directory
|
|
// scanner task is enqueued for each directory that is added to the manifest.
|
|
// This is only needed during UpdateAll(). When the manifest is updated in
|
|
// response to file watcher changes, then |recursive| should be set to false.
|
|
absl::Status Update(OperationList* operations, FileChunkMap* file_chunks,
|
|
PushManifestHandler push_handler, bool recursive = false);
|
|
|
|
// Content id of the current manifest.
|
|
const ContentIdProto& ManifestId() const { return manifest_id_; }
|
|
|
|
// Returns stats created from the last call to UpdateAll() or Update().
|
|
const UpdaterStats& Stats() const { return stats_; }
|
|
|
|
// Returns the manifest updater configuration.
|
|
const UpdaterConfig& Config() const { return cfg_; }
|
|
|
|
// Returns an empty manifest.
|
|
ContentIdProto DefaultManifestId();
|
|
|
|
// Appends the given |rel_paths| to the list of assets to prioritize. All
|
|
// paths must be given as Unix paths.
|
|
void AddPriorityAssets(std::vector<std::string> rel_paths)
|
|
ABSL_LOCKS_EXCLUDED(priority_mutex_);
|
|
|
|
private:
|
|
// Holds the number of queued tasks returned by QueueTasks().
|
|
struct QueueTasksResult {
|
|
size_t dir_scanners = 0, file_chunkers = 0;
|
|
};
|
|
|
|
// Adds enough pending assets from |queue_| as tasks to the |pool| to keep all
|
|
// worker threads busy. If |drain_dir_scanner_tasks| is true, only
|
|
// FileChunkerTasks are queued, others are skipped. Returns the number of
|
|
// tasks that were queued as a QueueTaskResult.
|
|
QueueTasksResult QueueTasks(bool drain_dir_scanner_tasks, Threadpool* pool,
|
|
const fastcdc::Config* cdc_cfg);
|
|
|
|
// Modifies the list of queued tasks to prioritize those assets that were
|
|
// previously selected using the AddPriorityAssets() method.
|
|
void PrioritizeQueuedAssets() ABSL_LOCKS_EXCLUDED(priority_mutex_);
|
|
|
|
// Returns true if all of the following conditions are satisfied:
|
|
// - |push_manifest_handler| is valid
|
|
// - the flush deadline that was set by a prioritized asset is due
|
|
// - the manifest was not flushed recently
|
|
bool WantManifestFlushed(PushManifestHandler push_manifest_handler) const;
|
|
|
|
// Checks if it is safe and desired to flush the manifest, then calls
|
|
// FlushAndPushManifest() if that is the case.
|
|
// |dir_scanner_tasks_queued| must be the number of currently queued
|
|
// DirScannerTasks.
|
|
// |file_chunks| is updated by the flush operation, if it is executed.
|
|
// |push_manifest_handler| is invoked if the manifest gets flushed and pushed.
|
|
absl::Status MaybeFlushAndPushManifest(
|
|
size_t dir_scanner_tasks_queued, FileChunkMap* file_chunks,
|
|
std::unordered_set<ContentIdProto>* manifest_content_ids,
|
|
PushManifestHandler push_manifest_handler);
|
|
|
|
// Flushes the in-progress manifest and the updates queued in |file_chunks|.
|
|
// If |push_manifest_handler| is not nullptr, it is invoked with the resulting
|
|
// manifest ID.
|
|
absl::Status FlushAndPushManifest(
|
|
FileChunkMap* file_chunks,
|
|
std::unordered_set<ContentIdProto>* manifest_content_ids,
|
|
PushManifestHandler push_manifest_handler);
|
|
|
|
// Applies the |operations| list to the manifest owned by the manifest
|
|
// builder. First, all deletions are handled and the corresponding files are
|
|
// removed from the |file_chunks| map, then all added or updated assets are
|
|
// processed. This guarantees that the outcome is independent of the order in
|
|
// the list.
|
|
//
|
|
// If |parent| is non-null, then it must be of type DIRECTORY and all added
|
|
// assets are made direct children of |parent|. The function does *not* verify
|
|
// that all children have |parent| as directory path. This is used to
|
|
// efficently handle the result of a DirScannerTask.
|
|
//
|
|
// Enqueues tasks to chunk the given files for files that were added or
|
|
// updated. If |recursive| is true, then it will also enqueue directory
|
|
// scanner tasks for all given directories. All follow-up tasks have the given
|
|
// |deadline| set, which determines the deadline after which the manifest
|
|
// should be flushed.
|
|
absl::Status ApplyOperations(std::vector<Operation>* operations,
|
|
FileChunkMap* file_chunks, AssetBuilder* parent,
|
|
absl::Time deadline, bool recursive);
|
|
|
|
// Handles the results of a completed FileChunkerTask.
|
|
absl::Status HandleFileChunkerResult(FileChunkerTask* task,
|
|
FileChunkMap* file_chunks);
|
|
|
|
// Handles the results of a completed DirScannerTask.
|
|
absl::Status HandleDirScannerResult(
|
|
DirScannerTask* task, FileChunkMap* file_chunks,
|
|
std::unordered_set<ContentIdProto>* manifest_content_ids);
|
|
|
|
// Queue of pending assets waiting for completion.
|
|
PendingAssetsQueue queue_;
|
|
|
|
// Pool of pre-allocated buffers
|
|
std::vector<Buffer> buffers_;
|
|
|
|
// Store for manifest chunks and the manifest id itself.
|
|
DataStoreWriter* const data_store_;
|
|
|
|
// Source directory to build the manifest from and configuration details.
|
|
UpdaterConfig cfg_;
|
|
|
|
// ID of the manifest chunk.
|
|
ContentIdProto manifest_id_;
|
|
|
|
// Stats for the last Update*() operation.
|
|
UpdaterStats stats_;
|
|
|
|
// The builder used for updating the manifest.
|
|
std::unique_ptr<ManifestBuilder> manifest_builder_;
|
|
|
|
// Holds the assets that should be prioritized while updating the manifest.
|
|
std::vector<PriorityAsset> priority_assets_ ABSL_GUARDED_BY(priority_mutex_);
|
|
absl::Mutex priority_mutex_;
|
|
|
|
// Deadline by which the manifest should be flushed again.
|
|
absl::Time flush_deadline_ = absl::InfiniteFuture();
|
|
|
|
// The time when the manifest was flushed last.
|
|
absl::Time last_manifest_flush_;
|
|
|
|
// How much time we allow at least for processing a prioritized asset. The
|
|
// manifest won't be flushed for that time, to allow more assets to be
|
|
// finalized before the manifest is sent to the client.
|
|
static constexpr absl::Duration kMinAssetProcessingTime =
|
|
absl::Milliseconds(200);
|
|
|
|
// How often we allow an intermediate manifest to be flushed and pushed.
|
|
static constexpr absl::Duration kMinDelayBetweenFlush =
|
|
absl::Milliseconds(500);
|
|
};
|
|
|
|
}; // namespace cdc_ft
|
|
|
|
#endif // MANIFEST_MANIFEST_UPDATER_H_
|