Releasing the former Stadia file transfer tools

The tools allow efficient and fast synchronization of large directory trees from a Windows workstation to a Linux target machine. cdc_rsync* support efficient copy of files by using content-defined chunking (CDC) to identify chunks within files that can be reused. asset_stream_manager + cdc_fuse_fs support efficient streaming of a local directory to a remote virtual file system based on FUSE. It also employs CDC to identify and reuse unchanged data chunks.
2026-03-17 11:13:08 +02:00 · 2022-10-07 10:47:04 +02:00
commit 4326e972ac
364 changed files with 49410 additions and 0 deletions
--- a/cdc_indexer/BUILD
+++ b/cdc_indexer/BUILD
@@ -0,0 +1,35 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_binary(
+    name = "cdc_indexer",
+    srcs = ["main.cc"],
+    deps = [
+        ":indexer_lib",
+        "//absl_helper:jedec_size_flag",
+        "//common:path",
+        "@com_google_absl//absl/flags:config",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/flags:parse",
+        "@com_google_absl//absl/flags:usage",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "indexer_lib",
+    srcs = ["indexer.cc"],
+    hdrs = ["indexer.h"],
+    deps = [
+        "//common:dir_iter",
+        "//common:path",
+        "//common:status_macros",
+        "//fastcdc",
+        "@com_github_blake3//:blake3",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
+    ],
+)
--- a/cdc_indexer/README.md
+++ b/cdc_indexer/README.md
@@ -0,0 +1,72 @@
+# CDC Indexer
+
+This directory contains a CDC indexer based on our implementation of
+[FastCDC](https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf).
+
+Run the sample with Bazel:
+
+```
+bazel run -c opt //cdc_indexer -- --inputs '/path/to/files'
+```
+
+The CDC algorithm can be tweaked with a few compile-time constants for
+experimentation. See the file `indexer.h` for preprocessor macros that can be
+enabled, for example:
+
+```
+bazel build -c opt --copt=-DCDC_GEAR_TABLE=1 //cdc_indexer
+```
+
+At the end of the operation, the indexer outputs a summary of the results such
+as the following:
+
+```
+00:02   7.44 GB in 2 files processed at 3.1 GB/s, 50% deduplication
+Operation succeeded.
+
+Chunk size (min/avg/max): 128 KB / 256 KB / 1024 KB  |  Threads: 12
+gear_table: 64 bit  |  mask_s: 0x49249249249249  |  mask_l: 0x1249249249
+           Duration:           00:03
+        Total files:               2
+       Total chunks:           39203
+      Unique chunks:           20692
+         Total data:         9.25 GB
+        Unique data:         4.88 GB
+         Throughput:       3.07 GB/s
+    Avg. chunk size:          247 KB
+      Deduplication:           47.2%
+
+ 160 KB #########                                      1419 ( 7%)
+ 192 KB ########                                       1268 ( 6%)
+ 224 KB ###################                            2996 (14%)
+ 256 KB ########################################       6353 (31%)
+ 288 KB ######################                         3466 (17%)
+ 320 KB ##########################                     4102 (20%)
+ 352 KB ######                                          946 ( 5%)
+ 384 KB                                                  75 ( 0%)
+ 416 KB                                                  27 ( 0%)
+ 448 KB                                                   7 ( 0%)
+ 480 KB                                                   5 ( 0%)
+ 512 KB                                                   1 ( 0%)
+ 544 KB                                                   4 ( 0%)
+ 576 KB                                                   2 ( 0%)
+ 608 KB                                                   3 ( 0%)
+ 640 KB                                                   3 ( 0%)
+ 672 KB                                                   3 ( 0%)
+ 704 KB                                                   2 ( 0%)
+ 736 KB                                                   0 ( 0%)
+ 768 KB                                                   0 ( 0%)
+ 800 KB                                                   1 ( 0%)
+ 832 KB                                                   0 ( 0%)
+ 864 KB                                                   0 ( 0%)
+ 896 KB                                                   0 ( 0%)
+ 928 KB                                                   0 ( 0%)
+ 960 KB                                                   0 ( 0%)
+ 992 KB                                                   0 ( 0%)
+1024 KB                                                   9 ( 0%)
+```
+
+For testing multiple combinations and comparing the results, the indexer also
+features a flag `--results_file="results.csv"` which appends the raw data to the
+given file in CSV format. Combine this flag with `--description` to label each
+experiment with additional columns.
--- a/cdc_indexer/indexer.cc
+++ b/cdc_indexer/indexer.cc
@@ -0,0 +1,434 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cdc_indexer/indexer.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+#include <mutex>
+#include <queue>
+#include <thread>
+
+#include "absl/functional/bind_front.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "blake3.h"
+#include "common/dir_iter.h"
+#include "common/errno_mapping.h"
+#include "common/path.h"
+#include "common/status_macros.h"
+
+namespace cdc_ft {
+
+struct IndexerJob {
+  std::string filepath;
+};
+
+class Indexer::Impl {
+ public:
+  Impl(const IndexerConfig& cfg, const std::vector<std::string>& inputs);
+  const IndexerConfig& Config() const;
+
+  // Calls the given `progress` function periodically until `SetDone(true)` is
+  // called.
+  void TriggerProgress(ProgressFn fn);
+  bool GetNextJob(IndexerJob* job);
+
+  bool HasError() const;
+  absl::Status Error() const;
+  void SetError(absl::Status err);
+
+  void SetDone(bool done);
+
+  inline const IndexerConfig& Cfg() const { return cfg_; }
+  inline Indexer::OpStats Stats() const;
+  inline Indexer::ChunkSizeMap ChunkSizes() const;
+  void AddChunk(const uint8_t* data, size_t len);
+  void AddFile();
+
+ private:
+  friend class Indexer;
+  // Calculates a hash value for the given data.
+  inline hash_t Hash(const uint8_t* data, size_t len);
+  inline hash_t HashBlake3(const uint8_t* data, size_t len);
+  inline hash_t HashXxhash(const uint8_t* data, size_t len);
+  // Finds the smallest power of 2 such that the result is <= size. If size is >
+  // 2^31, then UINT64_MAX is returned.
+  inline size_t SizeBucket(size_t size) const;
+
+  IndexerConfig cfg_;
+  bool done_;
+  // The following members are all guarded by jobs_mutex_.
+  std::queue<std::string> inputs_;
+  DirectoryIterator dir_iter_;
+  std::mutex jobs_mutex_;
+  // Guarded by chunks_mutex_
+  Indexer::ChunkMap chunks_;
+  std::mutex chunks_mutex_;
+  // Guarded by stats_mutex_.
+  Indexer::OpStats stats_;
+  mutable std::mutex stats_mutex_;
+  // Guarded by chunk_sizes_mutex_;
+  Indexer::ChunkSizeMap chunk_sizes_;
+  mutable std::mutex chunk_sizes_mutex_;
+  // Guarded by result_mutex_
+  absl::Status result_;
+  mutable std::mutex result_mutex_;
+};
+
+class Indexer::Worker {
+ public:
+  Worker(Impl* impl);
+  void Run();
+
+ private:
+  absl::Status IndexFile(const std::string& filepath);
+
+  Impl* impl_;
+  absl::Cord buf_;
+  const fastcdc::Config cdc_cfg_;
+};
+
+// This class holds a `Worker` object and the associated `std::thread` object
+// that executes it.
+class Indexer::WorkerThread {
+ public:
+  WorkerThread() : worker(nullptr), thrd(nullptr) {}
+  ~WorkerThread() {
+    if (thrd) {
+      if (thrd->joinable()) thrd->join();
+      delete thrd;
+    }
+    if (worker) {
+      delete worker;
+    }
+  }
+  Worker* worker;
+  std::thread* thrd;
+};
+
+Indexer::Impl::Impl(const IndexerConfig& cfg,
+                    const std::vector<std::string>& inputs)
+    : cfg_(cfg), done_(false) {
+  // Perform some sanity checks on the config.
+  if (cfg_.num_threads == 0)
+    cfg_.num_threads = std::thread::hardware_concurrency();
+  if (cfg_.read_block_size == 0) cfg_.read_block_size = 4 << 10;
+  if (cfg_.avg_chunk_size == 0) cfg_.avg_chunk_size = 256 << 10;
+  if (cfg_.min_chunk_size == 0 || cfg_.min_chunk_size > cfg_.avg_chunk_size)
+    cfg_.min_chunk_size = cfg_.avg_chunk_size >> 1;
+  if (cfg_.max_chunk_size == 0 || cfg_.max_chunk_size < cfg_.avg_chunk_size)
+    cfg_.max_chunk_size = cfg_.avg_chunk_size << 1;
+  if (cfg_.max_chunk_size_step == 0)
+    cfg_.max_chunk_size_step =
+        cfg_.min_chunk_size > 0 ? cfg_.min_chunk_size : 128u;
+  // Populate the CDC bitmasks which the Chunker creates. Only done here for
+  // being able to write it to the output, setting them in the IndexerConfig has
+  // no effect.
+  fastcdc::Config ccfg(cfg_.min_chunk_size, cfg_.avg_chunk_size,
+                       cfg_.max_chunk_size);
+  Indexer::Chunker chunker(ccfg, nullptr);
+  cfg_.mask_s = chunker.Stage(0).mask;
+  cfg_.mask_l = chunker.Stage(chunker.StagesCount() - 1).mask;
+  // Collect inputs.
+  for (auto it = inputs.begin(); it != inputs.end(); ++it) {
+    inputs_.push(*it);
+  }
+}
+
+const IndexerConfig& Indexer::Impl::Config() const { return cfg_; }
+
+// Executes the `progress` function in a loop, approximately every 200ms. Call
+// `SetDone(true)` to stop this function.
+void Indexer::Impl::TriggerProgress(Indexer::ProgressFn fn) {
+  if (!fn) return;
+  const int64_t interval = 200;
+  absl::Time started = absl::Now();
+  // Keeping going until we're done or an error occured.
+  while (!done_ && !HasError()) {
+    absl::Time loop_started = absl::Now();
+    stats_mutex_.lock();
+    stats_.elapsed = loop_started - started;
+    stats_mutex_.unlock();
+
+    fn(Stats());
+    // Aim for one update every interval.
+    auto loop_elapsed = absl::ToInt64Milliseconds(loop_started - absl::Now());
+    if (loop_elapsed < interval)
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(interval - loop_elapsed));
+  }
+}
+
+bool Indexer::Impl::GetNextJob(IndexerJob* job) {
+  // Stop if an error occured.
+  if (HasError()) return false;
+  const std::lock_guard<std::mutex> lock(jobs_mutex_);
+
+  DirectoryEntry dent;
+  while (!dent.Valid()) {
+    // Open the next directory, if needed.
+    if (!dir_iter_.Valid()) {
+      if (inputs_.empty()) {
+        // We are done.
+        return false;
+      } else {
+        std::string input = inputs_.front();
+        std::string uinput = path::ToUnix(input);
+        inputs_.pop();
+        // Return files as jobs.
+        if (path::FileExists(uinput)) {
+          job->filepath = uinput;
+          return true;
+        }
+        // Otherwise read the directory.
+        if (!dir_iter_.Open(input, DirectorySearchFlags::kFiles)) {
+          // Ignore permission errors.
+          if (absl::IsPermissionDenied(dir_iter_.Status())) {
+            continue;
+          }
+          if (!dir_iter_.Status().ok()) {
+            SetError(dir_iter_.Status());
+          }
+          return false;
+        }
+      }
+    }
+    if (dir_iter_.NextEntry(&dent)) {
+      break;
+    } else if (!dir_iter_.Status().ok()) {
+      SetError(dir_iter_.Status());
+      return false;
+    }
+  }
+
+  path::Join(&job->filepath, dir_iter_.Path(), dent.RelPathName());
+  return true;
+}
+
+void Indexer::Impl::SetDone(bool done) { done_ = done; }
+
+inline size_t Indexer::Impl::SizeBucket(size_t size) const {
+  size_t bucket = 1024;
+  // Go in steps of powers of two until min. chunk size is reached.
+  while (bucket < size && bucket < cfg_.min_chunk_size && bucket < (1llu << 63))
+    bucket <<= 1;
+  // Go in steps of the configurable step size afterwards.
+  while (bucket < size && bucket < (1llu << 63))
+    bucket += cfg_.max_chunk_size_step;
+  return bucket >= size ? bucket : UINT64_MAX;
+}
+
+inline Indexer::OpStats Indexer::Impl::Stats() const {
+  const std::lock_guard<std::mutex> lock(stats_mutex_);
+  return stats_;
+}
+
+inline Indexer::ChunkSizeMap Indexer::Impl::ChunkSizes() const {
+  const std::lock_guard<std::mutex> lock(chunk_sizes_mutex_);
+  return chunk_sizes_;
+}
+
+Indexer::hash_t Indexer::Impl::HashBlake3(const uint8_t* data, size_t len) {
+  blake3_hasher state;
+  uint8_t out[BLAKE3_OUT_LEN];
+  blake3_hasher_init(&state);
+  blake3_hasher_update(&state, data, len);
+  blake3_hasher_finalize(&state, out, BLAKE3_OUT_LEN);
+  return Indexer::hash_t(reinterpret_cast<const char*>(out), BLAKE3_OUT_LEN);
+}
+
+Indexer::hash_t Indexer::Impl::Hash(const uint8_t* data, size_t len) {
+  switch (cfg_.hash_type) {
+    case IndexerConfig::HashType::kNull:
+      return hash_t();
+    case IndexerConfig::HashType::kBlake3:
+      return HashBlake3(data, len);
+    case IndexerConfig::HashType::kUndefined:
+      break;
+  }
+  std::cerr << "Unknown hash type" << std::endl;
+  return std::string();
+}
+
+void Indexer::Impl::AddChunk(const uint8_t* data, size_t len) {
+  std::string hash = Hash(data, len);
+  // See if the chunk already exists, insert it if not.
+  chunks_mutex_.lock();
+  bool new_chunk = chunks_.find(hash) == chunks_.end();
+  if (new_chunk) {
+    chunks_.emplace(hash, Chunk{hash, len});
+  }
+  chunks_mutex_.unlock();
+
+  // Update the stats.
+  stats_mutex_.lock();
+  stats_.total_bytes += len;
+  ++stats_.total_chunks;
+  if (new_chunk) {
+    stats_.unique_bytes += len;
+    ++stats_.unique_chunks;
+  }
+  stats_mutex_.unlock();
+
+  // Update chunk sizes distribution.
+  if (new_chunk) {
+    size_t bucket = SizeBucket(len);
+    chunk_sizes_mutex_.lock();
+    chunk_sizes_[bucket]++;
+    chunk_sizes_mutex_.unlock();
+  }
+}
+
+void Indexer::Impl::AddFile() {
+  const std::lock_guard<std::mutex> lock(stats_mutex_);
+  ++stats_.total_files;
+}
+
+bool Indexer::Impl::HasError() const {
+  const std::lock_guard<std::mutex> lock(result_mutex_);
+  return !result_.ok();
+}
+
+absl::Status Indexer::Impl::Error() const {
+  const std::lock_guard<std::mutex> lock(result_mutex_);
+  return result_;
+}
+
+void Indexer::Impl::SetError(absl::Status err) {
+  // Ignore attempts to set a non-error.
+  if (err.ok()) return;
+  const std::lock_guard<std::mutex> lock(result_mutex_);
+  // Don't overwrite any previous error.
+  if (result_.ok()) result_ = err;
+}
+
+Indexer::Worker::Worker(Indexer::Impl* impl)
+    : impl_(impl),
+      cdc_cfg_(impl_->Cfg().min_chunk_size, impl_->Cfg().avg_chunk_size,
+               impl_->Cfg().max_chunk_size) {}
+
+void Indexer::Worker::Run() {
+  IndexerJob job;
+  while (impl_->GetNextJob(&job)) {
+    absl::Status err = IndexFile(job.filepath);
+    if (!err.ok()) {
+      impl_->SetError(err);
+      return;
+    }
+  }
+}
+
+absl::Status Indexer::Worker::IndexFile(const std::string& filepath) {
+  std::FILE* fin = std::fopen(filepath.c_str(), "rb");
+  if (!fin) {
+    return ErrnoToCanonicalStatus(
+        errno, absl::StrFormat("failed to open file '%s'", filepath));
+  }
+  path::FileCloser closer(fin);
+  std::fseek(fin, 0, SEEK_SET);
+
+  auto hdlr = absl::bind_front(&Indexer::Impl::AddChunk, impl_);
+  Indexer::Chunker chunker(cdc_cfg_, hdlr);
+
+  std::vector<uint8_t> buf(impl_->Cfg().read_block_size, 0);
+  int err = 0;
+  while (!std::feof(fin)) {
+    size_t cnt = std::fread(buf.data(), sizeof(uint8_t), buf.size(), fin);
+    err = std::ferror(fin);
+    if (err) {
+      return ErrnoToCanonicalStatus(
+          err, absl::StrFormat("failed to read from file '%s'", filepath));
+    }
+    if (cnt) {
+      chunker.Process(buf.data(), cnt);
+    }
+  }
+  chunker.Finalize();
+  impl_->AddFile();
+
+  return absl::OkStatus();
+}
+
+IndexerConfig::IndexerConfig()
+    : read_block_size(32 << 10),
+      min_chunk_size(0),
+      avg_chunk_size(0),
+      max_chunk_size(0),
+      max_chunk_size_step(0),
+      num_threads(0),
+      mask_s(0),
+      mask_l(0) {}
+
+Indexer::Indexer() : impl_(nullptr) {}
+
+Indexer::~Indexer() {
+  if (impl_) delete impl_;
+}
+
+absl::Status Indexer::Run(const IndexerConfig& cfg,
+                          const std::vector<std::string>& inputs,
+                          Indexer::ProgressFn fn) {
+  if (impl_) delete impl_;
+  impl_ = new Impl(cfg, inputs);
+
+  // Start the file creation workers.
+  std::vector<WorkerThread> workers(impl_->Config().num_threads);
+  for (auto it = workers.begin(); it != workers.end(); ++it) {
+    auto worker = new Worker(impl_);
+    it->worker = worker;
+    it->thrd = new std::thread(&Worker::Run, worker);
+  }
+  // Start the progress function worker.
+  std::thread prog(&Impl::TriggerProgress, impl_, fn);
+
+  // Wait for the workers to finish.
+  for (auto it = workers.begin(); it != workers.end(); ++it) {
+    it->thrd->join();
+  }
+  // Wait for the progress worker to finish.
+  impl_->SetDone(true);
+  prog.join();
+
+  return Error();
+}
+
+absl::Status Indexer::Error() const {
+  return impl_ ? impl_->Error() : absl::Status();
+}
+
+IndexerConfig Indexer::Config() const {
+  if (impl_) return impl_->Cfg();
+  return IndexerConfig();
+}
+
+Indexer::OpStats Indexer::Stats() const {
+  if (impl_) return impl_->Stats();
+  return Stats();
+}
+
+Indexer::ChunkSizeMap Indexer::ChunkSizes() const {
+  if (impl_) return impl_->ChunkSizes();
+  return Indexer::ChunkSizeMap();
+}
+
+inline Indexer::OpStats::OpStats()
+    : total_files(0),
+      total_chunks(0),
+      unique_chunks(0),
+      total_bytes(0),
+      unique_bytes(0) {}
+
+};  // namespace cdc_ft
--- a/cdc_indexer/indexer.h
+++ b/cdc_indexer/indexer.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CDC_INDEXER_INDEXER_H_
+#define CDC_INDEXER_INDEXER_H_
+
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/time/time.h"
+#include "fastcdc/fastcdc.h"
+
+// Compile-time parameters for the FastCDC algorithm.
+#define CDC_GEAR_32BIT 1
+#define CDC_GEAR_64BIT 2
+#ifndef CDC_GEAR_TABLE
+#define CDC_GEAR_TABLE CDC_GEAR_64BIT
+#endif
+#ifndef CDC_MASK_STAGES
+#define CDC_MASK_STAGES 7
+#endif
+#ifndef CDC_MASK_BIT_LSHIFT_AMOUNT
+#define CDC_MASK_BIT_LSHIFT_AMOUNT 3
+#endif
+
+namespace cdc_ft {
+
+struct IndexerConfig {
+  // The hash function to use.
+  enum class HashType {
+    kUndefined = 0,
+    // No hashing performed, always return an empty string.
+    kNull,
+    // Use BLAKE3 (cryptographic)
+    kBlake3,
+  };
+  IndexerConfig();
+  // Read file contents in the given block size from disk, defaults to 4K.
+  size_t read_block_size;
+  // The minimum allowed chunk size, defaults to avg_chunk_size/2.
+  size_t min_chunk_size;
+  // The target average chunk size.
+  size_t avg_chunk_size;
+  // The maximum allowed chunk size, defaults to 2*avg_chunk_size.
+  size_t max_chunk_size;
+  // Max. step size for bucketing the chunk size distribution.
+  size_t max_chunk_size_step;
+  // How many operations to run in parallel. If this value is zero, then
+  // `std::thread::hardware_concurrency()` is used.
+  uint32_t num_threads;
+  // Which hash function to use.
+  HashType hash_type;
+  // The masks will be populated by the indexer, setting them here has no
+  // effect. They are in this struct so that they can be conveniently accessed
+  // when printing the operation summary (and since they are derived from the
+  // configuration, they are technically part of it).
+  uint64_t mask_s;
+  uint64_t mask_l;
+};
+
+class Indexer {
+ public:
+  using hash_t = std::string;
+#if CDC_GEAR_TABLE == CDC_GEAR_32BIT
+  typedef fastcdc::Chunker32<CDC_MASK_STAGES, CDC_MASK_BIT_LSHIFT_AMOUNT>
+      Chunker;
+#elif CDC_GEAR_TABLE == CDC_GEAR_64BIT
+  typedef fastcdc::Chunker64<CDC_MASK_STAGES, CDC_MASK_BIT_LSHIFT_AMOUNT>
+      Chunker;
+#else
+#error "Unknown gear table"
+#endif
+
+  // Represents a chunk.
+  struct Chunk {
+    hash_t hash;
+    size_t size;
+  };
+
+  // Chunk storage, keyed by hash. The hash value must be mapped to a uint64_t
+  // value here, which is only acceptable for an experimental program like this.
+  typedef std::unordered_map<hash_t, Chunk> ChunkMap;
+  // Used for counting number of chunks in size buckets.
+  typedef std::unordered_map<size_t, uint64_t> ChunkSizeMap;
+
+  // Statistics about the current operation.
+  struct OpStats {
+    OpStats();
+    size_t total_files;
+    size_t total_chunks;
+    size_t unique_chunks;
+    size_t total_bytes;
+    size_t unique_bytes;
+    absl::Duration elapsed;
+  };
+
+  // Defines a callback function that can be used to display progress updates
+  // while the Indexer is busy.
+  typedef void(ProgressFn)(const OpStats& stats);
+
+  Indexer();
+  ~Indexer();
+
+  // Starts the indexing operation for the given configuration `cfg` and
+  // `inputs`. The optional callback function `fn` is called periodically with
+  // statistics about the ongoing operation.
+  absl::Status Run(const IndexerConfig& cfg,
+                   const std::vector<std::string>& inputs, ProgressFn fn);
+  // Returns the status of the ongoing or completed operation.
+  absl::Status Error() const;
+  // Returns the configuration that was passed to Run().
+  IndexerConfig Config() const;
+  // Returns the statistics about the ongoing or completed operation.
+  OpStats Stats() const;
+  // Returns a map of chunk sizes to the number of occurrences. The sizes are
+  // combined to buckets according to the given `IndexerConfig` of the Run()
+  // operation.
+  ChunkSizeMap ChunkSizes() const;
+
+ private:
+  class Impl;
+  class Worker;
+  class WorkerThread;
+  Impl* impl_;
+};
+
+};  // namespace cdc_ft
+
+#endif  // CDC_INDEXER_INDEXER_H_
--- a/cdc_indexer/main.cc
+++ b/cdc_indexer/main.cc
@@ -0,0 +1,435 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <iostream>
+
+#include "absl/flags/flag.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
+#include "absl/flags/usage_config.h"
+#include "absl/random/random.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl_helper/jedec_size_flag.h"
+#include "cdc_indexer/indexer.h"
+#include "common/errno_mapping.h"
+#include "common/path.h"
+
+ABSL_FLAG(std::vector<std::string>, inputs, std::vector<std::string>(),
+          "List of input files or directory to read from.");
+ABSL_FLAG(uint32_t, num_threads, 0,
+          "How many threads should read files in parallel, use 0 to "
+          "auto-dertermine the best concurrency for this machine.");
+ABSL_FLAG(cdc_ft::JedecSize, min_chunk_size, cdc_ft::JedecSize(0),
+          "The minimum chunk size to size the files into. Defaults to half of "
+          "the average chunk size. Supports common unit suffixes K, M, G.");
+ABSL_FLAG(cdc_ft::JedecSize, avg_chunk_size, cdc_ft::JedecSize(256 << 10),
+          "The average chunk size to size the files into. Supports common "
+          "unit suffixes K, M, G.");
+ABSL_FLAG(cdc_ft::JedecSize, max_chunk_size, cdc_ft::JedecSize(0),
+          "The maximum chunk size to size the files into. Defaults to twice "
+          "the average chunk size. Supports common unit suffixes K, M, G.");
+ABSL_FLAG(cdc_ft::JedecSize, read_block_size, cdc_ft::JedecSize(0),
+          "The block size to read the input file(s) from disk. Defaults to the "
+          "value of --max_chunk_size. Supports common unit suffixes K, M, G.");
+ABSL_FLAG(std::string, hash, "blake3",
+          "Which hash function to use. Supported values are \"blake3\" and "
+          "\"null\".");
+ABSL_FLAG(std::string, results_file, "",
+          "File name to append results to in CVS format.");
+ABSL_FLAG(std::string, description, "",
+          "A descriptive string of the experiment that was run. If given, this "
+          "will be prepended literally to the results_file. Multiple columns "
+          "can be separated with commas.");
+
+namespace cdc_ft {
+namespace {
+
+const char* GearTable() {
+  // The following macros are defined in indexer.h.
+#if CDC_GEAR_TABLE == CDC_GEAR_32BIT
+  return "32 bit";
+#elif CDC_GEAR_TABLE == CDC_GEAR_64BIT
+  return "64 bit";
+#else
+#error "Unknown gear table"
+  return "unknown";
+#endif
+}
+
+void SetupFlagsHelp() {
+  absl::SetProgramUsageMessage(
+      "CDC indexer to measure and report data redundancy.");
+  absl::FlagsUsageConfig fuc;
+  // Filter flags to show when the --help flag is set.
+  fuc.contains_help_flags = [](absl::string_view f) {
+    return absl::EndsWith(f, "main.cc");
+  };
+  absl::SetFlagsUsageConfig(fuc);
+}
+
+// Prints a human-readable representation of the given size, such as "4 KB".
+template <typename T>
+std::string HumanBytes(T size, int precision = 0) {
+  const size_t threshold = 2048;
+  if (size < 1024)
+    return absl::StrFormat("%d bytes", static_cast<size_t>(size));
+  double s = static_cast<double>(size) / 1024;
+  std::string units = "KB";
+  if (s > threshold) {
+    s /= 1024;
+    units = "MB";
+  }
+  if (s > threshold) {
+    s /= 1024;
+    units = "GB";
+  }
+  if (s > threshold) {
+    s /= 1024;
+    units = "TB";
+  }
+  if (s > threshold) {
+    s /= 1024;
+    units = "PB";
+  }
+  return absl::StrFormat("%.*f %s", precision, s, units);
+}
+
+// Prints a human-readable representation of a duration as minutes and seconds
+// in the format "m:ss".
+std::string HumanDuration(const absl::Duration& d) {
+  auto sec = absl::ToInt64Seconds(d);
+  return absl::StrFormat("%02d:%02d", sec / 60, std::abs(sec) % 60);
+}
+
+std::string HashTypeToString(IndexerConfig::HashType type) {
+  switch (type) {
+    case IndexerConfig::HashType::kNull:
+      return "(no hashing)";
+    case IndexerConfig::HashType::kBlake3:
+      return "BLAKE3";
+    default:
+      return "unknown";
+  }
+}
+
+// Prints progress information on stdout.
+void ShowProgress(const Indexer::OpStats& stats) {
+  static absl::Time op_start = absl::Now();
+  static absl::Time last_progress = op_start;
+  static size_t last_total_bytes = 0;
+
+  auto now = absl::Now();
+  auto elapsed = now - last_progress;
+  if (elapsed < absl::Milliseconds(500)) return;
+
+  double bps =
+      (stats.total_bytes - last_total_bytes) / absl::ToDoubleSeconds(elapsed);
+  double dedup_pct = (stats.total_bytes - stats.unique_bytes) /
+                     static_cast<double>(stats.total_bytes) * 100.0;
+  std::cout << '\r' << HumanDuration(now - op_start) << "   " << std::setw(2)
+            << HumanBytes(stats.total_bytes, 2) << " in " << stats.total_files
+            << " files processed at " << HumanBytes(bps, 1) << "/s"
+            << ", " << static_cast<int>(dedup_pct) << "% deduplication"
+            << std::flush;
+  last_progress = now;
+  last_total_bytes = stats.total_bytes;
+}
+
+void ShowSummary(const IndexerConfig& cfg, const Indexer::OpStats& stats,
+                 absl::Duration elapsed) {
+  const int title_w = 20;
+  const int num_w = 16;
+  double dedup_pct = (stats.total_bytes - stats.unique_bytes) /
+                     static_cast<double>(stats.total_bytes) * 100.0;
+  double bps = stats.total_bytes / absl::ToDoubleSeconds(elapsed);
+  std::cout << "Chunk size (min/avg/max): " << HumanBytes(cfg.min_chunk_size)
+            << " / " << HumanBytes(cfg.avg_chunk_size) << " / "
+            << HumanBytes(cfg.max_chunk_size)
+            << "  |  Hash: " << HashTypeToString(cfg.hash_type)
+            << "  |  Threads: " << cfg.num_threads << std::endl;
+  std::cout << "gear_table: " << GearTable() << "  |  mask_s: 0x" << std::hex
+            << cfg.mask_s << "  |  mask_l: 0x" << cfg.mask_l << std::dec
+            << std::endl;
+  std::cout << std::setw(title_w) << "Duration:" << std::setw(num_w)
+            << HumanDuration(elapsed) << std::endl;
+  std::cout << std::setw(title_w) << "Total files:" << std::setw(num_w)
+            << stats.total_files << std::endl;
+  std::cout << std::setw(title_w) << "Total chunks:" << std::setw(num_w)
+            << stats.total_chunks << std::endl;
+  std::cout << std::setw(title_w) << "Unique chunks:" << std::setw(num_w)
+            << stats.unique_chunks << std::endl;
+  std::cout << std::setw(title_w) << "Total data:" << std::setw(num_w)
+            << HumanBytes(stats.total_bytes, 2) << std::endl;
+  std::cout << std::setw(title_w) << "Unique data:" << std::setw(num_w)
+            << HumanBytes(stats.unique_bytes, 2) << std::endl;
+  std::cout << std::setw(title_w) << "Throughput:" << std::setw(num_w - 2)
+            << HumanBytes(bps, 2) << "/s" << std::endl;
+  std::cout << std::setw(title_w) << "Avg. chunk size:" << std::setw(num_w)
+            << HumanBytes(static_cast<double>(stats.unique_bytes) /
+                          stats.unique_chunks)
+            << std::endl;
+  std::cout << std::setw(title_w) << "Deduplication:" << std::setw(num_w - 1)
+            << std::setprecision(4) << dedup_pct << "%" << std::endl;
+}
+
+void ShowChunkSize(size_t size, uint64_t cnt, uint64_t max_count,
+                   uint64_t total_count) {
+  const int key_w = 7;
+  const int hbar_w = 40;
+  const int num_w = 10;
+  const int pct_w = 2;
+
+  double pct = 100.0 * static_cast<double>(cnt) / total_count;
+  double hscale = static_cast<double>(cnt) / max_count;
+  int blocks = round(hscale * hbar_w);
+
+  std::cout << std::setw(key_w) << HumanBytes(size) << " ";
+  for (int i = 0; i < blocks; i++) std::cout << "#";
+  for (int i = hbar_w - blocks; i > 0; i--) std::cout << " ";
+  std::cout << " " << std::setw(num_w) << cnt << " (" << std::setw(pct_w)
+            << round(pct) << "%)" << std::endl;
+}
+
+std::vector<size_t> ChunkSizeBuckets(const IndexerConfig& cfg,
+                                     const Indexer::ChunkSizeMap& sizes,
+                                     size_t fixed_min_size,
+                                     size_t fixed_max_size,
+                                     uint64_t* max_count_out,
+                                     uint64_t* total_count_out) {
+  size_t min_size = 1u << 31;
+  size_t max_size = 0;
+  uint64_t max_count = 0;
+  uint64_t total_count = 0, found_count = 0;
+  uint64_t outside_min_max_count = 0;
+  std::vector<size_t> buckets;
+  // Find out min/max chunk sizes
+  for (auto [chunk_size, count] : sizes) {
+    if (chunk_size < min_size) min_size = chunk_size;
+    if (chunk_size > max_size) max_size = chunk_size;
+    if (count > max_count) max_count = count;
+    if (chunk_size < fixed_min_size) outside_min_max_count += count;
+    if (fixed_max_size > 0 && chunk_size > fixed_max_size)
+      outside_min_max_count += count;
+    total_count += count;
+  }
+  if (fixed_min_size > 0) min_size = fixed_min_size;
+  // Use steps of powers of two until min. chunk size is reached.
+  uint64_t size;
+  uint64_t pow_end_size = std::min(cfg.min_chunk_size, max_size);
+  for (size = min_size; size < pow_end_size; size <<= 1) {
+    buckets.push_back(size);
+    auto it = sizes.find(size);
+    if (it != sizes.end()) found_count += it->second;
+  }
+  if (fixed_max_size > max_size) max_size = fixed_max_size;
+  // Use step increments of max_chunk_size_step afterwards.
+  for (; size <= max_size; size += cfg.max_chunk_size_step) {
+    buckets.push_back(size);
+    auto it = sizes.find(size);
+    if (it != sizes.end()) found_count += it->second;
+  }
+  // Make sure we found every bucket.
+  assert(total_count == found_count + outside_min_max_count);
+  if (max_count_out) *max_count_out = max_count;
+  if (total_count_out) *total_count_out = total_count;
+  return buckets;
+}
+
+void ShowChunkSizes(const IndexerConfig& cfg,
+                    const Indexer::ChunkSizeMap& sizes) {
+  uint64_t max_count = 0;
+  uint64_t total_count = 0;
+  auto buckets = ChunkSizeBuckets(cfg, sizes, 0, 0, &max_count, &total_count);
+  for (auto size : buckets) {
+    auto it = sizes.find(size);
+    uint64_t cnt = it != sizes.end() ? it->second : 0;
+    ShowChunkSize(size, cnt, max_count, total_count);
+  }
+}
+
+absl::Status WriteResultsFile(const std::string& filepath,
+                              const std::string& description,
+                              const IndexerConfig& cfg,
+                              const Indexer::OpStats& stats,
+                              const Indexer::ChunkSizeMap& sizes) {
+  bool exists = path::FileExists(filepath);
+  std::FILE* fout = std::fopen(filepath.c_str(), "a");
+  if (!fout) {
+    return ErrnoToCanonicalStatus(
+        errno, absl::StrFormat("Couldn't write to file '%s'", filepath));
+  }
+
+  path::FileCloser closer(fout);
+
+  static constexpr int num_columns = 15;
+  static const char* columns[num_columns] = {
+      "gear_table",
+      "mask_s",
+      "mask_l",
+      "Min chunk size [KiB]",
+      "Avg chunk size [KiB]",
+      "Max chunk size [KiB]",
+      "Read speed [MiB/s]",
+      "Files",
+      "Total chunks",
+      "Unique chunks",
+      "Total size [MiB]",
+      "Unique size [MiB]",
+      "Dedup size [MiB]",
+      "Dedup ratio",
+      "Res avg chunk size [KiB]",
+  };
+
+  auto buckets = ChunkSizeBuckets(cfg, sizes, cfg.min_chunk_size,
+                                  cfg.max_chunk_size, nullptr, nullptr);
+  // Write column headers this is a new file.
+  if (!exists) {
+    // Write empty columns corresponding to the no. of given columns.
+    int desc_cols = description.empty() ? 0 : 1;
+    desc_cols += std::count(description.begin(), description.end(), ',');
+    for (int i = 0; i < desc_cols; i++) {
+      std::fprintf(fout, i == 0 ? "Description," : ",");
+    }
+    // Write fixed column headers.
+    for (int i = 0; i < num_columns; i++) {
+      std::fprintf(fout, "%s,", columns[i]);
+    }
+    // Write chunk distribution column headers
+    for (auto size : buckets) {
+      std::fprintf(fout, "%s,", HumanBytes(size).c_str());
+    }
+    std::fprintf(fout, "\n");
+  }
+
+  // Count allow chunks below min_chunk_size and above max_chunk_size as they
+  // won't be included in the buckets list automatically.
+  uint64_t below_min_cnt = 0, above_max_cnt = 0;
+  for (auto [chunk_size, count] : sizes) {
+    if (chunk_size < cfg.min_chunk_size) below_min_cnt += count;
+    if (chunk_size > cfg.max_chunk_size) above_max_cnt += count;
+  }
+
+  static constexpr double mib = static_cast<double>(1 << 20);
+
+  // Write user-supplied description
+  if (!description.empty()) std::fprintf(fout, "%s,", description.c_str());
+  // Write chunking params.
+  std::fprintf(fout, "%s,0x%zx,0x%zx,", GearTable(), cfg.mask_s, cfg.mask_l);
+  std::fprintf(fout, "%zu,%zu,%zu,", cfg.min_chunk_size >> 10,
+               cfg.avg_chunk_size >> 10, cfg.max_chunk_size >> 10);
+  // Write speed, files, chunks.
+  double mibps =
+      (stats.total_bytes / mib) / absl::ToDoubleSeconds(stats.elapsed);
+  std::fprintf(fout, "%f,%zu,%zu,%zu,", mibps, stats.total_files,
+               stats.total_chunks, stats.unique_chunks);
+  // Write total and unique sizes.
+  std::fprintf(fout, "%f,%f,%f,", stats.total_bytes / mib,
+               stats.unique_bytes / mib,
+               (stats.total_bytes - stats.unique_bytes) / mib);
+  // Write dedup ratio and avg. chunk size.
+  double dedup_ratio = (stats.total_bytes - stats.unique_bytes) /
+                       static_cast<double>(stats.total_bytes);
+  size_t avg_size = stats.unique_bytes / stats.unique_chunks;
+  std::fprintf(fout, "%f,%zu,", dedup_ratio, avg_size >> 10);
+  // Write chunk distribution
+  size_t index = 0;
+  for (auto size : buckets) {
+    auto it = sizes.find(size);
+    uint64_t cnt = it != sizes.end() ? it->second : 0;
+    if (index == 0) {
+      cnt += below_min_cnt;
+    } else if (index + 1 == buckets.size()) {
+      cnt += above_max_cnt;
+    }
+    ++index;
+    std::fprintf(fout, "%f,", static_cast<double>(cnt) / stats.unique_chunks);
+  }
+  std::fprintf(fout, "\n");
+  return absl::OkStatus();
+}
+
+IndexerConfig::HashType GetHashType(const std::string name) {
+  if (name == "null") return IndexerConfig::HashType::kNull;
+  if (name == "blake3") return IndexerConfig::HashType::kBlake3;
+  std::cerr << "Unknown hash type: \"" << name << "\"" << std::endl;
+  return IndexerConfig::HashType::kUndefined;
+}
+
+}  // namespace
+}  // namespace cdc_ft
+
+int main(int argc, char* argv[]) {
+  cdc_ft::SetupFlagsHelp();
+  absl::ParseCommandLine(argc, argv);
+
+  std::vector<std::string> inputs = absl::GetFlag(FLAGS_inputs);
+
+  if (inputs.empty()) {
+    std::cout << "Execute the following command to get help on the usage:"
+              << std::endl
+              << argv[0] << " --help" << std::endl;
+    return 0;
+  }
+
+  cdc_ft::IndexerConfig cfg;
+  cfg.num_threads = absl::GetFlag(FLAGS_num_threads);
+  cfg.min_chunk_size = absl::GetFlag(FLAGS_min_chunk_size).Size();
+  cfg.avg_chunk_size = absl::GetFlag(FLAGS_avg_chunk_size).Size();
+  cfg.max_chunk_size = absl::GetFlag(FLAGS_max_chunk_size).Size();
+  cfg.read_block_size = absl::GetFlag(FLAGS_read_block_size).Size();
+  cfg.hash_type = cdc_ft::GetHashType(absl::GetFlag(FLAGS_hash));
+
+  if (!cfg.min_chunk_size) cfg.min_chunk_size = cfg.avg_chunk_size >> 1;
+  if (!cfg.max_chunk_size) cfg.max_chunk_size = cfg.avg_chunk_size << 1;
+  if (!cfg.read_block_size) cfg.read_block_size = cfg.max_chunk_size;
+  cfg.max_chunk_size_step = std::max<size_t>(cfg.min_chunk_size >> 2, 1024u);
+  assert(cfg.avg_chunk_size > 0);
+  assert(cfg.avg_chunk_size > cfg.min_chunk_size);
+  assert(cfg.avg_chunk_size < cfg.max_chunk_size);
+  assert(cfg.hash_type != cdc_ft::IndexerConfig::HashType::kUndefined);
+
+  cdc_ft::Indexer idx;
+  std::cout << "Starting indexer on " << inputs.size() << " inputs."
+            << std::endl;
+  static absl::Time start = absl::Now();
+  absl::Status res = idx.Run(cfg, inputs, cdc_ft::ShowProgress);
+  auto elapsed = absl::Now() - start;
+  std::cout << std::endl;
+  if (res.ok()) {
+    std::cout << "Operation succeeded." << std::endl << std::endl;
+    cdc_ft::ShowSummary(idx.Config(), idx.Stats(), elapsed);
+    std::cout << std::endl;
+    cdc_ft::ShowChunkSizes(idx.Config(), idx.ChunkSizes());
+    std::string results_file = absl::GetFlag(FLAGS_results_file);
+    if (!results_file.empty()) {
+      res = cdc_ft::WriteResultsFile(
+          results_file, absl::GetFlag(FLAGS_description), idx.Config(),
+          idx.Stats(), idx.ChunkSizes());
+      if (!res.ok())
+        std::cerr << "Failed to write results to '" << results_file
+                  << "': " << res.message() << std::endl;
+    }
+  } else {
+    std::cerr << "Error: (" << res.code() << ") " << res.message() << std::endl;
+  }
+
+  return static_cast<int>(res.code());
+}