pax_global_header 0000666 0000000 0000000 00000000064 15212466604 0014520 g ustar 00root root 0000000 0000000 52 comment=ab59502f7174dc06088f3e500930b3d111e61aa6
desync-1.0.2/ 0000775 0000000 0000000 00000000000 15212466604 0013005 5 ustar 00root root 0000000 0000000 desync-1.0.2/.github/ 0000775 0000000 0000000 00000000000 15212466604 0014345 5 ustar 00root root 0000000 0000000 desync-1.0.2/.github/workflows/ 0000775 0000000 0000000 00000000000 15212466604 0016402 5 ustar 00root root 0000000 0000000 desync-1.0.2/.github/workflows/release.yaml 0000664 0000000 0000000 00000001031 15212466604 0020701 0 ustar 00root root 0000000 0000000
name: Release
on:
push:
tags:
- '*'
permissions:
contents: write
jobs:
release:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- uses: actions/setup-go@v6
with:
go-version-file: go.mod
- uses: goreleaser/goreleaser-action@v7
with:
distribution: goreleaser
version: '~> v2'
args: release --clean
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
desync-1.0.2/.github/workflows/validate.yaml 0000664 0000000 0000000 00000002706 15212466604 0021064 0 ustar 00root root 0000000 0000000
name: Validate
on:
# Post-merge validation only. Feature branches are validated by their
# pull_request run, so a branch push with an open PR no longer triggers
# a duplicate (previously cancelled) push run.
push:
branches:
- master
# All pull requests.
pull_request:
branches:
- '**'
# Least-privilege token; the validate workflow only needs to read the repo.
permissions:
contents: read
# A push and its pull_request both trigger this workflow on the same
# branch. Group them so only one full matrix runs, and cancel runs that
# are superseded by a newer commit on the same branch/PR.
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
cancel-in-progress: true
jobs:
build:
name: Validate on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, windows-latest, macos-latest ]
timeout-minutes: 10
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version-file: go.mod
- run: go test ./...
- run: go build -o cmd/desync/ ./cmd/desync
- name: Race detector
if: runner.os == 'Linux'
run: go test -race ./...
- name: Static analysis
if: runner.os == 'Linux'
run: |
go vet ./...
test -z "$(gofmt -l .)" || { echo "Files need gofmt:"; gofmt -l .; exit 1; }
go mod tidy -diff
desync-1.0.2/.gitignore 0000664 0000000 0000000 00000000006 15212466604 0014771 0 ustar 00root root 0000000 0000000 dist/
desync-1.0.2/.goreleaser.yml 0000664 0000000 0000000 00000000510 15212466604 0015732 0 ustar 00root root 0000000 0000000 version: 2
before:
hooks:
- go mod tidy
- go generate ./...
builds:
- main: ./cmd/desync
goos:
- linux
- darwin
- windows
goarch:
- amd64
- arm64
checksum:
name_template: 'checksums.txt'
snapshot:
version_template: "{{ incpatch .Version }}-next"
changelog:
sort: asc
desync-1.0.2/.pre-commit-config.yaml 0000664 0000000 0000000 00000000710 15212466604 0017264 0 ustar 00root root 0000000 0000000 repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-added-large-files
- repo: https://github.com/dnephin/pre-commit-golang
rev: master
hooks:
- id: go-fmt
#- id: go-vet
#- id: go-imports
#- id: golangci-lint
#- id: go-critic
#- id: go-unit-tests
- id: go-build
- id: go-mod-tidy desync-1.0.2/CLAUDE.md 0000664 0000000 0000000 00000012625 15212466604 0014272 0 ustar 00root root 0000000 0000000 # CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
desync is a Go library and CLI tool that re-implements [casync](https://github.com/systemd/casync) features for content-addressed binary distribution. It chunks large files using a rolling hash, deduplicates and compresses chunks (zstd), and distributes them via multiple store backends. Chunks are identified by SHA512/256 checksums (`ChunkID [32]byte`).
## Build and Test Commands
```bash
# Build the CLI binary (output into cmd/desync/)
go build -o cmd/desync/ ./cmd/desync
# Run all tests (library + CLI)
go test ./...
# Run library tests only
go test
# Run CLI tests only
go test ./cmd/desync
# Run a single test
go test -run TestFunctionName
go test ./cmd/desync -run TestFunctionName
# Install the binary
go install ./cmd/desync
# Format code (enforced by pre-commit)
go fmt ./...
# Tidy modules (enforced by pre-commit)
go mod tidy
```
CI runs `go test` and `go build ./cmd/desync` on ubuntu, windows, and macOS.
## Architecture
### Core Interfaces (store.go)
The system is built around composable store interfaces:
- **`Store`** — read-only: `GetChunk(ChunkID)`, `HasChunk(ChunkID)`
- **`WriteStore`** — adds `StoreChunk(*Chunk)`
- **`PruneStore`** — adds `Prune(ctx, ids)`
- **`IndexStore`** — read indexes: `GetIndexReader(name)`, `GetIndex(name)`
- **`IndexWriteStore`** — adds `StoreIndex(name, idx)`
### Store Implementations
Each backend implements the store interfaces:
- **LocalStore** (`local.go`) — filesystem-based chunk storage
- **RemoteHTTP** (`remotehttp.go`) — HTTP(S) with TLS/mutual auth support
- **S3Store** (`s3.go`) — S3-compatible storage (AWS, MinIO)
- **SFTPStore** (`sftp.go`) — SFTP over SSH
- **RemoteSSH** (`remotessh.go`) — casync protocol over SSH (read-only)
- **GCStore** (`gcs.go`) — Google Cloud Storage
### Store Composition
Stores are composed for routing, caching, and failover:
- **StoreRouter** (`storerouter.go`) — tries multiple stores in order
- **FailoverGroup** (`failover.go`) — failover with active store rotation
- **Cache** (`cache.go`) — local (fast) + remote (slow) with auto-caching
- **RepairableCache** — converts ChunkInvalid to ChunkMissing for self-repair
### Data Pipeline
**Chunking:** `Chunker` (`chunker.go`) uses a rolling hash (SipHash, 48-byte window) for content-defined chunking with configurable min/avg/max sizes (default 16KB/64KB/256KB).
**Converter pipeline** (`coverter.go`): Layered data transformations applied in order for writes, reverse for reads. Currently only compression (zstd via `Compressor`), but designed for adding encryption.
**Assembly:** `AssembleFile()` (`assemble.go`) reconstructs files from an index and chunk stores, supporting self-seeding and file seeds for efficient cloning with reflink support (Btrfs/XFS).
### Index Format
Index files (`.caibx`/`.caidx`) contain a table of `IndexChunk` entries mapping `ChunkID` to byte offsets and sizes. Parsed in `index.go`.
### Seeds
Seeds optimize extraction by reusing data from existing files:
- **FileSeed** (`fileseed.go`) — existing file + its index
- **SelfSeed** (`selfseed.go`) — file being written seeds later chunks
- **NullSeed** (`nullseed.go`) — all-zero chunk optimization
### CLI Structure (cmd/desync/)
Uses `cobra` for command framework. Key commands: `extract`, `make`, `tar`, `untar`, `chop`, `cache`, `verify`, `chunk-server`, `index-server`, `mount-index`, `prune`, `info`, `cat`.
Store factory in `cmd/desync/store.go` creates store instances from URL/path strings. Multiple stores are specified via CLI flags; failover groups use `|` separator.
### Configuration
Config file at `$HOME/.config/desync/config.json` for S3 credentials (per-endpoint with glob patterns), store options, and TLS settings. Long-running processes (chunk-server, mount-index) support dynamic store config via JSON files reloaded on SIGHUP.
Key environment variables: `S3_ACCESS_KEY`, `S3_SECRET_KEY`, `S3_REGION`, `CASYNC_SSH_PATH`, `DESYNC_HTTP_AUTH`.
## Key Patterns
- **Interface-driven composition** — small focused interfaces composed via routers/caches
- **Lazy evaluation** — chunks decompress only when plain data is accessed
- **Context-based cancellation** — goroutine lifecycle via `context.Context`
- **`t.Fatal()` restriction** — do not call `t.Fatal()`/`t.FailNow()` from non-main goroutines (see PR #291)
- **Avoid recompression** — if a chunk already has compressed form, don't recompress (see PR #289)
## Code Style
- **Tests use testify** — write new (or modified) Go tests with `github.com/stretchr/testify`: `require` for fatal checks, `assert` for non-fatal. Don't hand-roll `if … { t.Fatalf(…) }` conditionals. Existing plain-`testing` tests don't need a mass rewrite — apply this to new tests and tests you're already changing. Note: `require.*` calls `FailNow()`, so per the non-main-goroutine `t.Fatal()` restriction in Key Patterns, only use `require.*` from the test goroutine — in spawned goroutines use `assert.*` or pass errors back over a channel.
- **Build into the main package's directory** — always `go build -o cmd/desync/ ./cmd/desync`, never a bare `go build ./cmd/desync`. The bare form drops an untracked `desync` binary in the repo root (it is not git-ignored). Build artifacts belong next to their `main` package, never the project root.
## Module
Module path: `github.com/folbricht/desync`, Go 1.24.0.
desync-1.0.2/LICENSE 0000664 0000000 0000000 00000002745 15212466604 0014022 0 ustar 00root root 0000000 0000000 BSD 3-Clause License
Copyright (c) 2017, folbricht
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
desync-1.0.2/README.md 0000664 0000000 0000000 00000132165 15212466604 0014274 0 ustar 00root root 0000000 0000000 # desync
Content-addressed binary distribution, reimplemented in Go.
[](https://pkg.go.dev/github.com/folbricht/desync)
[](https://github.com/folbricht/desync/actions/workflows/validate.yaml)
[](LICENSE)
desync is a Go library and CLI tool that re-implements [casync](https://github.com/systemd/casync) features for content-addressed binary distribution. It chunks large files using a rolling hash, deduplicates and compresses chunks with [zstd](https://github.com/facebook/zstd), and distributes them via multiple store backends. It maintains compatibility with casync's data structures, protocols and types (chunk stores, index files, archives) to function as a drop-in replacement.
## Key Features
- **Parallel chunking** — identical output to casync, up to 10x faster
- **Multiple store backends** — local, HTTP(S), S3/GCS, SFTP, SSH
- **Store chaining and caching** — combine stores with failover groups
- **Seeds and reflinks** — clone blocks from existing files on Btrfs/XFS
- **Built-in servers** — HTTP(S) chunk server and index server with proxy support
- **FUSE mounting** — mount blob indexes as files
- **Tar interoperability** — create/extract catar from standard tar streams
- **Cross-platform** — Linux, macOS, Windows (subset), BSD
## Table of Contents
- [Installation](#installation)
- [Quick Start](#quick-start)
- [Concepts](#concepts)
- [Terminology](#terminology)
- [Parallel Chunking](#parallel-chunking)
- [Seeds and Reflinks](#seeds-and-reflinks)
- [Tar Interoperability](#tar-interoperability)
- [Store Backends](#store-backends)
- [Capabilities](#capabilities)
- [Store Architecture](#store-architecture)
- [Chaining and Caching](#chaining-and-caching)
- [Failover Groups](#failover-groups)
- [S3 Store URLs](#s3-store-urls)
- [Compressed vs Uncompressed](#compressed-vs-uncompressed)
- [Remote Indexes](#remote-indexes)
- [CLI Reference](#cli-reference)
- [Commands](#commands)
- [Common Options](#common-options)
- [Environment Variables](#environment-variables)
- [Configuration](#configuration)
- [Dynamic Store Configuration](#dynamic-store-configuration)
- [Configuration Reference](#configuration-reference)
- [Example Config](#example-config)
- [Examples](#examples)
- [Extraction](#extraction)
- [Chunking](#chunking)
- [Cache and Store Management](#cache-and-store-management)
- [Archives](#archives)
- [Server Examples](#server-examples)
- [Update Size Estimation](#update-size-estimation)
- [Platform Support](#platform-support)
- [Design Philosophy](#design-philosophy)
- [Links](#links)
## Installation
Install the latest release into `$HOME/go/bin`:
```text
go install -v github.com/folbricht/desync/cmd/desync@latest
```
Or build from source:
```text
git clone https://github.com/folbricht/desync.git
cd desync/cmd/desync && go install
```
## Quick Start
**Chunk a file** — split a blob into chunks and create an index:
```text
desync make -s /tmp/store index.caibx /path/to/largefile
```
**Extract a file** — reassemble a blob from its index and chunk store:
```text
desync extract -s /tmp/store index.caibx /path/to/largefile
```
**Extract with remote store and local cache** — fetch chunks over HTTP, cache locally:
```text
desync extract -s http://server/store -c /tmp/cache index.caibx /path/to/largefile
```
## Concepts
### Terminology
| Term | Description |
| --- | --- |
| **chunk** | A section of data from a file, typically 16KB-256KB. Identified by the SHA512-256 checksum of its uncompressed data. Stored compressed with zstd (`.cacnk` extension). Boundaries are determined by a [rolling hash algorithm](http://0pointer.net/blog/casync-a-tool-for-distributing-file-system-images.html). |
| **chunk store** | Location (local or remote) that stores chunks. Can be a local directory, or accessed via HTTP, S3, GCS, SFTP, or SSH. |
| **index** | Data structure mapping chunk IDs to byte offsets within a file. A small representation of a much larger file. Produced by `make`. Given an index and a chunk store, the original file can be reassembled or FUSE-mounted. |
| **index store** | Location for index files. Can be local, SFTP, S3, GCS, or HTTP. |
| **catar** | Archive of a directory tree, similar to tar (`.catar` extension). |
| **caidx** | Index file of a chunked catar archive. |
| **caibx** | Index of a chunked regular blob. |
### Parallel Chunking
One of the significant differences to casync is that desync attempts to make chunking faster by utilizing more CPU resources, chunking data in parallel. Depending on the chosen degree of concurrency, the file is split into N equal parts and each part is chunked independently. While the chunking of each part is ongoing, part1 is trying to align with part2, and part3 is trying to align with part4 and so on. Alignment is achieved once a common split point is found in the overlapping area. If a common split point is found, the process chunking the previous part stops, e.g. part1 chunker stops, part2 chunker keeps going until it aligns with part3 and so on until all split points have been found. Once all split points have been determined, the file is opened again (N times) to read, compress and store the chunks.
While in most cases this process achieves significantly reduced chunking times at the cost of CPU, there are edge cases where chunking is only about as fast as upstream casync (with more CPU usage). This is the case if no split points can be found in the data between min and max chunk size as is the case if most or all of the file consists of 0-bytes. In this situation, the concurrent chunking processes for each part will not align with each other and a lot of effort is wasted.
| Command | Mostly/All 0-bytes | Typical data |
| --- | --- | --- |
| `make` | Slow (worst-case) — likely comparable to casync | Fast — parallel chunking |
| `extract` | Extremely fast — effectively the speed of a `truncate()` syscall | Fast — done in parallel, usually limited by I/O |
While casync supports very small min chunk sizes, optimizations in desync require min chunk sizes larger than the window size of the rolling hash used (currently 48 bytes). The tool's default chunk sizes match the defaults used in casync: min 16KB, avg 64KB, max 256KB.
### Seeds and Reflinks
Copy-on-write filesystems such as Btrfs and XFS support cloning of blocks between files in order to save disk space as well as improve extraction performance. To utilize this feature, desync uses several seeds to clone sections of files rather than reading the data from chunk stores and copying it in place:
- **Null Seed** — a built-in seed for chunks of max size containing only 0 bytes. This can significantly reduce disk usage of files with large 0-byte ranges, such as VM images, effectively turning an eager-zeroed VM disk into a sparse disk.
- **Self Seed** — as chunks are written to the destination file, the file itself becomes a seed. If a chunk or series of chunks appears again later in the file, it is cloned from the position written previously, saving storage for files with repetitive sections.
- **File Seeds** — seed files and their indexes can be provided when extracting. For example, `image-v1.vmdk` and `image-v1.vmdk.caibx` can be used as seed for extracting `image-v2.vmdk`. The additional disk space required will be only the delta between the two versions.
```mermaid
graph LR
subgraph "External Seeds"
S1["Seed 1 (file + index)"]
S2["Seed 2 (file + index)"]
end
subgraph "Built-in Seeds"
NS["Null Seed (zero chunks)"]
SS["Self Seed (growing file)"]
end
CS["Chunk Store (fallback)"]
Result["Result File"]
S1 -- "clone/copy matching chunks" --> Result
S2 -- "clone/copy matching chunks" --> Result
NS -- "clone/copy zero regions" --> Result
SS -- "clone/copy repeated sections" --> Result
CS -. "fetch remaining chunks" .-> Result
style S1 fill:#4a90d9,stroke:#2a6cb0,color:#fff
style S2 fill:#4a90d9,stroke:#2a6cb0,color:#fff
style NS fill:#6ab04c,stroke:#4a8a2c,color:#fff
style SS fill:#6ab04c,stroke:#4a8a2c,color:#fff
style CS fill:#e17055,stroke:#c0392b,color:#fff
style Result fill:#f6b93b,stroke:#d4951a,color:#fff
```
Even if cloning is not available, seeds are still useful. desync automatically determines if reflinks are available (and the block size used in the filesystem). If cloning is not supported, sections are copied instead of cloned. Copying still improves performance and reduces the load created by retrieving chunks over the network and decompressing them.
### Tar Interoperability
In addition to packing local filesystem trees into catar archives, desync can read standard tar archive streams. Various tar formats such as GNU and BSD tar are supported. See the Go [archive/tar](https://pkg.go.dev/archive/tar) package for details on supported formats. When reading from tar archives, the content is not re-ordered and written to the catar in the same order. Since the catar format does not support hardlinks, the input tar stream needs to follow hardlinks for desync to process them correctly. See the `--hard-dereference` option in the tar utility.
catar archives can also be extracted to GNU tar archive streams. All files in the output stream are ordered the same as in the catar.
## Store Backends
### Capabilities
| Operation | Local | S3 | GCS | HTTP | SFTP | SSH (casync protocol) |
| --- | :---: | :---: | :---: | :---: | :---: | :---: |
| Read chunks | yes | yes | yes | yes | yes | yes |
| Write chunks | yes | yes | yes | yes | yes | no |
| Use as cache | yes | yes | yes | yes | yes | no |
| Prune | yes | yes | yes | no | yes | no |
| Verify | yes | no | no | no | no | no |
### Store Architecture
```mermaid
graph LR
Client["Client"]
Cache["Cache Store"]
Router["Store Router"]
S1["Store 1"]
FG["Failover Group"]
S2a["Store 2a"]
S2b["Store 2b"]
Client --> Cache
Cache -- "miss" --> Router
Cache -- "hit" --> Client
Router --> S1
Router --> FG
FG --> S2a
S2a -. "on failure" .-> S2b
S1 -- "found" --> Cache
FG -- "found" --> Cache
style Client fill:#6c5ce7,stroke:#4b3ec4,color:#fff
style Cache fill:#6ab04c,stroke:#4a8a2c,color:#fff
style Router fill:#4a90d9,stroke:#2a6cb0,color:#fff
style S1 fill:#f6b93b,stroke:#d4951a,color:#fff
style FG fill:#e17055,stroke:#c0392b,color:#fff
style S2a fill:#f6b93b,stroke:#d4951a,color:#fff
style S2b fill:#f6b93b,stroke:#d4951a,color:#fff
```
### Chaining and Caching
One of the main features of desync is the ability to combine/chain multiple chunk stores of different types and also combine it with a cache store. Stores are chained in the command line like so: `-s -s -s `. A chunk will first be requested from `store1`, and if not found there, the request will be routed to `store2` and so on. Typically, the fastest chunk store should be listed first to improve performance.
It is also possible to combine multiple chunk stores with a cache. In most cases the cache would be a local store, but that is not a requirement. When combining stores and a cache like so: `-s -s -c `, a chunk request will first be routed to the cache store, then to store1 followed by store2. Any chunk that is not yet in the cache will be stored there upon first request.
The `-c ` option can be used to either specify an existing store to act as cache or to populate a new store. Whenever a chunk is requested, it is first looked up in the cache before routing the request to the next (possibly remote) store. Any chunks downloaded from the main stores are added to the cache. In addition, when a chunk is read from the cache and it is a local store, mtime of the chunk is updated to allow for basic garbage collection based on file age. The cache store is expected to be writable. If the cache contains an invalid chunk (checksum does not match the chunk ID), the operation will fail. Invalid chunks are not skipped or removed from the cache automatically. `verify -r` can be used to evict bad chunks from a local store or cache.
### Failover Groups
Given stores with identical content (same chunks in each), it is possible to group them in a way that provides resilience to failures. Store groups are specified in the command line using `|` as separator in the same `-s` option. For example using `-s "http://server1/|http://server2/"`, requests will normally be sent to `server1`, but if a failure is encountered, all subsequent requests will be routed to `server2`. There is no automatic fail-back. A failure in `server2` will cause it to switch back to `server1`. Any number of stores can be grouped this way. Note that a missing chunk is treated as a failure immediately, no other servers will be tried, hence the need for all grouped stores to hold the same content.
S3 Store URLs
desync supports reading from and writing to chunk stores that offer an S3 API, for example hosted in AWS or running on a local server. When using such a store, credentials are passed into the tool either via environment variables `S3_ACCESS_KEY`, `S3_SECRET_KEY` and `S3_SESSION_TOKEN` (if needed) or, if multiples are required, in the config file. Care is required when building those URLs. Below a few examples:
#### AWS
This store is hosted in `eu-west-3` in AWS. `s3` signals that the S3 protocol is to be used, `https` should be specified for SSL connections. The first path element of the URL contains the bucket, `desync.bucket` in this example. Note, when using AWS, no port should be given in the URL!
```text
s3+https://s3-eu-west-3.amazonaws.com/desync.bucket
```
It's possible to use prefixes (or "directories") to object names like so:
```text
s3+https://s3-eu-west-3.amazonaws.com/desync.bucket/prefix
```
#### Other service with S3 API
This is a store running on the local machine on port 9000 without SSL.
```text
s3+http://127.0.0.1:9000/store
```
#### Setting S3 bucket addressing style
desync uses [minio](https://github.com/minio/minio-go) as an S3 client library. It has an auto-detection mechanism for determining the addressing style of the buckets which should work for Amazon and Google S3 services but could potentially fail for your custom implementation. You can manually specify the addressing style by appending the "lookup" query parameter to the URL.
By default, the value of `?lookup=auto` is implied.
```text
s3+http://127.0.0.1:9000/bucket/prefix?lookup=path
s3+https://s3.internal.company/bucket/prefix?lookup=dns
s3+https://example.com/bucket/prefix?lookup=auto
```
Compressed vs Uncompressed
By default, desync reads and writes chunks in compressed form to all supported stores. This is in line with upstream casync's goal of storing in the most efficient way. It is however possible to change this behavior by providing desync with a config file (see [Configuration](#configuration)). Disabling compression and storing chunks uncompressed may reduce latency in some use-cases and improve performance. desync supports reading and writing uncompressed chunks to SFTP, S3, HTTP and local stores and caches. If more than one store is used, each of those can be configured independently, for example it's possible to read compressed chunks from S3 while using a local uncompressed cache for best performance. However, care needs to be taken when using the `chunk-server` command and building chains of chunk store proxies to avoid shifting the decompression load onto the server (it's possible this is actually desirable).
In the setup below, a client reads chunks from an HTTP chunk server which itself gets chunks from S3.
```text
---> --->
```
If the client configures the HTTP chunk server to be uncompressed (`chunk-server` needs to be started with the `-u` option), and the chunk server reads compressed chunks from S3, then the chunk server will have to decompress every chunk that's requested before responding to the client. If the chunk server was reading uncompressed chunks from S3, there would be no overhead.
Compressed and uncompressed chunks can live in the same store and don't interfere with each other. A store that's configured for compressed chunks by configuring it client-side will not see the uncompressed chunks that may be present. `prune` and `verify` too will ignore any chunks written in the other format. Both kinds of chunks can be accessed by multiple clients concurrently and independently.
### Remote Indexes
Indexes can be stored and retrieved from remote locations via SFTP, S3, and HTTP. Storing indexes remotely is optional and deliberately separate from chunk storage. While it's possible to store indexes in the same location as chunks in the case of SFTP and S3, this should only be done in secured environments. The built-in HTTP chunk store (`chunk-server` command) can not be used as index server. Use the `index-server` command instead to start an index server that serves indexes and can optionally store them as well (with `-w`).
Using remote indexes, it is possible to use desync completely file-less. For example when wanting to share a large file with `mount-index`, one could read the index from an index store like this:
```text
desync mount-index -s http://chunk.store/store http://index.store/myindex.caibx /mnt/image
```
No file would need to be stored on disk in this case.
## CLI Reference
The CLI tool uses the desync library and makes most features available in a consistent fashion. It does not match upstream casync's syntax exactly, but tries to be similar.
### Commands
#### Chunking and Extraction
| Command | Description |
| --- | --- |
| `make` | Split a blob into chunks and create an index file |
| `extract` | Build a blob from an index file, optionally using seed indexes+blobs |
| `verify-index` | Verify that an index file matches a given blob |
| `mount-index` | FUSE mount a blob index as a single file |
| `cat` | Stream a blob to stdout or a file |
| `chunk` | Chunk input file and print chunk boundaries plus chunk IDs |
#### Archives
| Command | Description |
| --- | --- |
| `tar` | Pack a catar file, optionally chunk and create an index |
| `untar` | Unpack a catar file or index referencing a catar |
| `mtree` | Print the content of a catar, caidx, or local directory in mtree format |
#### Servers
| Command | Description |
| --- | --- |
| `chunk-server` | Start an HTTP(S) chunk server/store |
| `index-server` | Start an HTTP(S) index server/store |
| `pull` | Serve chunks using the casync protocol over stdin/stdout |
#### Inspection
| Command | Description |
| --- | --- |
| `info` | Show information about an index file |
| `inspect-chunks` | Show detailed information about chunks in an index and optional local store |
| `list-chunks` | List all chunk IDs in an index file |
#### Maintenance
| Command | Description |
| --- | --- |
| `verify` | Verify the integrity of a local store |
| `cache` | Populate a cache from index files without extracting |
| `chop` | Split a blob according to an existing index and store chunks |
| `prune` | Remove unreferenced chunks from a store (use with caution) |
#### Utility
| Command | Description |
| --- | --- |
| `config` | Show or write the config file |
| `manpage` | Generate manpages for desync |
Common Options
Not all options apply to all commands.
**Global options:**
| Option | Description |
| --- | --- |
| `--config ` | Path to config file. Default: `$HOME/.config/desync/config.json`. |
| `--digest ` | Digest algorithm: `sha512-256` (default) or `sha256`. |
| `--verbose` | Enable verbose/debug logging. |
**Store options:**
| Option | Description |
| --- | --- |
| `-s ` | Location of the chunk store, can be local directory or a URL like `ssh://hostname/path/to/store`. Multiple stores can be specified, they'll be queried in order. The `chop`, `make`, `tar` and `prune` commands support updating chunk stores in S3, while `verify` only operates on a local store. |
| `-c ` | Location of a chunk store to be used as cache. Needs to be writable. |
| `-n ` | Number of concurrent goroutines. Default: 10. |
| `-t` | Trust all certificates presented by HTTPS stores. Allows the use of self-signed certs. |
| `--ca-cert ` | Trust authorities in this file instead of the OS trust store. |
| `--client-cert ` | Client certificate for mutual TLS authentication. |
| `--client-key ` | Client key for mutual TLS authentication. |
| `-e` / `--error-retry ` | Number of times to retry on network error. |
| `-b` / `--error-retry-base-interval ` | Initial retry delay; attempt N waits N times this interval. |
**Extract options:**
| Option | Description |
| --- | --- |
| `--seed ` | Specifies a seed file and index for the `extract` command. The tool expects the matching file to have the same name as the index file, without the `.caibx` extension. |
| `--seed-dir ` | Specifies a directory containing seed files and their indexes for `extract`. Each index file (`*.caibx`) needs a matching blob without the extension. |
| `-k` / `--in-place` | Keep partially assembled files in place when `extract` fails or is interrupted. Also use this option to write to block devices. |
| `--print-stats` | Print extraction statistics (`extract`) or chunking statistics (`make`) to stderr. |
| `--skip-invalid-seeds` | Skip seeds with invalid chunks instead of failing. |
| `--regenerate-invalid-seeds` | Regenerate seed indexes when invalid chunks are found. |
**Chunking and archive options:**
| Option | Description |
| --- | --- |
| `-m` | Specify the min/avg/max chunk sizes in KB. Only applicable to `make`. Defaults to 16:64:256. For best results: min = avg/4, max = 4*avg. |
| `-i` | When packing/unpacking an archive, don't create/read an archive file but instead use an index file (caidx). Only applicable to `tar` and `untar`. |
| `--input-format ` | Input format for `tar`: `disk` (default) or `tar`. |
| `--output-format ` | Output format for `untar`: `disk` (default) or `gnu-tar`. |
| `--ignore ` | Index file(s) whose chunks should be skipped. Applies to `chop` and `cache`. |
**Server options:**
| Option | Description |
| --- | --- |
| `-l ` | Listening address for the HTTP chunk server. Can be used multiple times for more than one interface or port. |
| `-w` / `--writeable` | Enable write support. Applies to `chunk-server` and `index-server`. |
| `-u` / `--uncompressed` | Serve uncompressed chunks. Applies to `chunk-server`. |
| `--store-file ` | Read store arguments from a JSON file; supports SIGHUP reload. Applies to `chunk-server` and `mount-index`. |
| `--key ` | Key file in PEM format for HTTPS `chunk-server` and `index-server`. Requires `--cert`. |
| `--cert ` | Certificate file in PEM format for HTTPS `chunk-server` and `index-server`. Requires `--key`. |
| `--mutual-tls` | Require a valid client certificate, verified against `--client-ca` (which is mandatory when this is set). Applies to `chunk-server` and `index-server`. |
| `--client-ca ` | Acceptable client certificate or CA for mutual TLS. Required when `--mutual-tls` is set; otherwise client certs would be verified against the system trust store. |
| `--authorization ` | Expected value of the Authorization header in client requests. |
| `--log ` | Request log file, or `-` for STDOUT. Applies to `chunk-server` and `index-server`. |
**Other options:**
| Option | Description |
| --- | --- |
| `-r` | Repair a local store by removing invalid chunks. Only valid for `verify`. |
| `-y` | Answer with `yes` when asked for confirmation. Only supported by `prune`. |
| `-f` / `--format ` | Output format for `info`: `plain` (default) or `json`. |
Environment Variables
| Variable | Description |
| --- | --- |
| `CASYNC_SSH_PATH` | Overrides the default `ssh` command when connecting to remote SSH or SFTP chunk stores. |
| `CASYNC_REMOTE_PATH` | Defines the command to run on the chunk store when using SSH. Default: `casync`. |
| `S3_ACCESS_KEY`, `S3_SECRET_KEY`, `S3_SESSION_TOKEN`, `S3_REGION` | S3 store credentials when using a single store. If `S3_ACCESS_KEY` and `S3_SECRET_KEY` are not defined, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_SESSION_TOKEN` are also considered. These take precedence over config file values. |
| `DESYNC_PROGRESSBAR_ENABLED` | Enables the progress bar if set to any non-empty value. By default, the progress bar is only shown when STDERR is a terminal. |
| `DESYNC_ENABLE_PARSABLE_PROGRESS` | Prints operation name, completion percentage, and estimated remaining time to STDERR. Similar to the default progress bar but without the visual bar. |
| `DESYNC_HTTP_AUTH` | Sets the expected `Authorization` header value from clients when using `chunk-server` or `index-server`. Needs the full string including type and encoding, e.g. `"Basic dXNlcjpwYXNzd29yZAo="`. Command-line values take precedence. |
## Configuration
For most use cases, the tool's default configuration is sufficient. A config file at `$HOME/.config/desync/config.json` allows customization of timeouts, error retry behavior, or credentials that can't be set via command-line options or environment variables. All values have sensible defaults. Only add configuration for values that differ from the defaults.
To view the current configuration, use `desync config`. If no config file is present, this shows the defaults. To create a config file, use `desync config -w` to write the current configuration, then edit the file.
### Dynamic Store Configuration
Some long-running processes, namely `chunk-server` and `mount-index`, may require reconfiguration without restart. This can be achieved by starting them with the `--store-file` option which provides the arguments normally passed via `--store` and `--cache` from a JSON file instead. A SIGHUP to the process will trigger a reload of the configuration and replace the stores internally without restart. This can be done under load. If the configuration is found to be invalid, an error is printed to STDERR and the reload is ignored.
```json
{
"stores": [
"/path/to/store1",
"/path/to/store2"
],
"cache": "/path/to/cache"
}
```
This can be combined with store failover by providing the same syntax as used in the command-line, for example `{"stores":["/path/to/main|/path/to/backup"]}`. See [Server Examples](#server-examples) for details.
Configuration Reference
- **`s3-credentials`** — Credentials for S3 stores. The key must be the URL scheme and host used for the store, excluding the path, but including the port if used in the store URL. Keys can contain glob patterns (`*`, `?`, `[…]`). See [filepath.Match](https://pkg.go.dev/path/filepath#Match) for wildcard details. Standard [AWS credentials files](https://docs.aws.amazon.com/cli/latest/userguide/cli-config-files.html) are also supported.
- **`store-options`** — Per-store customization of compression, timeouts, retry behavior, and keys. Not all options apply to every store type. The store location in the command line must match the key exactly for options to apply. Glob patterns are also supported; a config file where more than one key matches a single store is considered invalid.
| Option | Description | Default |
| --- | --- | --- |
| `timeout` | Time limit for chunk read/write in nanoseconds. Negative = infinite. | 1 minute |
| `error-retry` | Number of times to retry failed chunk requests. | 0 |
| `error-retry-base-interval` | Nanoseconds to wait before first retry. Attempt N waits N times this interval. | 0 |
| `client-cert` | Certificate file for mutual SSL. | — |
| `client-key` | Key file for mutual SSL. | — |
| `ca-cert` | Certificate file containing trusted certs or CAs. | — |
| `trust-insecure` | Trust any certificate presented by the server. | false |
| `skip-verify` | Disable data integrity verification on read. Only recommended when chaining stores with `chunk-server` using compressed stores. | false |
| `uncompressed` | Read and write uncompressed chunks. Both formats can coexist in the same store. | false |
| `http-auth` | Value of the `Authorization` header in HTTP requests, e.g. `"Bearer "` or `"Basic dXNlcjpwYXNzd29yZAo="`. | — |
| `http-cookie` | Value of the `Cookie` header in HTTP requests, e.g. `"name=value; name2=value2"`. | — |
Example Config
#### JSON config file
```json
{
"s3-credentials": {
"http://localhost": {
"access-key": "MYACCESSKEY",
"secret-key": "MYSECRETKEY"
},
"https://127.0.0.1:9000": {
"aws-credentials-file": "/Users/user/.aws/credentials"
},
"https://127.0.0.1:8000": {
"aws-credentials-file": "/Users/user/.aws/credentials",
"aws-profile": "profile_static"
},
"https://s3.us-west-2.amazonaws.com": {
"aws-credentials-file": "/Users/user/.aws/credentials",
"aws-region": "us-west-2",
"aws-profile": "profile_refreshable"
}
},
"store-options": {
"https://192.168.1.1/store": {
"client-cert": "/path/to/crt",
"client-key": "/path/to/key",
"error-retry": 1
},
"https://10.0.0.1/": {
"http-auth": "Bearer abcabcabc"
},
"https://example.com/*/*/": {
"http-auth": "Bearer dXNlcjpwYXNzd29yZA=="
},
"https://cdn.example.com/": {
"http-cookie": "PHPSESSID=298zf09hf012fh2; csrftoken=u32t4o3tb3gg43"
},
"/path/to/local/cache": {
"uncompressed": true
}
}
}
```
#### AWS credentials file
```ini
[default]
aws_access_key_id = DEFAULT_PROFILE_KEY
aws_secret_access_key = DEFAULT_PROFILE_SECRET
[profile_static]
aws_access_key_id = OTHERACCESSKEY
aws_secret_access_key = OTHERSECRETKEY
[profile_refreshable]
aws_access_key_id = PROFILE_REFRESHABLE_KEY
aws_secret_access_key = PROFILE_REFRESHABLE_SECRET
aws_session_token = PROFILE_REFRESHABLE_TOKEN
```
## Examples
### Extraction
Re-assemble somefile.tar using a remote chunk store and a blob index file.
```text
desync extract -s ssh://192.168.1.1/path/to/casync.store/ -c /tmp/store somefile.tar.caibx somefile.tar
```
Use multiple stores, specify the local one first to improve performance.
```text
desync extract -s /some/local/store -s ssh://192.168.1.1/path/to/casync.store/ somefile.tar.caibx somefile.tar
```
Extract version 3 of a disk image using the previous 2 versions as seed for cloning (if supported), or copying. Note, when providing a seed like `--seed .ext.caibx`, it is assumed that `.ext` is available next to the index file, and matches the index.
```text
desync extract -s /local/store \
--seed image-v1.qcow2.caibx \
--seed image-v2.qcow2.caibx \
image-v3.qcow2.caibx image-v3.qcow2
```
Extract an image using several seeds present in a directory. Each of the `.caibx` files in the directory needs to have a matching blob of the same name. It is possible for the source index file to be in the same directory also (it'll be skipped automatically).
```text
desync extract -s /local/store --seed-dir /path/to/images image-v3.qcow2.caibx image-v3.qcow2
```
Mix and match remote stores and use a local cache store to improve performance. Also group two identical HTTP stores with `|` to provide failover in case of errors on one.
```text
desync extract \
-s "http://192.168.1.101/casync.store/|http://192.168.1.102/casync.store/" \
-s ssh://192.168.1.1/path/to/casync.store/ \
-s https://192.168.1.3/ssl.store/ \
-c /path/to/cache \
somefile.tar.caibx somefile.tar
```
Extract a file in-place (`-k` option). If this operation fails, the file will remain partially complete and can be restarted without the need to re-download chunks from the remote SFTP store. Use `-k` when a local cache is not available and the extract may be interrupted.
```text
desync extract -k -s sftp://192.168.1.1/path/to/store file.caibx file.tar
```
Extract an image directly onto a block device. The `-k` or `--in-place` option is needed.
```text
desync extract -k -s /mnt/store image.caibx /dev/sdc
```
Extract a file using a remote index stored in an HTTP index store.
```text
desync extract -k -s sftp://192.168.1.1/path/to/store http://192.168.1.2/file.caibx file.tar
```
### Chunking
Split a blob, store the chunks and create an index file.
```text
desync make -s /some/local/store index.caibx /some/blob
```
Split a blob, create an index file and store the chunks in an S3 bucket named `store`.
```text
S3_ACCESS_KEY=mykey S3_SECRET_KEY=mysecret desync make -s s3+http://127.0.0.1:9000/store index.caibx /some/blob
```
Index an existing local file without creating chunks.
```text
desync make image.raw.caibx /tmp/image.raw
```
Verify the index you just created.
```text
desync verify-index image.raw.caibx /tmp/image.raw
```
### Cache and Store Management
Verify a local cache. Errors will be reported to STDOUT, since `-r` is not given, nothing invalid will be removed.
```text
desync verify -s /some/local/store
```
Cache the chunks used in a couple of index files in a local store without actually writing the blob.
```text
desync cache -s ssh://192.168.1.1/path/to/casync.store/ -c /local/cache somefile.tar.caibx other.file.caibx
```
Copy all chunks referenced in an index file from a remote HTTP store to a remote SFTP store.
```text
desync cache -s ssh://192.168.1.2/store -c sftp://192.168.1.3/path/to/store /path/to/index.caibx
```
Cache chunks from remote locally with non-standard port. Ignore existing files that are available locally from seed(s). This will only download chunks from the remote if they do not exist in the seed. Works with multiple seeds.
```text
desync cache -s http://cdn:9876 -c /tmp/chunkstore --ignore /tmp/indices/existing-image.raw.caibx /tmp/images/existing-image.raw
```
List the chunks referenced in a caibx.
```text
desync list-chunks somefile.tar.caibx
```
Chop an existing file according to an existing caibx and store the chunks in a local store. This can be used to populate a local cache from a possibly large blob that already exists on the target system.
```text
desync chop -s /some/local/store somefile.tar.caibx somefile.tar
```
Chop a blob according to an existing index, while ignoring any chunks that are referenced in another index. This can be used to improve performance when it is known that all chunks referenced in `image-v1.caibx` are already present in the target store and can be ignored when chopping `image-v2.iso`.
```text
desync chop -s /some/local/store --ignore image-v1.iso.caibx image-v2.iso.caibx image-v2.iso
```
Prune a store to only contain chunks that are referenced in the provided index files. Possible data loss.
```text
desync prune -s /some/local/store index1.caibx index2.caibx
```
### Archives
Pack a directory tree into a catar file.
```text
desync tar archive.catar /some/dir
```
Pack a directory tree into an archive and chunk the archive, producing an index file.
```text
desync tar -i -s /some/local/store archive.caidx /some/dir
```
Unpack a catar file.
```text
desync untar archive.catar /some/dir
```
Unpack a directory tree using an index file referencing a chunked archive.
```text
desync untar -i -s /some/local/store archive.caidx /some/dir
```
Pack a directory tree currently available as tar archive into a catar. The tar input stream can also be read from STDIN by providing `-` instead of the file name.
```text
desync tar --input-format=tar archive.catar /path/to/archive.tar
```
Process a tar stream into a catar. Since catar don't support hardlinks, we need to make sure those are dereferenced in the input stream.
```text
tar --hard-dereference -C /path/to/dir -c . | desync tar --input-format tar archive.catar -
```
Unpack a directory tree from an index file and store the output filesystem in a GNU tar file rather than the local filesystem. Instead of an archive file, the output can be given as `-` which will write to STDOUT.
```text
desync untar -i -s /some/local/store --output-format=gnu-tar archive.caidx /path/to/archive.tar
```
Server Examples
Start a chunk server serving up a local store via port 80.
```text
desync chunk-server -s /some/local/store
```
Start a chunk server on port 8080 acting as proxy for other remote HTTP and SSH stores and populate a local cache.
```text
desync chunk-server -s http://192.168.1.1/ -s ssh://192.168.1.2/store -c cache -l :8080
```
Start a chunk server with a store-file, this allows the configuration to be re-read on SIGHUP without restart.
```text
# Create store file
echo '{"stores": ["http://192.168.1.1/"], "cache": "/tmp/cache"}' > stores.json
# Start the server
desync chunk-server --store-file stores.json -l :8080
# Modify
echo '{"stores": ["http://192.168.1.2/"], "cache": "/tmp/cache"}' > stores.json
# Reload
killall -1 desync
```
Start a writable index server, chunk a file and store the index.
```text
server# desync index-server -s /mnt/indexes --writeable -l :8080
client# desync make -s /some/store http://192.168.1.1:8080/file.vmdk.caibx file.vmdk
```
Start a TLS chunk server on port 443 acting as proxy for a remote chunk store in AWS with local cache. The credentials for AWS are expected to be in the config file under key `https://s3-eu-west-3.amazonaws.com`.
```text
desync chunk-server -s s3+https://s3-eu-west-3.amazonaws.com/desync.bucket/prefix -c cache -l 127.0.0.1:https --cert cert.pem --key key.pem
```
FUSE mount an index file. This will make the indexed blob available as file underneath the mount point. The filename in the mount matches the name of the index with the extension removed. In this example `/some/mnt/` will contain one file `index`.
```text
desync mount-index -s /some/local/store index.caibx /some/mnt
```
FUSE mount a chunked and remote index file. First a (small) index file is read from the index-server which is used to re-assemble a larger index file and pipe it into the 2nd command that then mounts it.
```text
desync cat -s http://192.168.1.1/store http://192.168.1.2/small.caibx | desync mount-index -s http://192.168.1.1/store - /mnt/point
```
Long-running FUSE mount that may need to have its store setup changed without unmounting. This can be done by using the `--store-file` option rather than specifying store+cache in the command line. The process will then reload the file when a SIGHUP is sent.
```text
# Create the store file
echo '{"stores": ["http://192.168.1.1/"], "cache": "/tmp/cache"}' > stores.json
# Start the mount
desync mount-index --store-file stores.json index.caibx /some/mnt
# Modify the store setup
echo '{"stores": ["http://192.168.1.2/"], "cache": "/tmp/cache"}' > stores.json
# Reload
killall -1 desync
```
Show information about an index file to see how many of its chunks are present in a local store or an S3 store. The local store is queried first, S3 is only queried if the chunk is not present in the local store. The output will be in JSON format (`--format=json`) for easier processing in scripts.
```text
desync info --format=json -s /tmp/store -s s3+http://127.0.0.1:9000/store /path/to/index
```
Start an HTTP chunk server that will store uncompressed chunks locally, configured via JSON config file, and serve uncompressed chunks over the network (`-u` option). This chunk server could be used as a cache, minimizing latency by storing and serving uncompressed chunks. Clients will need to be configured to request uncompressed chunks from this server.
```text
# Chunk server
echo '{"store-options": {"/path/to/store/":{"uncompressed": true}}}' > /path/to/server.json
desync --config /path/to/server.json chunk-server -w -u -s /path/to/store/ -l :8080
# Client
echo '{"store-options": {"http://store.host:8080/":{"uncompressed": true}}}' > /path/to/client.json
desync --config /path/to/client.json cache -s sftp://remote.host/store -c http://store.host:8080/ /path/to/blob.caibx
```
HTTP chunk server using basic authorization. The server is configured to expect an `Authorization` header with the correct value in every request. The client configuration defines what the value should be on a per-server basis. The client config could be added to the default `$HOME/.config/desync/config.json` instead.
```text
# Server
DESYNC_HTTP_AUTH="Bearer abcabcabc" desync chunk-server -s /path/to/store -l :8080
# Client
echo '{"store-options": {"http://127.0.0.1:8080/":{"http-auth": "Bearer abcabcabc"}}}' > /path/to/client.json
desync --config /path/to/client.json extract -s http://127.0.0.1:8080/ /path/to/blob.caibx /path/to/blob
```
HTTPS chunk server using key and certificate signed by custom CA.
```text
# Building the CA and server certificate
openssl genrsa -out ca.key 4096
openssl req -x509 -new -nodes -key ca.key -sha256 -days 3650 -out ca.crt
openssl genrsa -out server.key 2048
openssl req -new -key server.key -out server.csr (Common Name should be the server name)
openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt -days 3650 -sha256
# Chunk server
desync chunk-server -s /path/to/store --key server.key --cert server.crt -l :8443
# Client
desync extract --ca-cert ca.crt -s https://hostname:8443/ image.iso.caibx image.iso
```
HTTPS chunk server with client authentication (mutual-TLS).
```text
# Building the CA, server and client certificates
openssl genrsa -out ca.key 4096
openssl req -x509 -new -nodes -key ca.key -sha256 -days 3650 -out ca.crt
openssl genrsa -out server.key 2048
openssl req -new -key server.key -out server.csr (Common Name should be the server name)
openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt -days 3650 -sha256
openssl genrsa -out client.key 2048
openssl req -new -key client.key -out client.csr
openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client.crt -days 3650 -sha256
# Chunk server
desync chunk-server -s /path/to/store --key server.key --cert server.crt --mutual-tls --client-ca ca.crt -l :8443
# Client
desync extract --client-key client.key --client-cert client.crt --ca-cert ca.crt -s https://hostname:8443/ image.iso.caibx image.iso
```
Update Size Estimation
Get the size of the chunks that are required for an update, when using *compressed* chunks (default). I.e. how much data a client needs to download.
```text
# Server
## Create the update index file
desync make --store /some/local/store update.caibx /some/blob
## Create a detailed JSON info file for the chunks
desync inspect-chunks --store /some/local/store update.caibx update_chunks_details.json
# Client
## Download the update_chunks_details.json file
## Get the update info
desync info --seed local_index.caibx --chunks-info update_chunks_details.json --format=json update.caibx
## The value in 'dedup-size-not-in-seed-nor-cache-compressed' will hold the size in bytes that needs to be downloaded
```
Get the size of the chunks that are required for an update, when using *uncompressed* chunks.
```text
# Server
## Create the update index file
desync make --store /some/local/store update.caibx /some/blob
# Client
## Get the update info
desync info --seed local_index.caibx --format=json update.caibx
## The value 'dedup-size-not-in-seed-nor-cache' will hold the size in bytes that needs to be downloaded
```
## Platform Support
| Platform | Status | Notes |
| --- | --- | --- |
| Linux | Full support | All features including FUSE, reflinks (Btrfs/XFS) |
| macOS | Supported | Minor incompatibilities possible when exchanging catar files with Linux (devices, filemodes) |
| Windows | Partial | Subset of commands. Device entries unsupported in tar; `--no-same-owner` and `--no-same-permissions` ignored in `untar`. |
| BSD | Untested | Expected to work |
## Design Philosophy
- **Performance over storage efficiency** — where upstream casync optimizes for storage efficiency (e.g. using local files as seeds, building temporary indexes), desync optimizes for runtime performance (maintaining a local explicit chunk store, avoiding the need to reindex) at the cost of storage efficiency.
- **Cross-platform over platform-specific features** — where upstream casync takes full advantage of Linux platform features, desync implements a minimum feature set. High-value platform-specific features (such as Btrfs reflinks) are added while maintaining the ability to build on other platforms.
- **Hash functions** — both SHA512/256 and SHA256 are supported.
- **Compression** — only zstd compression and uncompressed stores are supported.
- **casync as drop-in replacement** — desync can serve as a drop-in replacement for casync on SSH servers for read-only chunk serving. Set `CASYNC_REMOTE_PATH=desync` on the client.
- **catar limitations** — SELinux and ACLs in existing catar files are ignored and won't be present in newly created catars. FCAPs are supported only as a verbatim copy of the `security.capability` XAttr.
## Links
- casync — [https://github.com/systemd/casync](https://github.com/systemd/casync)
- Go package documentation — [https://pkg.go.dev/github.com/folbricht/desync](https://pkg.go.dev/github.com/folbricht/desync)
- casync blog post — [http://0pointer.net/blog/casync-a-tool-for-distributing-file-system-images.html](http://0pointer.net/blog/casync-a-tool-for-distributing-file-system-images.html)
desync-1.0.2/archive.go 0000664 0000000 0000000 00000013255 15212466604 0014763 0 ustar 00root root 0000000 0000000 package desync
import (
"fmt"
"io"
"os"
"path"
"path/filepath"
"reflect"
"strings"
"time"
)
type Xattrs map[string]string
// NodeDirectory represents a directory in a catar archive
type NodeDirectory struct {
Name string
UID int
GID int
Mode os.FileMode
MTime time.Time
Xattrs Xattrs
}
// NodeFile holds file permissions and data in a catar archive
type NodeFile struct {
UID int
GID int
Mode os.FileMode
Name string
MTime time.Time
Xattrs Xattrs
Size uint64
Data io.Reader
}
// NodeSymlink holds symlink information in a catar archive
type NodeSymlink struct {
Name string
UID int
GID int
Mode os.FileMode
MTime time.Time
Xattrs Xattrs
Target string
}
// NodeDevice holds device information in a catar archive
type NodeDevice struct {
Name string
UID int
GID int
Mode os.FileMode
Major uint64
Minor uint64
Xattrs Xattrs
MTime time.Time
}
// ArchiveDecoder is used to decode a catar archive.
type ArchiveDecoder struct {
d FormatDecoder
dir string
last any
}
// NewArchiveDecoder initializes a decoder for a catar archive.
func NewArchiveDecoder(r io.Reader) ArchiveDecoder {
return ArchiveDecoder{d: NewFormatDecoder(r), dir: "."}
}
// safeComponent validates a single path component as it appears in a catar
// FormatFilename element. casync filenames are always a single, non-empty
// path component. Anything else (empty, ".", "..", containing a path
// separator, or absolute) is rejected so that a crafted archive cannot place
// or traverse entries outside the extraction root - this catches the
// embedded-slash trick (e.g. "evil/passwd") regardless of the writer in use.
func safeComponent(name string) error {
switch name {
case "", ".", "..":
return InvalidFormat{Msg: fmt.Sprintf("invalid filename %q in archive", name)}
}
if strings.ContainsRune(name, '/') || strings.ContainsRune(name, '\\') {
return InvalidFormat{Msg: fmt.Sprintf("filename %q contains a path separator", name)}
}
if path.IsAbs(name) || filepath.IsAbs(name) {
return InvalidFormat{Msg: fmt.Sprintf("absolute filename %q in archive", name)}
}
return nil
}
// confined reports whether p, the cumulative path of an archive entry, stays
// within the archive root (".").
func confined(p string) bool {
if path.IsAbs(p) {
return false
}
c := path.Clean(p)
return c == "." || (c != ".." && !strings.HasPrefix(c, "../"))
}
// Next returns a node from an archive, or nil if the end is reached. If NodeFile
// is returned, the caller should read the file body before calling Next() again
// as that invalidates the reader.
func (a *ArchiveDecoder) Next() (any, error) {
var (
entry *FormatEntry
payload *FormatPayload
symlink *FormatSymlink
device *FormatDevice
xattrs map[string]string
name string
c any
err error
)
loop:
for {
// First process any elements left over from the last loop before reading
// new ones from the decoder
if a.last != nil {
c = a.last
a.last = nil
} else {
c, err = a.d.Next()
if err != nil {
return nil, err
}
}
switch d := c.(type) {
case FormatEntry:
if entry != nil {
return nil, InvalidFormat{}
}
entry = &d
case FormatUser: // Not supported yet
case FormatGroup:
case FormatSELinux:
case FormatACLUser:
case FormatACLGroup:
case FormatACLGroupObj:
case FormatACLDefault:
case FormatFCaps:
case FormatPayload:
if entry == nil {
return nil, InvalidFormat{}
}
payload = &d
break loop
case FormatXAttr:
idx := strings.IndexRune(d.NameAndValue, '\000')
if entry == nil || idx == -1 {
return nil, InvalidFormat{}
}
if xattrs == nil {
xattrs = make(map[string]string)
}
xattrs[d.NameAndValue[0:idx]] = d.NameAndValue[idx+1:]
case FormatSymlink:
if entry == nil {
return nil, InvalidFormat{}
}
symlink = &d
case FormatDevice:
if entry == nil {
return nil, InvalidFormat{}
}
device = &d
case FormatFilename:
if entry != nil { // Store and come back to it in the next iteration
a.last = c
break loop
}
if err := safeComponent(d.Name); err != nil {
return nil, err
}
name = d.Name
case FormatGoodbye: // This will effectively be a "cd .."
if entry != nil {
a.last = c
break loop
}
a.dir = filepath.Dir(a.dir)
case nil:
return nil, nil
default:
return nil, fmt.Errorf("unsupported element %s in archive", reflect.TypeOf(d))
}
}
// If it doesn't have a payload or is a device/symlink, it must be a directory
if payload == nil && device == nil && symlink == nil {
a.dir = path.Join(a.dir, name)
if !confined(a.dir) {
return nil, InvalidFormat{Msg: fmt.Sprintf("entry %q escapes the archive root", a.dir)}
}
return NodeDirectory{
Name: a.dir,
UID: entry.UID,
GID: entry.GID,
Mode: entry.Mode,
MTime: entry.MTime,
Xattrs: xattrs,
}, nil
}
p := path.Join(a.dir, name)
if !confined(p) {
return nil, InvalidFormat{Msg: fmt.Sprintf("entry %q escapes the archive root", p)}
}
// Regular file
if payload != nil {
return NodeFile{
Name: p,
UID: entry.UID,
GID: entry.GID,
Mode: entry.Mode,
MTime: entry.MTime,
Xattrs: xattrs,
Size: payload.Size - 16,
Data: payload.Data,
}, nil
}
// Device
if device != nil {
return NodeDevice{
Name: p,
UID: entry.UID,
GID: entry.GID,
Mode: entry.Mode,
MTime: entry.MTime,
Xattrs: xattrs,
Major: device.Major,
Minor: device.Minor,
}, nil
}
// Symlink
if symlink != nil {
return NodeSymlink{
Name: p,
UID: entry.UID,
GID: entry.GID,
Mode: entry.Mode,
MTime: entry.MTime,
Xattrs: xattrs,
Target: symlink.Target,
}, nil
}
return nil, nil
}
desync-1.0.2/archive_escape_test.go 0000664 0000000 0000000 00000004060 15212466604 0017334 0 ustar 00root root 0000000 0000000 package desync
import (
"bytes"
"os"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSafeComponent(t *testing.T) {
valid := []string{"file", "dir1", "a.b", "..foo", "foo..", "name with space"}
for _, n := range valid {
assert.NoError(t, safeComponent(n), "safeComponent(%q)", n)
}
invalid := []string{"", ".", "..", "evil/passwd", "a/b", "/abs", `a\b`, `\abs`}
for _, n := range invalid {
assert.Error(t, safeComponent(n), "safeComponent(%q)", n)
}
}
func TestConfined(t *testing.T) {
in := []string{".", "a", "a/b", "a/../b", "./a"}
for _, p := range in {
assert.True(t, confined(p), "confined(%q)", p)
}
out := []string{"..", "../x", "a/../..", "/abs", "/"}
for _, p := range out {
assert.False(t, confined(p), "confined(%q)", p)
}
}
// TestArchiveDecoderRejectsEmbeddedSlash verifies the decoder rejects a
// FormatFilename whose name embeds a path separator (e.g. "evil/passwd"), the
// trick used to write through a previously-planted symlink. This protects
// every FilesystemWriter, including TarWriter which would otherwise forward
// the poisoned name into a produced tar.
func TestArchiveDecoderRejectsEmbeddedSlash(t *testing.T) {
var buf bytes.Buffer
enc := NewFormatEncoder(&buf)
entry := FormatEntry{
FormatHeader: FormatHeader{Size: 64, Type: CaFormatEntry},
FeatureFlags: TarFeatureFlags,
Mode: os.ModeDir | 0755,
MTime: time.Unix(0, 0),
}
_, err := enc.Encode(entry)
require.NoError(t, err)
name := "evil/passwd"
fn := FormatFilename{
FormatHeader: FormatHeader{Size: uint64(16 + len(name) + 1), Type: CaFormatFilename},
Name: name,
}
_, err = enc.Encode(fn)
require.NoError(t, err)
d := NewArchiveDecoder(&buf)
// First node is the (unnamed) root directory.
v, err := d.Next()
require.NoError(t, err, "decoding root")
require.IsType(t, NodeDirectory{}, v)
// The embedded-slash filename must be rejected.
_, err = d.Next()
require.Error(t, err, "expected error for embedded-slash filename")
require.IsType(t, InvalidFormat{}, err)
}
desync-1.0.2/archive_test.go 0000664 0000000 0000000 00000003530 15212466604 0016015 0 ustar 00root root 0000000 0000000 package desync
import (
"os"
"path"
"testing"
"github.com/stretchr/testify/require"
)
func TestArchiveDecoderTypes(t *testing.T) {
f, err := os.Open("testdata/flat.catar")
require.NoError(t, err)
defer f.Close()
d := NewArchiveDecoder(f)
// Define an array of what is expected in the test file
expected := []any{
NodeDirectory{},
NodeDevice{},
NodeFile{},
NodeFile{},
NodeSymlink{},
nil,
}
for _, exp := range expected {
v, err := d.Next()
require.NoError(t, err)
require.IsType(t, exp, v)
}
}
func TestArchiveDecoderNesting(t *testing.T) {
f, err := os.Open("testdata/nested.catar")
require.NoError(t, err)
defer f.Close()
d := NewArchiveDecoder(f)
// Define an array of what is expected in the test file
expected := []struct {
Type any
Name string
UID int
GID int
}{
{Type: NodeDirectory{}, Name: ".", UID: 500, GID: 500},
{Type: NodeDirectory{}, Name: "dir1", UID: 500, GID: 500},
{Type: NodeDirectory{}, Name: path.Join("dir1", "sub11"), UID: 500, GID: 500},
{Type: NodeFile{}, Name: path.Join("dir1", "sub11", "f11"), UID: 500, GID: 500},
{Type: NodeFile{}, Name: path.Join("dir1", "sub11", "f12"), UID: 500, GID: 500},
{Type: NodeDirectory{}, Name: path.Join("dir1", "sub12"), UID: 500, GID: 500},
{Type: NodeDirectory{}, Name: "dir2", UID: 500, GID: 500},
{Type: NodeDirectory{}, Name: path.Join("dir2", "sub21"), UID: 500, GID: 500},
{Type: NodeDirectory{}, Name: path.Join("dir2", "sub22"), UID: 500, GID: 500},
{Type: nil},
}
for _, e := range expected {
v, err := d.Next()
require.NoError(t, err)
require.IsType(t, e.Type, v)
if e.Type == nil {
break
}
switch val := v.(type) {
case NodeDirectory:
require.Equal(t, e.Name, val.Name)
require.Equal(t, e.UID, val.UID)
case NodeFile:
require.Equal(t, e.Name, val.Name)
require.Equal(t, e.UID, val.UID)
}
}
}
desync-1.0.2/assemble.go 0000664 0000000 0000000 00000022175 15212466604 0015136 0 ustar 00root root 0000000 0000000 package desync
import (
"context"
"fmt"
"golang.org/x/sync/errgroup"
"os"
)
// InvalidSeedAction represents the action that we will take if a seed
// happens to be invalid. There are currently three options:
// - fail with an error
// - skip the invalid seed and try to continue
// - regenerate the invalid seed index
type InvalidSeedAction int
const (
InvalidSeedActionBailOut InvalidSeedAction = iota
InvalidSeedActionSkip
InvalidSeedActionRegenerate
)
type AssembleOptions struct {
N int
InvalidSeedAction InvalidSeedAction
}
// writeChunk tries to write a chunk by looking at the self seed, if it is already existing in the
// destination file or by taking it from the store. The in-place check runs first to avoid unnecessary
// writes. If the target already has the correct data, no write is performed.
func writeChunk(c IndexChunk, ss *selfSeed, f *os.File, blocksize uint64, s Store, stats *ExtractStats, isBlank bool) error {
// If we operate on an existing file there's a good chance we already
// have the data written for this chunk. Let's read it from disk and
// compare to what is expected. This is checked first to avoid rewriting
// data that is already correct, even for chunks available in the selfSeed.
if !isBlank {
b := make([]byte, c.Size)
if _, err := f.ReadAt(b, int64(c.Start)); err != nil {
return err
}
sum := Digest.Sum(b)
if sum == c.ID {
// Record we kept this chunk in the file (when using in-place extract)
stats.incChunksInPlace()
return nil
}
}
// If we already took this chunk from the store we can reuse it by looking
// into the selfSeed.
if segment := ss.getChunk(c.ID); segment != nil {
copied, cloned, err := segment.WriteInto(f, c.Start, c.Size, blocksize, isBlank)
if err != nil {
return err
}
stats.addBytesCopied(copied)
stats.addBytesCloned(cloned)
return nil
}
// Record this chunk having been pulled from the store
stats.incChunksFromStore()
// Pull the (compressed) chunk from the store
chunk, err := s.GetChunk(c.ID)
if err != nil {
return err
}
b, err := chunk.Data()
if err != nil {
return err
}
// Might as well verify the chunk size while we're at it
if c.Size != uint64(len(b)) {
return fmt.Errorf("unexpected size for chunk %s", c.ID.String())
}
// Write the decompressed chunk into the file at the right position
if _, err = f.WriteAt(b, int64(c.Start)); err != nil {
return err
}
return nil
}
// AssembleFile re-assembles a file based on a list of index chunks. It runs n
// goroutines, creating one filehandle for the file "name" per goroutine
// and writes to the file simultaneously. If progress is provided, it'll be
// called when a chunk has been processed.
// If the input file exists and is not empty, the algorithm will first
// confirm if the data matches what is expected and only populate areas that
// differ from the expected content. This can be used to complete partly
// written files.
func AssembleFile(ctx context.Context, name string, idx Index, s Store, seeds []Seed, options AssembleOptions) (*ExtractStats, error) {
type Job struct {
segment IndexSegment
source SeedSegment
}
var (
attempt = 1
in = make(chan Job)
isBlank bool
isBlkDevice bool
pb ProgressBar
)
g, ctx := errgroup.WithContext(ctx)
// Initialize stats to be gathered during extraction
stats := &ExtractStats{
BytesTotal: idx.Length(),
ChunksTotal: len(idx.Chunks),
}
// Determine if the target exists and create it if not
info, err := os.Stat(name)
switch {
case os.IsNotExist(err): // File doesn't exist yet => create it
f, err := os.Create(name)
if err != nil {
return stats, err
}
f.Close()
isBlank = true
case err != nil: // Some other error => bail
return stats, err
case isDevice(info.Mode()): // Dealing with a block device
isBlkDevice = true
case info.Size() == 0: // Is a file that exists, but is empty => use optimizations for blank files
isBlank = true
}
// Truncate the output file to the full expected size. Not only does this
// confirm there's enough disk space, but it allows for an optimization
// when dealing with the Null Chunk
if !isBlkDevice {
if err := os.Truncate(name, idx.Length()); err != nil {
return stats, err
}
}
// Determine the blocksize of the target file which is required for reflinking
blocksize := blocksizeOfFile(name)
// Prepend a nullchunk seed to the list of seeds to make sure we read that
// before any large null sections in other seed files
ns, err := newNullChunkSeed(name, blocksize, idx.Index.ChunkSizeMax)
if err != nil {
return stats, err
}
defer ns.close()
seeds = append([]Seed{ns}, seeds...)
// Start a self-seed which will become usable once chunks are written contiguously
// beginning at position 0. There is no need to add this to the seeds list because
// when we create a plan it will be empty.
ss, err := newSelfSeed(name, idx)
if err != nil {
return stats, err
}
// Record the total number of seeds and blocksize in the stats
stats.Seeds = len(seeds)
stats.Blocksize = blocksize
// Start the workers, each having its own filehandle to write concurrently
for i := 0; i < options.N; i++ {
f, err := os.OpenFile(name, os.O_RDWR, 0666)
if err != nil {
return stats, fmt.Errorf("unable to open file %s, %s", name, err)
}
defer f.Close()
g.Go(func() error {
for job := range in {
pb.Add(job.segment.lengthChunks())
if job.source != nil {
// If we have a seedSegment we expect 1 or more chunks between
// the start and the end of this segment.
stats.addChunksFromSeed(uint64(job.segment.lengthChunks()))
offset := job.segment.start()
length := job.segment.lengthBytes()
copied, cloned, err := job.source.WriteInto(f, offset, length, blocksize, isBlank)
if err != nil {
return err
}
// Validate that the written chunks are exactly what we were expecting.
// Because the seed might point to a RW location, if the data changed
// while we were extracting an index, we might end up writing to the
// destination some unexpected values.
for _, c := range job.segment.chunks() {
b := make([]byte, c.Size)
if _, err := f.ReadAt(b, int64(c.Start)); err != nil {
return err
}
sum := Digest.Sum(b)
if sum != c.ID {
if options.InvalidSeedAction == InvalidSeedActionRegenerate {
// Try harder before giving up and aborting
Log.WithField("ID", c.ID.String()).Info("The seed may have changed during processing, trying to take the chunk from the self seed or the store")
if err := writeChunk(c, ss, f, blocksize, s, stats, isBlank); err != nil {
return err
}
} else {
return fmt.Errorf("written data in %s doesn't match its expected hash value, seed may have changed during processing", name)
}
}
}
stats.addBytesCopied(copied)
stats.addBytesCloned(cloned)
// Record this segment's been written in the self-seed to make it
// available going forward
ss.add(job.segment)
continue
}
// If we don't have a seedSegment we expect an IndexSegment with just
// a single chunk, that we can take from either the selfSeed, from the
// destination file, or from the store.
if len(job.segment.chunks()) != 1 {
panic("Received an unexpected segment that doesn't contain just a single chunk")
}
c := job.segment.chunks()[0]
if err := writeChunk(c, ss, f, blocksize, s, stats, isBlank); err != nil {
return err
}
// Record this chunk's been written in the self-seed.
// Even if we already confirmed that this chunk is present in the
// self-seed, we still need to record it as being written, otherwise
// the self-seed position pointer doesn't advance as we expect.
ss.add(job.segment)
}
return nil
})
}
// Let the sequencer break up the index into segments, create and validate a plan,
// feed the workers, and stop if there are any errors
seq := NewSeedSequencer(idx, seeds...)
plan := seq.Plan()
for {
validatingPrefix := fmt.Sprintf("Attempt %d: Validating ", attempt)
if err := plan.Validate(ctx, options.N, NewProgressBar(validatingPrefix)); err != nil {
// This plan has at least one invalid seed
switch options.InvalidSeedAction {
case InvalidSeedActionBailOut:
return stats, err
case InvalidSeedActionRegenerate:
Log.WithError(err).Info("Unable to use one of the chosen seeds, regenerating it")
if err := seq.RegenerateInvalidSeeds(ctx, options.N, attempt); err != nil {
return stats, err
}
case InvalidSeedActionSkip:
// Recreate the plan. This time the seed marked as invalid will be skipped
Log.WithError(err).Info("Unable to use one of the chosen seeds, skipping it")
default:
panic("Unhandled InvalidSeedAction")
}
attempt += 1
seq.Rewind()
plan = seq.Plan()
continue
}
// Found a valid plan
break
}
pb = NewProgressBar(fmt.Sprintf("Attempt %d: Assembling ", attempt))
pb.SetTotal(len(idx.Chunks))
pb.Start()
defer pb.Finish()
loop:
for _, segment := range plan {
select {
case <-ctx.Done():
break loop
case in <- Job{segment.indexSegment, segment.source}:
}
}
close(in)
return stats, g.Wait()
}
desync-1.0.2/assemble_test.go 0000664 0000000 0000000 00000025345 15212466604 0016177 0 ustar 00root root 0000000 0000000 package desync
import (
"context"
"crypto/md5"
"crypto/rand"
"fmt"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
)
func TestExtract(t *testing.T) {
// Make a test file that's guaranteed to have duplicate chunks.
b, err := os.ReadFile("testdata/chunker.input")
if err != nil {
t.Fatal(err)
}
for range 4 { // Replicate it a few times to make sure we get dupes
b = append(b, b...)
}
b = append(b, make([]byte, 2*ChunkSizeMaxDefault)...) // want to have at least one null-chunk in the input
tmp := t.TempDir()
in := filepath.Join(tmp, "in")
require.NoError(t, os.WriteFile(in, b, 0644))
// Record the checksum of the input file, used to compare to the output later
inSum := md5.Sum(b)
// Chunk the file to get an index
index, _, err := IndexFromFile(
context.Background(),
in,
10,
ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault,
NewProgressBar(""),
)
if err != nil {
t.Fatal(err)
}
// Chop up the input file into a (temporary) local store
store := t.TempDir()
s, err := NewLocalStore(store, StoreOptions{})
if err != nil {
t.Fatal(err)
}
if err := ChopFile(context.Background(), in, index.Chunks, s, 10, NewProgressBar("")); err != nil {
t.Fatal(err)
}
// Make a blank store - used to test a case where no chunk *should* be requested
blankstore := t.TempDir()
bs, err := NewLocalStore(blankstore, StoreOptions{})
if err != nil {
t.Fatal(err)
}
// Prepare output files for each test - first a non-existing one
out1 := filepath.Join(tmp, "out1")
// This one is a complete file matching what we expect at the end
out2 := filepath.Join(tmp, "out2")
require.NoError(t, os.WriteFile(out2, b, 0644))
// Incomplete or damaged file that has most but not all data
b[0] ^= 0xff // flip some bits
b[len(b)-1] ^= 0xff
b = append(b, 0) // make it longer
out3 := filepath.Join(tmp, "out3")
require.NoError(t, os.WriteFile(out3, b, 0644))
// At this point we have the data needed for the test setup
// in - Temp file that represents the original input file
// inSub - MD5 of the input file
// index - Index file for the input file
// s - Local store containing the chunks needed to rebuild the input file
// bs - A blank local store, all GetChunk fail on it
// out1 - Just a non-existing file that gets assembled
// out2 - The output file already fully complete, no GetChunk should be needed
// out3 - Partial/damaged file with most, but not all data correct
// seedIndex + seedFile - Seed file to help assemble the input
tests := map[string]struct {
outfile string
store Store
seed []Seed
}{
"extract to new file": {outfile: out1, store: s},
"extract to complete file": {outfile: out2, store: bs},
"extract to incomplete file": {outfile: out3, store: s},
}
for name, test := range tests {
t.Run(name, func(t *testing.T) {
if _, err := AssembleFile(context.Background(), test.outfile, index, test.store, nil,
AssembleOptions{10, InvalidSeedActionBailOut},
); err != nil {
t.Fatal(err)
}
b, err := os.ReadFile(test.outfile)
if err != nil {
t.Fatal(err)
}
outSum := md5.Sum(b)
if inSum != outSum {
t.Fatal("checksum of extracted file doesn't match expected")
}
})
}
}
func TestSeed(t *testing.T) {
// Prepare different types of data slices that'll be used to assemble target
// and seed files with varying amount of duplication
data1, err := os.ReadFile("testdata/chunker.input")
if err != nil {
t.Fatal(err)
}
null := make([]byte, 4*ChunkSizeMaxDefault)
rand1 := make([]byte, 4*ChunkSizeMaxDefault)
rand.Read(rand1)
rand2 := make([]byte, 4*ChunkSizeMaxDefault)
rand.Read(rand2)
// Setup a temporary store
store := t.TempDir()
s, err := NewLocalStore(store, StoreOptions{})
if err != nil {
t.Fatal(err)
}
// Define tests with files with different content, by building files out
// of sets of byte slices to create duplication or not between the target and
// its seeds
tests := map[string]struct {
target [][]byte
seeds [][][]byte
}{
"extract without seed": {
target: [][]byte{rand1, rand2},
seeds: nil},
"extract all null file": {
target: [][]byte{null, null, null, null, null},
seeds: nil},
"extract repetitive file": {
target: [][]byte{data1, data1, data1, data1, data1},
seeds: nil},
"extract with single file seed": {
target: [][]byte{data1, null, null, rand1, null},
seeds: [][][]byte{
{data1, null, rand2, rand2, data1},
},
},
"extract with multiple file seeds": {
target: [][]byte{null, null, rand1, null, data1},
seeds: [][][]byte{
{rand2, null, rand2, rand2, data1},
{data1, null, rand2, rand2, data1},
{rand2},
},
},
"extract with identical file seed": {
target: [][]byte{data1, null, rand1, null, data1},
seeds: [][][]byte{
{data1, null, rand1, null, data1},
},
},
}
for name, test := range tests {
t.Run(name, func(t *testing.T) {
// Build the destination file so we can chunk it
tmp := t.TempDir()
dst := filepath.Join(tmp, "dst")
dstBytes := join(test.target...)
require.NoError(t, os.WriteFile(dst, dstBytes, 0644))
// Record the checksum of the target file, used to compare to the output later
dstSum := md5.Sum(dstBytes)
// Chunk the file to get an index
dstIndex, _, err := IndexFromFile(
context.Background(),
dst,
10,
ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault,
NewProgressBar(""),
)
if err != nil {
t.Fatal(err)
}
// Chop up the input file into the store
if err := ChopFile(context.Background(), dst, dstIndex.Chunks, s, 10, NewProgressBar("")); err != nil {
t.Fatal(err)
}
// Build the seed files and indexes then populate the array of seeds
var seeds []Seed
for i, f := range test.seeds {
seedFile := filepath.Join(tmp, fmt.Sprintf("seed%d", i))
require.NoError(t, os.WriteFile(seedFile, join(f...), 0644))
seedIndex, _, err := IndexFromFile(
context.Background(),
seedFile,
10,
ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault,
NewProgressBar(""),
)
if err != nil {
t.Fatal(err)
}
seed, err := NewIndexSeed(dst, seedFile, seedIndex)
if err != nil {
t.Fatal(err)
}
seeds = append(seeds, seed)
}
if _, err := AssembleFile(context.Background(), dst, dstIndex, s, seeds,
AssembleOptions{10, InvalidSeedActionBailOut},
); err != nil {
t.Fatal(err)
}
b, err := os.ReadFile(dst)
if err != nil {
t.Fatal(err)
}
outSum := md5.Sum(b)
if dstSum != outSum {
t.Fatal("checksum of extracted file doesn't match expected")
}
})
}
}
// TestSelfSeedInPlace is the same as TestSelfSeed but the target file is
// pre-populated with the correct content before extraction. Every chunk must
// be kept in-place and the self-seed must not cause any re-writes.
func TestSelfSeedInPlace(t *testing.T) {
// Setup a temporary store
store := t.TempDir()
s, err := NewLocalStore(store, StoreOptions{})
if err != nil {
t.Fatal(err)
}
// Build a number of fake chunks that can then be used in the test in any order
type rawChunk struct {
id ChunkID
data []byte
}
size := 1024
numChunks := 10
chunks := make([]rawChunk, numChunks)
for i := range numChunks {
b := make([]byte, size)
rand.Read(b)
chunk := NewChunk(b)
if err = s.StoreChunk(chunk); err != nil {
t.Fatal(err)
}
chunks[i] = rawChunk{chunk.ID(), b}
}
// The target is pre-written with the correct content,
// so every chunk should be detected as in-place.
tests := map[string]struct {
index []int
}{
"single chunk": {
index: []int{0},
},
"repeating single chunk": {
index: []int{0, 0, 0, 0, 0},
},
"repeating chunk sequence": {
index: []int{0, 1, 2, 0, 1, 2, 2},
},
"repeating chunk sequence mid file": {
index: []int{1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
},
"repeating chunk sequence reversed": {
index: []int{0, 1, 2, 2, 1, 0},
},
"non-repeating chunks": {
index: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
},
}
for name, test := range tests {
t.Run(name, func(t *testing.T) {
// Build an index from the target chunks
var idx Index
var b []byte
for i, p := range test.index {
chunk := IndexChunk{
ID: chunks[p].id,
Start: uint64(i * size),
Size: uint64(size),
}
b = append(b, chunks[p].data...)
idx.Chunks = append(idx.Chunks, chunk)
}
// Calculate the expected checksum
sum := md5.Sum(b)
// Build a temp target file pre-populated with the correct content
dst := filepath.Join(t.TempDir(), "dst")
require.NoError(t, os.WriteFile(dst, b, 0644))
// Extract the file
stats, err := AssembleFile(context.Background(), dst, idx, s, nil,
AssembleOptions{1, InvalidSeedActionBailOut},
)
if err != nil {
t.Fatal(err)
}
// Compare the checksums to that of the input data
b, err = os.ReadFile(dst)
if err != nil {
t.Fatal(err)
}
outSum := md5.Sum(b)
if sum != outSum {
t.Fatal("checksum of extracted file doesn't match expected")
}
// All chunks must be in-place. The in-place check in writeChunk
// runs before the self-seed lookup, so repeated chunks are not
// re-written from the self-seed.
if stats.ChunksInPlace != uint64(len(test.index)) {
t.Fatalf("expected all %d chunks in-place, got %d", len(test.index), stats.ChunksInPlace)
}
})
}
}
func join(slices ...[]byte) []byte {
var out []byte
for _, b := range slices {
out = append(out, b...)
}
return out
}
func readCaibxFile(t *testing.T, indexLocation string) (idx Index) {
is, err := NewLocalIndexStore(filepath.Dir(indexLocation))
require.NoError(t, err)
defer is.Close()
indexName := filepath.Base(indexLocation)
idx, err = is.GetIndex(indexName)
require.NoError(t, err)
return idx
}
func TestExtractWithNonStaticSeeds(t *testing.T) {
n := 10
outDir := t.TempDir()
out := filepath.Join(outDir, "out")
// Test a seed that is initially valid, but becomes corrupted halfway through
// the extraction operation
MockValidate = true
store, err := NewLocalStore("testdata/blob2.store", StoreOptions{})
require.NoError(t, err)
defer store.Close()
index := readCaibxFile(t, "testdata/blob2.caibx")
var seeds []Seed
srcIndex := readCaibxFile(t, "testdata/blob2_corrupted.caibx")
seed, err := NewIndexSeed(out, "testdata/blob2_corrupted", srcIndex)
seeds = append(seeds, seed)
// Test that the MockValidate works as expected
seq := NewSeedSequencer(index, seeds...)
plan := seq.Plan()
err = plan.Validate(context.Background(), n, NullProgressBar{})
require.NoError(t, err)
options := AssembleOptions{n, InvalidSeedActionRegenerate}
_, err = AssembleFile(context.Background(), out, index, store, seeds, options)
require.NoError(t, err)
//Test the output
err = VerifyIndex(context.Background(), out, index, n, NullProgressBar{})
require.NoError(t, err)
}
desync-1.0.2/blocksize.go 0000664 0000000 0000000 00000000512 15212466604 0015317 0 ustar 00root root 0000000 0000000 //go:build !windows
// +build !windows
package desync
import (
"os"
"syscall"
)
func blocksizeOfFile(name string) uint64 {
stat, err := os.Stat(name)
if err != nil {
return DefaultBlockSize
}
switch sys := stat.Sys().(type) {
case *syscall.Stat_t:
return uint64(sys.Blksize)
default:
return DefaultBlockSize
}
}
desync-1.0.2/blocksize_windows.go 0000664 0000000 0000000 00000000353 15212466604 0017074 0 ustar 00root root 0000000 0000000 package desync
func blocksizeOfFile(name string) uint64 {
// TODO: Not that it really matters for reflink cloning of files on windows
// but it would be nice to determine the actual blocksize here anyway.
return DefaultBlockSize
}
desync-1.0.2/cache.go 0000664 0000000 0000000 00000005266 15212466604 0014410 0 ustar 00root root 0000000 0000000 package desync
import (
"fmt"
"github.com/pkg/errors"
)
// Cache is used to connect a (typically remote) store with a local store which
// functions as disk cache. Any request to the cache for a chunk will first be
// routed to the local store, and if that fails to the slower remote store.
// Any chunks retrieved from the remote store will be stored in the local one.
type Cache struct {
s Store
l WriteStore
}
// NewCache returns a cache router that uses a local store as cache before
// accessing a (supposedly slower) remote one.
func NewCache(s Store, l WriteStore) Cache {
return Cache{s: s, l: l}
}
// GetChunk first asks the local store for the chunk and then the remote one.
// If we get a chunk from the remote, it's stored locally too.
func (c Cache) GetChunk(id ChunkID) (*Chunk, error) {
chunk, err := c.l.GetChunk(id)
switch err.(type) {
case nil:
return chunk, nil
case ChunkMissing:
default:
return chunk, err
}
// At this point we failed to find chunk in the local cache. Ask the remote
chunk, err = c.s.GetChunk(id)
if err != nil {
return chunk, err
}
// Got the chunk. Store it in the local cache for next time
if err = c.l.StoreChunk(chunk); err != nil {
return chunk, errors.Wrap(err, "failed to store in local cache")
}
return chunk, nil
}
// HasChunk first checks the cache for the chunk, then the store.
func (c Cache) HasChunk(id ChunkID) (bool, error) {
if hasChunk, err := c.l.HasChunk(id); err != nil || hasChunk {
return hasChunk, err
}
return c.s.HasChunk(id)
}
func (c Cache) String() string {
return fmt.Sprintf("store:%s with cache %s", c.s, c.l)
}
// Close the underlying writable chunk store
func (c Cache) Close() error {
c.l.Close()
return c.s.Close()
}
// RepairableCache is a cache whose GetChunk() function will return ChunkMissing error instead of ChunkInvalid
// so caller can redownload invalid chunk from store
type RepairableCache struct {
l WriteStore
}
// Create new RepairableCache that wraps WriteStore and modify its GetChunk() so ChunkInvalid error
// will be replaced by ChunkMissing error
func NewRepairableCache(l WriteStore) RepairableCache {
return RepairableCache{l: l}
}
func (r RepairableCache) GetChunk(id ChunkID) (*Chunk, error) {
chunk, err := r.l.GetChunk(id)
var chunkInvalidErr ChunkInvalid
if err != nil && errors.As(err, &chunkInvalidErr) {
return chunk, ChunkMissing{ID: chunkInvalidErr.ID}
}
return chunk, err
}
func (r RepairableCache) HasChunk(id ChunkID) (bool, error) {
return r.l.HasChunk(id)
}
func (r RepairableCache) Close() error {
return r.l.Close()
}
func (r RepairableCache) String() string {
return r.l.String()
}
func (r RepairableCache) StoreChunk(c *Chunk) error {
return r.l.StoreChunk(c)
}
desync-1.0.2/chop.go 0000664 0000000 0000000 00000003257 15212466604 0014274 0 ustar 00root root 0000000 0000000 package desync
import (
"context"
"fmt"
"io"
"os"
"golang.org/x/sync/errgroup"
)
// ChopFile split a file according to a list of chunks obtained from an Index
// and stores them in the provided store
func ChopFile(ctx context.Context, name string, chunks []IndexChunk, ws WriteStore, n int, pb ProgressBar) error {
in := make(chan IndexChunk)
g, ctx := errgroup.WithContext(ctx)
// Setup and start the progressbar if any
pb.SetTotal(len(chunks))
pb.Start()
defer pb.Finish()
s := NewChunkStorage(ws)
// Start the workers, each having its own filehandle to read concurrently
for range n {
f, err := os.Open(name)
if err != nil {
return fmt.Errorf("unable to open file %s, %s", name, err)
}
defer f.Close()
g.Go(func() error {
for c := range in {
// Update progress bar if any
pb.Increment()
chunk, err := readChunkFromFile(f, c)
if err != nil {
return err
}
if err := s.StoreChunk(chunk); err != nil {
return err
}
}
return nil
})
}
// Feed the workers, stop if there are any errors
loop:
for _, c := range chunks {
select {
case <-ctx.Done():
break loop
case in <- c:
}
}
close(in)
return g.Wait()
}
// Helper function to read chunk contents from file
func readChunkFromFile(f *os.File, c IndexChunk) (*Chunk, error) {
var err error
b := make([]byte, c.Size)
// Position the filehandle to the place where the chunk is meant to come
// from within the file
if _, err = f.Seek(int64(c.Start), io.SeekStart); err != nil {
return nil, err
}
// Read the whole (uncompressed) chunk into memory
if _, err = io.ReadFull(f, b); err != nil {
return nil, err
}
return NewChunkWithID(c.ID, b, false)
}
desync-1.0.2/chunk.go 0000664 0000000 0000000 00000006454 15212466604 0014455 0 ustar 00root root 0000000 0000000 package desync
import (
"errors"
)
// Chunk holds chunk data plain, storage format, or both. If a chunk is created
// from storage data, such as read from a compressed chunk store, and later the
// application requires the plain data, it'll be converted on demand by applying
// the given storage converters in reverse order. The converters can only be used
// to read the plain data, not to convert back to storage format.
type Chunk struct {
data []byte // Plain data if available
storage []byte // Storage format (compressed, encrypted, etc)
converters Converters // Modifiers to convert from storage format to plain
id ChunkID
idCalculated bool
}
// NewChunk creates a new chunk from plain data. The data is trusted and the ID is
// calculated on demand.
func NewChunk(b []byte) *Chunk {
return &Chunk{data: b}
}
// NewChunkWithID creates a new chunk from either compressed or uncompressed data
// (or both if available). It also expects an ID and validates that it matches
// the uncompressed data unless skipVerify is true. If called with just compressed
// data, it'll decompress it for the ID validation.
func NewChunkWithID(id ChunkID, b []byte, skipVerify bool) (*Chunk, error) {
c := &Chunk{id: id, data: b}
if skipVerify {
c.idCalculated = true // Pretend this was calculated. No need to re-calc later
return c, nil
}
sum := c.ID()
if sum != id {
return nil, ChunkInvalid{ID: id, Sum: sum}
}
return c, nil
}
// NewChunkFromStorage builds a new chunk from data that is not in plain format.
// It uses raw storage format from its source and the modifiers are used to convert
// into plain data as needed.
func NewChunkFromStorage(id ChunkID, b []byte, modifiers Converters, skipVerify bool) (*Chunk, error) {
c := &Chunk{id: id, storage: b, converters: modifiers}
if skipVerify {
c.idCalculated = true // Pretend this was calculated. No need to re-calc later
return c, nil
}
sum := c.ID()
if sum != id {
return nil, ChunkInvalid{ID: id, Sum: sum}
}
return c, nil
}
// Data returns the chunk data in uncompressed form. If the chunk was created
// with compressed data only, it'll be decompressed, stored and returned. The
// caller must not modify the data in the returned slice.
func (c *Chunk) Data() ([]byte, error) {
if len(c.data) > 0 {
return c.data, nil
}
if len(c.storage) > 0 {
var err error
c.data, err = c.converters.fromStorage(c.storage)
return c.data, err
}
return nil, errors.New("no data in chunk")
}
// ID returns the checksum/ID of the uncompressed chunk data. The ID is stored
// after the first call and doesn't need to be re-calculated. Note that calculating
// the ID may mean decompressing the data first.
func (c *Chunk) ID() ChunkID {
if c.idCalculated {
return c.id
}
b, err := c.Data()
if err != nil {
return ChunkID{}
}
c.id = Digest.Sum(b)
c.idCalculated = true
return c.id
}
// Storage returns the chunk data in compressed form. If the chunk was created
// with compressed data and same modifiers, this data will be returned as is. The
// caller must not modify the data in the returned slice.
func (c *Chunk) Storage(modifiers Converters) ([]byte, error) {
if len(c.storage) > 0 && modifiers.equal(c.converters) {
return c.storage, nil
}
b, err := c.Data()
if err != nil {
return nil, err
}
return modifiers.toStorage(b)
}
desync-1.0.2/chunkadditionalinfo.go 0000664 0000000 0000000 00000000711 15212466604 0017350 0 ustar 00root root 0000000 0000000 package desync
// ChunkAdditionalInfo contains detailed information about a particular chunk.
// Some of those info, e.g. CompressedSize, are only exact for the store used when
// generating it. Because other stores could potentially use different compression levels.
type ChunkAdditionalInfo struct {
ID ChunkID `json:"id"`
UncompressedSize uint64 `json:"uncompressed_size"`
CompressedSize int64 `json:"compressed_size,omitempty"`
}
desync-1.0.2/chunker.go 0000664 0000000 0000000 00000030325 15212466604 0014776 0 ustar 00root root 0000000 0000000 package desync
import (
"errors"
"fmt"
"io"
"math/bits"
)
// ChunkerWindowSize is the number of bytes in the rolling hash window
const ChunkerWindowSize = 48
func discriminatorFromAvg(avg uint64) uint32 {
return uint32(float64(avg) / (-1.42888852e-7*float64(avg) + 1.33237515))
}
// modInverse32 computes the modular multiplicative inverse of an odd number d
// modulo 2^32 using Newton's method. This is used for Lemire's fast
// divisibility test which replaces expensive hardware division.
func modInverse32(d uint32) uint32 {
x := d // d is odd, so d*d ≡ 1 (mod 4) is a valid start
x *= 2 - d*x // 3 bits
x *= 2 - d*x // 6 bits
x *= 2 - d*x // 12 bits
x *= 2 - d*x // 24 bits
x *= 2 - d*x // 48 bits → full 32-bit precision
return x
}
var hashTable = [256]uint32{
0x458be752, 0xc10748cc, 0xfbbcdbb8, 0x6ded5b68,
0xb10a82b5, 0x20d75648, 0xdfc5665f, 0xa8428801,
0x7ebf5191, 0x841135c7, 0x65cc53b3, 0x280a597c,
0x16f60255, 0xc78cbc3e, 0x294415f5, 0xb938d494,
0xec85c4e6, 0xb7d33edc, 0xe549b544, 0xfdeda5aa,
0x882bf287, 0x3116737c, 0x05569956, 0xe8cc1f68,
0x0806ac5e, 0x22a14443, 0x15297e10, 0x50d090e7,
0x4ba60f6f, 0xefd9f1a7, 0x5c5c885c, 0x82482f93,
0x9bfd7c64, 0x0b3e7276, 0xf2688e77, 0x8fad8abc,
0xb0509568, 0xf1ada29f, 0xa53efdfe, 0xcb2b1d00,
0xf2a9e986, 0x6463432b, 0x95094051, 0x5a223ad2,
0x9be8401b, 0x61e579cb, 0x1a556a14, 0x5840fdc2,
0x9261ddf6, 0xcde002bb, 0x52432bb0, 0xbf17373e,
0x7b7c222f, 0x2955ed16, 0x9f10ca59, 0xe840c4c9,
0xccabd806, 0x14543f34, 0x1462417a, 0x0d4a1f9c,
0x087ed925, 0xd7f8f24c, 0x7338c425, 0xcf86c8f5,
0xb19165cd, 0x9891c393, 0x325384ac, 0x0308459d,
0x86141d7e, 0xc922116a, 0xe2ffa6b6, 0x53f52aed,
0x2cd86197, 0xf5b9f498, 0xbf319c8f, 0xe0411fae,
0x977eb18c, 0xd8770976, 0x9833466a, 0xc674df7f,
0x8c297d45, 0x8ca48d26, 0xc49ed8e2, 0x7344f874,
0x556f79c7, 0x6b25eaed, 0xa03e2b42, 0xf68f66a4,
0x8e8b09a2, 0xf2e0e62a, 0x0d3a9806, 0x9729e493,
0x8c72b0fc, 0x160b94f6, 0x450e4d3d, 0x7a320e85,
0xbef8f0e1, 0x21d73653, 0x4e3d977a, 0x1e7b3929,
0x1cc6c719, 0xbe478d53, 0x8d752809, 0xe6d8c2c6,
0x275f0892, 0xc8acc273, 0x4cc21580, 0xecc4a617,
0xf5f7be70, 0xe795248a, 0x375a2fe9, 0x425570b6,
0x8898dcf8, 0xdc2d97c4, 0x0106114b, 0x364dc22f,
0x1e0cad1f, 0xbe63803c, 0x5f69fac2, 0x4d5afa6f,
0x1bc0dfb5, 0xfb273589, 0x0ea47f7b, 0x3c1c2b50,
0x21b2a932, 0x6b1223fd, 0x2fe706a8, 0xf9bd6ce2,
0xa268e64e, 0xe987f486, 0x3eacf563, 0x1ca2018c,
0x65e18228, 0x2207360a, 0x57cf1715, 0x34c37d2b,
0x1f8f3cde, 0x93b657cf, 0x31a019fd, 0xe69eb729,
0x8bca7b9b, 0x4c9d5bed, 0x277ebeaf, 0xe0d8f8ae,
0xd150821c, 0x31381871, 0xafc3f1b0, 0x927db328,
0xe95effac, 0x305a47bd, 0x426ba35b, 0x1233af3f,
0x686a5b83, 0x50e072e5, 0xd9d3bb2a, 0x8befc475,
0x487f0de6, 0xc88dff89, 0xbd664d5e, 0x971b5d18,
0x63b14847, 0xd7d3c1ce, 0x7f583cf3, 0x72cbcb09,
0xc0d0a81c, 0x7fa3429b, 0xe9158a1b, 0x225ea19a,
0xd8ca9ea3, 0xc763b282, 0xbb0c6341, 0x020b8293,
0xd4cd299d, 0x58cfa7f8, 0x91b4ee53, 0x37e4d140,
0x95ec764c, 0x30f76b06, 0x5ee68d24, 0x679c8661,
0xa41979c2, 0xf2b61284, 0x4fac1475, 0x0adb49f9,
0x19727a23, 0x15a7e374, 0xc43a18d5, 0x3fb1aa73,
0x342fc615, 0x924c0793, 0xbee2d7f0, 0x8a279de9,
0x4aa2d70c, 0xe24dd37f, 0xbe862c0b, 0x177c22c2,
0x5388e5ee, 0xcd8a7510, 0xf901b4fd, 0xdbc13dbc,
0x6c0bae5b, 0x64efe8c7, 0x48b02079, 0x80331a49,
0xca3d8ae6, 0xf3546190, 0xfed7108b, 0xc49b941b,
0x32baf4a9, 0xeb833a4a, 0x88a3f1a5, 0x3a91ce0a,
0x3cc27da1, 0x7112e684, 0x4a3096b1, 0x3794574c,
0xa3c8b6f3, 0x1d213941, 0x6e0a2e00, 0x233479f1,
0x0f4cd82f, 0x6093edd2, 0x5d7d209e, 0x464fe319,
0xd4dcac9e, 0x0db845cb, 0xfb5e4bc3, 0xe0256ce1,
0x09fb4ed1, 0x0914be1e, 0xa5bdb2c3, 0xc6eb57bb,
0x30320350, 0x3f397e91, 0xa67791bc, 0x86bc0e2c,
0xefa0a7e2, 0xe9ff7543, 0xe733612c, 0xd185897b,
0x329e5388, 0x91dd236b, 0x2ecb0d93, 0xf4d82a3d,
0x35b5c03f, 0xe4e606f0, 0x05b21843, 0x37b45964,
0x5eff22f4, 0x6027f4cc, 0x77178b3c, 0xae507131,
0x7bf7cabc, 0xf9c18d66, 0x593ade65, 0xd95ddf11,
}
// hashTableRotated contains hashTable values pre-rotated by ChunkerWindowSize,
// eliminating a RotateLeft32 call per byte in the hot loop.
var hashTableRotated [256]uint32
func init() {
for i := range hashTable {
hashTableRotated[i] = bits.RotateLeft32(hashTable[i], ChunkerWindowSize)
}
}
// Chunker is used to break up a data stream into chunks of data.
type Chunker struct {
r io.Reader
min, avg, max uint64
start uint64
buf []byte
backingBuf []byte // reusable backing buffer for fillBuffer
hitEOF bool // true once the reader returned EOF
// rolling hash values
hValue uint32
hWindow [ChunkerWindowSize]byte
hIdx int
hDiscriminator uint32
// Precomputed values for Lemire's fast divisibility test.
// Tests "hValue % d == d-1" without hardware division by rewriting as
// "((hValue - (d-1)) >> trailShift) * inverseOdd < threshOdd"
// where d's power-of-2 factor is separated out.
hInverseOdd uint32 // modular inverse of odd part of hDiscriminator
hThreshOdd uint32 // ceil(2^32 / odd part of hDiscriminator)
hDiscMinus1 uint32 // hDiscriminator - 1
hTrailShift uint // number of trailing zeros in hDiscriminator
hTrailMask uint32 // (1 << hTrailShift) - 1, for checking low bits
}
// NewChunker initializes a chunker for a data stream according to min/avg/max chunk size.
func NewChunker(r io.Reader, min, avg, max uint64) (Chunker, error) {
if min < ChunkerWindowSize {
return Chunker{}, fmt.Errorf("min chunk size too small, must be over %d", ChunkerWindowSize)
}
if min > max {
return Chunker{}, errors.New("min chunk size must not be greater than max")
}
if min > avg {
return Chunker{}, errors.New("min chunk size must not be greater than avg")
}
if avg > max {
return Chunker{}, errors.New("avg chunk size must not be greater than max")
}
disc := discriminatorFromAvg(avg)
// Precompute Lemire's fast divisibility constants.
// We want to test: hValue % disc == disc-1
// Rewritten as: (hValue - (disc-1)) is divisible by disc
// Factor disc = odd * 2^shift, then use modular inverse of odd part.
trailShift := uint(bits.TrailingZeros32(disc))
oddPart := disc >> trailShift
inverseOdd := modInverse32(oddPart)
// threshOdd = ceil(2^32 / oddPart)
threshOdd := uint32(((1 << 32) + uint64(oddPart) - 1) / uint64(oddPart))
return Chunker{
r: r,
min: min,
avg: avg,
max: max,
hDiscriminator: disc,
hInverseOdd: inverseOdd,
hThreshOdd: threshOdd,
hDiscMinus1: disc - 1,
hTrailShift: trailShift,
hTrailMask: (1 << trailShift) - 1,
}, nil
}
// Make a new buffer with 10*max bytes and copy anything that may be leftover
// from before into it, then fill it up with new bytes. Don't fail on EOF.
func (c *Chunker) fillBuffer() (n int, err error) {
if c.hitEOF { // We won't get anymore here, no need for more allocations
return
}
size := 10 * c.max
// Reuse the backing buffer if it has sufficient capacity
var buf []byte
if uint64(cap(c.backingBuf)) >= size {
buf = c.backingBuf[:size]
} else {
buf = make([]byte, int(size))
c.backingBuf = buf
}
n = copy(buf, c.buf) // copy the remaining bytes from the old buffer
for uint64(n) < size && err == nil { // read until the buffer is at max or we get an EOF
var nn int
nn, err = c.r.Read(buf[n:])
n += nn
}
c.buf = buf[:n] // we are not going to get any more, resize the buffer
if err == io.EOF {
c.hitEOF = true
err = nil
}
return
}
// Next returns the starting position as well as the chunk data. Returns
// an empty byte slice when complete. The returned byte slice is only valid
// until the next call to Next; callers that pass the slice to other
// goroutines must copy it first.
func (c *Chunker) Next() (uint64, []byte, error) {
if len(c.buf) < int(c.max) {
n, err := c.fillBuffer()
if err != nil {
return c.split(n, err)
}
}
// No need to carry on if we don't have enough bytes left to even fill the min chunk
if len(c.buf) <= int(c.min) {
return c.split(len(c.buf), nil)
}
// m is the upper boundary for the current chunk. It's either c.max if we have
// enough bytes in the buffer, or len(c.buf)
m := min(len(c.buf), int(c.max))
// Initialize the rolling hash window with the ChunkerWindowSize bytes
// immediately prior to min size
window := c.buf[c.min-ChunkerWindowSize : c.min]
for i, b := range window {
c.hValue ^= bits.RotateLeft32(hashTable[b], ChunkerWindowSize-i-1)
}
copy(c.hWindow[:], window)
// Position the pointer at the minimum size
var pos = int(c.min)
// Hoist frequently accessed struct fields into locals to avoid
// repeated pointer dereferences in the hot loop.
hValue := c.hValue
hIdx := c.hIdx
buf := c.buf
win := &c.hWindow
inverseOdd := c.hInverseOdd
threshOdd := c.hThreshOdd
discMinus1 := c.hDiscMinus1
trailShift := c.hTrailShift
trailMask := c.hTrailMask
var out, in byte
for {
// Add a byte to the hash
in = buf[pos]
out = win[hIdx]
win[hIdx] = in
hIdx++
if hIdx >= ChunkerWindowSize {
hIdx = 0
}
hValue = bits.RotateLeft32(hValue, 1) ^
hashTableRotated[out] ^
hashTable[in]
pos++
// didn't find a boundary before reaching the max?
if pos >= m {
// Write locals back (split resets them, but keep state consistent)
c.hValue = hValue
c.hIdx = hIdx
return c.split(pos, nil)
}
// Did we find a boundary? Uses Lemire's fast divisibility test:
// "hValue % d == d-1" ⟺ "(hValue - (d-1)) is divisible by d"
// Factor d = oddPart * 2^trailShift, check 2-part divisibility.
adjusted := hValue - discMinus1
if adjusted&trailMask == 0 && (adjusted>>trailShift)*inverseOdd < threshOdd {
c.hValue = hValue
c.hIdx = hIdx
return c.split(pos, nil)
}
}
}
func (c *Chunker) split(i int, err error) (uint64, []byte, error) {
// save the remaining bytes (after the split position) for the next round
start := c.start
b := c.buf[:i]
c.buf = c.buf[i:]
c.start += uint64(i)
// reset the hash
c.hIdx = 0
c.hValue = 0
return start, b, err
}
// Advance n bytes without producing chunks. This can be used if the content of the next
// section in the file is known (i.e. it is known that there are a number of null chunks
// coming). This resets everything in the chunker and behaves as if the streams starts
// at (current position+n).
func (c *Chunker) Advance(n int) error {
// We might still have bytes in the buffer. These count towards the move forward.
// It's possible the advance stays within the buffer and doesn't impact the reader.
c.start += uint64(n)
if n <= len(c.buf) {
c.buf = c.buf[n:]
return nil
}
readerN := int64(n - len(c.buf))
c.buf = nil
rs, ok := c.r.(io.Seeker)
if ok {
_, err := rs.Seek(readerN, io.SeekCurrent)
return err
}
_, err := io.CopyN(io.Discard, c.r, readerN)
return err
}
// Min returns the minimum chunk size
func (c *Chunker) Min() uint64 { return c.min }
// Avg returns the average chunk size
func (c *Chunker) Avg() uint64 { return c.avg }
// Max returns the maximum chunk size
func (c *Chunker) Max() uint64 { return c.max }
// Hash implements the rolling hash algorithm used to find chunk boundaries
// in a stream of bytes.
type Hash struct {
value uint32
window []byte
size int
idx int
discriminator uint32
}
// NewHash returns a new instance of a hash. size determines the length of the
// hash window used and the discriminator is used to find the boundary.
func NewHash(size int, discriminator uint32) Hash {
return Hash{
window: make([]byte, size),
size: size,
discriminator: discriminator,
}
}
// Roll adds a new byte to the hash calculation. No useful value is returned until
// the hash window has been populated.
func (h *Hash) Roll(b byte) {
ob := h.window[h.idx]
h.window[h.idx] = b
h.idx = (h.idx + 1) % h.size
h.value = bits.RotateLeft32(h.value, 1) ^
bits.RotateLeft32(hashTable[ob], len(h.window)) ^
hashTable[b]
}
// Initialize the window used for the rolling hash calculation. The size of the
// slice must match the window size
func (h *Hash) Initialize(b []byte) {
for i, c := range b {
h.value ^= bits.RotateLeft32(hashTable[c], h.size-i-1)
}
copy(h.window, b)
}
// IsBoundary returns true if the discriminator and hash match to signal a
// chunk boundary has been reached
func (h *Hash) IsBoundary() bool {
return h.value%h.discriminator == h.discriminator-1
}
// Reset the hash window and value
func (h *Hash) Reset() {
h.idx = 0
h.value = 0
}
desync-1.0.2/chunker_test.go 0000664 0000000 0000000 00000017351 15212466604 0016041 0 ustar 00root root 0000000 0000000 package desync
import (
"bytes"
"crypto/sha512"
"os"
"testing"
"github.com/stretchr/testify/require"
)
const (
ChunkSizeAvgDefault uint64 = 64 * 1024
ChunkSizeMinDefault = ChunkSizeAvgDefault / 4
ChunkSizeMaxDefault = ChunkSizeAvgDefault * 4
)
func TestChunkerLargeFile(t *testing.T) {
f, err := os.Open("testdata/chunker.input")
require.NoError(t, err)
defer f.Close()
expected := []struct {
Start uint64
Size uint64
ID string
}{
{Start: 0, Size: 81590, ID: "ad951d7f65c27828ce390f3c81c41d75f80e4527169ad072ad720b56220f5be4"},
{Start: 81590, Size: 46796, ID: "ef6df312072ccefe965f07669b2819902f4e9889ebe7c35a38f1dc11ee99f212"},
{Start: 128386, Size: 36543, ID: "a816e22f4105741972eb34909b6f8ffa569759a1c2cf82ab88394b3db9019f23"},
{Start: 164929, Size: 83172, ID: "8b8e4a274f06dc3c92d49869a699a5a8255c0bf0b48a4d3c3689aaa3e9cff090"},
{Start: 248101, Size: 76749, ID: "583d08fc16d8d191af362a1aaecea6af062cc8afab1b301786bb717aa1b425b4"},
{Start: 324850, Size: 79550, ID: "aefa8c5a3c86896110565b6a3748c2f985892e8ab0073730cac390cb478a913a"},
{Start: 404400, Size: 41484, ID: "8e39f02975c8d0596e46f643b90cd290b7c0386845132eee4d415c63317773a4"},
{Start: 445884, Size: 20326, ID: "d689ca889f2f7ba26896681214f0f0f5f5177d5820d99b1f11ddb76b693bddee"},
{Start: 466210, Size: 31652, ID: "259de367c7ef2f51133d04e744f05918ceb93bd4b9c2bb6621ffeae70501dd09"},
{Start: 497862, Size: 19995, ID: "01ae987ec457cacc8b3528e3254bc9c93b3f0c0b2a51619e15be16e678ef016d"},
{Start: 517857, Size: 103873, ID: "78618b2d0539ecf45c08c7334e1c61051725767a76ba9108ad5298c6fd7cde1b"},
{Start: 621730, Size: 38087, ID: "f44e6992cccadb08d8e18174ba3d6dd6365bdfb9906a58a9f82621ace0461c0d"},
{Start: 659817, Size: 38377, ID: "abbf9935aaa535538c5fbff069481c343c2770207d88b94584314ee33050ae4f"},
{Start: 698194, Size: 23449, ID: "a6c737b95ab514d6538c6ef4c42ef2f08b201c3426a88b95e67e517510cd1fb9"},
{Start: 721643, Size: 47321, ID: "51d44e2d355d5c5b846543d47ba9569f12bbc3d49970c91913a8e3efef45e47e"},
{Start: 768964, Size: 86692, ID: "90f7e061ed2fb1ed9594297851f8528d3ac355c98457b5dce08ee7d88f801b26"},
{Start: 855656, Size: 28268, ID: "2dea144e5d771420e90b6e96c1e97e9c6afeda2c37ae7c95ceaf3ee2550efa08"},
{Start: 883924, Size: 65465, ID: "7a94e051c82ec7abba32883b2eee9a2832e8e9bcc3b3151743fef533e2d46e70"},
{Start: 949389, Size: 33255, ID: "32edd2d382045ad64d5fbd1a574f8191b700b9e0a2406bd90d2eefcf77168846"},
{Start: 982644, Size: 65932, ID: "a8bfdadaecbee1ed16ce23d8bf771d1b3fbca2e631fc71b5adb3846c1bb2d542"},
}
c, err := NewChunker(f, ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault)
require.NoError(t, err)
for i, e := range expected {
start, buf, err := c.Next()
require.NoError(t, err)
chunkID := ChunkID(sha512.Sum512_256(buf))
require.Equal(t, e.ID, chunkID.String(), "chunk #%d hash", i+1)
require.Equal(t, e.Start, start, "chunk #%d start", i+1)
require.Equal(t, e.Size, uint64(len(buf)), "chunk #%d size", i+1)
}
// Should get a size of 0 at the end
_, buf, err := c.Next()
require.NoError(t, err)
require.Empty(t, buf, "expected size 0 at the end")
}
func TestChunkerEmptyFile(t *testing.T) {
r := bytes.NewReader([]byte{})
c, err := NewChunker(r, ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault)
require.NoError(t, err)
start, buf, err := c.Next()
require.NoError(t, err)
require.Empty(t, buf)
require.Equal(t, uint64(0), start)
}
func TestChunkerSmallFile(t *testing.T) {
b := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
r := bytes.NewReader(b)
c, err := NewChunker(r, ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault)
require.NoError(t, err)
start, buf, err := c.Next()
require.NoError(t, err)
require.Len(t, buf, len(b))
require.Equal(t, uint64(0), start)
}
// There are no chunk boundaries when all data is nil, make sure we get the
// max chunk size
func TestChunkerNoBoundary(t *testing.T) {
b := make([]byte, 1024*1024)
r := bytes.NewReader(b)
c, err := NewChunker(r, ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault)
require.NoError(t, err)
for {
start, buf, err := c.Next()
require.NoError(t, err)
if len(buf) == 0 {
break
}
require.Equal(t, ChunkSizeMaxDefault, uint64(len(buf)))
require.Zero(t, start%ChunkSizeMaxDefault, "unexpected start position %d", start)
}
}
// Test with exactly min, avg, max chunk size of data
func TestChunkerBounds(t *testing.T) {
for _, c := range []struct {
name string
size uint64
}{
{"chunker with exactly min chunk size data", ChunkSizeMinDefault},
{"chunker with exactly avg chunk size data", ChunkSizeAvgDefault},
{"chunker with exactly max chunk size data", ChunkSizeMaxDefault},
} {
t.Run(c.name, func(t *testing.T) {
b := make([]byte, c.size)
r := bytes.NewReader(b)
c, err := NewChunker(r, ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault)
require.NoError(t, err)
start, buf, err := c.Next()
require.NoError(t, err)
require.Len(t, buf, len(b))
require.Equal(t, uint64(0), start)
})
}
}
// Test to confirm advancing through the input without producing chunks works.
func TestChunkerAdvance(t *testing.T) {
// Build an input slice that is NullChunk + + Nullchunk + .
// Then skip over the data slices and we should be left with only Null chunks.
dataA := make([]byte, 128) // Short slice
for i := range dataA {
dataA[i] = 'a'
}
dataB := make([]byte, 12*ChunkSizeMaxDefault) // Long slice to ensure we read past the chunker-internal buffer
for i := range dataB {
dataB[i] = 'b'
}
nullChunk := NewNullChunk(ChunkSizeMaxDefault)
// Build the input slice consisting of Null+dataA+Null+dataB
input := join(nullChunk.Data, dataA, nullChunk.Data, dataB)
c, err := NewChunker(bytes.NewReader(input), ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault)
require.NoError(t, err)
// Chunk the first part, this should be a null chunk
_, buf, err := c.Next()
require.NoError(t, err)
require.Equal(t, nullChunk.Data, buf, "expected null chunk")
// Now skip the dataA slice
require.NoError(t, c.Advance(len(dataA)))
// Read the 2nd null chunk
_, buf, err = c.Next()
require.NoError(t, err)
require.Equal(t, nullChunk.Data, buf, "expected null chunk")
// Skip over dataB
require.NoError(t, c.Advance(len(dataB)))
// Should be at the end, nothing more to chunk
_, buf, err = c.Next()
require.NoError(t, err)
require.Empty(t, buf, "expected end of input")
}
// Global vars used for results during the benchmark to prevent optimizer
// from optimizing away some operations
var (
chunkStart uint64
chunkBuf []byte
)
func BenchmarkChunker(b *testing.B) {
data, err := os.ReadFile("testdata/chunker.input")
if err != nil {
b.Fatal(err)
}
b.SetBytes(int64(len(data)))
b.ResetTimer()
for b.Loop() {
c, err := NewChunker(bytes.NewReader(data), ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault)
if err != nil {
b.Fatal(err)
}
for {
start, buf, err := c.Next()
if err != nil {
b.Fatal(err)
}
if len(buf) == 0 {
break
}
chunkStart = start
chunkBuf = buf
}
}
}
func benchmarkChunkNull(b *testing.B, size int) {
in := make([]byte, size)
b.SetBytes(int64(size))
for b.Loop() {
c, err := NewChunker(bytes.NewReader(in), ChunkSizeMinDefault, ChunkSizeAvgDefault, ChunkSizeMaxDefault)
if err != nil {
b.Fatal(err)
}
for {
start, buf, err := c.Next()
if err != nil {
b.Fatal(err)
}
if len(buf) == 0 {
break
}
chunkStart = start
chunkBuf = buf
}
}
}
func BenchmarkChunkNull1M(b *testing.B) { benchmarkChunkNull(b, 1024*1024) }
func BenchmarkChunkNull10M(b *testing.B) { benchmarkChunkNull(b, 10*1024*1024) }
func BenchmarkChunkNull50M(b *testing.B) { benchmarkChunkNull(b, 50*1024*1024) }
func BenchmarkChunkNull100M(b *testing.B) { benchmarkChunkNull(b, 100*1024*1024) }
desync-1.0.2/chunkstorage.go 0000664 0000000 0000000 00000003624 15212466604 0016036 0 ustar 00root root 0000000 0000000 package desync
import (
"sync"
)
// ChunkStorage stores chunks in a writable store. It can be safely used by multiple goroutines and
// contains an internal cache of what chunks have been stored previously.
type ChunkStorage struct {
sync.Mutex
ws WriteStore
processed map[ChunkID]struct{}
}
// NewChunkStorage initializes a ChunkStorage object.
func NewChunkStorage(ws WriteStore) *ChunkStorage {
s := &ChunkStorage{
ws: ws,
processed: make(map[ChunkID]struct{}),
}
return s
}
// Mark a chunk in the in-memory cache as having been processed and returns true
// if it was already marked, and is therefore presumably already stored.
func (s *ChunkStorage) markProcessed(id ChunkID) bool {
s.Lock()
defer s.Unlock()
_, ok := s.processed[id]
s.processed[id] = struct{}{}
return ok
}
// Unmark a chunk in the in-memory cache. This is used if a chunk is first
// marked as processed, but then actually fails to be stored. Unmarking the
// makes it eligible to be re-tried again in case of errors.
func (s *ChunkStorage) unmarkProcessed(id ChunkID) {
s.Lock()
defer s.Unlock()
delete(s.processed, id)
}
// StoreChunk stores a single chunk in a synchronous manner.
func (s *ChunkStorage) StoreChunk(chunk *Chunk) (err error) {
// Mark this chunk as done so no other goroutine will attempt to store it
// at the same time. If this is the first time this chunk is marked, it'll
// return false and we need to continue processing/storing the chunk below.
if s.markProcessed(chunk.ID()) {
return nil
}
// Skip this chunk if the store already has it
if hasChunk, err := s.ws.HasChunk(chunk.ID()); err != nil || hasChunk {
return err
}
// The chunk was marked as "processed" above. If there's a problem to actually
// store it, we need to unmark it again.
defer func() {
if err != nil {
s.unmarkProcessed(chunk.ID())
}
}()
// Store the compressed chunk
return s.ws.StoreChunk(chunk)
}
desync-1.0.2/cmd/ 0000775 0000000 0000000 00000000000 15212466604 0013550 5 ustar 00root root 0000000 0000000 desync-1.0.2/cmd/desync/ 0000775 0000000 0000000 00000000000 15212466604 0015035 5 ustar 00root root 0000000 0000000 desync-1.0.2/cmd/desync/.gitignore 0000664 0000000 0000000 00000000007 15212466604 0017022 0 ustar 00root root 0000000 0000000 desync
desync-1.0.2/cmd/desync/cache.go 0000664 0000000 0000000 00000006757 15212466604 0016446 0 ustar 00root root 0000000 0000000 package main
import (
"context"
"errors"
"github.com/folbricht/desync"
"github.com/spf13/cobra"
)
type cacheOptions struct {
cmdStoreOptions
stores []string
cache string
ignoreIndexes []string
ignoreChunks []string
}
func newCacheCommand(ctx context.Context) *cobra.Command {
var opt cacheOptions
cmd := &cobra.Command{
Use: "cache [...]",
Short: "Read indexes and copy the referenced chunks",
Long: `Read chunk IDs from one or more index files (caibx or caidx) and copy the
referenced chunks from the source store(s) into the target store given with
-c, without assembling any blob on disk. This can be used to pre-populate a
cache, or to replicate the chunks referenced by indexes into another store.
Use '-' to read (a single) index from STDIN.
To exclude chunks that are known to exist in the target store already, use
--ignore which will skip any chunks from the given index. The same can
be achieved by providing the chunks in their ASCII representation in a text
file with --ignore-chunks .`,
Example: ` desync cache -s http://192.168.1.1/ -c /path/to/local file.caibx`,
Args: cobra.MinimumNArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
return runCache(ctx, opt, args)
},
SilenceUsage: true,
}
flags := cmd.Flags()
flags.StringSliceVarP(&opt.stores, "store", "s", nil, "source store(s)")
flags.StringVarP(&opt.cache, "cache", "c", "", "target store the chunks are copied to")
flags.StringSliceVarP(&opt.ignoreIndexes, "ignore", "", nil, "index(es) with chunks to be excluded")
flags.StringSliceVarP(&opt.ignoreChunks, "ignore-chunks", "", nil, "text file with chunk IDs to be excluded")
addStoreOptions(&opt.cmdStoreOptions, flags)
return cmd
}
func runCache(ctx context.Context, opt cacheOptions, args []string) error {
if err := opt.cmdStoreOptions.validate(); err != nil {
return err
}
if len(opt.stores) == 0 {
return errors.New("no source store provided")
}
if opt.cache == "" {
return errors.New("no target cache store provided")
}
// Read the input files and merge all chunk IDs in a map to de-dup them
idm := make(map[desync.ChunkID]struct{})
for _, name := range args {
c, err := readCaibxFile(name, opt.cmdStoreOptions)
if err != nil {
return err
}
for _, c := range c.Chunks {
idm[c.ID] = struct{}{}
}
}
// If requested, skip/ignore all chunks that are referenced in other indexes or text files
if len(opt.ignoreIndexes) > 0 || len(opt.ignoreChunks) > 0 {
// Remove chunks referenced in indexes
for _, f := range opt.ignoreIndexes {
i, err := readCaibxFile(f, opt.cmdStoreOptions)
if err != nil {
return err
}
for _, c := range i.Chunks {
delete(idm, c.ID)
}
}
// Remove chunks referenced in ASCII text files
for _, f := range opt.ignoreChunks {
ids, err := readChunkIDFile(f)
if err != nil {
return err
}
for _, id := range ids {
delete(idm, id)
}
}
}
// Now put the IDs into an array for further processing
ids := make([]desync.ChunkID, 0, len(idm))
for id := range idm {
ids = append(ids, id)
}
s, err := multiStoreWithRouter(opt.cmdStoreOptions, opt.stores...)
if err != nil {
return err
}
defer s.Close()
dst, err := WritableStore(opt.cache, opt.cmdStoreOptions)
if err != nil {
return err
}
defer dst.Close()
// If this is a terminal, we want a progress bar
pb := desync.NewProgressBar("")
// Pull all the chunks, and load them into the cache in the process
return desync.Copy(ctx, ids, s, dst, opt.n, pb)
}
desync-1.0.2/cmd/desync/cache_test.go 0000664 0000000 0000000 00000002163 15212466604 0017470 0 ustar 00root root 0000000 0000000 package main
import (
"context"
"io"
"os"
"testing"
"github.com/stretchr/testify/require"
)
func TestCacheCommand(t *testing.T) {
for _, test := range []struct {
name string
args []string
}{
{"singe store, single index",
[]string{"--store", "testdata/blob1.store", "testdata/blob1.caibx"}},
{"multiple store, single index",
[]string{"--store", "testdata/blob1.store", "--store", "testdata/blob2.store", "testdata/blob1.caibx"}},
{"multiple store, multiple index",
[]string{"--store", "testdata/blob1.store", "--store", "testdata/blob2.store", "testdata/blob1.caibx", "testdata/blob2.caibx"}},
} {
t.Run(test.name, func(t *testing.T) {
cache := t.TempDir()
cmd := newCacheCommand(context.Background())
cmd.SetArgs(append(test.args, "-c", cache))
// Redirect the command's output to turn off the progressbar and run it
stderr = io.Discard
cmd.SetOutput(io.Discard)
_, err := cmd.ExecuteC()
require.NoError(t, err)
// If the file was split right, we'll have chunks in the dir now
dirs, err := os.ReadDir(cache)
require.NoError(t, err)
require.NotEmpty(t, dirs)
})
}
}
desync-1.0.2/cmd/desync/cat.go 0000664 0000000 0000000 00000005112 15212466604 0016132 0 ustar 00root root 0000000 0000000 package main
import (
"context"
"errors"
"io"
"os"
"github.com/folbricht/desync"
"github.com/spf13/cobra"
)
type catOptions struct {
cmdStoreOptions
stores []string
cache string
offset, length int
}
func newCatCommand(ctx context.Context) *cobra.Command {
var opt catOptions
cmd := &cobra.Command{
Use: "cat [